Initial commit for cesu8 crate. am: a7e4bc004f am: e0d2931950 am: 82f44c6e92 am: 9a771a31e3

Original change: https://android-review.googlesource.com/c/platform/external/rust/crates/cesu8/+/1804098 Change-Id: I44bbede8e60e4e5c6f2b525c3f5013c8b77f5a9b
author: Ivan Lozano <ivanlozano@google.com> 2021-08-20 23:37:39 +0000
committer: Automerger Merge Worker <android-build-automerger-merge-worker@system.gserviceaccount.com> 2021-08-20 23:37:39 +0000
commit: 5e39cdae8df55b50c2da9d77a1a6da3bbbc7e9a7 (patch)
tree: 90d87dcb0faf6c6e2805cab833d9f173ac4ed786
parent: d22939409e8930f2426b1b6fe55634859e66104b (diff)
parent: 9a771a31e3b075cae3bae31d5fc4ba0b6e114667 (diff)
download: cesu8-5e39cdae8df55b50c2da9d77a1a6da3bbbc7e9a7.tar.gz
9 files changed, 983 insertions, 0 deletions
diff --git a/COPYRIGHT-RUST.txt b/COPYRIGHT-RUST.txt
new file mode 100644
index 0000000..52039ea
--- /dev/null
+++ b/COPYRIGHT-RUST.txt
@@ -0,0 +1,422 @@
+Short version for non-lawyers:
+
+The Rust Project is dual-licensed under Apache 2.0 and MIT
+terms.
+
+
+Longer version:
+
+The Rust Project is copyright 2014, The Rust Project
+Developers (given in the file AUTHORS.txt).
+
+Licensed under the Apache License, Version 2.0
+<LICENSE-APACHE or
+http://www.apache.org/licenses/LICENSE-2.0> or the MIT
+license <LICENSE-MIT or http://opensource.org/licenses/MIT>,
+at your option. All files in the project carrying such
+notice may not be copied, modified, or distributed except
+according to those terms.
+
+
+The Rust Project includes packages written by third parties.
+The following third party packages are included, and carry
+their own copyright notices and license terms:
+
+* Two header files that are part of the Valgrind
+  package. These files are found at src/rt/vg/valgrind.h and
+  src/rt/vg/memcheck.h, within this distribution. These files
+  are redistributed under the following terms, as noted in
+  them:
+
+  for src/rt/vg/valgrind.h:
+
+    This file is part of Valgrind, a dynamic binary
+    instrumentation framework.
+
+    Copyright (C) 2000-2010 Julian Seward.  All rights
+    reserved.
+
+    Redistribution and use in source and binary forms, with
+    or without modification, are permitted provided that the
+    following conditions are met:
+
+    1. Redistributions of source code must retain the above
+       copyright notice, this list of conditions and the
+       following disclaimer.
+
+    2. The origin of this software must not be
+       misrepresented; you must not claim that you wrote the
+       original software.  If you use this software in a
+       product, an acknowledgment in the product
+       documentation would be appreciated but is not
+       required.
+
+    3. Altered source versions must be plainly marked as
+       such, and must not be misrepresented as being the
+       original software.
+
+    4. The name of the author may not be used to endorse or
+       promote products derived from this software without
+       specific prior written permission.
+
+    THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
+    ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
+    AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN
+    NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
+    INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+    PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+    USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+    CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+    CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+    NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+    USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY
+    OF SUCH DAMAGE.
+
+  for src/rt/vg/memcheck.h:
+
+    This file is part of MemCheck, a heavyweight Valgrind
+    tool for detecting memory errors.
+
+    Copyright (C) 2000-2010 Julian Seward.  All rights
+    reserved.
+
+    Redistribution and use in source and binary forms, with
+    or without modification, are permitted provided that the
+    following conditions are met:
+
+    1. Redistributions of source code must retain the above
+       copyright notice, this list of conditions and the
+       following disclaimer.
+
+    2. The origin of this software must not be
+       misrepresented; you must not claim that you wrote the
+       original software.  If you use this software in a
+       product, an acknowledgment in the product
+       documentation would be appreciated but is not
+       required.
+
+    3. Altered source versions must be plainly marked as
+       such, and must not be misrepresented as being the
+       original software.
+
+    4. The name of the author may not be used to endorse or
+       promote products derived from this software without
+       specific prior written permission.
+
+    THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
+    ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
+    AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN
+    NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
+    INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+    PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+    USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+    CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+    CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+    NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+    USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY
+    OF SUCH DAMAGE.
+
+* The auxiliary file src/etc/pkg/modpath.iss contains a
+  library routine compiled, by Inno Setup, into the Windows
+  installer binary. This file is licensed under the LGPL,
+  version 3, but, in our legal interpretation, this does not
+  affect the aggregate "collected work" license of the Rust
+  distribution (MIT/ASL2) nor any other components of it. We
+  believe that the terms governing distribution of the
+  binary Windows installer built from modpath.iss are
+  therefore LGPL, but not the terms governing distribution
+  of any of the files installed by such an installer (such
+  as the Rust compiler or runtime libraries themselves).
+
+* The src/rt/miniz.c file, carrying an implementation of
+  RFC1950/RFC1951 DEFLATE, by Rich Geldreich
+  <richgel99@gmail.com>. All uses of this file are
+  permitted by the embedded "unlicense" notice
+  (effectively: public domain with warranty disclaimer).
+
+* LLVM. Code for this package is found in src/llvm.
+
+    Copyright (c) 2003-2013 University of Illinois at
+    Urbana-Champaign.  All rights reserved.
+
+    Developed by:
+
+        LLVM Team
+
+        University of Illinois at Urbana-Champaign
+
+        http://llvm.org
+
+    Permission is hereby granted, free of charge, to any
+    person obtaining a copy of this software and associated
+    documentation files (the "Software"), to deal with the
+    Software without restriction, including without
+    limitation the rights to use, copy, modify, merge,
+    publish, distribute, sublicense, and/or sell copies of
+    the Software, and to permit persons to whom the Software
+    is furnished to do so, subject to the following
+    conditions:
+
+        * Redistributions of source code must retain the
+          above copyright notice, this list of conditions
+          and the following disclaimers.
+
+        * Redistributions in binary form must reproduce the
+          above copyright notice, this list of conditions
+          and the following disclaimers in the documentation
+          and/or other materials provided with the
+          distribution.
+
+        * Neither the names of the LLVM Team, University of
+          Illinois at Urbana-Champaign, nor the names of its
+          contributors may be used to endorse or promote
+          products derived from this Software without
+          specific prior written permission.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
+    ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
+    TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
+    PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT
+    SHALL THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE
+    FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+    ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT
+    OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+    OTHER DEALINGS WITH THE SOFTWARE.
+
+* Additional libraries included in LLVM carry separate
+  BSD-compatible licenses. See src/llvm/LICENSE.txt for
+  details.
+
+* compiler-rt, in src/compiler-rt is dual licensed under
+  LLVM's license and MIT:
+
+    Copyright (c) 2009-2014 by the contributors listed in
+    CREDITS.TXT
+
+    All rights reserved.
+
+    Developed by:
+
+        LLVM Team
+
+        University of Illinois at Urbana-Champaign
+
+        http://llvm.org
+
+    Permission is hereby granted, free of charge, to any
+    person obtaining a copy of this software and associated
+    documentation files (the "Software"), to deal with the
+    Software without restriction, including without
+    limitation the rights to use, copy, modify, merge,
+    publish, distribute, sublicense, and/or sell copies of
+    the Software, and to permit persons to whom the Software
+    is furnished to do so, subject to the following
+    conditions:
+
+        * Redistributions of source code must retain the
+          above copyright notice, this list of conditions
+          and the following disclaimers.
+
+        * Redistributions in binary form must reproduce the
+          above copyright notice, this list of conditions
+          and the following disclaimers in the documentation
+          and/or other materials provided with the
+          distribution.
+
+        * Neither the names of the LLVM Team, University of
+          Illinois at Urbana-Champaign, nor the names of its
+          contributors may be used to endorse or promote
+          products derived from this Software without
+          specific prior written permission.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
+    ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
+    TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
+    PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT
+    SHALL THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE
+    FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+    ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT
+    OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+    OTHER DEALINGS WITH THE SOFTWARE.
+
+    ========================================================
+
+    Copyright (c) 2009-2014 by the contributors listed in
+    CREDITS.TXT
+
+    Permission is hereby granted, free of charge, to any
+    person obtaining a copy of this software and associated
+    documentation files (the "Software"), to deal in the
+    Software without restriction, including without
+    limitation the rights to use, copy, modify, merge,
+    publish, distribute, sublicense, and/or sell copies of
+    the Software, and to permit persons to whom the Software
+    is furnished to do so, subject to the following
+    conditions:
+
+    The above copyright notice and this permission notice
+    shall be included in all copies or substantial portions
+    of the Software.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
+    ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
+    TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
+    PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
+    SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+    CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+    OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
+    IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+    DEALINGS IN THE SOFTWARE.
+
+* Portions of the FFI code for interacting with the native ABI
+  is derived from the Clay programming language, which carries
+  the following license.
+
+    Copyright (C) 2008-2010 Tachyon Technologies.
+    All rights reserved.
+
+    Redistribution and use in source and binary forms, with
+    or without modification, are permitted provided that the
+    following conditions are met:
+
+    1. Redistributions of source code must retain the above
+       copyright notice, this list of conditions and the
+       following disclaimer.
+
+    2. Redistributions in binary form must reproduce the
+       above copyright notice, this list of conditions and
+       the following disclaimer in the documentation and/or
+       other materials provided with the distribution.
+
+    THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR
+    IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+    IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+    PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+    DEVELOPERS AND CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+    INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+    PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+    USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+    CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+    CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+    NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+    USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY
+    OF SUCH DAMAGE.
+
+* Hoedown, the markdown parser, under src/rt/hoedown, is
+  licensed as follows.
+
+    Copyright (c) 2008, Natacha Porté
+    Copyright (c) 2011, Vicent Martí
+    Copyright (c) 2013, Devin Torres and the Hoedown authors
+
+    Permission to use, copy, modify, and distribute this
+    software for any purpose with or without fee is hereby
+    granted, provided that the above copyright notice and
+    this permission notice appear in all copies.
+
+    THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR
+    DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE
+    INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+    FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+    SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR
+    ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA
+    OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
+    OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+    CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+* libbacktrace, under src/libbacktrace:
+
+    Copyright (C) 2012-2014 Free Software Foundation, Inc.
+    Written by Ian Lance Taylor, Google.
+
+    Redistribution and use in source and binary forms, with
+    or without modification, are permitted provided that the
+    following conditions are met:
+
+        (1) Redistributions of source code must retain the
+        above copyright notice, this list of conditions and
+        the following disclaimer.
+
+        (2) Redistributions in binary form must reproduce
+        the above copyright notice, this list of conditions
+        and the following disclaimer in the documentation
+        and/or other materials provided with the
+        distribution.
+
+        (3) The name of the author may not be used to
+        endorse or promote products derived from this
+        software without specific prior written permission.
+
+    THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
+    ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
+    AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN
+    NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
+    INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+    PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+    USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+    CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+    CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+    NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+    USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY
+    OF SUCH DAMAGE.  */
+
+* jemalloc, under src/jemalloc:
+
+    Copyright (C) 2002-2014 Jason Evans
+    <jasone@canonware.com>. All rights reserved.
+    Copyright (C) 2007-2012 Mozilla Foundation.
+    All rights reserved.
+    Copyright (C) 2009-2014 Facebook, Inc.
+    All rights reserved.
+
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions are met:
+    1. Redistributions of source code must retain the above copyright notice(s),
+       this list of conditions and the following disclaimer.
+    2. Redistributions in binary form must reproduce the above copyright notice(s),
+       this list of conditions and the following disclaimer in the documentation
+       and/or other materials provided with the distribution.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S)
+    ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
+    INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+    MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+    DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER(S)
+    BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+    EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+    LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+    HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+    IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+    NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+    USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY
+    OF SUCH DAMAGE.
+
+* Additional copyright may be retained by contributors other
+  than Mozilla, the Rust Project Developers, or the parties
+  enumerated in this file. Such copyright can be determined
+  on a case-by-case basis by examining the author of each
+  portion of a file in the revision-control commit records
+  of the project, or by consulting representative comments
+  claiming copyright ownership for a file.
+
+  For example, the text:
+
+      "Copyright (c) 2011 Google Inc."
+
+  appears in some files, and these files thereby denote
+  that their author and copyright-holder is Google Inc.
+
+  In all such cases, the absence of explicit licensing text
+  indicates that the contributor chose to license their work
+  for distribution under identical terms to those Mozilla
+  has chosen for the collective work, enumerated at the top
+  of this file. The only difference is the retention of
+  copyright itself, held by the contributor.
diff --git a/Cargo.toml b/Cargo.toml
new file mode 100644
index 0000000..0d8bd9a
--- /dev/null
+++ b/Cargo.toml
@@ -0,0 +1,17 @@
+[package]
+
+name = "cesu8"
+version = "1.1.0"
+authors = ["Eric Kidd <git@randomhacks.net>"]
+
+description = "Convert to and from CESU-8 encoding (similar to UTF-8)"
+license = "Apache-2.0/MIT"
+readme = "README.md"
+
+repository = "https://github.com/emk/cesu8-rs"
+documentation = "http://emk.github.io/cesu8-rs/cesu8/index.html"
+
+[features]
+# Allow access to unstable features when being built with a nightly compiler,
+# to keep travis-cargo happy and enable access to benchmarks if we want them.
+unstable = []
diff --git a/LICENSE b/LICENSE
new file mode 120000
index 0000000..004484b
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1 @@
+COPYRIGHT-RUST.txt
+\ No newline at end of file
diff --git a/METADATA b/METADATA
new file mode 100644
index 0000000..ae7ceba
--- /dev/null
+++ b/METADATA
@@ -0,0 +1,19 @@
+name: "cesu8"
+description: "Convert to and from CESU-8 encoding (similar to UTF-8)"
+third_party {
+  url {
+    type: HOMEPAGE
+    value: "https://crates.io/crates/cesu8"
+  }
+  url {
+    type: ARCHIVE
+    value: "https://static.crates.io/crates/cesu8/cesu8-1.1.0.crate"
+  }
+  version: "1.1.0"
+  license_type: NOTICE
+  last_upgrade_date {
+    year: 2021
+    month: 7
+    day: 30
+  }
+}
diff --git a/MODULE_LICENSE_APACHE2 b/MODULE_LICENSE_APACHE2
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/MODULE_LICENSE_APACHE2
diff --git a/OWNERS b/OWNERS
new file mode 100644
index 0000000..45dc4dd
--- /dev/null
+++ b/OWNERS
@@ -0,0 +1 @@
+include platform/prebuilts/rust:master:/OWNERS
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..161f865
--- /dev/null
+++ b/README.md
@@ -0,0 +1,33 @@
+# CESU-8 encoder/decoder for Rust
+
+[![Build Status](https://travis-ci.org/emk/cesu8-rs.svg)](https://travis-ci.org/emk/cesu8-rs) [![Latest version](https://img.shields.io/crates/v/cesu8.svg)](https://crates.io/crates/cesu8) [![License](https://img.shields.io/crates/l/cesu8.svg)](https://crates.io/crates/cesu8)
+
+[Documentation][apidoc].
+
+[apidoc]: http://emk.github.io/cesu8-rs/cesu8/index.html
+
+Convert between ordinary UTF-8 and [CESU-8][] encodings.
+
+CESU-8 encodes characters outside the Basic Multilingual Plane as two
+UTF-16 surrogate chacaters, which are then further re-encoded as invalid,
+3-byte UTF-8 characters.  This means that 4-byte UTF-8 sequences become
+6-byte CESU-8 sequences.
+
+**Note that CESU-8 is only intended for internal use within tightly-coupled
+systems, and not for data interchange.**
+
+This encoding is sometimes needed when working with Java, Oracle or MySQL,
+and when trying to store emoji, hieroglyphs, or other characters on the
+Supplementary Multilingual Plane or the Supplementary Ideographic Plane.
+
+[CESU-8]: http://www.unicode.org/reports/tr26/tr26-2.html
+
+## License
+
+Some of this code is adapted from Rust's [`src/libcore/str.rs` file][str.rs].
+This code is covered by LICENSE-RUST.txt and copyright by The Rust Project
+Developers and individual Rust contributors, as described in that file.
+
+The new code in this project is distributed under the same terms.
+
+[str.rs]: https://github.com/rust-lang/rust/blob/master/src/libcore/str.rs
diff --git a/src/lib.rs b/src/lib.rs
new file mode 100644
index 0000000..7fac5ff
--- /dev/null
+++ b/src/lib.rs
@@ -0,0 +1,453 @@
+// Copyright 2012-2014 The Rust Project Developers and Eric Kidd.  See the
+// COPYRIGHT-RUST.txt file at the top-level directory of this distribution.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed except
+// according to those terms.
+
+
+//! A simple library implementing the [CESU-8 compatibility encoding
+//! scheme](http://www.unicode.org/reports/tr26/tr26-2.html).  This is a
+//! non-standard variant of UTF-8 that is used internally by some systems
+//! that need to represent UTF-16 data as 8-bit characters.  Yes, this is
+//! ugly.
+//!
+//! Use of this encoding is discouraged by the Unicode Consortium.  It's OK
+//! for working with existing internal APIs, but it should not be used for
+//! transmitting or storing data.
+//!
+//! ```
+//! use std::borrow::Cow;
+//! use cesu8::{from_cesu8, to_cesu8};
+//!
+//! // 16-bit Unicode characters are the same in UTF-8 and CESU-8.
+//! assert_eq!(Cow::Borrowed("aé日".as_bytes()),
+//!            to_cesu8("aé日"));
+//! assert_eq!(Cow::Borrowed("aé日"),
+//!            from_cesu8("aé日".as_bytes()).unwrap());
+//!
+//! // This string is CESU-8 data containing a 6-byte surrogate pair,
+//! // which decodes to a 4-byte UTF-8 string.
+//! let data = &[0xED, 0xA0, 0x81, 0xED, 0xB0, 0x81];
+//! assert_eq!(Cow::Borrowed("\u{10401}"),
+//!            from_cesu8(data).unwrap());
+//! ```
+//!
+//! ### A note about security
+//!
+//! As a general rule, this library is intended to fail on malformed or
+//! unexpected input.  CESU-8 is supposed to be an internal-only format,
+//! and if we're seeing malformed data, we assume that it's either a bug in
+//! somebody's code, or an attacker is trying to improperly encode data to
+//! evade security checks.
+//!
+//! If you have a use case for lossy conversion to UTF-8, or conversion
+//! from mixed UTF-8/CESU-8 data, please feel free to submit a pull request
+//! for `from_cesu8_lossy_permissive` with appropriate behavior.
+//!
+//! ### Java and U+0000, and other variants
+//!
+//! Java uses the CESU-8 encoding as described above, but with one
+//! difference: The null character U+0000 is represented as an overlong
+//! UTF-8 sequence `C0 80`. This is supported by the `from_java_cesu8` and
+//! `to_java_cesu8` methods.
+//!
+//! ### Surrogate pairs and UTF-8
+//!
+//! The UTF-16 encoding uses "surrogate pairs" to represent Unicode code
+//! points in the range from U+10000 to U+10FFFF.  These are 16-bit numbers
+//! in the range 0xD800 to 0xDFFF.
+//!
+//! * 0xD800 to 0xDBFF: First half of surrogate pair.  When encoded as
+//!   CESU-8, these become **1110**1101 **10**100000 **10**000000 to
+//!   **1110**1101 **10**101111 **10**111111.
+//!
+//! * 0xDC00 to 0xDFFF: Second half of surrogate pair.  These become
+//!   **1110**1101 **10**110000 **10**000000 to
+//!   **1110**1101 **10**111111 **10**111111.
+//!
+//! Wikipedia [explains](http://en.wikipedia.org/wiki/UTF-16) the
+//! code point to UTF-16 conversion process:
+//!
+//! > Consider the encoding of U+10437 (𐐷):
+//! >
+//! > * Subtract 0x10000 from 0x10437. The result is 0x00437, 0000 0000 0100
+//! >   0011 0111.
+//! > * Split this into the high 10-bit value and the low 10-bit value:
+//! >   0000000001 and 0000110111.
+//! > * Add 0xD800 to the high value to form the high surrogate: 0xD800 +
+//! >   0x0001 = 0xD801.
+//! > * Add 0xDC00 to the low value to form the low surrogate: 0xDC00 +
+//! >   0x0037 = 0xDC37.
+
+#![warn(missing_docs)]
+
+
+use std::borrow::Cow;
+use std::error::Error;
+use std::fmt;
+use std::result::Result;
+use std::slice;
+use std::str::{from_utf8, from_utf8_unchecked};
+use unicode::utf8_char_width;
+
+mod unicode;
+
+/// Mask of the value bits of a continuation byte.
+const CONT_MASK: u8 = 0b0011_1111u8;
+/// Value of the tag bits (tag mask is !CONT_MASK) of a continuation byte.
+const TAG_CONT_U8: u8 = 0b1000_0000u8;
+
+/// The CESU-8 data could not be decoded as valid UTF-8 data.
+#[derive(Clone, Copy, Debug)]
+pub struct Cesu8DecodingError;
+
+impl Error for Cesu8DecodingError {
+    fn description(&self) -> &str { "decoding error" }
+    fn cause(&self) -> Option<&Error> { None }
+}
+
+impl fmt::Display for Cesu8DecodingError {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        write!(f, "could not convert CESU-8 data to UTF-8")
+    }
+}
+
+/// Which variant of the encoding are we working with?
+#[derive(PartialEq, Eq)]
+enum Variant {
+    /// Regular CESU-8, with '\0' represented by itself.
+    Standard,
+    /// This is technically Java's "Modified UTF-8", which is supposedly
+    /// like CESU-8, except that it UTF-8 encodes the '\0' byte.  I'm sure
+    /// it seemed like a good idea at the time.
+    Java,
+}
+
+/// Convert CESU-8 data to a Rust string, re-encoding only if necessary.
+/// Returns an error if the data cannot be represented as valid UTF-8.
+///
+/// ```
+/// use std::borrow::Cow;
+/// use cesu8::from_cesu8;
+///
+/// // This string is valid as UTF-8 or CESU-8, so it doesn't change,
+/// // and we can convert it without allocating memory.
+/// assert_eq!(Cow::Borrowed("aé日"),
+///            from_cesu8("aé日".as_bytes()).unwrap());
+///
+/// // This string is CESU-8 data containing a 6-byte surrogate pair,
+/// // which becomes a 4-byte UTF-8 string.
+/// let data = &[0xED, 0xA0, 0x81, 0xED, 0xB0, 0x81];
+/// assert_eq!(Cow::Borrowed("\u{10401}"),
+///            from_cesu8(data).unwrap());
+/// ```
+pub fn from_cesu8(bytes: &[u8]) -> Result<Cow<str>, Cesu8DecodingError> {
+    from_cesu8_internal(bytes, Variant::Standard)
+}
+
+/// Convert Java's modified UTF-8 data to a Rust string, re-encoding only if
+/// necessary. Returns an error if the data cannot be represented as valid
+/// UTF-8.
+///
+/// ```
+/// use std::borrow::Cow;
+/// use cesu8::from_java_cesu8;
+///
+/// // This string is valid as UTF-8 or modified UTF-8, so it doesn't change,
+/// // and we can convert it without allocating memory.
+/// assert_eq!(Cow::Borrowed("aé日"),
+///            from_java_cesu8("aé日".as_bytes()).unwrap());
+///
+/// // This string is modified UTF-8 data containing a 6-byte surrogate pair,
+/// // which becomes a 4-byte UTF-8 string.
+/// let data = &[0xED, 0xA0, 0x81, 0xED, 0xB0, 0x81];
+/// assert_eq!(Cow::Borrowed("\u{10401}"),
+///            from_java_cesu8(data).unwrap());
+///
+/// // This string is modified UTF-8 data containing null code-points.
+/// let data = &[0xC0, 0x80, 0xC0, 0x80];
+/// assert_eq!(Cow::Borrowed("\0\0"),
+///            from_java_cesu8(data).unwrap());
+/// ```
+pub fn from_java_cesu8(bytes: &[u8]) -> Result<Cow<str>, Cesu8DecodingError> {
+    from_cesu8_internal(bytes, Variant::Java)
+}
+
+/// Do the actual work of decoding.
+fn from_cesu8_internal(bytes: &[u8], variant: Variant) ->
+    Result<Cow<str>, Cesu8DecodingError>
+{
+    match from_utf8(bytes) {
+        Ok(str) => Ok(Cow::Borrowed(str)),
+        _ => {
+            let mut decoded = Vec::with_capacity(bytes.len());
+            if decode_from_iter(&mut decoded, &mut bytes.iter(), variant) {
+                // Keep this assertion in debug mode only.  It's important
+                // that this assertion is true, because Rust assumes that
+                // all UTF-8 strings are valid.
+                debug_assert!(from_utf8(&decoded[..]).is_ok());
+                Ok(Cow::Owned(unsafe { String::from_utf8_unchecked(decoded) }))
+            } else {
+                Err(Cesu8DecodingError)
+            }
+        }
+    }
+}
+
+#[test]
+fn test_from_cesu8() {
+    // The surrogate-encoded character below is from the ICU library's
+    // icu/source/test/testdata/conversion.txt test case.
+    let data = &[0x4D, 0xE6, 0x97, 0xA5, 0xED, 0xA0, 0x81, 0xED, 0xB0, 0x81, 0x7F];
+    assert_eq!(Cow::Borrowed("M日\u{10401}\u{7F}"),
+               from_cesu8(data).unwrap());
+
+    // We used to have test data from the CESU-8 specification, but when we
+    // worked it through manually, we got the wrong answer:
+    //
+    // Input: [0xED, 0xAE, 0x80, 0xED, 0xB0, 0x80]
+    // Binary: 11101101 10101110 10000000 11101101 10110000 10000000
+    //
+    // 0b1101_101110_000000 -> 0xDB80
+    // 0b1101_110000_000000 -> 0xDC00
+    //
+    // ((0xDB80 - 0xD800) << 10) | (0xDC00 - 0xDC00) -> 0xE0000
+    // 0x10000 + 0xE0000 -> 0xF0000
+    //
+    // The spec claims that we are supposed to get 0x10000, not 0xF0000.
+    // Since I can't reconcile this example data with the text of the
+    // specification, I decided to use a test character from ICU instead.
+}
+
+// Our internal decoder, based on Rust's is_utf8 implementation.
+fn decode_from_iter(
+    decoded: &mut Vec<u8>, iter: &mut slice::Iter<u8>, variant: Variant)
+    -> bool
+{
+    macro_rules! err {
+        () => { return false }
+    }
+    macro_rules! next {
+        () => {
+            match iter.next() {
+                Some(a) => *a,
+                // We needed data, but there was none: error!
+                None => err!()
+            }
+        }
+    }
+    macro_rules! next_cont {
+        () => {
+            {
+                let byte = next!();
+                if (byte) & !CONT_MASK == TAG_CONT_U8 { byte } else { err!() }
+            }
+        }
+    }
+
+    loop {
+        let first = match iter.next() {
+            Some(&b) => b,
+            // We're at the end of the iterator and a codepoint boundary at
+            // the same time, so this string is valid.
+            None => return true
+        };
+
+        if variant == Variant::Java && first == 0 {
+            // Java's modified UTF-8 should never contain \0 directly.
+            err!();
+        } else if first < 128 {
+            // Pass ASCII through directly.
+            decoded.push(first);
+        } else if first == 0xc0 && variant == Variant::Java {
+            match next!() {
+                0x80 => decoded.push(0),
+                _ => err!(),
+            }
+        } else {
+            let w = utf8_char_width(first);
+            let second = next_cont!();
+            match w {
+                // Two-byte sequences can be used directly.
+                2 => { decoded.extend([first, second].iter().cloned()); }
+                3 => {
+                    let third = next_cont!();
+                    match (first, second) {
+                        // These are valid UTF-8, so pass them through.
+                        (0xE0         , 0xA0 ... 0xBF) |
+                        (0xE1 ... 0xEC, 0x80 ... 0xBF) |
+                        (0xED         , 0x80 ... 0x9F) |
+                        (0xEE ... 0xEF, 0x80 ... 0xBF) => {
+                            decoded.extend([first, second, third].iter()
+                                               .cloned())
+                        }
+                        // First half a surrogate pair, so decode.
+                        (0xED         , 0xA0 ... 0xAF) => {
+                            if next!() != 0xED { err!() }
+                            let fifth = next_cont!();
+                            if fifth < 0xB0 || 0xBF < fifth { err!() }
+                            let sixth = next_cont!();
+                            let s = dec_surrogates(second, third, fifth, sixth);
+                            decoded.extend(s.iter().cloned());
+                        }
+                        _ => err!()
+                    }
+                }
+                _ => err!()
+            }
+        }
+    }
+}
+
+/// Convert the two trailing bytes from a CESU-8 surrogate to a regular
+/// surrogate value.
+fn dec_surrogate(second: u8, third: u8) -> u32 {
+    0xD000u32 | ((second & CONT_MASK) as u32) << 6 | (third & CONT_MASK) as u32
+}
+
+/// Convert the bytes from a CESU-8 surrogate pair into a valid UTF-8
+/// sequence.  Assumes input is valid.
+fn dec_surrogates(second: u8, third: u8, fifth: u8, sixth: u8) -> [u8; 4] {
+    // Convert to a 32-bit code point.
+    let s1 = dec_surrogate(second, third);
+    let s2 = dec_surrogate(fifth, sixth);
+    let c = 0x10000 + (((s1 - 0xD800) << 10) | (s2 - 0xDC00));
+    //println!("{:0>8b} {:0>8b} {:0>8b} -> {:0>16b}", 0xEDu8, second, third, s1);
+    //println!("{:0>8b} {:0>8b} {:0>8b} -> {:0>16b}", 0xEDu8, fifth, sixth, s2);
+    //println!("-> {:0>32b}", c);
+    assert!(0x010000 <= c && c <= 0x10FFFF);
+
+    // Convert to UTF-8.
+    // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+    [0b1111_0000u8 | ((c & 0b1_1100_0000_0000_0000_0000) >> 18) as u8,
+     TAG_CONT_U8   | ((c & 0b0_0011_1111_0000_0000_0000) >> 12) as u8,
+     TAG_CONT_U8   | ((c & 0b0_0000_0000_1111_1100_0000) >>  6) as u8,
+     TAG_CONT_U8   | ((c & 0b0_0000_0000_0000_0011_1111)      ) as u8]
+}
+
+/// Convert a Rust `&str` to CESU-8 bytes.
+///
+/// ```
+/// use std::borrow::Cow;
+/// use cesu8::to_cesu8;
+///
+/// // This string is valid as UTF-8 or CESU-8, so it doesn't change,
+/// // and we can convert it without allocating memory.
+/// assert_eq!(Cow::Borrowed("aé日".as_bytes()), to_cesu8("aé日"));
+///
+/// // This string is a 4-byte UTF-8 string, which becomes a 6-byte CESU-8
+/// // vector.
+/// assert_eq!(Cow::Borrowed(&[0xED, 0xA0, 0x81, 0xED, 0xB0, 0x81]),
+///            to_cesu8("\u{10401}"));
+/// ```
+pub fn to_cesu8(text: &str) -> Cow<[u8]> {
+    if is_valid_cesu8(text) {
+        Cow::Borrowed(text.as_bytes())
+    } else {
+        Cow::Owned(to_cesu8_internal(text, Variant::Standard))
+    }
+}
+
+/// Convert a Rust `&str` to Java's modified UTF-8 bytes.
+///
+/// ```
+/// use std::borrow::Cow;
+/// use cesu8::to_java_cesu8;
+///
+/// // This string is valid as UTF-8 or CESU-8, so it doesn't change,
+/// // and we can convert it without allocating memory.
+/// assert_eq!(Cow::Borrowed("aé日".as_bytes()), to_java_cesu8("aé日"));
+///
+/// // This string is a 4-byte UTF-8 string, which becomes a 6-byte modified
+/// // UTF-8 vector.
+/// assert_eq!(Cow::Borrowed(&[0xED, 0xA0, 0x81, 0xED, 0xB0, 0x81]),
+///            to_java_cesu8("\u{10401}"));
+///
+/// // This string contains null, which becomes 2-byte modified UTF-8 encoding
+/// assert_eq!(Cow::Borrowed(&[0xC0, 0x80, 0xC0, 0x80]),
+///            to_java_cesu8("\0\0"));
+/// ```
+pub fn to_java_cesu8(text: &str) -> Cow<[u8]> {
+    if is_valid_java_cesu8(text) {
+        Cow::Borrowed(text.as_bytes())
+    } else {
+        Cow::Owned(to_cesu8_internal(text, Variant::Java))
+    }
+}
+
+fn to_cesu8_internal(text: &str, variant: Variant) -> Vec<u8> {
+    let bytes = text.as_bytes();
+    let mut encoded = Vec::with_capacity(bytes.len() + bytes.len() >> 2);
+    let mut i = 0;
+    while i < bytes.len() {
+        let b = bytes[i];
+        if variant == Variant::Java && b == 0 {
+            encoded.push(0xc0);
+            encoded.push(0x80);
+            i += 1;
+        } else if b < 128 {
+            // Pass ASCII through quickly.
+            encoded.push(b);
+            i += 1;
+        } else {
+            // Figure out how many bytes we need for this character.
+            let w = utf8_char_width(b);
+            assert!(w <= 4);
+            assert!(i + w <= bytes.len());
+            if w != 4 {
+                // Pass through short UTF-8 sequences unmodified.
+                encoded.extend(bytes[i..i+w].iter().cloned());
+            } else {
+                // Encode 4-byte sequences as 6 bytes.
+                let s = unsafe { from_utf8_unchecked(&bytes[i..i+w]) };
+                let c = s.chars().next().unwrap() as u32 - 0x10000;
+                let mut s: [u16; 2] = [0; 2];
+                s[0] = ((c >> 10) as u16)   | 0xD800;
+                s[1] = ((c & 0x3FF) as u16) | 0xDC00;
+                encoded.extend(enc_surrogate(s[0]).iter().cloned());
+                encoded.extend(enc_surrogate(s[1]).iter().cloned());
+            }
+            i += w;
+        }
+    }
+    encoded
+}
+
+/// Check whether a Rust string contains valid CESU-8 data.
+pub fn is_valid_cesu8(text: &str) -> bool {
+    // We rely on the fact that Rust strings are guaranteed to be valid
+    // UTF-8.
+    for b in text.bytes() {
+        if (b & !CONT_MASK) == TAG_CONT_U8 { continue; }
+        if utf8_char_width(b) > 3 { return false; }
+    }
+    true
+}
+
+/// Check whether a Rust string contains valid Java's modified UTF-8 data.
+pub fn is_valid_java_cesu8(text: &str) -> bool {
+    !text.contains('\0') && is_valid_cesu8(text)
+}
+
+#[test]
+fn test_valid_cesu8() {
+    assert!(is_valid_cesu8("aé日"));
+    assert!(is_valid_java_cesu8("aé日"));
+    assert!(!is_valid_cesu8("\u{10401}"));
+    assert!(!is_valid_java_cesu8("\u{10401}"));
+    assert!(is_valid_cesu8("\0\0"));
+    assert!(!is_valid_java_cesu8("\0\0"));
+}
+
+
+/// Encode a single surrogate as CESU-8.
+fn enc_surrogate(surrogate: u16) -> [u8; 3] {
+    assert!(0xD800 <= surrogate && surrogate <= 0xDFFF);
+    // 1110xxxx 10xxxxxx 10xxxxxx
+    [0b11100000  | ((surrogate & 0b11110000_00000000) >> 12) as u8,
+     TAG_CONT_U8 | ((surrogate & 0b00001111_11000000) >>  6) as u8,
+     TAG_CONT_U8 | ((surrogate & 0b00000000_00111111)      ) as u8]
+}
diff --git a/src/unicode.rs b/src/unicode.rs
new file mode 100644
index 0000000..a236d31
--- /dev/null
+++ b/src/unicode.rs
@@ -0,0 +1,37 @@
+// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
+// file at http://rust-lang.org/COPYRIGHT.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+// (This used to be in the Rust unicode crate, which is now gone, so we'll
+// just include it inline.)
+
+// https://tools.ietf.org/html/rfc3629
+static UTF8_CHAR_WIDTH: [u8; 256] = [
+1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x1F
+1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x3F
+1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x5F
+1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x7F
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0x9F
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0xBF
+0,0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // 0xDF
+3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, // 0xEF
+4,4,4,4,4,0,0,0,0,0,0,0,0,0,0,0, // 0xFF
+];
+
+/// Given a first byte, determine how many bytes are in this UTF-8 character
+#[inline]
+pub fn utf8_char_width(b: u8) -> usize {
+    return UTF8_CHAR_WIDTH[b as usize] as usize;
+}
author	Ivan Lozano <ivanlozano@google.com>	2021-08-20 23:37:39 +0000
committer	Automerger Merge Worker <android-build-automerger-merge-worker@system.gserviceaccount.com>	2021-08-20 23:37:39 +0000
commit	5e39cdae8df55b50c2da9d77a1a6da3bbbc7e9a7 (patch)
tree	90d87dcb0faf6c6e2805cab833d9f173ac4ed786
parent	d22939409e8930f2426b1b6fe55634859e66104b (diff)
parent	9a771a31e3b075cae3bae31d5fc4ba0b6e114667 (diff)
download	cesu8-5e39cdae8df55b50c2da9d77a1a6da3bbbc7e9a7.tar.gz