From b0837045171dae3e9ad24059de17ee039b7d1b48 Mon Sep 17 00:00:00 2001 From: Hyun Jae Moon Date: Fri, 23 Jun 2023 16:06:43 +0000 Subject: Import utf-8 crate Bug: 288312145 Bug: 286134481 Test: libutf8 Change-Id: Ibd8a0299dccb53909ebc19f3f7514ef3babc40a3 --- Android.bp | 38 +++++++++ Cargo.toml | 29 +++++++ Cargo.toml.orig | 20 +++++ LICENSE | 1 + LICENSE-APACHE | 176 ++++++++++++++++++++++++++++++++++++++++ LICENSE-MIT | 23 ++++++ METADATA | 20 +++++ MODULE_LICENSE_APACHE2 | 0 OWNERS | 1 + README.md | 5 ++ benches/from_utf8_lossy.rs | 30 +++++++ src/lib.rs | 186 ++++++++++++++++++++++++++++++++++++++++++ src/lossy.rs | 92 +++++++++++++++++++++ src/read.rs | 167 ++++++++++++++++++++++++++++++++++++++ tests/unit.rs | 197 +++++++++++++++++++++++++++++++++++++++++++++ 15 files changed, 985 insertions(+) create mode 100644 Android.bp create mode 100644 Cargo.toml create mode 100644 Cargo.toml.orig create mode 120000 LICENSE create mode 100644 LICENSE-APACHE create mode 100644 LICENSE-MIT create mode 100644 METADATA create mode 100644 MODULE_LICENSE_APACHE2 create mode 100644 OWNERS create mode 100644 README.md create mode 100644 benches/from_utf8_lossy.rs create mode 100644 src/lib.rs create mode 100644 src/lossy.rs create mode 100644 src/read.rs create mode 100644 tests/unit.rs diff --git a/Android.bp b/Android.bp new file mode 100644 index 0000000..b5771ee --- /dev/null +++ b/Android.bp @@ -0,0 +1,38 @@ +// This file is generated by cargo2android.py --run --device --tests. +// Do not modify this file as changes will be overridden on upgrade. + + + +rust_library { + name: "libutf8", + host_supported: true, + crate_name: "utf8", + cargo_env_compat: true, + cargo_pkg_version: "0.7.6", + srcs: ["src/lib.rs"], + edition: "2015", + apex_available: [ + "//apex_available:platform", + "//apex_available:anyapex", + ], + product_available: true, + vendor_available: true, +} + +rust_test { + name: "utf-8_test_tests_unit", + host_supported: true, + crate_name: "unit", + cargo_env_compat: true, + cargo_pkg_version: "0.7.6", + srcs: ["tests/unit.rs"], + test_suites: ["general-tests"], + auto_gen_config: true, + test_options: { + unit_test: true, + }, + edition: "2015", + rustlibs: [ + "libutf8", + ], +} diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..c01a69d --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,29 @@ +# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO +# +# When uploading crates to the registry Cargo will automatically +# "normalize" Cargo.toml files for maximal compatibility +# with all versions of Cargo and also rewrite `path` dependencies +# to registry (e.g., crates.io) dependencies +# +# If you believe there's an error in this file please file an +# issue against the rust-lang/cargo repository. If you're +# editing this file be aware that the upstream Cargo.toml +# will likely look very different (and much more reasonable) + +[package] +name = "utf-8" +version = "0.7.6" +authors = ["Simon Sapin "] +description = "Incremental, zero-copy UTF-8 decoding with error handling" +license = "MIT OR Apache-2.0" +repository = "https://github.com/SimonSapin/rust-utf8" +[profile.bench] + +[profile.test] + +[lib] +name = "utf8" +test = false +bench = false + +[dependencies] diff --git a/Cargo.toml.orig b/Cargo.toml.orig new file mode 100644 index 0000000..c9a377d --- /dev/null +++ b/Cargo.toml.orig @@ -0,0 +1,20 @@ +[package] +name = "utf-8" +version = "0.7.6" +authors = ["Simon Sapin "] +description = "Incremental, zero-copy UTF-8 decoding with error handling" +license = "MIT OR Apache-2.0" +repository = "https://github.com/SimonSapin/rust-utf8" + +[lib] +name = "utf8" +test = false +bench = false + +[dependencies] + +[profile.test] +#opt-level = 3 + +[profile.bench] +#debug = true diff --git a/LICENSE b/LICENSE new file mode 120000 index 0000000..6b579aa --- /dev/null +++ b/LICENSE @@ -0,0 +1 @@ +LICENSE-APACHE \ No newline at end of file diff --git a/LICENSE-APACHE b/LICENSE-APACHE new file mode 100644 index 0000000..1b5ec8b --- /dev/null +++ b/LICENSE-APACHE @@ -0,0 +1,176 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + +TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + +1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + +2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + +3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + +4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + +5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + +6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + +7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + +8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + +9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + +END OF TERMS AND CONDITIONS diff --git a/LICENSE-MIT b/LICENSE-MIT new file mode 100644 index 0000000..31aa793 --- /dev/null +++ b/LICENSE-MIT @@ -0,0 +1,23 @@ +Permission is hereby granted, free of charge, to any +person obtaining a copy of this software and associated +documentation files (the "Software"), to deal in the +Software without restriction, including without +limitation the rights to use, copy, modify, merge, +publish, distribute, sublicense, and/or sell copies of +the Software, and to permit persons to whom the Software +is furnished to do so, subject to the following +conditions: + +The above copyright notice and this permission notice +shall be included in all copies or substantial portions +of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF +ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED +TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A +PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT +SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR +IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. diff --git a/METADATA b/METADATA new file mode 100644 index 0000000..178a89e --- /dev/null +++ b/METADATA @@ -0,0 +1,20 @@ +name: "utf-8" +description: "Incremental, zero-copy UTF-8 decoding with error handling" +third_party { + url { + type: HOMEPAGE + value: "https://crates.io/crates/utf-8" + } + url { + type: ARCHIVE + value: "https://static.crates.io/crates/utf-8/utf-8-0.7.6.crate" + } + version: "0.7.6" + # Dual-licensed, using the least restrictive per go/thirdpartylicenses#same. + license_type: NOTICE + last_upgrade_date { + year: 2023 + month: 6 + day: 2 + } +} diff --git a/MODULE_LICENSE_APACHE2 b/MODULE_LICENSE_APACHE2 new file mode 100644 index 0000000..e69de29 diff --git a/OWNERS b/OWNERS new file mode 100644 index 0000000..45dc4dd --- /dev/null +++ b/OWNERS @@ -0,0 +1 @@ +include platform/prebuilts/rust:master:/OWNERS diff --git a/README.md b/README.md new file mode 100644 index 0000000..145889b --- /dev/null +++ b/README.md @@ -0,0 +1,5 @@ +# rust-utf8 + +Incremental, zero-copy UTF-8 decoding for Rust + +[Documentation](https://docs.rs/utf-8/) diff --git a/benches/from_utf8_lossy.rs b/benches/from_utf8_lossy.rs new file mode 100644 index 0000000..95d9edf --- /dev/null +++ b/benches/from_utf8_lossy.rs @@ -0,0 +1,30 @@ +#![feature(test)] + +extern crate test; +extern crate utf8; + +#[path = "../tests/shared/data.rs"] +mod data; + +#[path = "../tests/shared/string_from_utf8_lossy.rs"] +mod string_from_utf8_lossy; + +#[bench] +fn bench_our_string_from_utf8_lossy(bencher: &mut test::Bencher) { + bencher.bytes = data::DECODED_LOSSY.iter().map(|&(input, _expected)| input.len() as u64).sum(); + bencher.iter(|| { + for &(input, _expected) in data::DECODED_LOSSY { + test::black_box(string_from_utf8_lossy::string_from_utf8_lossy(input)); + } + }) +} + +#[bench] +fn bench_std_string_from_utf8_lossy(bencher: &mut test::Bencher) { + bencher.bytes = data::DECODED_LOSSY.iter().map(|&(input, _expected)| input.len() as u64).sum(); + bencher.iter(|| { + for &(input, _expected) in data::DECODED_LOSSY { + test::black_box(String::from_utf8_lossy(input)); + } + }) +} diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..ec223f2 --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,186 @@ +mod lossy; +mod read; + +pub use lossy::LossyDecoder; +pub use read::{BufReadDecoder, BufReadDecoderError}; + +use std::cmp; +use std::error::Error; +use std::fmt; +use std::str; + +/// The replacement character, U+FFFD. In lossy decoding, insert it for every decoding error. +pub const REPLACEMENT_CHARACTER: &'static str = "\u{FFFD}"; + +#[derive(Debug, Copy, Clone)] +pub enum DecodeError<'a> { + /// In lossy decoding insert `valid_prefix`, then `"\u{FFFD}"`, + /// then call `decode()` again with `remaining_input`. + Invalid { + valid_prefix: &'a str, + invalid_sequence: &'a [u8], + remaining_input: &'a [u8], + }, + + /// Call the `incomplete_suffix.try_complete` method with more input when available. + /// If no more input is available, this is an invalid byte sequence. + Incomplete { + valid_prefix: &'a str, + incomplete_suffix: Incomplete, + }, +} + +impl<'a> fmt::Display for DecodeError<'a> { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match *self { + DecodeError::Invalid { + valid_prefix, + invalid_sequence, + remaining_input, + } => write!( + f, + "found invalid byte sequence {invalid_sequence:02x?} after \ + {valid_byte_count} valid bytes, followed by {unprocessed_byte_count} more \ + unprocessed bytes", + invalid_sequence = invalid_sequence, + valid_byte_count = valid_prefix.len(), + unprocessed_byte_count = remaining_input.len() + ), + DecodeError::Incomplete { + valid_prefix, + incomplete_suffix, + } => write!( + f, + "found incomplete byte sequence {incomplete_suffix:02x?} after \ + {valid_byte_count} bytes", + incomplete_suffix = incomplete_suffix, + valid_byte_count = valid_prefix.len() + ), + } + } +} + +impl<'a> Error for DecodeError<'a> {} + +#[derive(Debug, Copy, Clone)] +pub struct Incomplete { + pub buffer: [u8; 4], + pub buffer_len: u8, +} + +pub fn decode(input: &[u8]) -> Result<&str, DecodeError> { + let error = match str::from_utf8(input) { + Ok(valid) => return Ok(valid), + Err(error) => error, + }; + + // FIXME: separate function from here to guide inlining? + let (valid, after_valid) = input.split_at(error.valid_up_to()); + let valid = unsafe { + str::from_utf8_unchecked(valid) + }; + + match error.error_len() { + Some(invalid_sequence_length) => { + let (invalid, rest) = after_valid.split_at(invalid_sequence_length); + Err(DecodeError::Invalid { + valid_prefix: valid, + invalid_sequence: invalid, + remaining_input: rest + }) + } + None => { + Err(DecodeError::Incomplete { + valid_prefix: valid, + incomplete_suffix: Incomplete::new(after_valid), + }) + } + } +} + +impl Incomplete { + pub fn empty() -> Self { + Incomplete { + buffer: [0, 0, 0, 0], + buffer_len: 0, + } + } + + pub fn is_empty(&self) -> bool { + self.buffer_len == 0 + } + + pub fn new(bytes: &[u8]) -> Self { + let mut buffer = [0, 0, 0, 0]; + let len = bytes.len(); + buffer[..len].copy_from_slice(bytes); + Incomplete { + buffer: buffer, + buffer_len: len as u8, + } + } + + /// * `None`: still incomplete, call `try_complete` again with more input. + /// If no more input is available, this is invalid byte sequence. + /// * `Some((result, remaining_input))`: We’re done with this `Incomplete`. + /// To keep decoding, pass `remaining_input` to `decode()`. + pub fn try_complete<'input>(&mut self, input: &'input [u8]) + -> Option<(Result<&str, &[u8]>, &'input [u8])> { + let (consumed, opt_result) = self.try_complete_offsets(input); + let result = opt_result?; + let remaining_input = &input[consumed..]; + let result_bytes = self.take_buffer(); + let result = match result { + Ok(()) => Ok(unsafe { str::from_utf8_unchecked(result_bytes) }), + Err(()) => Err(result_bytes), + }; + Some((result, remaining_input)) + } + + fn take_buffer(&mut self) -> &[u8] { + let len = self.buffer_len as usize; + self.buffer_len = 0; + &self.buffer[..len as usize] + } + + /// (consumed_from_input, None): not enough input + /// (consumed_from_input, Some(Err(()))): error bytes in buffer + /// (consumed_from_input, Some(Ok(()))): UTF-8 string in buffer + fn try_complete_offsets(&mut self, input: &[u8]) -> (usize, Option>) { + let initial_buffer_len = self.buffer_len as usize; + let copied_from_input; + { + let unwritten = &mut self.buffer[initial_buffer_len..]; + copied_from_input = cmp::min(unwritten.len(), input.len()); + unwritten[..copied_from_input].copy_from_slice(&input[..copied_from_input]); + } + let spliced = &self.buffer[..initial_buffer_len + copied_from_input]; + match str::from_utf8(spliced) { + Ok(_) => { + self.buffer_len = spliced.len() as u8; + (copied_from_input, Some(Ok(()))) + } + Err(error) => { + let valid_up_to = error.valid_up_to(); + if valid_up_to > 0 { + let consumed = valid_up_to.checked_sub(initial_buffer_len).unwrap(); + self.buffer_len = valid_up_to as u8; + (consumed, Some(Ok(()))) + } else { + match error.error_len() { + Some(invalid_sequence_length) => { + let consumed = invalid_sequence_length + .checked_sub(initial_buffer_len).unwrap(); + self.buffer_len = invalid_sequence_length as u8; + (consumed, Some(Err(()))) + } + None => { + self.buffer_len = spliced.len() as u8; + (copied_from_input, None) + } + } + } + } + } + } +} diff --git a/src/lossy.rs b/src/lossy.rs new file mode 100644 index 0000000..00bcdec --- /dev/null +++ b/src/lossy.rs @@ -0,0 +1,92 @@ +use super::*; + +/// A push-based, lossy decoder for UTF-8. +/// Errors are replaced with the U+FFFD replacement character. +/// +/// Users “push” bytes into the decoder, which in turn “pushes” `&str` slices into a callback. +/// +/// For example, `String::from_utf8_lossy` (but returning `String` instead of `Cow`) +/// can be rewritten as: +/// +/// ```rust +/// fn string_from_utf8_lossy(input: &[u8]) -> String { +/// let mut string = String::new(); +/// utf8::LossyDecoder::new(|s| string.push_str(s)).feed(input); +/// string +/// } +/// ``` +/// +/// **Note:** Dropping the decoder signals the end of the input: +/// If the last input chunk ended with an incomplete byte sequence for a code point, +/// this is an error and a replacement character is emitted. +/// Use `std::mem::forget` to inhibit this behavior. +pub struct LossyDecoder { + push_str: F, + incomplete: Incomplete, +} + +impl LossyDecoder { + /// Create a new decoder from a callback. + #[inline] + pub fn new(push_str: F) -> Self { + LossyDecoder { + push_str: push_str, + incomplete: Incomplete { + buffer: [0, 0, 0, 0], + buffer_len: 0, + }, + } + } + + /// Feed one chunk of input into the decoder. + /// + /// The input is decoded lossily + /// and the callback called once or more with `&str` string slices. + /// + /// If the UTF-8 byte sequence for one code point was split into this bytes chunk + /// and previous bytes chunks, it will be correctly pieced back together. + pub fn feed(&mut self, mut input: &[u8]) { + if self.incomplete.buffer_len > 0 { + match self.incomplete.try_complete(input) { + Some((Ok(s), remaining)) => { + (self.push_str)(s); + input = remaining + } + Some((Err(_), remaining)) => { + (self.push_str)(REPLACEMENT_CHARACTER); + input = remaining + } + None => { + return + } + } + } + loop { + match decode(input) { + Ok(s) => { + (self.push_str)(s); + return + } + Err(DecodeError::Incomplete { valid_prefix, incomplete_suffix }) => { + (self.push_str)(valid_prefix); + self.incomplete = incomplete_suffix; + return + } + Err(DecodeError::Invalid { valid_prefix, remaining_input, .. }) => { + (self.push_str)(valid_prefix); + (self.push_str)(REPLACEMENT_CHARACTER); + input = remaining_input + } + } + } + } +} + +impl Drop for LossyDecoder { + #[inline] + fn drop(&mut self) { + if self.incomplete.buffer_len > 0 { + (self.push_str)(REPLACEMENT_CHARACTER) + } + } +} diff --git a/src/read.rs b/src/read.rs new file mode 100644 index 0000000..5e38f54 --- /dev/null +++ b/src/read.rs @@ -0,0 +1,167 @@ +use std::io::{self, BufRead}; +use std::error::Error; +use std::fmt; +use std::str; +use super::*; + +/// Wraps a `std::io::BufRead` buffered byte stream and decode it as UTF-8. +pub struct BufReadDecoder { + buf_read: B, + bytes_consumed: usize, + incomplete: Incomplete, +} + +#[derive(Debug)] +pub enum BufReadDecoderError<'a> { + /// Represents one UTF-8 error in the byte stream. + /// + /// In lossy decoding, each such error should be replaced with U+FFFD. + /// (See `BufReadDecoder::next_lossy` and `BufReadDecoderError::lossy`.) + InvalidByteSequence(&'a [u8]), + + /// An I/O error from the underlying byte stream + Io(io::Error), +} + +impl<'a> BufReadDecoderError<'a> { + /// Replace UTF-8 errors with U+FFFD + pub fn lossy(self) -> Result<&'static str, io::Error> { + match self { + BufReadDecoderError::Io(error) => Err(error), + BufReadDecoderError::InvalidByteSequence(_) => Ok(REPLACEMENT_CHARACTER), + } + } +} + +impl<'a> fmt::Display for BufReadDecoderError<'a> { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match *self { + BufReadDecoderError::InvalidByteSequence(bytes) => { + write!(f, "invalid byte sequence: {:02x?}", bytes) + } + BufReadDecoderError::Io(ref err) => write!(f, "underlying bytestream error: {}", err), + } + } +} + +impl<'a> Error for BufReadDecoderError<'a> { + fn source(&self) -> Option<&(dyn Error + 'static)> { + match *self { + BufReadDecoderError::InvalidByteSequence(_) => None, + BufReadDecoderError::Io(ref err) => Some(err), + } + } +} + +impl BufReadDecoder { + /// This is to `Read::read_to_string` what `String::from_utf8_lossy` is to `String::from_utf8`. + pub fn read_to_string_lossy(buf_read: B) -> io::Result { + let mut decoder = Self::new(buf_read); + let mut string = String::new(); + while let Some(result) = decoder.next_lossy() { + string.push_str(result?) + } + Ok(string) + } + + pub fn new(buf_read: B) -> Self { + Self { + buf_read, + bytes_consumed: 0, + incomplete: Incomplete::empty(), + } + } + + /// Same as `BufReadDecoder::next_strict`, but replace UTF-8 errors with U+FFFD. + pub fn next_lossy(&mut self) -> Option> { + self.next_strict().map(|result| result.or_else(|e| e.lossy())) + } + + /// Decode and consume the next chunk of UTF-8 input. + /// + /// This method is intended to be called repeatedly until it returns `None`, + /// which represents EOF from the underlying byte stream. + /// This is similar to `Iterator::next`, + /// except that decoded chunks borrow the decoder (~iterator) + /// so they need to be handled or copied before the next chunk can start decoding. + pub fn next_strict(&mut self) -> Option> { + enum BytesSource { + BufRead(usize), + Incomplete, + } + macro_rules! try_io { + ($io_result: expr) => { + match $io_result { + Ok(value) => value, + Err(error) => return Some(Err(BufReadDecoderError::Io(error))) + } + } + } + let (source, result) = loop { + if self.bytes_consumed > 0 { + self.buf_read.consume(self.bytes_consumed); + self.bytes_consumed = 0; + } + let buf = try_io!(self.buf_read.fill_buf()); + + // Force loop iteration to go through an explicit `continue` + enum Unreachable {} + let _: Unreachable = if self.incomplete.is_empty() { + if buf.is_empty() { + return None // EOF + } + match str::from_utf8(buf) { + Ok(_) => { + break (BytesSource::BufRead(buf.len()), Ok(())) + } + Err(error) => { + let valid_up_to = error.valid_up_to(); + if valid_up_to > 0 { + break (BytesSource::BufRead(valid_up_to), Ok(())) + } + match error.error_len() { + Some(invalid_sequence_length) => { + break (BytesSource::BufRead(invalid_sequence_length), Err(())) + } + None => { + self.bytes_consumed = buf.len(); + self.incomplete = Incomplete::new(buf); + // need more input bytes + continue + } + } + } + } + } else { + if buf.is_empty() { + break (BytesSource::Incomplete, Err(())) // EOF with incomplete code point + } + let (consumed, opt_result) = self.incomplete.try_complete_offsets(buf); + self.bytes_consumed = consumed; + match opt_result { + None => { + // need more input bytes + continue + } + Some(result) => { + break (BytesSource::Incomplete, result) + } + } + }; + }; + let bytes = match source { + BytesSource::BufRead(byte_count) => { + self.bytes_consumed = byte_count; + let buf = try_io!(self.buf_read.fill_buf()); + &buf[..byte_count] + } + BytesSource::Incomplete => { + self.incomplete.take_buffer() + } + }; + match result { + Ok(()) => Some(Ok(unsafe { str::from_utf8_unchecked(bytes) })), + Err(()) => Some(Err(BufReadDecoderError::InvalidByteSequence(bytes))), + } + } +} diff --git a/tests/unit.rs b/tests/unit.rs new file mode 100644 index 0000000..6839e84 --- /dev/null +++ b/tests/unit.rs @@ -0,0 +1,197 @@ +extern crate utf8; + +use std::borrow::Cow; +use std::collections::VecDeque; +use std::io; +use utf8::*; + +/// A re-implementation of std::str::from_utf8 +pub fn str_from_utf8(input: &[u8]) -> Result<&str, usize> { + match decode(input) { + Ok(s) => return Ok(s), + Err(DecodeError::Invalid { valid_prefix, .. }) | + Err(DecodeError::Incomplete { valid_prefix, .. }) => Err(valid_prefix.len()), + } +} + +#[test] +fn test_str_from_utf8() { + let xs = b"hello"; + assert_eq!(str_from_utf8(xs), Ok("hello")); + + let xs = "ศไทย中华Việt Nam".as_bytes(); + assert_eq!(str_from_utf8(xs), Ok("ศไทย中华Việt Nam")); + + let xs = b"hello\xFF"; + assert!(str_from_utf8(xs).is_err()); +} + +#[test] +fn test_is_utf8() { + // Chars of 1, 2, 3, and 4 bytes + assert!(str_from_utf8("eé€\u{10000}".as_bytes()).is_ok()); + // invalid prefix + assert!(str_from_utf8(&[0x80]).is_err()); + // invalid 2 byte prefix + assert!(str_from_utf8(&[0xc0]).is_err()); + assert!(str_from_utf8(&[0xc0, 0x10]).is_err()); + // invalid 3 byte prefix + assert!(str_from_utf8(&[0xe0]).is_err()); + assert!(str_from_utf8(&[0xe0, 0x10]).is_err()); + assert!(str_from_utf8(&[0xe0, 0xff, 0x10]).is_err()); + // invalid 4 byte prefix + assert!(str_from_utf8(&[0xf0]).is_err()); + assert!(str_from_utf8(&[0xf0, 0x10]).is_err()); + assert!(str_from_utf8(&[0xf0, 0xff, 0x10]).is_err()); + assert!(str_from_utf8(&[0xf0, 0xff, 0xff, 0x10]).is_err()); + + // deny overlong encodings + assert!(str_from_utf8(&[0xc0, 0x80]).is_err()); + assert!(str_from_utf8(&[0xc0, 0xae]).is_err()); + assert!(str_from_utf8(&[0xe0, 0x80, 0x80]).is_err()); + assert!(str_from_utf8(&[0xe0, 0x80, 0xaf]).is_err()); + assert!(str_from_utf8(&[0xe0, 0x81, 0x81]).is_err()); + assert!(str_from_utf8(&[0xf0, 0x82, 0x82, 0xac]).is_err()); + assert!(str_from_utf8(&[0xf4, 0x90, 0x80, 0x80]).is_err()); + + // deny surrogates + assert!(str_from_utf8(&[0xED, 0xA0, 0x80]).is_err()); + assert!(str_from_utf8(&[0xED, 0xBF, 0xBF]).is_err()); + + assert!(str_from_utf8(&[0xC2, 0x80]).is_ok()); + assert!(str_from_utf8(&[0xDF, 0xBF]).is_ok()); + assert!(str_from_utf8(&[0xE0, 0xA0, 0x80]).is_ok()); + assert!(str_from_utf8(&[0xED, 0x9F, 0xBF]).is_ok()); + assert!(str_from_utf8(&[0xEE, 0x80, 0x80]).is_ok()); + assert!(str_from_utf8(&[0xEF, 0xBF, 0xBF]).is_ok()); + assert!(str_from_utf8(&[0xF0, 0x90, 0x80, 0x80]).is_ok()); + assert!(str_from_utf8(&[0xF4, 0x8F, 0xBF, 0xBF]).is_ok()); +} + +/// A re-implementation of String::from_utf8_lossy +pub fn string_from_utf8_lossy(input: &[u8]) -> Cow { + let mut result = decode(input); + if let Ok(s) = result { + return s.into() + } + let mut string = String::with_capacity(input.len() + REPLACEMENT_CHARACTER.len()); + loop { + match result { + Ok(s) => { + string.push_str(s); + return string.into() + } + Err(DecodeError::Incomplete { valid_prefix, .. }) => { + string.push_str(valid_prefix); + string.push_str(REPLACEMENT_CHARACTER); + return string.into() + } + Err(DecodeError::Invalid { valid_prefix, remaining_input, .. }) => { + string.push_str(valid_prefix); + string.push_str(REPLACEMENT_CHARACTER); + result = decode(remaining_input); + } + } + } +} + +pub const DECODED_LOSSY: &'static [(&'static [u8], &'static str)] = &[ + (b"hello", "hello"), + (b"\xe0\xb8\xa8\xe0\xb9\x84\xe0\xb8\x97\xe0\xb8\xa2\xe4\xb8\xad\xe5\x8d\x8e", "ศไทย中华"), + (b"Vi\xe1\xbb\x87t Nam", "Việt Nam"), + (b"Hello\xC2 There\xFF ", "Hello\u{FFFD} There\u{FFFD} "), + (b"Hello\xC0\x80 There", "Hello\u{FFFD}\u{FFFD} There"), + (b"\xE6\x83 Goodbye", "\u{FFFD} Goodbye"), + (b"\xF5foo\xF5\x80bar", "\u{FFFD}foo\u{FFFD}\u{FFFD}bar"), + (b"\xF5foo\xF5\xC2", "\u{FFFD}foo\u{FFFD}\u{FFFD}"), + (b"\xF1foo\xF1\x80bar\xF1\x80\x80baz", "\u{FFFD}foo\u{FFFD}bar\u{FFFD}baz"), + (b"\xF4foo\xF4\x80bar\xF4\xBFbaz", "\u{FFFD}foo\u{FFFD}bar\u{FFFD}\u{FFFD}baz"), + (b"\xF0\x80\x80\x80foo\xF0\x90\x80\x80bar", "\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}foo\u{10000}bar"), + (b"\xF0\x90\x80foo", "\u{FFFD}foo"), + // surrogates + (b"\xED\xA0\x80foo\xED\xBF\xBFbar", "\u{FFFD}\u{FFFD}\u{FFFD}foo\u{FFFD}\u{FFFD}\u{FFFD}bar"), +]; + +#[test] +fn test_string_from_utf8_lossy() { + for &(input, expected) in DECODED_LOSSY { + assert_eq!(string_from_utf8_lossy(input), expected); + } +} + +pub fn all_partitions<'a, F>(input: &'a [u8], f: F) + where F: Fn(&[&[u8]]) +{ + + fn all_partitions_inner<'a, F>(chunks: &mut Vec<&'a [u8]>, input: &'a [u8], f: &F) + where F: Fn(&[&[u8]]) + { + if input.is_empty() { + f(chunks) + } + for i in 1..(input.len() + 1) { + chunks.push(&input[..i]); + all_partitions_inner(chunks, &input[i..], f); + chunks.pop(); + } + } + + let mut chunks = Vec::new(); + all_partitions_inner(&mut chunks, input, &f); + assert_eq!(chunks.len(), 0); +} + +#[test] +fn test_incremental_decoder() { + for &(input, expected) in DECODED_LOSSY { + all_partitions(input, |chunks| { + let mut string = String::new(); + { + let mut decoder = LossyDecoder::new(|s| string.push_str(s)); + for &chunk in &*chunks { + decoder.feed(chunk); + } + } + assert_eq!(string, expected); + }); + } +} + +#[test] +fn test_bufread_decoder() { + for &(input, expected) in DECODED_LOSSY { + all_partitions(input, |chunks| { + let chunks = Chunks(chunks.to_vec().into()); + let string = BufReadDecoder::read_to_string_lossy(chunks).unwrap(); + assert_eq!(string, expected) + }); + } +} + +struct Chunks<'a>(VecDeque<&'a [u8]>); + +impl<'a> io::Read for Chunks<'a> { + fn read(&mut self, _: &mut [u8]) -> io::Result { + unimplemented!() + } +} + +impl<'a> io::BufRead for Chunks<'a> { + fn fill_buf(&mut self) -> io::Result<&[u8]> { + Ok(*self.0.front().unwrap()) + } + + fn consume(&mut self, bytes: usize) { + { + let front = self.0.front_mut().unwrap(); + *front = &front[bytes..]; + if !front.is_empty() { + return + } + } + if self.0.len() > 1 { + self.0.pop_front(); + } + } + +} -- cgit v1.2.3