aboutsummaryrefslogtreecommitdiff
path: root/src/read.rs
diff options
context:
space:
mode:
Diffstat (limited to 'src/read.rs')
-rw-r--r--src/read.rs167
1 files changed, 167 insertions, 0 deletions
diff --git a/src/read.rs b/src/read.rs
new file mode 100644
index 0000000..5e38f54
--- /dev/null
+++ b/src/read.rs
@@ -0,0 +1,167 @@
+use std::io::{self, BufRead};
+use std::error::Error;
+use std::fmt;
+use std::str;
+use super::*;
+
+/// Wraps a `std::io::BufRead` buffered byte stream and decode it as UTF-8.
+pub struct BufReadDecoder<B: BufRead> {
+ buf_read: B,
+ bytes_consumed: usize,
+ incomplete: Incomplete,
+}
+
+#[derive(Debug)]
+pub enum BufReadDecoderError<'a> {
+ /// Represents one UTF-8 error in the byte stream.
+ ///
+ /// In lossy decoding, each such error should be replaced with U+FFFD.
+ /// (See `BufReadDecoder::next_lossy` and `BufReadDecoderError::lossy`.)
+ InvalidByteSequence(&'a [u8]),
+
+ /// An I/O error from the underlying byte stream
+ Io(io::Error),
+}
+
+impl<'a> BufReadDecoderError<'a> {
+ /// Replace UTF-8 errors with U+FFFD
+ pub fn lossy(self) -> Result<&'static str, io::Error> {
+ match self {
+ BufReadDecoderError::Io(error) => Err(error),
+ BufReadDecoderError::InvalidByteSequence(_) => Ok(REPLACEMENT_CHARACTER),
+ }
+ }
+}
+
+impl<'a> fmt::Display for BufReadDecoderError<'a> {
+ fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+ match *self {
+ BufReadDecoderError::InvalidByteSequence(bytes) => {
+ write!(f, "invalid byte sequence: {:02x?}", bytes)
+ }
+ BufReadDecoderError::Io(ref err) => write!(f, "underlying bytestream error: {}", err),
+ }
+ }
+}
+
+impl<'a> Error for BufReadDecoderError<'a> {
+ fn source(&self) -> Option<&(dyn Error + 'static)> {
+ match *self {
+ BufReadDecoderError::InvalidByteSequence(_) => None,
+ BufReadDecoderError::Io(ref err) => Some(err),
+ }
+ }
+}
+
+impl<B: BufRead> BufReadDecoder<B> {
+ /// This is to `Read::read_to_string` what `String::from_utf8_lossy` is to `String::from_utf8`.
+ pub fn read_to_string_lossy(buf_read: B) -> io::Result<String> {
+ let mut decoder = Self::new(buf_read);
+ let mut string = String::new();
+ while let Some(result) = decoder.next_lossy() {
+ string.push_str(result?)
+ }
+ Ok(string)
+ }
+
+ pub fn new(buf_read: B) -> Self {
+ Self {
+ buf_read,
+ bytes_consumed: 0,
+ incomplete: Incomplete::empty(),
+ }
+ }
+
+ /// Same as `BufReadDecoder::next_strict`, but replace UTF-8 errors with U+FFFD.
+ pub fn next_lossy(&mut self) -> Option<io::Result<&str>> {
+ self.next_strict().map(|result| result.or_else(|e| e.lossy()))
+ }
+
+ /// Decode and consume the next chunk of UTF-8 input.
+ ///
+ /// This method is intended to be called repeatedly until it returns `None`,
+ /// which represents EOF from the underlying byte stream.
+ /// This is similar to `Iterator::next`,
+ /// except that decoded chunks borrow the decoder (~iterator)
+ /// so they need to be handled or copied before the next chunk can start decoding.
+ pub fn next_strict(&mut self) -> Option<Result<&str, BufReadDecoderError>> {
+ enum BytesSource {
+ BufRead(usize),
+ Incomplete,
+ }
+ macro_rules! try_io {
+ ($io_result: expr) => {
+ match $io_result {
+ Ok(value) => value,
+ Err(error) => return Some(Err(BufReadDecoderError::Io(error)))
+ }
+ }
+ }
+ let (source, result) = loop {
+ if self.bytes_consumed > 0 {
+ self.buf_read.consume(self.bytes_consumed);
+ self.bytes_consumed = 0;
+ }
+ let buf = try_io!(self.buf_read.fill_buf());
+
+ // Force loop iteration to go through an explicit `continue`
+ enum Unreachable {}
+ let _: Unreachable = if self.incomplete.is_empty() {
+ if buf.is_empty() {
+ return None // EOF
+ }
+ match str::from_utf8(buf) {
+ Ok(_) => {
+ break (BytesSource::BufRead(buf.len()), Ok(()))
+ }
+ Err(error) => {
+ let valid_up_to = error.valid_up_to();
+ if valid_up_to > 0 {
+ break (BytesSource::BufRead(valid_up_to), Ok(()))
+ }
+ match error.error_len() {
+ Some(invalid_sequence_length) => {
+ break (BytesSource::BufRead(invalid_sequence_length), Err(()))
+ }
+ None => {
+ self.bytes_consumed = buf.len();
+ self.incomplete = Incomplete::new(buf);
+ // need more input bytes
+ continue
+ }
+ }
+ }
+ }
+ } else {
+ if buf.is_empty() {
+ break (BytesSource::Incomplete, Err(())) // EOF with incomplete code point
+ }
+ let (consumed, opt_result) = self.incomplete.try_complete_offsets(buf);
+ self.bytes_consumed = consumed;
+ match opt_result {
+ None => {
+ // need more input bytes
+ continue
+ }
+ Some(result) => {
+ break (BytesSource::Incomplete, result)
+ }
+ }
+ };
+ };
+ let bytes = match source {
+ BytesSource::BufRead(byte_count) => {
+ self.bytes_consumed = byte_count;
+ let buf = try_io!(self.buf_read.fill_buf());
+ &buf[..byte_count]
+ }
+ BytesSource::Incomplete => {
+ self.incomplete.take_buffer()
+ }
+ };
+ match result {
+ Ok(()) => Some(Ok(unsafe { str::from_utf8_unchecked(bytes) })),
+ Err(()) => Some(Err(BufReadDecoderError::InvalidByteSequence(bytes))),
+ }
+ }
+}