From 7e6f3508c31f81272c9094b76f3e60aebce6c9fb Mon Sep 17 00:00:00 2001 From: Jakub Kotur Date: Mon, 21 Dec 2020 17:28:15 +0100 Subject: Initial import of csv-core-0.1.10. Bug: 155309706 Change-Id: If5440be388a5c0ea5fcff8a0315b1dad21d672e2 --- .cargo_vcs_info.json | 5 + Cargo.toml | 43 ++ Cargo.toml.orig | 31 + README.md | 113 +++ benches/bench.rs | 94 +++ src/lib.rs | 189 +++++ src/reader.rs | 2005 ++++++++++++++++++++++++++++++++++++++++++++++++++ src/writer.rs | 1047 ++++++++++++++++++++++++++ 8 files changed, 3527 insertions(+) create mode 100644 .cargo_vcs_info.json create mode 100644 Cargo.toml create mode 100644 Cargo.toml.orig create mode 100644 README.md create mode 100644 benches/bench.rs create mode 100644 src/lib.rs create mode 100644 src/reader.rs create mode 100644 src/writer.rs diff --git a/.cargo_vcs_info.json b/.cargo_vcs_info.json new file mode 100644 index 0000000..6de0194 --- /dev/null +++ b/.cargo_vcs_info.json @@ -0,0 +1,5 @@ +{ + "git": { + "sha1": "70c8600b29349f9ee0501577284d8300ae9c8055" + } +} diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..0f43cb0 --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,43 @@ +# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO +# +# When uploading crates to the registry Cargo will automatically +# "normalize" Cargo.toml files for maximal compatibility +# with all versions of Cargo and also rewrite `path` dependencies +# to registry (e.g., crates.io) dependencies +# +# If you believe there's an error in this file please file an +# issue against the rust-lang/cargo repository. If you're +# editing this file be aware that the upstream Cargo.toml +# will likely look very different (and much more reasonable) + +[package] +edition = "2018" +name = "csv-core" +version = "0.1.10" +authors = ["Andrew Gallant "] +description = "Bare bones CSV parsing with no_std support." +homepage = "https://github.com/BurntSushi/rust-csv" +documentation = "https://docs.rs/csv-core" +readme = "README.md" +keywords = ["csv", "comma", "parser", "delimited", "no_std"] +categories = ["encoding", "no-std", "parser-implementations"] +license = "Unlicense/MIT" +repository = "https://github.com/BurntSushi/rust-csv" + +[lib] +bench = false +[dependencies.memchr] +version = "2" +default-features = false +[dev-dependencies.arrayvec] +version = "0.5" +default-features = false + +[features] +default = [] +libc = ["memchr/libc"] +[badges.appveyor] +repository = "BurntSushi/rust-csv" + +[badges.travis-ci] +repository = "BurntSushi/rust-csv" diff --git a/Cargo.toml.orig b/Cargo.toml.orig new file mode 100644 index 0000000..110ee06 --- /dev/null +++ b/Cargo.toml.orig @@ -0,0 +1,31 @@ +[package] +name = "csv-core" +version = "0.1.10" #:version +authors = ["Andrew Gallant "] +description = "Bare bones CSV parsing with no_std support." +documentation = "https://docs.rs/csv-core" +homepage = "https://github.com/BurntSushi/rust-csv" +repository = "https://github.com/BurntSushi/rust-csv" +readme = "README.md" +keywords = ["csv", "comma", "parser", "delimited", "no_std"] +license = "Unlicense/MIT" +categories = ["encoding", "no-std", "parser-implementations"] +workspace = ".." +edition = "2018" + +[badges] +travis-ci = { repository = "BurntSushi/rust-csv" } +appveyor = { repository = "BurntSushi/rust-csv" } + +[lib] +bench = false + +[features] +default = [] +libc = ["memchr/libc"] + +[dependencies] +memchr = { version = "2", default-features = false } + +[dev-dependencies] +arrayvec = { version = "0.5", default-features = false } diff --git a/README.md b/README.md new file mode 100644 index 0000000..07afd99 --- /dev/null +++ b/README.md @@ -0,0 +1,113 @@ +csv-core +======== +A fast CSV reader and write for use in a `no_std` context. This crate will +never use the Rust standard library. + +[![Linux build status](https://api.travis-ci.org/BurntSushi/rust-csv.png)](https://travis-ci.org/BurntSushi/rust-csv) +[![Windows build status](https://ci.appveyor.com/api/projects/status/github/BurntSushi/rust-csv?svg=true)](https://ci.appveyor.com/project/BurntSushi/rust-csv) +[![](http://meritbadge.herokuapp.com/csv-core)](https://crates.io/crates/csv-core) + +Dual-licensed under MIT or the [UNLICENSE](http://unlicense.org). + +### Documentation + +https://docs.rs/csv-core + +### Usage + +Add this to your `Cargo.toml`: + +```toml +[dependencies] +csv-core = "0.1.6" +``` + +### Build features + +This crate by default links with `libc`, which is done via the `libc` feature. +Disabling this feature will drop `csv-core`'s dependency on `libc`. + + +### Example: reading CSV + +This example shows how to count the number of fields and records in CSV data. + +```rust +use csv_core::{Reader, ReadFieldResult}; + +let data = " +foo,bar,baz +a,b,c +xxx,yyy,zzz +"; + +let mut rdr = Reader::new(); +let mut bytes = data.as_bytes(); +let mut count_fields = 0; +let mut count_records = 0; +loop { + // We skip handling the output since we don't need it for counting. + let (result, nin, _) = rdr.read_field(bytes, &mut [0; 1024]); + bytes = &bytes[nin..]; + match result { + ReadFieldResult::InputEmpty => {}, + ReadFieldResult::OutputFull => panic!("field too large"), + ReadFieldResult::Field { record_end } => { + count_fields += 1; + if record_end { + count_records += 1; + } + } + ReadFieldResult::End => break, + } +} +assert_eq!(3, count_records); +assert_eq!(9, count_fields); +``` + + +### Example: writing CSV + +This example shows how to use the `Writer` API to write valid CSV data. Proper +quoting is handled automatically. + +```rust +use csv_core::Writer; + +// This is where we'll write out CSV data. +let mut out = &mut [0; 1024]; +// The number of bytes we've written to `out`. +let mut nout = 0; +// Create a CSV writer with a default configuration. +let mut wtr = Writer::new(); + +// Write a single field. Note that we ignore the `WriteResult` and the number +// of input bytes consumed since we're doing this by hand. +let (_, _, n) = wtr.field(&b"foo"[..], &mut out[nout..]); +nout += n; + +// Write a delimiter and then another field that requires quotes. +let (_, n) = wtr.delimiter(&mut out[nout..]); +nout += n; +let (_, _, n) = wtr.field(&b"bar,baz"[..], &mut out[nout..]); +nout += n; +let (_, n) = wtr.terminator(&mut out[nout..]); +nout += n; + +// Now write another record. +let (_, _, n) = wtr.field(&b"a \"b\" c"[..], &mut out[nout..]); +nout += n; +let (_, n) = wtr.delimiter(&mut out[nout..]); +nout += n; +let (_, _, n) = wtr.field(&b"quux"[..], &mut out[nout..]); +nout += n; + +// We must always call finish once done writing. +// This ensures that any closing quotes are written. +let (_, n) = wtr.finish(&mut out[nout..]); +nout += n; + +assert_eq!(&out[..nout], &b"\ +foo,\"bar,baz\" +\"a \"\"b\"\" c\",quux"[..]); +``` diff --git a/benches/bench.rs b/benches/bench.rs new file mode 100644 index 0000000..2aa24a9 --- /dev/null +++ b/benches/bench.rs @@ -0,0 +1,94 @@ +#![feature(test)] + +extern crate test; + +use test::Bencher; + +use csv_core::{Reader, ReaderBuilder}; + +static NFL: &'static str = include_str!("../../examples/data/bench/nfl.csv"); +static GAME: &'static str = include_str!("../../examples/data/bench/game.csv"); +static POP: &'static str = + include_str!("../../examples/data/bench/worldcitiespop.csv"); +static MBTA: &'static str = + include_str!("../../examples/data/bench/gtfs-mbta-stop-times.csv"); + +macro_rules! bench { + ($name:ident, $data:ident, $counter:ident, $result:expr) => { + bench!($name, $data, $counter, $result, false); + }; + ($name:ident, $data:ident, $counter:ident, $result:expr, NFA) => { + bench!($name, $data, $counter, $result, true); + }; + ($name:ident, $data:ident, $counter:ident, $result:expr, $nfa:expr) => { + #[bench] + fn $name(b: &mut Bencher) { + let data = $data.as_bytes(); + b.bytes = data.len() as u64; + let mut rdr = ReaderBuilder::new().nfa($nfa).build(); + b.iter(|| { + rdr.reset(); + assert_eq!($counter(&mut rdr, data), $result); + }) + } + }; +} + +bench!(count_nfl_field_copy_dfa, NFL, count_fields, 130000); +bench!(count_nfl_field_copy_nfa, NFL, count_fields, 130000, NFA); +bench!(count_nfl_record_copy_dfa, NFL, count_records, 10000); +bench!(count_nfl_record_copy_nfa, NFL, count_records, 10000, NFA); + +bench!(count_game_field_copy_dfa, GAME, count_fields, 600000); +bench!(count_game_field_copy_nfa, GAME, count_fields, 600000, NFA); +bench!(count_game_record_copy_dfa, GAME, count_records, 100000); +bench!(count_game_record_copy_nfa, GAME, count_records, 100000, NFA); + +bench!(count_pop_field_copy_dfa, POP, count_fields, 140007); +bench!(count_pop_field_copy_nfa, POP, count_fields, 140007, NFA); +bench!(count_pop_record_copy_dfa, POP, count_records, 20001); +bench!(count_pop_record_copy_nfa, POP, count_records, 20001, NFA); + +bench!(count_mbta_field_copy_dfa, MBTA, count_fields, 90000); +bench!(count_mbta_field_copy_nfa, MBTA, count_fields, 90000, NFA); +bench!(count_mbta_record_copy_dfa, MBTA, count_records, 10000); +bench!(count_mbta_record_copy_nfa, MBTA, count_records, 10000, NFA); + +fn count_fields(rdr: &mut Reader, mut data: &[u8]) -> u64 { + use csv_core::ReadFieldResult::*; + + let mut count = 0; + let mut field = [0u8; 1024]; + loop { + let (res, nin, _) = rdr.read_field(data, &mut field); + data = &data[nin..]; + match res { + InputEmpty => {} + OutputFull => panic!("field too large"), + Field { .. } => { + count += 1; + } + End => break, + } + } + count +} + +fn count_records(rdr: &mut Reader, mut data: &[u8]) -> u64 { + use csv_core::ReadRecordResult::*; + + let mut count = 0; + let mut record = [0; 8192]; + let mut ends = [0; 32]; + loop { + let (res, nin, _, _) = rdr.read_record(data, &mut record, &mut ends); + data = &data[nin..]; + match res { + InputEmpty => {} + OutputFull | OutputEndsFull => panic!("field too large"), + Record => count += 1, + End => break, + } + } + count +} diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..747e58d --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,189 @@ +/*! +`csv-core` provides a fast CSV reader and writer for use in a `no_std` context. + +This crate will never use the standard library. `no_std` support is therefore +enabled by default. + +If you're looking for more ergonomic CSV parsing routines, please use the +[`csv`](https://docs.rs/csv) crate. + +# Overview + +This crate has two primary APIs. The `Reader` API provides a CSV parser, and +the `Writer` API provides a CSV writer. + +# Example: reading CSV + +This example shows how to count the number of fields and records in CSV data. + +``` +use csv_core::{Reader, ReadFieldResult}; + +let data = " +foo,bar,baz +a,b,c +xxx,yyy,zzz +"; + +let mut rdr = Reader::new(); +let mut bytes = data.as_bytes(); +let mut count_fields = 0; +let mut count_records = 0; +loop { + // We skip handling the output since we don't need it for counting. + let (result, nin, _) = rdr.read_field(bytes, &mut [0; 1024]); + bytes = &bytes[nin..]; + match result { + ReadFieldResult::InputEmpty => {}, + ReadFieldResult::OutputFull => panic!("field too large"), + ReadFieldResult::Field { record_end } => { + count_fields += 1; + if record_end { + count_records += 1; + } + } + ReadFieldResult::End => break, + } +} +assert_eq!(3, count_records); +assert_eq!(9, count_fields); +``` + +# Example: writing CSV + +This example shows how to use the `Writer` API to write valid CSV data. Proper +quoting is handled automatically. + +``` +use csv_core::Writer; + +// This is where we'll write out CSV data. +let mut out = &mut [0; 1024]; +// The number of bytes we've written to `out`. +let mut nout = 0; +// Create a CSV writer with a default configuration. +let mut wtr = Writer::new(); + +// Write a single field. Note that we ignore the `WriteResult` and the number +// of input bytes consumed since we're doing this by hand. +let (_, _, n) = wtr.field(&b"foo"[..], &mut out[nout..]); +nout += n; + +// Write a delimiter and then another field that requires quotes. +let (_, n) = wtr.delimiter(&mut out[nout..]); +nout += n; +let (_, _, n) = wtr.field(&b"bar,baz"[..], &mut out[nout..]); +nout += n; +let (_, n) = wtr.terminator(&mut out[nout..]); +nout += n; + +// Now write another record. +let (_, _, n) = wtr.field(&b"a \"b\" c"[..], &mut out[nout..]); +nout += n; +let (_, n) = wtr.delimiter(&mut out[nout..]); +nout += n; +let (_, _, n) = wtr.field(&b"quux"[..], &mut out[nout..]); +nout += n; + +// We must always call finish once done writing. +// This ensures that any closing quotes are written. +let (_, n) = wtr.finish(&mut out[nout..]); +nout += n; + +assert_eq!(&out[..nout], &b"\ +foo,\"bar,baz\" +\"a \"\"b\"\" c\",quux"[..]); +``` +*/ + +#![deny(missing_docs)] +#![no_std] + +pub use crate::reader::{ + ReadFieldNoCopyResult, ReadFieldResult, ReadRecordNoCopyResult, + ReadRecordResult, Reader, ReaderBuilder, +}; +pub use crate::writer::{ + is_non_numeric, quote, WriteResult, Writer, WriterBuilder, +}; + +mod reader; +mod writer; + +/// A record terminator. +/// +/// Use this to specify the record terminator while parsing CSV. The default is +/// CRLF, which treats `\r`, `\n` or `\r\n` as a single record terminator. +#[derive(Clone, Copy, Debug)] +pub enum Terminator { + /// Parses `\r`, `\n` or `\r\n` as a single record terminator. + CRLF, + /// Parses the byte given as a record terminator. + Any(u8), + /// Hints that destructuring should not be exhaustive. + /// + /// This enum may grow additional variants, so this makes sure clients + /// don't count on exhaustive matching. (Otherwise, adding a new variant + /// could break existing code.) + #[doc(hidden)] + __Nonexhaustive, +} + +impl Terminator { + /// Checks whether the terminator is set to CRLF. + fn is_crlf(&self) -> bool { + match *self { + Terminator::CRLF => true, + Terminator::Any(_) => false, + _ => unreachable!(), + } + } + + fn equals(&self, other: u8) -> bool { + match *self { + Terminator::CRLF => other == b'\r' || other == b'\n', + Terminator::Any(b) => other == b, + _ => unreachable!(), + } + } +} + +impl Default for Terminator { + fn default() -> Terminator { + Terminator::CRLF + } +} + +/// The quoting style to use when writing CSV data. +#[derive(Clone, Copy, Debug)] +pub enum QuoteStyle { + /// This puts quotes around every field. Always. + Always, + /// This puts quotes around fields only when necessary. + /// + /// They are necessary when fields contain a quote, delimiter or record + /// terminator. Quotes are also necessary when writing an empty record + /// (which is indistinguishable from a record with one empty field). + /// + /// This is the default. + Necessary, + /// This puts quotes around all fields that are non-numeric. Namely, when + /// writing a field that does not parse as a valid float or integer, then + /// quotes will be used even if they aren't strictly necessary. + NonNumeric, + /// This *never* writes quotes, even if it would produce invalid CSV data. + Never, + /// Hints that destructuring should not be exhaustive. + /// + /// This enum may grow additional variants, so this makes sure clients + /// don't count on exhaustive matching. (Otherwise, adding a new variant + /// could break existing code.) + #[doc(hidden)] + __Nonexhaustive, +} + +impl Default for QuoteStyle { + fn default() -> QuoteStyle { + QuoteStyle::Necessary + } +} diff --git a/src/reader.rs b/src/reader.rs new file mode 100644 index 0000000..dbd6dc3 --- /dev/null +++ b/src/reader.rs @@ -0,0 +1,2005 @@ +use core::fmt; + +use crate::Terminator; + +// BE ADVISED +// +// This may just be one of the more complicated CSV parsers you'll come across. +// The implementation never allocates and consists of both a functional NFA +// parser and a DFA parser. The DFA parser is the work horse and we could elide +// much of the work involved in making the NFA parser work, but the NFA parser +// is much easier to debug. The NFA parser is tested alongside the DFA parser, +// so they should never be out of sync. +// +// The basic structure of the implementation is to encode the NFA parser as +// an explicit state machine in code. The DFA is then generated by populating +// a transition table on the stack by exhaustively enumerating all possible +// states on all possible inputs (this is possible because the number of states +// and the number of inputs is very small). +// +// Note that some pieces of the NFA parser (such as the NFA state machine) are +// required. In particular, the translation from the NFA to the DFA depends on +// the configuration of the CSV parser as given by the caller, and indeed, this +// is one of the key performance benefits of the DFA: it doesn't have any +// overhead (other than a bigger transition table) associated with the number +// of configuration options. +// +// ADVICE FOR HACKERS +// +// This code is too clever for its own good. As such, changes to some parts of +// the code may have a non-obvious impact on other parts. This is mostly +// motivated by trying to keep the DFA transition table as small as possible, +// since it is stored on the stack. Here are some tips that may save you some +// time: +// +// * If you add a new NFA state, then you also need to consider how it impacts +// the DFA. If all of the incoming transitions into an NFA state are +// epsilon transitions, then it probably isn't materialized in the DFA. +// If the NFA state indicates that a field or a record has been parsed, then +// it should be considered final. Let the comments in `NfaState` be your +// guide. +// * If you add a new configuration knob to the parser, then you may need to +// modify the `TRANS_CLASSES` constant below. The `TRANS_CLASSES` constant +// indicates the total number of discriminating bytes in the DFA. And if you +// modify `TRANS_CLASSES`, you probably also need to modify `build_dfa` to +// add a new class. For example, in order to add parsing support for +// comments, I bumped `TRANS_CLASSES` from `6` to `7` and added the comment +// byte (if one exists) to the list of classes in `build_dfa`. +// * The special DFA start state doubles as the final state once all input +// from the caller has been exhausted. We must be careful to guard this +// case analysis on whether the input is actually exhausted, since the start +// state is an otherwise valid state. + +/// A pull based CSV reader. +/// +/// This reader parses CSV data using a finite state machine. Callers can +/// extract parsed data incrementally using one of the `read` methods. +/// +/// Note that this CSV reader is somewhat encoding agnostic. The source data +/// needs to be at least ASCII compatible. There is no support for specifying +/// the full gamut of Unicode delimiters/terminators/quotes/escapes. Instead, +/// any byte can be used, although callers probably want to stick to the ASCII +/// subset (`<= 0x7F`). +/// +/// # Usage +/// +/// A reader has two different ways to read CSV data, each with their own +/// trade offs. +/// +/// * `read_field` - Copies a single CSV field into an output buffer while +/// unescaping quotes. This is simple to use and doesn't require storing an +/// entire record contiguously in memory, but it is slower. +/// * `read_record` - Copies an entire CSV record into an output buffer while +/// unescaping quotes. The ending positions of each field are copied into +/// an additional buffer. This is harder to use and requires larger output +/// buffers, but it is faster than `read_field` since it amortizes more +/// costs. +/// +/// # RFC 4180 +/// +/// [RFC 4180](https://tools.ietf.org/html/rfc4180) +/// is the closest thing to a specification for CSV data. Unfortunately, +/// CSV data that is seen in the wild can vary significantly. Often, the CSV +/// data is outright invalid. Instead of fixing the producers of bad CSV data, +/// we have seen fit to make consumers much more flexible in what they accept. +/// This reader continues that tradition, and therefore, isn't technically +/// compliant with RFC 4180. In particular, this reader will never return an +/// error and will always find *a* parse. +/// +/// Here are some detailed differences from RFC 4180: +/// +/// * CRLF, LF and CR are each treated as a single record terminator by +/// default. +/// * Records are permitted to be of varying length. +/// * Empty lines (that do not include other whitespace) are ignored. +#[derive(Clone, Debug)] +pub struct Reader { + /// A table-based DFA for parsing CSV. + dfa: Dfa, + /// The current DFA state, if the DFA is used. + dfa_state: DfaState, + /// The current NFA state, if the NFA is used. + nfa_state: NfaState, + /// The delimiter that separates fields. + delimiter: u8, + /// The terminator that separates records. + term: Terminator, + /// The quotation byte. + quote: u8, + /// Whether to recognize escaped quotes. + escape: Option, + /// Whether to recognized doubled quotes. + double_quote: bool, + /// If enabled, lines beginning with this byte are ignored. + comment: Option, + /// If enabled (the default), then quotes are respected. When disabled, + /// quotes are not treated specially. + quoting: bool, + /// Whether to use the NFA for parsing. + /// + /// Generally this is for debugging. There's otherwise no good reason + /// to avoid the DFA. + use_nfa: bool, + /// The current line number. + line: u64, + /// Whether this parser has ever read anything. + has_read: bool, + /// The current position in the output buffer when reading a record. + output_pos: usize, +} + +impl Default for Reader { + fn default() -> Reader { + Reader { + dfa: Dfa::new(), + dfa_state: DfaState::start(), + nfa_state: NfaState::StartRecord, + delimiter: b',', + term: Terminator::default(), + quote: b'"', + escape: None, + double_quote: true, + comment: None, + quoting: true, + use_nfa: false, + line: 1, + has_read: false, + output_pos: 0, + } + } +} + +/// Builds a CSV reader with various configuration knobs. +/// +/// This builder can be used to tweak the field delimiter, record terminator +/// and more for parsing CSV. Once a CSV `Reader` is built, its configuration +/// cannot be changed. +#[derive(Debug, Default)] +pub struct ReaderBuilder { + rdr: Reader, +} + +impl ReaderBuilder { + /// Create a new builder. + pub fn new() -> ReaderBuilder { + ReaderBuilder::default() + } + + /// Build a CSV parser from this configuration. + pub fn build(&self) -> Reader { + let mut rdr = self.rdr.clone(); + rdr.build_dfa(); + rdr + } + + /// The field delimiter to use when parsing CSV. + /// + /// The default is `b','`. + pub fn delimiter(&mut self, delimiter: u8) -> &mut ReaderBuilder { + self.rdr.delimiter = delimiter; + self + } + + /// The record terminator to use when parsing CSV. + /// + /// A record terminator can be any single byte. The default is a special + /// value, `Terminator::CRLF`, which treats any occurrence of `\r`, `\n` + /// or `\r\n` as a single record terminator. + pub fn terminator(&mut self, term: Terminator) -> &mut ReaderBuilder { + self.rdr.term = term; + self + } + + /// The quote character to use when parsing CSV. + /// + /// The default is `b'"'`. + pub fn quote(&mut self, quote: u8) -> &mut ReaderBuilder { + self.rdr.quote = quote; + self + } + + /// The escape character to use when parsing CSV. + /// + /// In some variants of CSV, quotes are escaped using a special escape + /// character like `\` (instead of escaping quotes by doubling them). + /// + /// By default, recognizing these idiosyncratic escapes is disabled. + pub fn escape(&mut self, escape: Option) -> &mut ReaderBuilder { + self.rdr.escape = escape; + self + } + + /// Enable double quote escapes. + /// + /// This is enabled by default, but it may be disabled. When disabled, + /// doubled quotes are not interpreted as escapes. + pub fn double_quote(&mut self, yes: bool) -> &mut ReaderBuilder { + self.rdr.double_quote = yes; + self + } + + /// Enable or disable quoting. + /// + /// This is enabled by default, but it may be disabled. When disabled, + /// quotes are not treated specially. + pub fn quoting(&mut self, yes: bool) -> &mut ReaderBuilder { + self.rdr.quoting = yes; + self + } + + /// The comment character to use when parsing CSV. + /// + /// If the start of a record begins with the byte given here, then that + /// line is ignored by the CSV parser. + /// + /// This is disabled by default. + pub fn comment(&mut self, comment: Option) -> &mut ReaderBuilder { + self.rdr.comment = comment; + self + } + + /// A convenience method for specifying a configuration to read ASCII + /// delimited text. + /// + /// This sets the delimiter and record terminator to the ASCII unit + /// separator (`\x1F`) and record separator (`\x1E`), respectively. + pub fn ascii(&mut self) -> &mut ReaderBuilder { + self.delimiter(b'\x1F').terminator(Terminator::Any(b'\x1E')) + } + + /// Enable or disable the NFA for parsing CSV. + /// + /// This is intended to be a debug option useful for debugging. The NFA + /// is always slower than the DFA. + #[doc(hidden)] + pub fn nfa(&mut self, yes: bool) -> &mut ReaderBuilder { + self.rdr.use_nfa = yes; + self + } +} + +/// The result of parsing at most one field from CSV data. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum ReadFieldResult { + /// The caller provided input was exhausted before the end of a field or + /// record was found. + InputEmpty, + /// The caller provided output buffer was filled before an entire field + /// could be written to it. + OutputFull, + /// The end of a field was found. + /// + /// Note that when `record_end` is true, then the end of this field also + /// corresponds to the end of a record. + Field { + /// Whether this was the last field in a record or not. + record_end: bool, + }, + /// All CSV data has been read. + /// + /// This state can only be returned when an empty input buffer is provided + /// by the caller. + End, +} + +impl ReadFieldResult { + fn from_nfa( + state: NfaState, + inpdone: bool, + outdone: bool, + ) -> ReadFieldResult { + match state { + NfaState::End => ReadFieldResult::End, + NfaState::EndRecord | NfaState::CRLF => { + ReadFieldResult::Field { record_end: true } + } + NfaState::EndFieldDelim => { + ReadFieldResult::Field { record_end: false } + } + _ => { + assert!(!state.is_field_final()); + if !inpdone && outdone { + ReadFieldResult::OutputFull + } else { + ReadFieldResult::InputEmpty + } + } + } + } +} + +/// The result of parsing at most one field from CSV data while ignoring the +/// output. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum ReadFieldNoCopyResult { + /// The caller provided input was exhausted before the end of a field or + /// record was found. + InputEmpty, + /// The end of a field was found. + /// + /// Note that when `record_end` is true, then the end of this field also + /// corresponds to the end of a record. + Field { + /// Whether this was the last field in a record or not. + record_end: bool, + }, + /// All CSV data has been read. + /// + /// This state can only be returned when an empty input buffer is provided + /// by the caller. + End, +} + +/// The result of parsing at most one record from CSV data. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum ReadRecordResult { + /// The caller provided input was exhausted before the end of a record was + /// found. + InputEmpty, + /// The caller provided output buffer was filled before an entire field + /// could be written to it. + OutputFull, + /// The caller provided output buffer of field end poisitions was filled + /// before the next field could be parsed. + OutputEndsFull, + /// The end of a record was found. + Record, + /// All CSV data has been read. + /// + /// This state can only be returned when an empty input buffer is provided + /// by the caller. + End, +} + +impl ReadRecordResult { + fn is_record(&self) -> bool { + *self == ReadRecordResult::Record + } + + fn from_nfa( + state: NfaState, + inpdone: bool, + outdone: bool, + endsdone: bool, + ) -> ReadRecordResult { + match state { + NfaState::End => ReadRecordResult::End, + NfaState::EndRecord | NfaState::CRLF => ReadRecordResult::Record, + _ => { + assert!(!state.is_record_final()); + if !inpdone && outdone { + ReadRecordResult::OutputFull + } else if !inpdone && endsdone { + ReadRecordResult::OutputEndsFull + } else { + ReadRecordResult::InputEmpty + } + } + } + } +} + +/// The result of parsing at most one record from CSV data while ignoring +/// output. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum ReadRecordNoCopyResult { + /// The caller provided input was exhausted before the end of a record was + /// found. + InputEmpty, + /// The end of a record was found. + Record, + /// All CSV data has been read. + /// + /// This state can only be returned when an empty input buffer is provided + /// by the caller. + End, +} + +/// What should be done with input bytes during an NFA transition +#[derive(Clone, Debug, Eq, PartialEq)] +enum NfaInputAction { + // Do not consume an input byte + Epsilon, + // Copy input byte to a caller-provided output buffer + CopyToOutput, + // Consume but do not copy input byte (for example, seeing a field + // delimiter will consume an input byte but should not copy it to the + // output buffer. + Discard, +} + +/// An NFA state is a state that can be visited in the NFA parser. +/// +/// Given the simplicity of the machine, a subset of NFA states double as DFA +/// states. NFA states that only have incoming epsilon transitions are +/// optimized out when converting the machine to a DFA. +#[derive(Copy, Clone, Debug, Eq, PartialEq)] +enum NfaState { + // These states aren't used in the DFA, so we + // assign them meaningless numbers. + EndFieldTerm = 200, + InRecordTerm = 201, + End = 202, + + // All states below are DFA states. + StartRecord = 0, + StartField = 1, + InField = 2, + InQuotedField = 3, + InEscapedQuote = 4, + InDoubleEscapedQuote = 5, + InComment = 6, + // All states below are "final field" states. + // Namely, they indicate that a field has been parsed. + EndFieldDelim = 7, + // All states below are "final record" states. + // Namely, they indicate that a record has been parsed. + EndRecord = 8, + CRLF = 9, +} + +/// A list of NFA states that have an explicit representation in the DFA. +const NFA_STATES: &'static [NfaState] = &[ + NfaState::StartRecord, + NfaState::StartField, + NfaState::EndFieldDelim, + NfaState::InField, + NfaState::InQuotedField, + NfaState::InEscapedQuote, + NfaState::InDoubleEscapedQuote, + NfaState::InComment, + NfaState::EndRecord, + NfaState::CRLF, +]; + +impl NfaState { + /// Returns true if this state indicates that a field has been parsed. + fn is_field_final(&self) -> bool { + match *self { + NfaState::End + | NfaState::EndRecord + | NfaState::CRLF + | NfaState::EndFieldDelim => true, + _ => false, + } + } + + /// Returns true if this state indicates that a record has been parsed. + fn is_record_final(&self) -> bool { + match *self { + NfaState::End | NfaState::EndRecord | NfaState::CRLF => true, + _ => false, + } + } +} + +impl Reader { + /// Create a new CSV reader with a default parser configuration. + pub fn new() -> Reader { + ReaderBuilder::new().build() + } + + /// Reset the parser such that it behaves as if it had never been used. + /// + /// This may be useful when reading CSV data in a random access pattern. + pub fn reset(&mut self) { + self.dfa_state = self.dfa.new_state(NfaState::StartRecord); + self.nfa_state = NfaState::StartRecord; + self.line = 1; + self.has_read = false; + } + + /// Return the current line number as measured by the number of occurrences + /// of `\n`. + /// + /// Line numbers starts at `1` and are reset when `reset` is called. + pub fn line(&self) -> u64 { + self.line + } + + /// Set the line number. + /// + /// This is useful after a call to `reset` where the caller knows the + /// line number from some additional context. + pub fn set_line(&mut self, line: u64) { + self.line = line; + } + + /// Parse a single CSV field in `input` and copy field data to `output`. + /// + /// This routine requires a caller provided buffer of CSV data as the + /// `input` and a caller provided buffer, `output`, in which to store field + /// data extracted from `input`. The field data copied to `output` will + /// have its quotes unescaped. + /// + /// Calling this routine parses at most a single field and returns + /// three values indicating the state of the parser. The first value, a + /// `ReadFieldResult`, tells the caller what to do next. For example, if + /// the entire input was read or if the output buffer was filled before + /// a full field had been read, then `ReadFieldResult::InputEmpty` or + /// `ReadFieldResult::OutputFull` is returned, respectively. See the + /// documentation for `ReadFieldResult` for more details. + /// + /// The other two values returned correspond to the number of bytes + /// read from `input` and written to `output`, respectively. + /// + /// # Termination + /// + /// This reader interprets an empty `input` buffer as an indication that + /// there is no CSV data left to read. Namely, when the caller has + /// exhausted all CSV data, the caller should continue to call `read` with + /// an empty input buffer until `ReadFieldResult::End` is returned. + /// + /// # Errors + /// + /// This CSV reader can never return an error. Instead, it prefers *a* + /// parse over *no* parse. + pub fn read_field( + &mut self, + input: &[u8], + output: &mut [u8], + ) -> (ReadFieldResult, usize, usize) { + let (input, bom_nin) = self.strip_utf8_bom(input); + let (res, nin, nout) = if self.use_nfa { + self.read_field_nfa(input, output) + } else { + self.read_field_dfa(input, output) + }; + self.has_read = true; + (res, nin + bom_nin, nout) + } + + /// Parse a single CSV record in `input` and copy each field contiguously + /// to `output`, with the end position of each field written to `ends`. + /// + /// **NOTE**: This method is more cumbersome to use than `read_field`, but + /// it can be faster since it amortizes more work. + /// + /// This routine requires a caller provided buffer of CSV data as the + /// `input` and two caller provided buffers to store the unescaped field + /// data (`output`) and the end position of each field in the record + /// (`fields`). + /// + /// Calling this routine parses at most a single record and returns four + /// values indicating the state of the parser. The first value, a + /// `ReadRecordResult`, tells the caller what to do next. For example, if + /// the entire input was read or if the output buffer was filled before a + /// full field had been read, then `ReadRecordResult::InputEmpty` or + /// `ReadRecordResult::OutputFull` is returned, respectively. Similarly, if + /// the `ends` buffer is full, then `ReadRecordResult::OutputEndsFull` is + /// returned. See the documentation for `ReadRecordResult` for more + /// details. + /// + /// The other three values correspond to the number of bytes read from + /// `input`, the number of bytes written to `output` and the number of + /// end positions written to `ends`, respectively. + /// + /// The end positions written to `ends` are constructed as if there was + /// a single contiguous buffer in memory containing the entire row, even + /// if `ReadRecordResult::OutputFull` was returned in the middle of reading + /// a row. + /// + /// # Termination + /// + /// This reader interprets an empty `input` buffer as an indication that + /// there is no CSV data left to read. Namely, when the caller has + /// exhausted all CSV data, the caller should continue to call `read` with + /// an empty input buffer until `ReadRecordResult::End` is returned. + /// + /// # Errors + /// + /// This CSV reader can never return an error. Instead, it prefers *a* + /// parse over *no* parse. + pub fn read_record( + &mut self, + input: &[u8], + output: &mut [u8], + ends: &mut [usize], + ) -> (ReadRecordResult, usize, usize, usize) { + let (input, bom_nin) = self.strip_utf8_bom(input); + let (res, nin, nout, nend) = if self.use_nfa { + self.read_record_nfa(input, output, ends) + } else { + self.read_record_dfa(input, output, ends) + }; + self.has_read = true; + (res, nin + bom_nin, nout, nend) + } + + /// Strip off a possible UTF-8 BOM at the start of a file. Quick note that + /// this method will fail to strip off the BOM if only part of the BOM is + /// buffered. Hopefully that won't happen very often. + fn strip_utf8_bom<'a>(&self, input: &'a [u8]) -> (&'a [u8], usize) { + let (input, nin) = if { + !self.has_read + && input.len() >= 3 + && &input[0..3] == b"\xef\xbb\xbf" + } { + (&input[3..], 3) + } else { + (input, 0) + }; + (input, nin) + } + + #[inline(always)] + fn read_record_dfa( + &mut self, + input: &[u8], + output: &mut [u8], + ends: &mut [usize], + ) -> (ReadRecordResult, usize, usize, usize) { + if input.is_empty() { + let s = self.transition_final_dfa(self.dfa_state); + let res = + self.dfa.new_read_record_result(s, true, false, false, false); + // This part is a little tricky. When reading the final record, + // the last result the caller will get is an InputEmpty, and while + // they'll have everything they need in `output`, they'll be + // missing the final end position of the final field in `ends`. + // We insert that here, but we must take care to handle the case + // where `ends` doesn't have enough space. If it doesn't have + // enough space, then we also can't transition to the next state. + return match res { + ReadRecordResult::Record => { + if ends.is_empty() { + return (ReadRecordResult::OutputEndsFull, 0, 0, 0); + } + self.dfa_state = s; + ends[0] = self.output_pos; + self.output_pos = 0; + (res, 0, 0, 1) + } + _ => { + self.dfa_state = s; + (res, 0, 0, 0) + } + }; + } + if output.is_empty() { + return (ReadRecordResult::OutputFull, 0, 0, 0); + } + if ends.is_empty() { + return (ReadRecordResult::OutputEndsFull, 0, 0, 0); + } + let (mut nin, mut nout, mut nend) = (0, 0, 0); + let mut state = self.dfa_state; + while nin < input.len() && nout < output.len() && nend < ends.len() { + let (s, has_out) = self.dfa.get_output(state, input[nin]); + self.line += (input[nin] == b'\n') as u64; + state = s; + if has_out { + output[nout] = input[nin]; + nout += 1; + } + nin += 1; + if state >= self.dfa.final_field { + ends[nend] = self.output_pos + nout; + nend += 1; + if state > self.dfa.final_field { + break; + } + } + if state == self.dfa.in_field || state == self.dfa.in_quoted { + self.dfa + .classes + .scan_and_copy(input, &mut nin, output, &mut nout); + } + } + let res = self.dfa.new_read_record_result( + state, + false, + nin >= input.len(), + nout >= output.len(), + nend >= ends.len(), + ); + self.dfa_state = state; + if res.is_record() { + self.output_pos = 0; + } else { + self.output_pos += nout; + } + (res, nin, nout, nend) + } + + #[inline(always)] + fn read_field_dfa( + &mut self, + input: &[u8], + output: &mut [u8], + ) -> (ReadFieldResult, usize, usize) { + if input.is_empty() { + self.dfa_state = self.transition_final_dfa(self.dfa_state); + let res = self.dfa.new_read_field_result( + self.dfa_state, + true, + false, + false, + ); + return (res, 0, 0); + } + if output.is_empty() { + return (ReadFieldResult::OutputFull, 0, 0); + } + let (mut nin, mut nout) = (0, 0); + let mut state = self.dfa_state; + while nin < input.len() && nout < output.len() { + let b = input[nin]; + self.line += (b == b'\n') as u64; + let (s, has_out) = self.dfa.get_output(state, b); + state = s; + if has_out { + output[nout] = b; + nout += 1; + } + nin += 1; + if state >= self.dfa.final_field { + break; + } + } + let res = self.dfa.new_read_field_result( + state, + false, + nin >= input.len(), + nout >= output.len(), + ); + self.dfa_state = state; + (res, nin, nout) + } + + /// Perform the final state transition, i.e., when the caller indicates + /// that the input has been exhausted. + fn transition_final_dfa(&self, state: DfaState) -> DfaState { + // If we''ve already emitted a record or think we're ready to start + // parsing a new record, then we should sink into the final state + // and never move from there. (pro-tip: the start state doubles as + // the final state!) + if state >= self.dfa.final_record || state.is_start() { + self.dfa.new_state_final_end() + } else { + self.dfa.new_state_final_record() + } + } + + /// Write the transition tables for the DFA based on this parser's + /// configuration. + fn build_dfa(&mut self) { + // A naive DFA transition table has + // `cells = (# number of states) * (# size of alphabet)`. While we + // could get away with that, the table would have `10 * 256 = 2560` + // entries. Even worse, in order to avoid a multiplication instruction + // when computing the next transition, we store the starting index of + // each state's row, which would not be representible in a single byte. + // So we'd need a `u16`, which doubles our transition table size to + // ~5KB. This is a lot to put on the stack, even though it probably + // fits in the L1 cache of most modern CPUs. + // + // To avoid this, we note that while our "true" alphabet + // has 256 distinct possibilities, the DFA itself is only + // discriminatory on a very small subset of that alphabet. For + // example, assuming neither `a` nor `b` are set as special + // quote/comment/escape/delimiter/terminator bytes, they are otherwise + // indistinguishable to the DFA, so it would be OK to treat them as + // if they were equivalent. That is, they are in the same equivalence + // class. + // + // As it turns out, using this logic, we can shrink our effective + // alphabet down to 7 equivalence classes: + // + // 1. The field delimiter. + // 2. The record terminator. + // 3. If the record terminator is CRLF, then CR and LF are + // distinct equivalence classes. + // 4. The quote byte. + // 5. The escape byte. + // 6. The comment byte. + // 7. Everything else. + // + // We add those equivalence classes here. If more configuration knobs + // are added to the parser with more discriminating bytes, then this + // logic will need to be adjusted further. + // + // Even though this requires an extra bit of indirection when computing + // the next transition, microbenchmarks say that it doesn't make much + // of a difference. Perhaps because everything fits into the L1 cache. + self.dfa.classes.add(self.delimiter); + if self.quoting { + self.dfa.classes.add(self.quote); + if let Some(escape) = self.escape { + self.dfa.classes.add(escape); + } + } + if let Some(comment) = self.comment { + self.dfa.classes.add(comment); + } + match self.term { + Terminator::Any(b) => self.dfa.classes.add(b), + Terminator::CRLF => { + self.dfa.classes.add(b'\r'); + self.dfa.classes.add(b'\n'); + } + _ => unreachable!(), + } + // Build the DFA transition table by computing the DFA state for all + // possible combinations of state and input byte. + for &state in NFA_STATES { + for c in (0..256).map(|c| c as u8) { + let mut nfa_result = (state, NfaInputAction::Epsilon); + // Consume NFA states until we hit a non-epsilon transition. + while nfa_result.0 != NfaState::End + && nfa_result.1 == NfaInputAction::Epsilon + { + nfa_result = self.transition_nfa(nfa_result.0, c); + } + let from = self.dfa.new_state(state); + let to = self.dfa.new_state(nfa_result.0); + self.dfa.set( + from, + c, + to, + nfa_result.1 == NfaInputAction::CopyToOutput, + ); + } + } + self.dfa_state = self.dfa.new_state(NfaState::StartRecord); + self.dfa.finish(); + } + + // The NFA implementation follows. The transition_final_nfa and + // transition_nfa methods are required for the DFA to operate. The + // rest are included for completeness (and debugging). Note that this + // NFA implementation is included in most of the CSV parser tests below. + + #[inline(always)] + fn read_record_nfa( + &mut self, + input: &[u8], + output: &mut [u8], + ends: &mut [usize], + ) -> (ReadRecordResult, usize, usize, usize) { + if input.is_empty() { + let s = self.transition_final_nfa(self.nfa_state); + let res = ReadRecordResult::from_nfa(s, false, false, false); + return match res { + ReadRecordResult::Record => { + if ends.is_empty() { + return (ReadRecordResult::OutputEndsFull, 0, 0, 0); + } + self.nfa_state = s; + ends[0] = self.output_pos; + self.output_pos = 0; + (res, 0, 0, 1) + } + _ => { + self.nfa_state = s; + (res, 0, 0, 0) + } + }; + } + if output.is_empty() { + return (ReadRecordResult::OutputFull, 0, 0, 0); + } + if ends.is_empty() { + return (ReadRecordResult::OutputEndsFull, 0, 0, 0); + } + let (mut nin, mut nout, mut nend) = (0, self.output_pos, 0); + let mut state = self.nfa_state; + while nin < input.len() && nout < output.len() && nend < ends.len() { + let (s, io) = self.transition_nfa(state, input[nin]); + match io { + NfaInputAction::CopyToOutput => { + output[nout] = input[nin]; + nout += 1; + nin += 1; + } + NfaInputAction::Discard => { + nin += 1; + } + NfaInputAction::Epsilon => {} + } + state = s; + if state.is_field_final() { + ends[nend] = nout; + nend += 1; + if state != NfaState::EndFieldDelim { + break; + } + } + } + let res = ReadRecordResult::from_nfa( + state, + nin >= input.len(), + nout >= output.len(), + nend >= ends.len(), + ); + self.nfa_state = state; + self.output_pos = if res.is_record() { 0 } else { nout }; + (res, nin, nout, nend) + } + + #[inline(always)] + fn read_field_nfa( + &mut self, + input: &[u8], + output: &mut [u8], + ) -> (ReadFieldResult, usize, usize) { + if input.is_empty() { + self.nfa_state = self.transition_final_nfa(self.nfa_state); + let res = ReadFieldResult::from_nfa(self.nfa_state, false, false); + return (res, 0, 0); + } + if output.is_empty() { + // If the output buffer is empty, then we can never make progress, + // so just quit now. + return (ReadFieldResult::OutputFull, 0, 0); + } + let (mut nin, mut nout) = (0, 0); + let mut state = self.nfa_state; + while nin < input.len() && nout < output.len() { + let (s, io) = self.transition_nfa(state, input[nin]); + match io { + NfaInputAction::CopyToOutput => { + output[nout] = input[nin]; + nout += 1; + nin += 1; + } + NfaInputAction::Discard => { + nin += 1; + } + NfaInputAction::Epsilon => (), + } + state = s; + if state.is_field_final() { + break; + } + } + let res = ReadFieldResult::from_nfa( + state, + nin >= input.len(), + nout >= output.len(), + ); + self.nfa_state = state; + (res, nin, nout) + } + + /// Compute the final NFA transition after all caller-provided input has + /// been exhausted. + #[inline(always)] + fn transition_final_nfa(&self, state: NfaState) -> NfaState { + use self::NfaState::*; + match state { + End | StartRecord | EndRecord | InComment | CRLF => End, + StartField | EndFieldDelim | EndFieldTerm | InField + | InQuotedField | InEscapedQuote | InDoubleEscapedQuote + | InRecordTerm => EndRecord, + } + } + + /// Compute the next NFA state given the current NFA state and the current + /// input byte. + /// + /// This returns the next NFA state along with an NfaInputAction that + /// indicates what should be done with the input byte (nothing for an epsilon + /// transition, copied to a caller provided output buffer, or discarded). + #[inline(always)] + fn transition_nfa( + &self, + state: NfaState, + c: u8, + ) -> (NfaState, NfaInputAction) { + use self::NfaState::*; + match state { + End => (End, NfaInputAction::Epsilon), + StartRecord => { + if self.term.equals(c) { + (StartRecord, NfaInputAction::Discard) + } else if self.comment == Some(c) { + (InComment, NfaInputAction::Discard) + } else { + (StartField, NfaInputAction::Epsilon) + } + } + EndRecord => (StartRecord, NfaInputAction::Epsilon), + StartField => { + if self.quoting && self.quote == c { + (InQuotedField, NfaInputAction::Discard) + } else if self.delimiter == c { + (EndFieldDelim, NfaInputAction::Discard) + } else if self.term.equals(c) { + (EndFieldTerm, NfaInputAction::Epsilon) + } else { + (InField, NfaInputAction::CopyToOutput) + } + } + EndFieldDelim => (StartField, NfaInputAction::Epsilon), + EndFieldTerm => (InRecordTerm, NfaInputAction::Epsilon), + InField => { + if self.delimiter == c { + (EndFieldDelim, NfaInputAction::Discard) + } else if self.term.equals(c) { + (EndFieldTerm, NfaInputAction::Epsilon) + } else { + (InField, NfaInputAction::CopyToOutput) + } + } + InQuotedField => { + if self.quoting && self.quote == c { + (InDoubleEscapedQuote, NfaInputAction::Discard) + } else if self.quoting && self.escape == Some(c) { + (InEscapedQuote, NfaInputAction::Discard) + } else { + (InQuotedField, NfaInputAction::CopyToOutput) + } + } + InEscapedQuote => (InQuotedField, NfaInputAction::CopyToOutput), + InDoubleEscapedQuote => { + if self.quoting && self.double_quote && self.quote == c { + (InQuotedField, NfaInputAction::CopyToOutput) + } else if self.delimiter == c { + (EndFieldDelim, NfaInputAction::Discard) + } else if self.term.equals(c) { + (EndFieldTerm, NfaInputAction::Epsilon) + } else { + (InField, NfaInputAction::CopyToOutput) + } + } + InComment => { + if b'\n' == c { + (StartRecord, NfaInputAction::Discard) + } else { + (InComment, NfaInputAction::Discard) + } + } + InRecordTerm => { + if self.term.is_crlf() && b'\r' == c { + (CRLF, NfaInputAction::Discard) + } else { + (EndRecord, NfaInputAction::Discard) + } + } + CRLF => { + if b'\n' == c { + (StartRecord, NfaInputAction::Discard) + } else { + (StartRecord, NfaInputAction::Epsilon) + } + } + } + } +} + +/// The number of slots in the DFA transition table. +/// +/// This number is computed by multiplying the maximum number of transition +/// classes (7) by the total number of NFA states that are used in the DFA +/// (10). +/// +/// The number of transition classes is determined by an equivalence class of +/// bytes, where every byte in the same equivalence classes is +/// indistinguishable from any other byte with respect to the DFA. For example, +/// if neither `a` nor `b` are specifed as a delimiter/quote/terminator/escape, +/// then the DFA will never discriminate between `a` or `b`, so they can +/// effectively be treated as identical. This reduces storage space +/// substantially. +/// +/// The total number of NFA states (13) is greater than the total number of +/// NFA states that are in the DFA. In particular, any NFA state that can only +/// be reached by epsilon transitions will never have explicit usage in the +/// DFA. +const TRANS_CLASSES: usize = 7; +const DFA_STATES: usize = 10; +const TRANS_SIZE: usize = TRANS_CLASSES * DFA_STATES; + +/// The number of possible transition classes. (See the comment on `TRANS_SIZE` +/// for more details.) +const CLASS_SIZE: usize = 256; + +/// A representation of a DFA. +/// +/// For the most part, this is a transition table, but various optimizations +/// have been applied to reduce its memory footprint. +struct Dfa { + /// The core transition table. Each row corresponds to the transitions for + /// each input equivalence class. (Input bytes are mapped to their + /// corresponding equivalence class with the `classes` map.) + /// + /// DFA states are represented as an index corresponding to the start of + /// its row in this table. + trans: [DfaState; TRANS_SIZE], + /// A table with the same layout as `trans`, except its values indicate + /// whether a particular `(state, equivalence class)` pair should emit an + /// output byte. + has_output: [bool; TRANS_SIZE], + /// A map from input byte to equivalence class. + /// + /// This is responsible for reducing the effective alphabet size from + /// 256 to `TRANS_CLASSES`. + classes: DfaClasses, + /// The DFA state corresponding to being inside an unquoted field. + in_field: DfaState, + /// The DFA state corresponding to being inside an quoted field. + in_quoted: DfaState, + /// The minimum DFA state that indicates a field has been parsed. All DFA + /// states greater than this are also final-field states. + final_field: DfaState, + /// The minimum DFA state that indicates a record has been parsed. All DFA + /// states greater than this are also final-record states. + final_record: DfaState, +} + +impl Dfa { + fn new() -> Dfa { + Dfa { + trans: [DfaState(0); TRANS_SIZE], + has_output: [false; TRANS_SIZE], + classes: DfaClasses::new(), + in_field: DfaState(0), + in_quoted: DfaState(0), + final_field: DfaState(0), + final_record: DfaState(0), + } + } + + fn new_state(&self, nfa_state: NfaState) -> DfaState { + let nclasses = self.classes.num_classes() as u8; + let idx = (nfa_state as u8).checked_mul(nclasses).unwrap(); + DfaState(idx) + } + + fn new_state_final_end(&self) -> DfaState { + self.new_state(NfaState::StartRecord) + } + + fn new_state_final_record(&self) -> DfaState { + self.new_state(NfaState::EndRecord) + } + + fn get_output(&self, state: DfaState, c: u8) -> (DfaState, bool) { + let cls = self.classes.classes[c as usize]; + let idx = state.0 as usize + cls as usize; + (self.trans[idx], self.has_output[idx]) + } + + fn set(&mut self, from: DfaState, c: u8, to: DfaState, output: bool) { + let cls = self.classes.classes[c as usize]; + let idx = from.0 as usize + cls as usize; + self.trans[idx] = to; + self.has_output[idx] = output; + } + + fn finish(&mut self) { + self.in_field = self.new_state(NfaState::InField); + self.in_quoted = self.new_state(NfaState::InQuotedField); + self.final_field = self.new_state(NfaState::EndFieldDelim); + self.final_record = self.new_state(NfaState::EndRecord); + } + + fn new_read_field_result( + &self, + state: DfaState, + is_final_trans: bool, + inpdone: bool, + outdone: bool, + ) -> ReadFieldResult { + if state >= self.final_record { + ReadFieldResult::Field { record_end: true } + } else if state == self.final_field { + ReadFieldResult::Field { record_end: false } + } else if is_final_trans && state.is_start() { + ReadFieldResult::End + } else { + debug_assert!(state < self.final_field); + if !inpdone && outdone { + ReadFieldResult::OutputFull + } else { + ReadFieldResult::InputEmpty + } + } + } + + fn new_read_record_result( + &self, + state: DfaState, + is_final_trans: bool, + inpdone: bool, + outdone: bool, + endsdone: bool, + ) -> ReadRecordResult { + if state >= self.final_record { + ReadRecordResult::Record + } else if is_final_trans && state.is_start() { + ReadRecordResult::End + } else { + debug_assert!(state < self.final_record); + if !inpdone && outdone { + ReadRecordResult::OutputFull + } else if !inpdone && endsdone { + ReadRecordResult::OutputEndsFull + } else { + ReadRecordResult::InputEmpty + } + } + } +} + +/// A map from input byte to equivalence class. +struct DfaClasses { + classes: [u8; CLASS_SIZE], + next_class: usize, +} + +impl DfaClasses { + fn new() -> DfaClasses { + DfaClasses { classes: [0; CLASS_SIZE], next_class: 1 } + } + + fn add(&mut self, b: u8) { + if self.next_class > CLASS_SIZE { + panic!("added too many classes") + } + self.classes[b as usize] = self.next_class as u8; + self.next_class = self.next_class + 1; + } + + fn num_classes(&self) -> usize { + self.next_class as usize + } + + /// Scan and copy the input bytes to the output buffer quickly. + /// + /// This assumes that the current state of the DFA is either `InField` or + /// `InQuotedField`. In this case, all bytes corresponding to the first + /// equivalence class (i.e., not a delimiter/quote/escape/etc.) are + /// guaranteed to never result in a state transition out of the current + /// state. This function takes advantage of that copies every byte from + /// `input` in the first equivalence class to `output`. Once a byte is seen + /// outside the first equivalence class, we quit and should fall back to + /// the main DFA loop. + #[inline(always)] + fn scan_and_copy( + &self, + input: &[u8], + nin: &mut usize, + output: &mut [u8], + nout: &mut usize, + ) { + while *nin < input.len() + && *nout < output.len() + && self.classes[input[*nin] as usize] == 0 + { + output[*nout] = input[*nin]; + *nin += 1; + *nout += 1; + } + } +} + +/// A single DFA state. +/// +/// A DFA state is represented by the starting index of its corresponding row +/// in the DFA transition table. This representation allows us to elide a +/// single multiplication instruction when computing the next transition for +/// a particular input byte. +#[derive(Clone, Copy, Debug, Eq, Ord, PartialEq, PartialOrd)] +struct DfaState(u8); + +impl DfaState { + fn start() -> DfaState { + DfaState(0) + } + + fn is_start(&self) -> bool { + self.0 == 0 + } +} + +impl fmt::Debug for Dfa { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "Dfa(N/A)") + } +} + +impl fmt::Debug for DfaClasses { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!( + f, + "DfaClasses {{ classes: N/A, next_class: {:?} }}", + self.next_class + ) + } +} + +impl Clone for Dfa { + fn clone(&self) -> Dfa { + let mut dfa = Dfa::new(); + dfa.trans.copy_from_slice(&self.trans); + dfa + } +} + +impl Clone for DfaClasses { + fn clone(&self) -> DfaClasses { + let mut x = DfaClasses::new(); + x.classes.copy_from_slice(&self.classes); + x + } +} + +#[cfg(test)] +mod tests { + use core::str; + + use arrayvec::{ArrayString, ArrayVec}; + + use super::{ReadFieldResult, Reader, ReaderBuilder, Terminator}; + + type Csv = ArrayVec<[Row; 10]>; + type Row = ArrayVec<[Field; 10]>; + type Field = ArrayString<[u8; 10]>; + + // OMG I HATE BYTE STRING LITERALS SO MUCH. + fn b(s: &str) -> &[u8] { + s.as_bytes() + } + + macro_rules! csv { + ($([$($field:expr),*]),*) => {{ + #[allow(unused_mut)] + fn x() -> Csv { + let mut csv = Csv::new(); + $( + let mut row = Row::new(); + $( + row.push(Field::from($field).unwrap()); + )* + csv.push(row); + )* + csv + } + x() + }} + } + + macro_rules! parses_to { + ($name:ident, $data:expr, $expected:expr) => { + parses_to!($name, $data, $expected, |builder| builder); + }; + ($name:ident, $data:expr, $expected:expr, $config:expr) => { + #[test] + fn $name() { + let mut builder = ReaderBuilder::new(); + builder.nfa(true); + $config(&mut builder); + let mut rdr = builder.build(); + let got = parse_by_field(&mut rdr, $data); + let expected = $expected; + assert_eq!(expected, got, "nfa by field"); + + let mut builder = ReaderBuilder::new(); + builder.nfa(true); + $config(&mut builder); + let mut rdr = builder.build(); + let got = parse_by_record(&mut rdr, $data); + let expected = $expected; + assert_eq!(expected, got, "nfa by record"); + + let mut builder = ReaderBuilder::new(); + $config(&mut builder); + let mut rdr = builder.build(); + let got = parse_by_field(&mut rdr, $data); + let expected = $expected; + assert_eq!(expected, got, "dfa by field"); + + let mut builder = ReaderBuilder::new(); + $config(&mut builder); + let mut rdr = builder.build(); + let got = parse_by_record(&mut rdr, $data); + let expected = $expected; + assert_eq!(expected, got, "dfa by record"); + } + }; + } + + fn parse_by_field(rdr: &mut Reader, data: &str) -> Csv { + let mut data = data.as_bytes(); + let mut field = [0u8; 10]; + let mut csv = Csv::new(); + let mut row = Row::new(); + let mut outpos = 0; + loop { + let (res, nin, nout) = rdr.read_field(data, &mut field[outpos..]); + data = &data[nin..]; + outpos += nout; + + match res { + ReadFieldResult::InputEmpty => { + if !data.is_empty() { + panic!("missing input data") + } + } + ReadFieldResult::OutputFull => panic!("field too large"), + ReadFieldResult::Field { record_end } => { + let s = str::from_utf8(&field[..outpos]).unwrap(); + row.push(Field::from(s).unwrap()); + outpos = 0; + if record_end { + csv.push(row); + row = Row::new(); + } + } + ReadFieldResult::End => { + return csv; + } + } + } + } + + fn parse_by_record(rdr: &mut Reader, data: &str) -> Csv { + use crate::ReadRecordResult::*; + + let mut data = data.as_bytes(); + let mut record = [0; 1024]; + let mut ends = [0; 10]; + + let mut csv = Csv::new(); + let (mut outpos, mut endpos) = (0, 0); + loop { + let (res, nin, nout, nend) = rdr.read_record( + data, + &mut record[outpos..], + &mut ends[endpos..], + ); + data = &data[nin..]; + outpos += nout; + endpos += nend; + + match res { + InputEmpty => { + if !data.is_empty() { + panic!("missing input data") + } + } + OutputFull => panic!("record too large (out buffer)"), + OutputEndsFull => panic!("record too large (end buffer)"), + Record => { + let s = str::from_utf8(&record[..outpos]).unwrap(); + let mut start = 0; + let mut row = Row::new(); + for &end in &ends[..endpos] { + row.push(Field::from(&s[start..end]).unwrap()); + start = end; + } + csv.push(row); + outpos = 0; + endpos = 0; + } + End => return csv, + } + } + } + + parses_to!(one_row_one_field, "a", csv![["a"]]); + parses_to!(one_row_many_fields, "a,b,c", csv![["a", "b", "c"]]); + parses_to!(one_row_trailing_comma, "a,b,", csv![["a", "b", ""]]); + parses_to!(one_row_one_field_lf, "a\n", csv![["a"]]); + parses_to!(one_row_many_fields_lf, "a,b,c\n", csv![["a", "b", "c"]]); + parses_to!(one_row_trailing_comma_lf, "a,b,\n", csv![["a", "b", ""]]); + parses_to!(one_row_one_field_crlf, "a\r\n", csv![["a"]]); + parses_to!(one_row_many_fields_crlf, "a,b,c\r\n", csv![["a", "b", "c"]]); + parses_to!(one_row_trailing_comma_crlf, "a,b,\r\n", csv![["a", "b", ""]]); + parses_to!(one_row_one_field_cr, "a\r", csv![["a"]]); + parses_to!(one_row_many_fields_cr, "a,b,c\r", csv![["a", "b", "c"]]); + parses_to!(one_row_trailing_comma_cr, "a,b,\r", csv![["a", "b", ""]]); + + parses_to!(many_rows_one_field, "a\nb", csv![["a"], ["b"]]); + parses_to!( + many_rows_many_fields, + "a,b,c\nx,y,z", + csv![["a", "b", "c"], ["x", "y", "z"]] + ); + parses_to!( + many_rows_trailing_comma, + "a,b,\nx,y,", + csv![["a", "b", ""], ["x", "y", ""]] + ); + parses_to!(many_rows_one_field_lf, "a\nb\n", csv![["a"], ["b"]]); + parses_to!( + many_rows_many_fields_lf, + "a,b,c\nx,y,z\n", + csv![["a", "b", "c"], ["x", "y", "z"]] + ); + parses_to!( + many_rows_trailing_comma_lf, + "a,b,\nx,y,\n", + csv![["a", "b", ""], ["x", "y", ""]] + ); + parses_to!(many_rows_one_field_crlf, "a\r\nb\r\n", csv![["a"], ["b"]]); + parses_to!( + many_rows_many_fields_crlf, + "a,b,c\r\nx,y,z\r\n", + csv![["a", "b", "c"], ["x", "y", "z"]] + ); + parses_to!( + many_rows_trailing_comma_crlf, + "a,b,\r\nx,y,\r\n", + csv![["a", "b", ""], ["x", "y", ""]] + ); + parses_to!(many_rows_one_field_cr, "a\rb\r", csv![["a"], ["b"]]); + parses_to!( + many_rows_many_fields_cr, + "a,b,c\rx,y,z\r", + csv![["a", "b", "c"], ["x", "y", "z"]] + ); + parses_to!( + many_rows_trailing_comma_cr, + "a,b,\rx,y,\r", + csv![["a", "b", ""], ["x", "y", ""]] + ); + + parses_to!( + trailing_lines_no_record, + "\n\n\na,b,c\nx,y,z\n\n\n", + csv![["a", "b", "c"], ["x", "y", "z"]] + ); + parses_to!( + trailing_lines_no_record_cr, + "\r\r\ra,b,c\rx,y,z\r\r\r", + csv![["a", "b", "c"], ["x", "y", "z"]] + ); + parses_to!( + trailing_lines_no_record_crlf, + "\r\n\r\n\r\na,b,c\r\nx,y,z\r\n\r\n\r\n", + csv![["a", "b", "c"], ["x", "y", "z"]] + ); + + parses_to!(empty, "", csv![]); + parses_to!(empty_lines, "\n\n\n\n", csv![]); + parses_to!( + empty_lines_interspersed, + "\n\na,b\n\n\nx,y\n\n\nm,n\n", + csv![["a", "b"], ["x", "y"], ["m", "n"]] + ); + parses_to!(empty_lines_crlf, "\r\n\r\n\r\n\r\n", csv![]); + parses_to!( + empty_lines_interspersed_crlf, + "\r\n\r\na,b\r\n\r\n\r\nx,y\r\n\r\n\r\nm,n\r\n", + csv![["a", "b"], ["x", "y"], ["m", "n"]] + ); + parses_to!(empty_lines_mixed, "\r\n\n\r\n\n", csv![]); + parses_to!( + empty_lines_interspersed_mixed, + "\n\r\na,b\r\n\n\r\nx,y\r\n\n\r\nm,n\r\n", + csv![["a", "b"], ["x", "y"], ["m", "n"]] + ); + parses_to!(empty_lines_cr, "\r\r\r\r", csv![]); + parses_to!( + empty_lines_interspersed_cr, + "\r\ra,b\r\r\rx,y\r\r\rm,n\r", + csv![["a", "b"], ["x", "y"], ["m", "n"]] + ); + + parses_to!( + term_weird, + "zza,bzc,dzz", + csv![["a", "b"], ["c", "d"]], + |b: &mut ReaderBuilder| { + b.terminator(Terminator::Any(b'z')); + } + ); + + parses_to!( + ascii_delimited, + "a\x1fb\x1ec\x1fd", + csv![["a", "b"], ["c", "d"]], + |b: &mut ReaderBuilder| { + b.ascii(); + } + ); + + parses_to!(bom_at_start, "\u{feff}a", csv![["a"]]); + parses_to!(bom_in_field, "a\u{feff}", csv![["a\u{feff}"]]); + parses_to!(bom_at_field_start, "a,\u{feff}b", csv![["a", "\u{feff}b"]]); + + parses_to!(quote_empty, "\"\"", csv![[""]]); + parses_to!(quote_lf, "\"\"\n", csv![[""]]); + parses_to!(quote_space, "\" \"", csv![[" "]]); + parses_to!(quote_inner_space, "\" a \"", csv![[" a "]]); + parses_to!(quote_outer_space, " \"a\" ", csv![[" \"a\" "]]); + + parses_to!(quote_change, "zaz", csv![["a"]], |b: &mut ReaderBuilder| { + b.quote(b'z'); + }); + + // This one is pretty hokey. + // I don't really know what the "right" behavior is. + parses_to!( + quote_delimiter, + ",a,,b", + csv![["a,b"]], + |b: &mut ReaderBuilder| { + b.quote(b','); + } + ); + + parses_to!(quote_no_escapes, r#""a\"b""#, csv![[r#"a\b""#]]); + parses_to!( + quote_escapes_no_double, + r#""a""b""#, + csv![[r#"a"b""#]], + |b: &mut ReaderBuilder| { + b.double_quote(false); + } + ); + parses_to!( + quote_escapes, + r#""a\"b""#, + csv![[r#"a"b"#]], + |b: &mut ReaderBuilder| { + b.escape(Some(b'\\')); + } + ); + parses_to!( + quote_escapes_change, + r#""az"b""#, + csv![[r#"a"b"#]], + |b: &mut ReaderBuilder| { + b.escape(Some(b'z')); + } + ); + + parses_to!( + quote_escapes_with_comma, + r#""\"A,B\"""#, + csv![[r#""A,B""#]], + |b: &mut ReaderBuilder| { + b.escape(Some(b'\\')).double_quote(false); + } + ); + + parses_to!( + quoting_disabled, + r#""abc,foo""#, + csv![[r#""abc"#, r#"foo""#]], + |b: &mut ReaderBuilder| { + b.quoting(false); + } + ); + + parses_to!( + delimiter_tabs, + "a\tb", + csv![["a", "b"]], + |b: &mut ReaderBuilder| { + b.delimiter(b'\t'); + } + ); + parses_to!( + delimiter_weird, + "azb", + csv![["a", "b"]], + |b: &mut ReaderBuilder| { + b.delimiter(b'z'); + } + ); + + parses_to!(extra_record_crlf_1, "foo\n1\n", csv![["foo"], ["1"]]); + parses_to!(extra_record_crlf_2, "foo\r\n1\r\n", csv![["foo"], ["1"]]); + + parses_to!( + comment_1, + "foo\n# hi\nbar\n", + csv![["foo"], ["bar"]], + |b: &mut ReaderBuilder| { + b.comment(Some(b'#')); + } + ); + parses_to!( + comment_2, + "foo\n # hi\nbar\n", + csv![["foo"], [" # hi"], ["bar"]], + |b: &mut ReaderBuilder| { + b.comment(Some(b'#')); + } + ); + parses_to!( + comment_3, + "foo\n# hi\nbar\n", + csv![["foo"], ["# hi"], ["bar"]], + |b: &mut ReaderBuilder| { + b.comment(Some(b'\n')); + } + ); + parses_to!( + comment_4, + "foo,b#ar,baz", + csv![["foo", "b#ar", "baz"]], + |b: &mut ReaderBuilder| { + b.comment(Some(b'#')); + } + ); + parses_to!( + comment_5, + "foo,#bar,baz", + csv![["foo", "#bar", "baz"]], + |b: &mut ReaderBuilder| { + b.comment(Some(b'#')); + } + ); + + macro_rules! assert_read { + ( + $rdr:expr, $input:expr, $output:expr, + $expect_in:expr, $expect_out:expr, $expect_res:expr + ) => {{ + let (res, nin, nout) = $rdr.read_field($input, $output); + assert_eq!($expect_in, nin); + assert_eq!($expect_out, nout); + assert_eq!($expect_res, res); + }}; + } + + // This tests that feeding a new reader with an empty buffer sends us + // straight to End. + #[test] + fn stream_empty() { + use crate::ReadFieldResult::*; + + let mut rdr = Reader::new(); + assert_read!(rdr, &[], &mut [], 0, 0, End); + } + + // Test that a single space is treated as a single field. + #[test] + fn stream_space() { + use crate::ReadFieldResult::*; + + let mut rdr = Reader::new(); + assert_read!(rdr, b(" "), &mut [0], 1, 1, InputEmpty); + assert_read!(rdr, &[], &mut [0], 0, 0, Field { record_end: true }); + assert_read!(rdr, &[], &mut [0], 0, 0, End); + } + + // Test that a single comma ... + #[test] + fn stream_comma() { + use crate::ReadFieldResult::*; + + let mut rdr = Reader::new(); + assert_read!(rdr, b(","), &mut [0], 1, 0, Field { record_end: false }); + assert_read!(rdr, &[], &mut [0], 0, 0, Field { record_end: true }); + assert_read!(rdr, &[], &mut [0], 0, 0, End); + } + + // Test that we can read a single large field in multiple output + // buffers. + #[test] + fn stream_output_chunks() { + use crate::ReadFieldResult::*; + + let mut inp = b("fooquux"); + let out = &mut [0; 2]; + let mut rdr = Reader::new(); + + assert_read!(rdr, inp, out, 2, 2, OutputFull); + assert_eq!(out, b("fo")); + inp = &inp[2..]; + + assert_read!(rdr, inp, out, 2, 2, OutputFull); + assert_eq!(out, b("oq")); + inp = &inp[2..]; + + assert_read!(rdr, inp, out, 2, 2, OutputFull); + assert_eq!(out, b("uu")); + inp = &inp[2..]; + + assert_read!(rdr, inp, out, 1, 1, InputEmpty); + assert_eq!(&out[..1], b("x")); + inp = &inp[1..]; + assert!(inp.is_empty()); + + assert_read!(rdr, &[], out, 0, 0, Field { record_end: true }); + assert_read!(rdr, inp, out, 0, 0, End); + } + + // Test that we can read a single large field across multiple input + // buffers. + #[test] + fn stream_input_chunks() { + use crate::ReadFieldResult::*; + + let out = &mut [0; 10]; + let mut rdr = Reader::new(); + + assert_read!(rdr, b("fo"), out, 2, 2, InputEmpty); + assert_eq!(&out[..2], b("fo")); + + assert_read!(rdr, b("oq"), &mut out[2..], 2, 2, InputEmpty); + assert_eq!(&out[..4], b("fooq")); + + assert_read!(rdr, b("uu"), &mut out[4..], 2, 2, InputEmpty); + assert_eq!(&out[..6], b("fooquu")); + + assert_read!(rdr, b("x"), &mut out[6..], 1, 1, InputEmpty); + assert_eq!(&out[..7], b("fooquux")); + + assert_read!(rdr, &[], out, 0, 0, Field { record_end: true }); + assert_read!(rdr, &[], out, 0, 0, End); + } + + // Test we can read doubled quotes correctly in a stream. + #[test] + fn stream_doubled_quotes() { + use crate::ReadFieldResult::*; + + let out = &mut [0; 10]; + let mut rdr = Reader::new(); + + assert_read!(rdr, b("\"fo\""), out, 4, 2, InputEmpty); + assert_eq!(&out[..2], b("fo")); + + assert_read!(rdr, b("\"o"), &mut out[2..], 2, 2, InputEmpty); + assert_eq!(&out[..4], b("fo\"o")); + + assert_read!(rdr, &[], out, 0, 0, Field { record_end: true }); + assert_read!(rdr, &[], out, 0, 0, End); + } + + // Test we can read escaped quotes correctly in a stream. + #[test] + fn stream_escaped_quotes() { + use crate::ReadFieldResult::*; + + let out = &mut [0; 10]; + let mut builder = ReaderBuilder::new(); + let mut rdr = builder.escape(Some(b'\\')).build(); + + assert_read!(rdr, b("\"fo\\"), out, 4, 2, InputEmpty); + assert_eq!(&out[..2], b("fo")); + + assert_read!(rdr, b("\"o"), &mut out[2..], 2, 2, InputEmpty); + assert_eq!(&out[..4], b("fo\"o")); + + assert_read!(rdr, &[], out, 0, 0, Field { record_end: true }); + assert_read!(rdr, &[], out, 0, 0, End); + } + + // Test that empty output buffers don't wreak havoc. + #[test] + fn stream_empty_output() { + use crate::ReadFieldResult::*; + + let out = &mut [0; 10]; + let mut rdr = Reader::new(); + + assert_read!( + rdr, + b("foo,bar"), + out, + 4, + 3, + Field { record_end: false } + ); + assert_eq!(&out[..3], b("foo")); + + assert_read!(rdr, b("bar"), &mut [], 0, 0, OutputFull); + + assert_read!(rdr, b("bar"), out, 3, 3, InputEmpty); + assert_eq!(&out[..3], b("bar")); + + assert_read!(rdr, &[], out, 0, 0, Field { record_end: true }); + assert_read!(rdr, &[], out, 0, 0, End); + } + + // Test that we can reset the parser mid-stream and count on it to do + // the right thing. + #[test] + fn reset_works() { + use crate::ReadFieldResult::*; + + let out = &mut [0; 10]; + let mut rdr = Reader::new(); + + assert_read!(rdr, b("\"foo"), out, 4, 3, InputEmpty); + assert_eq!(&out[..3], b("foo")); + + // Without reseting the parser state, the reader will remember that + // we're in a quoted field, and therefore interpret the leading double + // quotes below as a single quote and the trailing quote as a matching + // terminator. With the reset, however, the parser forgets the quoted + // field and treats the leading double quotes as a syntax quirk and + // drops them, in addition to hanging on to the trailing unmatched + // quote. (Matches Python's behavior.) + rdr.reset(); + + assert_read!(rdr, b("\"\"bar\""), out, 6, 4, InputEmpty); + assert_eq!(&out[..4], b("bar\"")); + } + + // Test the line number reporting is correct. + #[test] + fn line_numbers() { + use crate::ReadFieldResult::*; + + let out = &mut [0; 10]; + let mut rdr = Reader::new(); + + assert_eq!(1, rdr.line()); + + assert_read!(rdr, b("\n\n\n\n"), out, 4, 0, InputEmpty); + assert_eq!(5, rdr.line()); + + assert_read!(rdr, b("foo,"), out, 4, 3, Field { record_end: false }); + assert_eq!(5, rdr.line()); + + assert_read!(rdr, b("bar\n"), out, 4, 3, Field { record_end: true }); + assert_eq!(6, rdr.line()); + + assert_read!(rdr, &[], &mut [0], 0, 0, End); + assert_eq!(6, rdr.line()); + } + + macro_rules! assert_read_record { + ( + $rdr:expr, $input:expr, $output:expr, $ends:expr, + $expect_in:expr, $expect_out:expr, + $expect_end:expr, $expect_res:expr + ) => {{ + let (res, nin, nout, nend) = + $rdr.read_record($input, $output, $ends); + assert_eq!($expect_res, res, "result"); + assert_eq!($expect_in, nin, "input"); + assert_eq!($expect_out, nout, "output"); + assert_eq!($expect_end, nend, "ends"); + }}; + } + + // Test that we can incrementally read a record. + #[test] + fn stream_record() { + use crate::ReadRecordResult::*; + + let mut inp = b("foo,bar\nbaz"); + let out = &mut [0; 1024]; + let ends = &mut [0; 10]; + let mut rdr = Reader::new(); + + assert_read_record!(rdr, &inp, out, ends, 8, 6, 2, Record); + assert_eq!(ends[0], 3); + assert_eq!(ends[1], 6); + inp = &inp[8..]; + + assert_read_record!(rdr, &inp, out, ends, 3, 3, 0, InputEmpty); + inp = &inp[3..]; + + assert_read_record!(rdr, &inp, out, ends, 0, 0, 1, Record); + assert_eq!(ends[0], 3); + + assert_read_record!(rdr, &inp, out, ends, 0, 0, 0, End); + } + + // Test that if our output ends are full during the last read that + // we get an appropriate state returned. + #[test] + fn stream_record_last_end_output_full() { + use crate::ReadRecordResult::*; + + let mut inp = b("foo,bar\nbaz"); + let out = &mut [0; 1024]; + let ends = &mut [0; 10]; + let mut rdr = Reader::new(); + + assert_read_record!(rdr, &inp, out, ends, 8, 6, 2, Record); + assert_eq!(ends[0], 3); + assert_eq!(ends[1], 6); + inp = &inp[8..]; + + assert_read_record!(rdr, &inp, out, ends, 3, 3, 0, InputEmpty); + inp = &inp[3..]; + + assert_read_record!(rdr, &inp, out, &mut [], 0, 0, 0, OutputEndsFull); + assert_read_record!(rdr, &inp, out, ends, 0, 0, 1, Record); + assert_eq!(ends[0], 3); + + assert_read_record!(rdr, &inp, out, ends, 0, 0, 0, End); + } +} diff --git a/src/writer.rs b/src/writer.rs new file mode 100644 index 0000000..4f94301 --- /dev/null +++ b/src/writer.rs @@ -0,0 +1,1047 @@ +use core::fmt; +use core::str; + +use memchr::memchr; + +use crate::{QuoteStyle, Terminator}; + +/// A builder for configuring a CSV writer. +/// +/// This builder permits specifying the CSV delimiter, terminator, quoting +/// style and more. +#[derive(Debug)] +pub struct WriterBuilder { + wtr: Writer, +} + +impl WriterBuilder { + /// Create a new builder for configuring a CSV writer. + pub fn new() -> WriterBuilder { + let wtr = Writer { + state: WriterState::default(), + requires_quotes: [false; 256], + delimiter: b',', + term: Terminator::Any(b'\n'), + style: QuoteStyle::default(), + quote: b'"', + escape: b'\\', + double_quote: true, + }; + WriterBuilder { wtr: wtr } + } + + /// Builder a CSV writer from this configuration. + pub fn build(&self) -> Writer { + use crate::Terminator::*; + + let mut wtr = self.wtr.clone(); + wtr.requires_quotes[self.wtr.delimiter as usize] = true; + wtr.requires_quotes[self.wtr.quote as usize] = true; + if !self.wtr.double_quote { + // We only need to quote the escape character if the escape + // character is used for escaping quotes. + wtr.requires_quotes[self.wtr.escape as usize] = true; + } + match self.wtr.term { + CRLF | Any(b'\n') | Any(b'\r') => { + // This is a bit hokey. By default, the record terminator + // is '\n', but we still need to quote '\r' (even if our + // terminator is only `\n`) because the reader interprets '\r' + // as a record terminator by default. + wtr.requires_quotes[b'\r' as usize] = true; + wtr.requires_quotes[b'\n' as usize] = true; + } + Any(b) => { + wtr.requires_quotes[b as usize] = true; + } + _ => unreachable!(), + } + wtr + } + + /// The field delimiter to use when writing CSV. + /// + /// The default is `b','`. + pub fn delimiter(&mut self, delimiter: u8) -> &mut WriterBuilder { + self.wtr.delimiter = delimiter; + self + } + + /// The record terminator to use when writing CSV. + /// + /// A record terminator can be any single byte. The default is `\n`. + /// + /// Note that RFC 4180 specifies that record terminators should be `\r\n`. + /// To use `\r\n`, use the special `Terminator::CRLF` value. + pub fn terminator(&mut self, term: Terminator) -> &mut WriterBuilder { + self.wtr.term = term; + self + } + + /// The quoting style to use when writing CSV. + /// + /// By default, this is set to `QuoteStyle::Necessary`, which will only + /// use quotes when they are necessary to preserve the integrity of data. + /// + /// Note that unless the quote style is set to `Never`, an empty field is + /// quoted if it is the only field in a record. + pub fn quote_style(&mut self, style: QuoteStyle) -> &mut WriterBuilder { + self.wtr.style = style; + self + } + + /// The quote character to use when writing CSV. + /// + /// The default value is `b'"'`. + pub fn quote(&mut self, quote: u8) -> &mut WriterBuilder { + self.wtr.quote = quote; + self + } + + /// The escape character to use when writing CSV. + /// + /// This is only used when `double_quote` is set to `false`. + /// + /// The default value is `b'\\'`. + pub fn escape(&mut self, escape: u8) -> &mut WriterBuilder { + self.wtr.escape = escape; + self + } + + /// The quoting escape mechanism to use when writing CSV. + /// + /// When enabled (which is the default), quotes are escaped by doubling + /// them. e.g., `"` escapes to `""`. + /// + /// When disabled, quotes are escaped with the escape character (which + /// is `\\` by default). + pub fn double_quote(&mut self, yes: bool) -> &mut WriterBuilder { + self.wtr.double_quote = yes; + self + } +} + +impl Default for WriterBuilder { + fn default() -> WriterBuilder { + WriterBuilder::new() + } +} + +/// The result of writing CSV data. +/// +/// A value of this type is returned from every interaction with `Writer`. It +/// informs the caller how to proceed, namely, by indicating whether more +/// input should be given (`InputEmpty`) or if a bigger output buffer is needed +/// (`OutputFull`). +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum WriteResult { + /// This result occurs when all of the bytes from the given input have + /// been processed. + InputEmpty, + /// This result occurs when the output buffer was too small to process + /// all of the input bytes. Generally, this means the caller must call + /// the corresponding method again with the rest of the input and more + /// room in the output buffer. + OutputFull, +} + +/// A writer for CSV data. +/// +/// # RFC 4180 +/// +/// This writer conforms to RFC 4180 with one exception: it doesn't guarantee +/// that all records written are of the same length. Instead, the onus is on +/// the caller to ensure that all records written are of the same length. +/// +/// Note that the default configuration of a `Writer` uses `\n` for record +/// terminators instead of `\r\n` as specified by RFC 4180. Use the +/// `terminator` method on `WriterBuilder` to set the terminator to `\r\n` if +/// it's desired. +pub struct Writer { + state: WriterState, + requires_quotes: [bool; 256], + delimiter: u8, + term: Terminator, + style: QuoteStyle, + quote: u8, + escape: u8, + double_quote: bool, +} + +impl Clone for Writer { + fn clone(&self) -> Writer { + let mut requires_quotes = [false; 256]; + for i in 0..256 { + requires_quotes[i] = self.requires_quotes[i]; + } + Writer { + state: self.state.clone(), + requires_quotes: requires_quotes, + delimiter: self.delimiter, + term: self.term, + style: self.style, + quote: self.quote, + escape: self.escape, + double_quote: self.double_quote, + } + } +} + +impl fmt::Debug for Writer { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + f.debug_struct("Writer") + .field("state", &self.state) + .field("delimiter", &self.delimiter) + .field("term", &self.term) + .field("style", &self.style) + .field("quote", &self.quote) + .field("escape", &self.escape) + .field("double_quote", &self.double_quote) + .finish() + } +} + +#[derive(Clone, Debug)] +struct WriterState { + /// This is set whenever we've begun writing the contents of a field, even + /// if the contents are empty. We use it to avoid re-computing whether + /// quotes are necessary. + in_field: bool, + /// This is set whenever we've started writing a field that is enclosed in + /// quotes. When the writer is finished, or if a delimiter or terminator + /// are written, then a closing quote is inserted when this is true. + quoting: bool, + /// The number of total bytes written for the current record. + /// + /// If the writer is finished or a terminator is written when this is `0`, + /// then an empty field is added as a pair of adjacent quotes. + record_bytes: u64, +} + +impl Writer { + /// Creates a new CSV writer with the default configuration. + pub fn new() -> Writer { + Writer::default() + } + + /// Finish writing CSV data to `output`. + /// + /// This must be called when one is done writing CSV data to `output`. + /// In particular, it will write closing quotes if necessary. + pub fn finish(&mut self, mut output: &mut [u8]) -> (WriteResult, usize) { + let mut nout = 0; + if self.state.record_bytes == 0 && self.state.in_field { + assert!(!self.state.quoting); + let (res, o) = self.write(&[self.quote, self.quote], output); + if o == 0 { + return (res, 0); + } + output = &mut moving(output)[o..]; + nout += o; + self.state.record_bytes += o as u64; + } + if !self.state.quoting { + return (WriteResult::InputEmpty, nout); + } + let (res, o) = self.write(&[self.quote], output); + if o == 0 { + return (res, nout); + } + nout += o; + self.state.record_bytes = 0; + self.state.in_field = false; + self.state.quoting = false; + (res, nout) + } + + /// Write a single CSV field from `input` to `output` while employing this + /// writer's quoting style. + /// + /// This returns the result of writing field data, in addition to the + /// number of bytes consumed from `input` and the number of bytes + /// written to `output`. + /// + /// The result of writing field data is either `WriteResult::InputEmpty` + /// or `WriteResult::OutputFull`. The former occurs when all bytes in + /// `input` were copied to `output`, while the latter occurs when `output` + /// is too small to fit everything from `input`. The maximum number of + /// bytes that can be written to `output` is `2 + (2 * input.len())` + /// because of quoting. (The worst case is a field consisting entirely + /// of quotes.) + /// + /// Multiple successive calls to `field` will write more data to the same + /// field. Subsequent fields can be written by calling either `delimiter` + /// or `terminator` first. + /// + /// If this writer's quoting style is `QuoteStyle::Necessary`, then `input` + /// should contain the *entire* field. Otherwise, whether the field needs + /// to be quoted or not cannot be determined. + pub fn field( + &mut self, + input: &[u8], + mut output: &mut [u8], + ) -> (WriteResult, usize, usize) { + let (mut nin, mut nout) = (0, 0); + + if !self.state.in_field { + self.state.quoting = self.should_quote(input); + if self.state.quoting { + let (res, o) = self.write(&[self.quote], output); + if o == 0 { + return (res, 0, 0); + } + output = &mut moving(output)[o..]; + nout += o; + self.state.record_bytes += o as u64; + } + self.state.in_field = true; + } + let (res, i, o) = if self.state.quoting { + quote(input, output, self.quote, self.escape, self.double_quote) + } else { + write_optimistic(input, output) + }; + nin += i; + nout += o; + self.state.record_bytes += o as u64; + (res, nin, nout) + } + + /// Write the configured field delimiter to `output`. + /// + /// If the output buffer does not have enough room to fit + /// a field delimiter, then nothing is written to `output` + /// and `WriteResult::OutputFull` is returned. Otherwise, + /// `WriteResult::InputEmpty` is returned along with the number of bytes + /// written to `output` (which is `1` in case of an unquoted + /// field, or `2` in case of an end quote and a field separator). + pub fn delimiter( + &mut self, + mut output: &mut [u8], + ) -> (WriteResult, usize) { + let mut nout = 0; + if self.state.quoting { + let (res, o) = self.write(&[self.quote], output); + if o == 0 { + return (res, o); + } + output = &mut moving(output)[o..]; + nout += o; + self.state.record_bytes += o as u64; + self.state.quoting = false; + } + let (res, o) = self.write(&[self.delimiter], output); + if o == 0 { + return (res, nout); + } + nout += o; + self.state.record_bytes += o as u64; + self.state.in_field = false; + (res, nout) + } + + /// Write the configured record terminator to `output`. + /// + /// If the output buffer does not have enough room to fit a record + /// terminator, then no part of the terminator is written and + /// `WriteResult::OutputFull` is returned. Otherwise, + /// `WriteResult::InputEmpty` is returned along with the number of bytes + /// written to `output` (which is always `1` or `2`). + pub fn terminator( + &mut self, + mut output: &mut [u8], + ) -> (WriteResult, usize) { + let mut nout = 0; + if self.state.record_bytes == 0 { + assert!(!self.state.quoting); + let (res, o) = self.write(&[self.quote, self.quote], output); + if o == 0 { + return (res, 0); + } + output = &mut moving(output)[o..]; + nout += o; + self.state.record_bytes += o as u64; + } + if self.state.quoting { + let (res, o) = self.write(&[self.quote], output); + if o == 0 { + return (res, o); + } + output = &mut moving(output)[o..]; + nout += o; + self.state.record_bytes += o as u64; + self.state.quoting = false; + } + let (res, o) = match self.term { + Terminator::CRLF => write_pessimistic(&[b'\r', b'\n'], output), + Terminator::Any(b) => write_pessimistic(&[b], output), + _ => unreachable!(), + }; + if o == 0 { + return (res, nout); + } + nout += o; + self.state.record_bytes = 0; + self.state.in_field = false; + (res, nout) + } + + /// Returns true if and only if the given input field *requires* quotes to + /// preserve the integrity of `input` while taking into account the current + /// configuration of this writer (except for the configured quoting style). + #[inline] + fn needs_quotes(&self, mut input: &[u8]) -> bool { + let mut needs = false; + while !needs && input.len() >= 8 { + needs = self.requires_quotes[input[0] as usize] + || self.requires_quotes[input[1] as usize] + || self.requires_quotes[input[2] as usize] + || self.requires_quotes[input[3] as usize] + || self.requires_quotes[input[4] as usize] + || self.requires_quotes[input[5] as usize] + || self.requires_quotes[input[6] as usize] + || self.requires_quotes[input[7] as usize]; + input = &input[8..]; + } + needs || input.iter().any(|&b| self.is_special_byte(b)) + } + + /// Returns true if and only if the given byte corresponds to a special + /// byte in this CSV writer's configuration. + /// + /// Note that this does **not** take into account this writer's quoting + /// style. + #[inline] + pub fn is_special_byte(&self, b: u8) -> bool { + self.requires_quotes[b as usize] + } + + /// Returns true if and only if we should put the given field data + /// in quotes. This takes the quoting style into account. + #[inline] + pub fn should_quote(&self, input: &[u8]) -> bool { + match self.style { + QuoteStyle::Always => true, + QuoteStyle::Never => false, + QuoteStyle::NonNumeric => is_non_numeric(input), + QuoteStyle::Necessary => self.needs_quotes(input), + _ => unreachable!(), + } + } + + /// Return the delimiter used for this writer. + #[inline] + pub fn get_delimiter(&self) -> u8 { + self.delimiter + } + + /// Return the terminator used for this writer. + #[inline] + pub fn get_terminator(&self) -> Terminator { + self.term + } + + /// Return the quoting style used for this writer. + #[inline] + pub fn get_quote_style(&self) -> QuoteStyle { + self.style + } + + /// Return the quote character used for this writer. + #[inline] + pub fn get_quote(&self) -> u8 { + self.quote + } + + /// Return the escape character used for this writer. + #[inline] + pub fn get_escape(&self) -> u8 { + self.escape + } + + /// Return whether this writer doubles quotes or not. When the writer + /// does not double quotes, it will escape them using the escape character. + #[inline] + pub fn get_double_quote(&self) -> bool { + self.double_quote + } + + fn write(&self, data: &[u8], output: &mut [u8]) -> (WriteResult, usize) { + if data.len() > output.len() { + (WriteResult::OutputFull, 0) + } else { + output[..data.len()].copy_from_slice(data); + (WriteResult::InputEmpty, data.len()) + } + } +} + +impl Default for Writer { + fn default() -> Writer { + WriterBuilder::new().build() + } +} + +impl Default for WriterState { + fn default() -> WriterState { + WriterState { in_field: false, quoting: false, record_bytes: 0 } + } +} + +/// Returns true if and only if the given input is non-numeric. +pub fn is_non_numeric(input: &[u8]) -> bool { + let s = match str::from_utf8(input) { + Err(_) => return true, + Ok(s) => s, + }; + // I suppose this could be faster if we wrote validators of numbers instead + // of using the actual parser, but that's probably a lot of work for a bit + // of a niche feature. + !s.parse::().is_ok() && !s.parse::().is_ok() +} + +/// Escape quotes `input` and writes the result to `output`. +/// +/// If `input` does not have a `quote`, then the contents of `input` are +/// copied verbatim to `output`. +/// +/// If `output` is not big enough to store the fully quoted contents of +/// `input`, then `WriteResult::OutputFull` is returned. The `output` buffer +/// will require a maximum of storage of `2 * input.len()` in the worst case +/// (where every byte is a quote). +/// +/// In streaming contexts, `quote` should be called in a loop until +/// `WriteResult::InputEmpty` is returned. It is possible to write an infinite +/// loop if your output buffer is less than 2 bytes in length (the minimum +/// storage space required to store an escaped quote). +/// +/// In addition to the `WriteResult`, the number of consumed bytes from `input` +/// and the number of bytes written to `output` are also returned. +/// +/// `quote` is the quote byte and `escape` is the escape byte. If +/// `double_quote` is true, then quotes are escaped by doubling them, +/// otherwise, quotes are escaped with the `escape` byte. +/// +/// N.B. This function is provided for low level usage. It is called +/// automatically if you're using a `Writer`. +pub fn quote( + mut input: &[u8], + mut output: &mut [u8], + quote: u8, + escape: u8, + double_quote: bool, +) -> (WriteResult, usize, usize) { + let (mut nin, mut nout) = (0, 0); + loop { + match memchr(quote, input) { + None => { + let (res, i, o) = write_optimistic(input, output); + nin += i; + nout += o; + return (res, nin, nout); + } + Some(next_quote) => { + let (res, i, o) = + write_optimistic(&input[..next_quote], output); + input = &input[i..]; + output = &mut moving(output)[o..]; + nin += i; + nout += o; + if let WriteResult::OutputFull = res { + return (res, nin, nout); + } + if double_quote { + let (res, o) = write_pessimistic(&[quote, quote], output); + if let WriteResult::OutputFull = res { + return (res, nin, nout); + } + nout += o; + output = &mut moving(output)[o..]; + } else { + let (res, o) = write_pessimistic(&[escape, quote], output); + if let WriteResult::OutputFull = res { + return (res, nin, nout); + } + nout += o; + output = &mut moving(output)[o..]; + } + nin += 1; + input = &input[1..]; + } + } + } +} + +/// Copy the bytes from `input` to `output`. If `output` is too small to fit +/// everything from `input`, then copy `output.len()` bytes from `input`. +/// Otherwise, copy everything from `input` into `output`. +/// +/// In the first case (`output` is too small), `WriteResult::OutputFull` is +/// returned, in addition to the number of bytes consumed from `input` and +/// the number of bytes written to `output`. +/// +/// In the second case (`input` is no bigger than `output`), +/// `WriteResult::InputEmpty` is returned, in addition to the number of bytes +/// consumed from `input` and the number of bytes written to `output`. +fn write_optimistic( + input: &[u8], + output: &mut [u8], +) -> (WriteResult, usize, usize) { + if input.len() > output.len() { + let input = &input[..output.len()]; + output.copy_from_slice(input); + (WriteResult::OutputFull, output.len(), output.len()) + } else { + output[..input.len()].copy_from_slice(input); + (WriteResult::InputEmpty, input.len(), input.len()) + } +} + +/// Copy the bytes from `input` to `output` only if `input` is no bigger than +/// `output`. If `input` is bigger than `output`, then return +/// `WriteResult::OutputFull` and copy nothing into `output`. Otherwise, +/// return `WriteResult::InputEmpty` and the number of bytes copied into +/// `output`. +fn write_pessimistic(input: &[u8], output: &mut [u8]) -> (WriteResult, usize) { + if input.len() > output.len() { + (WriteResult::OutputFull, 0) + } else { + output[..input.len()].copy_from_slice(input); + (WriteResult::InputEmpty, input.len()) + } +} + +/// This avoids reborrowing. +/// See: https://bluss.github.io/rust/fun/2015/10/11/stuff-the-identity-function-does/ +fn moving(x: T) -> T { + x +} + +#[cfg(test)] +mod tests { + use crate::writer::WriteResult::*; + use crate::writer::{quote, QuoteStyle, Writer, WriterBuilder}; + + // OMG I HATE BYTE STRING LITERALS SO MUCH. + fn b(s: &str) -> &[u8] { + s.as_bytes() + } + fn s(b: &[u8]) -> &str { + ::core::str::from_utf8(b).unwrap() + } + + macro_rules! assert_field { + ( + $wtr:expr, $inp:expr, $out:expr, + $expect_in:expr, $expect_out:expr, + $expect_res:expr, $expect_data:expr + ) => {{ + let (res, i, o) = $wtr.field($inp, $out); + assert_eq!($expect_res, res, "result"); + assert_eq!($expect_in, i, "input"); + assert_eq!($expect_out, o, "output"); + assert_eq!($expect_data, s(&$out[..o]), "data"); + }}; + } + + macro_rules! assert_write { + ( + $wtr:expr, $which:ident, $out:expr, + $expect_out:expr, $expect_res:expr, $expect_data:expr + ) => {{ + let (res, o) = $wtr.$which($out); + assert_eq!($expect_res, res, "result"); + assert_eq!($expect_out, o, "output"); + assert_eq!($expect_data, s(&$out[..o]), "data"); + }}; + } + + #[test] + fn writer_one_field() { + let mut wtr = Writer::new(); + let out = &mut [0; 1024]; + let mut n = 0; + + assert_field!(wtr, b("abc"), &mut out[n..], 3, 3, InputEmpty, "abc"); + n += 3; + + assert_write!(wtr, finish, &mut out[n..], 0, InputEmpty, ""); + } + + #[test] + fn writer_one_empty_field_terminator() { + let mut wtr = Writer::new(); + let out = &mut [0; 1024]; + + assert_field!(wtr, b(""), &mut out[..], 0, 0, InputEmpty, ""); + assert_write!(wtr, terminator, &mut out[..], 3, InputEmpty, "\"\"\n"); + assert_write!(wtr, finish, &mut out[..], 0, InputEmpty, ""); + } + + #[test] + fn writer_one_empty_field_finish() { + let mut wtr = Writer::new(); + let out = &mut [0; 1024]; + + assert_field!(wtr, b(""), &mut out[..], 0, 0, InputEmpty, ""); + assert_write!(wtr, finish, &mut out[..], 2, InputEmpty, "\"\""); + } + + #[test] + fn writer_many_one_empty_field_finish() { + let mut wtr = Writer::new(); + let out = &mut [0; 1024]; + + assert_field!(wtr, b(""), &mut out[..], 0, 0, InputEmpty, ""); + assert_write!(wtr, terminator, &mut out[..], 3, InputEmpty, "\"\"\n"); + assert_field!(wtr, b(""), &mut out[..], 0, 0, InputEmpty, ""); + assert_write!(wtr, finish, &mut out[..], 2, InputEmpty, "\"\""); + } + + #[test] + fn writer_many_one_empty_field_terminator() { + let mut wtr = Writer::new(); + let out = &mut [0; 1024]; + + assert_field!(wtr, b(""), &mut out[..], 0, 0, InputEmpty, ""); + assert_write!(wtr, terminator, &mut out[..], 3, InputEmpty, "\"\"\n"); + assert_field!(wtr, b(""), &mut out[..], 0, 0, InputEmpty, ""); + assert_write!(wtr, terminator, &mut out[..], 3, InputEmpty, "\"\"\n"); + assert_write!(wtr, finish, &mut out[..], 0, InputEmpty, ""); + } + + #[test] + fn writer_one_field_quote() { + let mut wtr = Writer::new(); + let out = &mut [0; 1024]; + let mut n = 0; + + assert_field!( + wtr, + b("a\"bc"), + &mut out[n..], + 4, + 6, + InputEmpty, + "\"a\"\"bc" + ); + n += 6; + + assert_write!(wtr, finish, &mut out[n..], 1, InputEmpty, "\""); + } + + #[test] + fn writer_one_field_stream() { + let mut wtr = Writer::new(); + let out = &mut [0; 1024]; + let mut n = 0; + + assert_field!(wtr, b("abc"), &mut out[n..], 3, 3, InputEmpty, "abc"); + n += 3; + assert_field!(wtr, b("x"), &mut out[n..], 1, 1, InputEmpty, "x"); + n += 1; + + assert_write!(wtr, finish, &mut out[n..], 0, InputEmpty, ""); + } + + #[test] + fn writer_one_field_stream_quote() { + let mut wtr = Writer::new(); + let out = &mut [0; 1024]; + let mut n = 0; + + assert_field!( + wtr, + b("abc\""), + &mut out[n..], + 4, + 6, + InputEmpty, + "\"abc\"\"" + ); + n += 6; + assert_field!(wtr, b("x"), &mut out[n..], 1, 1, InputEmpty, "x"); + n += 1; + + assert_write!(wtr, finish, &mut out[n..], 1, InputEmpty, "\""); + } + + #[test] + fn writer_one_field_stream_quote_partial() { + let mut wtr = Writer::new(); + let out = &mut [0; 4]; + + assert_field!(wtr, b("ab\"xyz"), out, 2, 3, OutputFull, "\"ab"); + assert_field!(wtr, b("\"xyz"), out, 3, 4, OutputFull, "\"\"xy"); + assert_field!(wtr, b("z"), out, 1, 1, InputEmpty, "z"); + assert_write!(wtr, finish, out, 1, InputEmpty, "\""); + } + + #[test] + fn writer_two_fields() { + let mut wtr = Writer::new(); + let out = &mut [0; 1024]; + let mut n = 0; + + assert_field!(wtr, b("abc"), &mut out[n..], 3, 3, InputEmpty, "abc"); + n += 3; + assert_write!(wtr, delimiter, &mut out[n..], 1, InputEmpty, ","); + n += 1; + assert_field!(wtr, b("yz"), &mut out[n..], 2, 2, InputEmpty, "yz"); + n += 2; + + assert_write!(wtr, finish, &mut out[n..], 0, InputEmpty, ""); + + assert_eq!("abc,yz", s(&out[..n])); + } + + #[test] + fn writer_two_fields_non_numeric() { + let mut wtr = + WriterBuilder::new().quote_style(QuoteStyle::NonNumeric).build(); + let out = &mut [0; 1024]; + let mut n = 0; + + assert_field!(wtr, b("abc"), &mut out[n..], 3, 4, InputEmpty, "\"abc"); + n += 4; + assert_write!(wtr, delimiter, &mut out[n..], 2, InputEmpty, "\","); + n += 2; + assert_field!(wtr, b("5.2"), &mut out[n..], 3, 3, InputEmpty, "5.2"); + n += 3; + assert_write!(wtr, delimiter, &mut out[n..], 1, InputEmpty, ","); + n += 1; + assert_field!(wtr, b("98"), &mut out[n..], 2, 2, InputEmpty, "98"); + n += 2; + + assert_write!(wtr, finish, &mut out[n..], 0, InputEmpty, ""); + + assert_eq!("\"abc\",5.2,98", s(&out[..n])); + } + + #[test] + fn writer_two_fields_quote() { + let mut wtr = Writer::new(); + let out = &mut [0; 1024]; + let mut n = 0; + + assert_field!( + wtr, + b("a,bc"), + &mut out[n..], + 4, + 5, + InputEmpty, + "\"a,bc" + ); + n += 5; + assert_write!(wtr, delimiter, &mut out[n..], 2, InputEmpty, "\","); + n += 2; + assert_field!(wtr, b("\nz"), &mut out[n..], 2, 3, InputEmpty, "\"\nz"); + n += 3; + + assert_write!(wtr, finish, &mut out[n..], 1, InputEmpty, "\""); + n += 1; + + assert_eq!("\"a,bc\",\"\nz\"", s(&out[..n])); + } + + #[test] + fn writer_two_fields_two_records() { + let mut wtr = Writer::new(); + let out = &mut [0; 1024]; + let mut n = 0; + + assert_field!(wtr, b("abc"), &mut out[n..], 3, 3, InputEmpty, "abc"); + n += 3; + assert_write!(wtr, delimiter, &mut out[n..], 1, InputEmpty, ","); + n += 1; + assert_field!(wtr, b("yz"), &mut out[n..], 2, 2, InputEmpty, "yz"); + n += 2; + assert_write!(wtr, terminator, &mut out[n..], 1, InputEmpty, "\n"); + n += 1; + assert_field!(wtr, b("foo"), &mut out[n..], 3, 3, InputEmpty, "foo"); + n += 3; + assert_write!(wtr, delimiter, &mut out[n..], 1, InputEmpty, ","); + n += 1; + assert_field!(wtr, b("quux"), &mut out[n..], 4, 4, InputEmpty, "quux"); + n += 4; + + assert_write!(wtr, finish, &mut out[n..], 0, InputEmpty, ""); + + assert_eq!("abc,yz\nfoo,quux", s(&out[..n])); + } + + #[test] + fn writer_two_fields_two_records_quote() { + let mut wtr = Writer::new(); + let out = &mut [0; 1024]; + let mut n = 0; + + assert_field!( + wtr, + b("a,bc"), + &mut out[n..], + 4, + 5, + InputEmpty, + "\"a,bc" + ); + n += 5; + assert_write!(wtr, delimiter, &mut out[n..], 2, InputEmpty, "\","); + n += 2; + assert_field!(wtr, b("\nz"), &mut out[n..], 2, 3, InputEmpty, "\"\nz"); + n += 3; + assert_write!(wtr, terminator, &mut out[n..], 2, InputEmpty, "\"\n"); + n += 2; + assert_field!( + wtr, + b("f\"oo"), + &mut out[n..], + 4, + 6, + InputEmpty, + "\"f\"\"oo" + ); + n += 6; + assert_write!(wtr, delimiter, &mut out[n..], 2, InputEmpty, "\","); + n += 2; + assert_field!( + wtr, + b("quux,"), + &mut out[n..], + 5, + 6, + InputEmpty, + "\"quux," + ); + n += 6; + + assert_write!(wtr, finish, &mut out[n..], 1, InputEmpty, "\""); + n += 1; + + assert_eq!("\"a,bc\",\"\nz\"\n\"f\"\"oo\",\"quux,\"", s(&out[..n])); + } + + macro_rules! assert_quote { + ( + $inp:expr, $out:expr, + $expect_in:expr, $expect_out:expr, + $expect_res:expr, $expect_data:expr + ) => { + assert_quote!( + $inp, + $out, + $expect_in, + $expect_out, + $expect_res, + $expect_data, + true + ); + }; + ( + $inp:expr, $out:expr, + $expect_in:expr, $expect_out:expr, + $expect_res:expr, $expect_data:expr, + $double_quote:expr + ) => {{ + let (res, i, o) = quote($inp, $out, b'"', b'\\', $double_quote); + assert_eq!($expect_res, res, "result"); + assert_eq!($expect_in, i, "input"); + assert_eq!($expect_out, o, "output"); + assert_eq!(b($expect_data), &$out[..o], "data"); + }}; + } + + #[test] + fn quote_empty() { + let inp = b(""); + let out = &mut [0; 1024]; + + assert_quote!(inp, out, 0, 0, InputEmpty, ""); + } + + #[test] + fn quote_no_quotes() { + let inp = b("foobar"); + let out = &mut [0; 1024]; + + assert_quote!(inp, out, 6, 6, InputEmpty, "foobar"); + } + + #[test] + fn quote_one_quote() { + let inp = b("\""); + let out = &mut [0; 1024]; + + assert_quote!(inp, out, 1, 2, InputEmpty, r#""""#); + } + + #[test] + fn quote_two_quotes() { + let inp = b("\"\""); + let out = &mut [0; 1024]; + + assert_quote!(inp, out, 2, 4, InputEmpty, r#""""""#); + } + + #[test] + fn quote_escaped_one() { + let inp = b("\""); + let out = &mut [0; 1024]; + + assert_quote!(inp, out, 1, 2, InputEmpty, r#"\""#, false); + } + + #[test] + fn quote_escaped_two() { + let inp = b("\"\""); + let out = &mut [0; 1024]; + + assert_quote!(inp, out, 2, 4, InputEmpty, r#"\"\""#, false); + } + + #[test] + fn quote_misc() { + let inp = b(r#"foo "bar" baz "quux"?"#); + let out = &mut [0; 1024]; + + assert_quote!( + inp, + out, + 21, + 25, + InputEmpty, + r#"foo ""bar"" baz ""quux""?"# + ); + } + + #[test] + fn quote_stream_no_quotes() { + let mut inp = b("fooba"); + let out = &mut [0; 2]; + + assert_quote!(inp, out, 2, 2, OutputFull, "fo"); + inp = &inp[2..]; + assert_quote!(inp, out, 2, 2, OutputFull, "ob"); + inp = &inp[2..]; + assert_quote!(inp, out, 1, 1, InputEmpty, "a"); + } + + #[test] + fn quote_stream_quotes() { + let mut inp = b(r#"a"bc"d""#); + let out = &mut [0; 2]; + + assert_quote!(inp, out, 1, 1, OutputFull, "a"); + inp = &inp[1..]; + assert_quote!(inp, out, 1, 2, OutputFull, r#""""#); + inp = &inp[1..]; + assert_quote!(inp, out, 2, 2, OutputFull, "bc"); + inp = &inp[2..]; + assert_quote!(inp, out, 1, 2, OutputFull, r#""""#); + inp = &inp[1..]; + assert_quote!(inp, out, 1, 1, OutputFull, "d"); + inp = &inp[1..]; + assert_quote!(inp, out, 1, 2, InputEmpty, r#""""#); + } +} -- cgit v1.2.3