use std::{ fs::File, io::{self, BufRead, Seek}, marker::PhantomData, path::Path, result, }; use { csv_core::{Reader as CoreReader, ReaderBuilder as CoreReaderBuilder}, serde::de::DeserializeOwned, }; use crate::{ byte_record::{ByteRecord, Position}, error::{Error, ErrorKind, Result, Utf8Error}, string_record::StringRecord, {Terminator, Trim}, }; /// Builds a CSV reader with various configuration knobs. /// /// This builder can be used to tweak the field delimiter, record terminator /// and more. Once a CSV `Reader` is built, its configuration cannot be /// changed. #[derive(Debug)] pub struct ReaderBuilder { capacity: usize, flexible: bool, has_headers: bool, trim: Trim, /// The underlying CSV parser builder. /// /// We explicitly put this on the heap because CoreReaderBuilder embeds an /// entire DFA transition table, which along with other things, tallies up /// to almost 500 bytes on the stack. builder: Box, } impl Default for ReaderBuilder { fn default() -> ReaderBuilder { ReaderBuilder { capacity: 8 * (1 << 10), flexible: false, has_headers: true, trim: Trim::default(), builder: Box::new(CoreReaderBuilder::default()), } } } impl ReaderBuilder { /// Create a new builder for configuring CSV parsing. /// /// To convert a builder into a reader, call one of the methods starting /// with `from_`. /// /// # Example /// /// ``` /// use std::error::Error; /// use csv::{ReaderBuilder, StringRecord}; /// /// # fn main() { example().unwrap(); } /// fn example() -> Result<(), Box> { /// let data = "\ /// city,country,pop /// Boston,United States,4628910 /// Concord,United States,42695 /// "; /// let mut rdr = ReaderBuilder::new().from_reader(data.as_bytes()); /// /// let records = rdr /// .records() /// .collect::, csv::Error>>()?; /// assert_eq!(records, vec![ /// vec!["Boston", "United States", "4628910"], /// vec!["Concord", "United States", "42695"], /// ]); /// Ok(()) /// } /// ``` pub fn new() -> ReaderBuilder { ReaderBuilder::default() } /// Build a CSV parser from this configuration that reads data from the /// given file path. /// /// If there was a problem opening the file at the given path, then this /// returns the corresponding error. /// /// # Example /// /// ```no_run /// use std::error::Error; /// use csv::ReaderBuilder; /// /// # fn main() { example().unwrap(); } /// fn example() -> Result<(), Box> { /// let mut rdr = ReaderBuilder::new().from_path("foo.csv")?; /// for result in rdr.records() { /// let record = result?; /// println!("{:?}", record); /// } /// Ok(()) /// } /// ``` pub fn from_path>(&self, path: P) -> Result> { Ok(Reader::new(self, File::open(path)?)) } /// Build a CSV parser from this configuration that reads data from `rdr`. /// /// Note that the CSV reader is buffered automatically, so you should not /// wrap `rdr` in a buffered reader like `io::BufReader`. /// /// # Example /// /// ``` /// use std::error::Error; /// use csv::ReaderBuilder; /// /// # fn main() { example().unwrap(); } /// fn example() -> Result<(), Box> { /// let data = "\ /// city,country,pop /// Boston,United States,4628910 /// Concord,United States,42695 /// "; /// let mut rdr = ReaderBuilder::new().from_reader(data.as_bytes()); /// for result in rdr.records() { /// let record = result?; /// println!("{:?}", record); /// } /// Ok(()) /// } /// ``` pub fn from_reader(&self, rdr: R) -> Reader { Reader::new(self, rdr) } /// The field delimiter to use when parsing CSV. /// /// The default is `b','`. /// /// # Example /// /// ``` /// use std::error::Error; /// use csv::ReaderBuilder; /// /// # fn main() { example().unwrap(); } /// fn example() -> Result<(), Box> { /// let data = "\ /// city;country;pop /// Boston;United States;4628910 /// "; /// let mut rdr = ReaderBuilder::new() /// .delimiter(b';') /// .from_reader(data.as_bytes()); /// /// if let Some(result) = rdr.records().next() { /// let record = result?; /// assert_eq!(record, vec!["Boston", "United States", "4628910"]); /// Ok(()) /// } else { /// Err(From::from("expected at least one record but got none")) /// } /// } /// ``` pub fn delimiter(&mut self, delimiter: u8) -> &mut ReaderBuilder { self.builder.delimiter(delimiter); self } /// Whether to treat the first row as a special header row. /// /// By default, the first row is treated as a special header row, which /// means the header is never returned by any of the record reading methods /// or iterators. When this is disabled (`yes` set to `false`), the first /// row is not treated specially. /// /// Note that the `headers` and `byte_headers` methods are unaffected by /// whether this is set. Those methods always return the first record. /// /// # Example /// /// This example shows what happens when `has_headers` is disabled. /// Namely, the first row is treated just like any other row. /// /// ``` /// use std::error::Error; /// use csv::ReaderBuilder; /// /// # fn main() { example().unwrap(); } /// fn example() -> Result<(), Box> { /// let data = "\ /// city,country,pop /// Boston,United States,4628910 /// "; /// let mut rdr = ReaderBuilder::new() /// .has_headers(false) /// .from_reader(data.as_bytes()); /// let mut iter = rdr.records(); /// /// // Read the first record. /// if let Some(result) = iter.next() { /// let record = result?; /// assert_eq!(record, vec!["city", "country", "pop"]); /// } else { /// return Err(From::from( /// "expected at least two records but got none")); /// } /// /// // Read the second record. /// if let Some(result) = iter.next() { /// let record = result?; /// assert_eq!(record, vec!["Boston", "United States", "4628910"]); /// } else { /// return Err(From::from( /// "expected at least two records but got one")) /// } /// Ok(()) /// } /// ``` pub fn has_headers(&mut self, yes: bool) -> &mut ReaderBuilder { self.has_headers = yes; self } /// Whether the number of fields in records is allowed to change or not. /// /// When disabled (which is the default), parsing CSV data will return an /// error if a record is found with a number of fields different from the /// number of fields in a previous record. /// /// When enabled, this error checking is turned off. /// /// # Example: flexible records enabled /// /// ``` /// use std::error::Error; /// use csv::ReaderBuilder; /// /// # fn main() { example().unwrap(); } /// fn example() -> Result<(), Box> { /// // Notice that the first row is missing the population count. /// let data = "\ /// city,country,pop /// Boston,United States /// "; /// let mut rdr = ReaderBuilder::new() /// .flexible(true) /// .from_reader(data.as_bytes()); /// /// if let Some(result) = rdr.records().next() { /// let record = result?; /// assert_eq!(record, vec!["Boston", "United States"]); /// Ok(()) /// } else { /// Err(From::from("expected at least one record but got none")) /// } /// } /// ``` /// /// # Example: flexible records disabled /// /// This shows the error that appears when records of unequal length /// are found and flexible records have been disabled (which is the /// default). /// /// ``` /// use std::error::Error; /// use csv::{ErrorKind, ReaderBuilder}; /// /// # fn main() { example().unwrap(); } /// fn example() -> Result<(), Box> { /// // Notice that the first row is missing the population count. /// let data = "\ /// city,country,pop /// Boston,United States /// "; /// let mut rdr = ReaderBuilder::new() /// .flexible(false) /// .from_reader(data.as_bytes()); /// /// if let Some(Err(err)) = rdr.records().next() { /// match *err.kind() { /// ErrorKind::UnequalLengths { expected_len, len, .. } => { /// // The header row has 3 fields... /// assert_eq!(expected_len, 3); /// // ... but the first row has only 2 fields. /// assert_eq!(len, 2); /// Ok(()) /// } /// ref wrong => { /// Err(From::from(format!( /// "expected UnequalLengths error but got {:?}", /// wrong))) /// } /// } /// } else { /// Err(From::from( /// "expected at least one errored record but got none")) /// } /// } /// ``` pub fn flexible(&mut self, yes: bool) -> &mut ReaderBuilder { self.flexible = yes; self } /// Whether fields are trimmed of leading and trailing whitespace or not. /// /// By default, no trimming is performed. This method permits one to /// override that behavior and choose one of the following options: /// /// 1. `Trim::Headers` trims only header values. /// 2. `Trim::Fields` trims only non-header or "field" values. /// 3. `Trim::All` trims both header and non-header values. /// /// A value is only interpreted as a header value if this CSV reader is /// configured to read a header record (which is the default). /// /// When reading string records, characters meeting the definition of /// Unicode whitespace are trimmed. When reading byte records, characters /// meeting the definition of ASCII whitespace are trimmed. ASCII /// whitespace characters correspond to the set `[\t\n\v\f\r ]`. /// /// # Example /// /// This example shows what happens when all values are trimmed. /// /// ``` /// use std::error::Error; /// use csv::{ReaderBuilder, StringRecord, Trim}; /// /// # fn main() { example().unwrap(); } /// fn example() -> Result<(), Box> { /// let data = "\ /// city , country , pop /// Boston,\" /// United States\",4628910 /// Concord, United States ,42695 /// "; /// let mut rdr = ReaderBuilder::new() /// .trim(Trim::All) /// .from_reader(data.as_bytes()); /// let records = rdr /// .records() /// .collect::, csv::Error>>()?; /// assert_eq!(records, vec![ /// vec!["Boston", "United States", "4628910"], /// vec!["Concord", "United States", "42695"], /// ]); /// Ok(()) /// } /// ``` pub fn trim(&mut self, trim: Trim) -> &mut ReaderBuilder { self.trim = trim; self } /// The record terminator to use when parsing CSV. /// /// A record terminator can be any single byte. The default is a special /// value, `Terminator::CRLF`, which treats any occurrence of `\r`, `\n` /// or `\r\n` as a single record terminator. /// /// # Example: `$` as a record terminator /// /// ``` /// use std::error::Error; /// use csv::{ReaderBuilder, Terminator}; /// /// # fn main() { example().unwrap(); } /// fn example() -> Result<(), Box> { /// let data = "city,country,pop$Boston,United States,4628910"; /// let mut rdr = ReaderBuilder::new() /// .terminator(Terminator::Any(b'$')) /// .from_reader(data.as_bytes()); /// /// if let Some(result) = rdr.records().next() { /// let record = result?; /// assert_eq!(record, vec!["Boston", "United States", "4628910"]); /// Ok(()) /// } else { /// Err(From::from("expected at least one record but got none")) /// } /// } /// ``` pub fn terminator(&mut self, term: Terminator) -> &mut ReaderBuilder { self.builder.terminator(term.to_core()); self } /// The quote character to use when parsing CSV. /// /// The default is `b'"'`. /// /// # Example: single quotes instead of double quotes /// /// ``` /// use std::error::Error; /// use csv::ReaderBuilder; /// /// # fn main() { example().unwrap(); } /// fn example() -> Result<(), Box> { /// let data = "\ /// city,country,pop /// Boston,'United States',4628910 /// "; /// let mut rdr = ReaderBuilder::new() /// .quote(b'\'') /// .from_reader(data.as_bytes()); /// /// if let Some(result) = rdr.records().next() { /// let record = result?; /// assert_eq!(record, vec!["Boston", "United States", "4628910"]); /// Ok(()) /// } else { /// Err(From::from("expected at least one record but got none")) /// } /// } /// ``` pub fn quote(&mut self, quote: u8) -> &mut ReaderBuilder { self.builder.quote(quote); self } /// The escape character to use when parsing CSV. /// /// In some variants of CSV, quotes are escaped using a special escape /// character like `\` (instead of escaping quotes by doubling them). /// /// By default, recognizing these idiosyncratic escapes is disabled. /// /// # Example /// /// ``` /// use std::error::Error; /// use csv::ReaderBuilder; /// /// # fn main() { example().unwrap(); } /// fn example() -> Result<(), Box> { /// let data = "\ /// city,country,pop /// Boston,\"The \\\"United\\\" States\",4628910 /// "; /// let mut rdr = ReaderBuilder::new() /// .escape(Some(b'\\')) /// .from_reader(data.as_bytes()); /// /// if let Some(result) = rdr.records().next() { /// let record = result?; /// assert_eq!(record, vec![ /// "Boston", "The \"United\" States", "4628910", /// ]); /// Ok(()) /// } else { /// Err(From::from("expected at least one record but got none")) /// } /// } /// ``` pub fn escape(&mut self, escape: Option) -> &mut ReaderBuilder { self.builder.escape(escape); self } /// Enable double quote escapes. /// /// This is enabled by default, but it may be disabled. When disabled, /// doubled quotes are not interpreted as escapes. /// /// # Example /// /// ``` /// use std::error::Error; /// use csv::ReaderBuilder; /// /// # fn main() { example().unwrap(); } /// fn example() -> Result<(), Box> { /// let data = "\ /// city,country,pop /// Boston,\"The \"\"United\"\" States\",4628910 /// "; /// let mut rdr = ReaderBuilder::new() /// .double_quote(false) /// .from_reader(data.as_bytes()); /// /// if let Some(result) = rdr.records().next() { /// let record = result?; /// assert_eq!(record, vec![ /// "Boston", "The \"United\"\" States\"", "4628910", /// ]); /// Ok(()) /// } else { /// Err(From::from("expected at least one record but got none")) /// } /// } /// ``` pub fn double_quote(&mut self, yes: bool) -> &mut ReaderBuilder { self.builder.double_quote(yes); self } /// Enable or disable quoting. /// /// This is enabled by default, but it may be disabled. When disabled, /// quotes are not treated specially. /// /// # Example /// /// ``` /// use std::error::Error; /// use csv::ReaderBuilder; /// /// # fn main() { example().unwrap(); } /// fn example() -> Result<(), Box> { /// let data = "\ /// city,country,pop /// Boston,\"The United States,4628910 /// "; /// let mut rdr = ReaderBuilder::new() /// .quoting(false) /// .from_reader(data.as_bytes()); /// /// if let Some(result) = rdr.records().next() { /// let record = result?; /// assert_eq!(record, vec![ /// "Boston", "\"The United States", "4628910", /// ]); /// Ok(()) /// } else { /// Err(From::from("expected at least one record but got none")) /// } /// } /// ``` pub fn quoting(&mut self, yes: bool) -> &mut ReaderBuilder { self.builder.quoting(yes); self } /// The comment character to use when parsing CSV. /// /// If the start of a record begins with the byte given here, then that /// line is ignored by the CSV parser. /// /// This is disabled by default. /// /// # Example /// /// ``` /// use std::error::Error; /// use csv::ReaderBuilder; /// /// # fn main() { example().unwrap(); } /// fn example() -> Result<(), Box> { /// let data = "\ /// city,country,pop /// #Concord,United States,42695 /// Boston,United States,4628910 /// "; /// let mut rdr = ReaderBuilder::new() /// .comment(Some(b'#')) /// .from_reader(data.as_bytes()); /// /// if let Some(result) = rdr.records().next() { /// let record = result?; /// assert_eq!(record, vec!["Boston", "United States", "4628910"]); /// Ok(()) /// } else { /// Err(From::from("expected at least one record but got none")) /// } /// } /// ``` pub fn comment(&mut self, comment: Option) -> &mut ReaderBuilder { self.builder.comment(comment); self } /// A convenience method for specifying a configuration to read ASCII /// delimited text. /// /// This sets the delimiter and record terminator to the ASCII unit /// separator (`\x1F`) and record separator (`\x1E`), respectively. /// /// # Example /// /// ``` /// use std::error::Error; /// use csv::ReaderBuilder; /// /// # fn main() { example().unwrap(); } /// fn example() -> Result<(), Box> { /// let data = "\ /// city\x1Fcountry\x1Fpop\x1EBoston\x1FUnited States\x1F4628910"; /// let mut rdr = ReaderBuilder::new() /// .ascii() /// .from_reader(data.as_bytes()); /// /// if let Some(result) = rdr.records().next() { /// let record = result?; /// assert_eq!(record, vec!["Boston", "United States", "4628910"]); /// Ok(()) /// } else { /// Err(From::from("expected at least one record but got none")) /// } /// } /// ``` pub fn ascii(&mut self) -> &mut ReaderBuilder { self.builder.ascii(); self } /// Set the capacity (in bytes) of the buffer used in the CSV reader. /// This defaults to a reasonable setting. pub fn buffer_capacity(&mut self, capacity: usize) -> &mut ReaderBuilder { self.capacity = capacity; self } /// Enable or disable the NFA for parsing CSV. /// /// This is intended to be a debug option. The NFA is always slower than /// the DFA. #[doc(hidden)] pub fn nfa(&mut self, yes: bool) -> &mut ReaderBuilder { self.builder.nfa(yes); self } } /// A already configured CSV reader. /// /// A CSV reader takes as input CSV data and transforms that into standard Rust /// values. The most flexible way to read CSV data is as a sequence of records, /// where a record is a sequence of fields and each field is a string. However, /// a reader can also deserialize CSV data into Rust types like `i64` or /// `(String, f64, f64, f64)` or even a custom struct automatically using /// Serde. /// /// # Configuration /// /// A CSV reader has a couple convenient constructor methods like `from_path` /// and `from_reader`. However, if you want to configure the CSV reader to use /// a different delimiter or quote character (among many other things), then /// you should use a [`ReaderBuilder`](struct.ReaderBuilder.html) to construct /// a `Reader`. For example, to change the field delimiter: /// /// ``` /// use std::error::Error; /// use csv::ReaderBuilder; /// /// # fn main() { example().unwrap(); } /// fn example() -> Result<(), Box> { /// let data = "\ /// city;country;pop /// Boston;United States;4628910 /// "; /// let mut rdr = ReaderBuilder::new() /// .delimiter(b';') /// .from_reader(data.as_bytes()); /// /// if let Some(result) = rdr.records().next() { /// let record = result?; /// assert_eq!(record, vec!["Boston", "United States", "4628910"]); /// Ok(()) /// } else { /// Err(From::from("expected at least one record but got none")) /// } /// } /// ``` /// /// # Error handling /// /// In general, CSV *parsing* does not ever return an error. That is, there is /// no such thing as malformed CSV data. Instead, this reader will prioritize /// finding a parse over rejecting CSV data that it does not understand. This /// choice was inspired by other popular CSV parsers, but also because it is /// pragmatic. CSV data varies wildly, so even if the CSV data is malformed, /// it might still be possible to work with the data. In the land of CSV, there /// is no "right" or "wrong," only "right" and "less right." /// /// With that said, a number of errors can occur while reading CSV data: /// /// * By default, all records in CSV data must have the same number of fields. /// If a record is found with a different number of fields than a prior /// record, then an error is returned. This behavior can be disabled by /// enabling flexible parsing via the `flexible` method on /// [`ReaderBuilder`](struct.ReaderBuilder.html). /// * When reading CSV data from a resource (like a file), it is possible for /// reading from the underlying resource to fail. This will return an error. /// For subsequent calls to the `Reader` after encountering a such error /// (unless `seek` is used), it will behave as if end of file had been /// reached, in order to avoid running into infinite loops when still /// attempting to read the next record when one has errored. /// * When reading CSV data into `String` or `&str` fields (e.g., via a /// [`StringRecord`](struct.StringRecord.html)), UTF-8 is strictly /// enforced. If CSV data is invalid UTF-8, then an error is returned. If /// you want to read invalid UTF-8, then you should use the byte oriented /// APIs such as [`ByteRecord`](struct.ByteRecord.html). If you need explicit /// support for another encoding entirely, then you'll need to use another /// crate to transcode your CSV data to UTF-8 before parsing it. /// * When using Serde to deserialize CSV data into Rust types, it is possible /// for a number of additional errors to occur. For example, deserializing /// a field `xyz` into an `i32` field will result in an error. /// /// For more details on the precise semantics of errors, see the /// [`Error`](enum.Error.html) type. #[derive(Debug)] pub struct Reader { /// The underlying CSV parser. /// /// We explicitly put this on the heap because CoreReader embeds an entire /// DFA transition table, which along with other things, tallies up to /// almost 500 bytes on the stack. core: Box, /// The underlying reader. rdr: io::BufReader, /// Various state tracking. /// /// There is more state embedded in the `CoreReader`. state: ReaderState, } #[derive(Debug)] struct ReaderState { /// When set, this contains the first row of any parsed CSV data. /// /// This is always populated, regardless of whether `has_headers` is set. headers: Option, /// When set, the first row of parsed CSV data is excluded from things /// that read records, like iterators and `read_record`. has_headers: bool, /// When set, there is no restriction on the length of records. When not /// set, every record must have the same number of fields, or else an error /// is reported. flexible: bool, trim: Trim, /// The number of fields in the first record parsed. first_field_count: Option, /// The current position of the parser. /// /// Note that this position is only observable by callers at the start /// of a record. More granular positions are not supported. cur_pos: Position, /// Whether the first record has been read or not. first: bool, /// Whether the reader has been seeked or not. seeked: bool, /// Whether EOF of the underlying reader has been reached or not. /// /// IO errors on the underlying reader will be considered as an EOF for /// subsequent read attempts, as it would be incorrect to keep on trying /// to read when the underlying reader has broken. /// /// For clarity, having the best `Debug` impl and in case they need to be /// treated differently at some point, we store whether the `EOF` is /// considered because an actual EOF happened, or because we encoundered /// an IO error. /// This has no additional runtime cost. eof: ReaderEofState, } /// Whether EOF of the underlying reader has been reached or not. /// /// IO errors on the underlying reader will be considered as an EOF for /// subsequent read attempts, as it would be incorrect to keep on trying /// to read when the underlying reader has broken. /// /// For clarity, having the best `Debug` impl and in case they need to be /// treated differently at some point, we store whether the `EOF` is /// considered because an actual EOF happened, or because we encoundered /// an IO error #[derive(Debug, Clone, Copy, PartialEq, Eq)] enum ReaderEofState { NotEof, Eof, IOError, } /// Headers encapsulates any data associated with the headers of CSV data. /// /// The headers always correspond to the first row. #[derive(Debug)] struct Headers { /// The header, as raw bytes. byte_record: ByteRecord, /// The header, as valid UTF-8 (or a UTF-8 error). string_record: result::Result, } impl Reader> { /// Create a new CSV parser with a default configuration for the given /// file path. /// /// To customize CSV parsing, use a `ReaderBuilder`. /// /// # Example /// /// ```no_run /// use std::error::Error; /// use csv::Reader; /// /// # fn main() { example().unwrap(); } /// fn example() -> Result<(), Box> { /// let mut rdr = Reader::from_path("foo.csv")?; /// for result in rdr.records() { /// let record = result?; /// println!("{:?}", record); /// } /// Ok(()) /// } /// ``` pub fn from_path>(path: P) -> Result> { ReaderBuilder::new().from_path(path) } } impl Reader { /// Create a new CSV reader given a builder and a source of underlying /// bytes. fn new(builder: &ReaderBuilder, rdr: R) -> Reader { Reader { core: Box::new(builder.builder.build()), rdr: io::BufReader::with_capacity(builder.capacity, rdr), state: ReaderState { headers: None, has_headers: builder.has_headers, flexible: builder.flexible, trim: builder.trim, first_field_count: None, cur_pos: Position::new(), first: false, seeked: false, eof: ReaderEofState::NotEof, }, } } /// Create a new CSV parser with a default configuration for the given /// reader. /// /// To customize CSV parsing, use a `ReaderBuilder`. /// /// # Example /// /// ``` /// use std::error::Error; /// use csv::Reader; /// /// # fn main() { example().unwrap(); } /// fn example() -> Result<(), Box> { /// let data = "\ /// city,country,pop /// Boston,United States,4628910 /// Concord,United States,42695 /// "; /// let mut rdr = Reader::from_reader(data.as_bytes()); /// for result in rdr.records() { /// let record = result?; /// println!("{:?}", record); /// } /// Ok(()) /// } /// ``` pub fn from_reader(rdr: R) -> Reader { ReaderBuilder::new().from_reader(rdr) } /// Returns a borrowed iterator over deserialized records. /// /// Each item yielded by this iterator is a `Result`. /// Therefore, in order to access the record, callers must handle the /// possibility of error (typically with `try!` or `?`). /// /// If `has_headers` was enabled via a `ReaderBuilder` (which is the /// default), then this does not include the first record. Additionally, /// if `has_headers` is enabled, then deserializing into a struct will /// automatically align the values in each row to the fields of a struct /// based on the header row. /// /// # Example /// /// This shows how to deserialize CSV data into normal Rust structs. The /// fields of the header row are used to match up the values in each row /// to the fields of the struct. /// /// ``` /// use std::error::Error; /// /// #[derive(Debug, serde::Deserialize, Eq, PartialEq)] /// struct Row { /// city: String, /// country: String, /// #[serde(rename = "popcount")] /// population: u64, /// } /// /// # fn main() { example().unwrap(); } /// fn example() -> Result<(), Box> { /// let data = "\ /// city,country,popcount /// Boston,United States,4628910 /// "; /// let mut rdr = csv::Reader::from_reader(data.as_bytes()); /// let mut iter = rdr.deserialize(); /// /// if let Some(result) = iter.next() { /// let record: Row = result?; /// assert_eq!(record, Row { /// city: "Boston".to_string(), /// country: "United States".to_string(), /// population: 4628910, /// }); /// Ok(()) /// } else { /// Err(From::from("expected at least one record but got none")) /// } /// } /// ``` /// /// # Rules /// /// For the most part, any Rust type that maps straight-forwardly to a CSV /// record is supported. This includes maps, structs, tuples and tuple /// structs. Other Rust types, such as `Vec`s, arrays, and enums have /// a more complicated story. In general, when working with CSV data, one /// should avoid *nested sequences* as much as possible. /// /// Maps, structs, tuples and tuple structs map to CSV records in a simple /// way. Tuples and tuple structs decode their fields in the order that /// they are defined. Structs will do the same only if `has_headers` has /// been disabled using [`ReaderBuilder`](struct.ReaderBuilder.html), /// otherwise, structs and maps are deserialized based on the fields /// defined in the header row. (If there is no header row, then /// deserializing into a map will result in an error.) /// /// Nested sequences are supported in a limited capacity. Namely, they /// are flattened. As a result, it's often useful to use a `Vec` to capture /// a "tail" of fields in a record: /// /// ``` /// use std::error::Error; /// /// #[derive(Debug, serde::Deserialize, Eq, PartialEq)] /// struct Row { /// label: String, /// values: Vec, /// } /// /// # fn main() { example().unwrap(); } /// fn example() -> Result<(), Box> { /// let data = "foo,1,2,3"; /// let mut rdr = csv::ReaderBuilder::new() /// .has_headers(false) /// .from_reader(data.as_bytes()); /// let mut iter = rdr.deserialize(); /// /// if let Some(result) = iter.next() { /// let record: Row = result?; /// assert_eq!(record, Row { /// label: "foo".to_string(), /// values: vec![1, 2, 3], /// }); /// Ok(()) /// } else { /// Err(From::from("expected at least one record but got none")) /// } /// } /// ``` /// /// In the above example, adding another field to the `Row` struct after /// the `values` field will result in a deserialization error. This is /// because the deserializer doesn't know when to stop reading fields /// into the `values` vector, so it will consume the rest of the fields in /// the record leaving none left over for the additional field. /// /// Finally, simple enums in Rust can be deserialized as well. Namely, /// enums must either be variants with no arguments or variants with a /// single argument. Variants with no arguments are deserialized based on /// which variant name the field matches. Variants with a single argument /// are deserialized based on which variant can store the data. The latter /// is only supported when using "untagged" enum deserialization. The /// following example shows both forms in action: /// /// ``` /// use std::error::Error; /// /// #[derive(Debug, serde::Deserialize, PartialEq)] /// struct Row { /// label: Label, /// value: Number, /// } /// /// #[derive(Debug, serde::Deserialize, PartialEq)] /// #[serde(rename_all = "lowercase")] /// enum Label { /// Celsius, /// Fahrenheit, /// } /// /// #[derive(Debug, serde::Deserialize, PartialEq)] /// #[serde(untagged)] /// enum Number { /// Integer(i64), /// Float(f64), /// } /// /// # fn main() { example().unwrap(); } /// fn example() -> Result<(), Box> { /// let data = "\ /// label,value /// celsius,22.2222 /// fahrenheit,72 /// "; /// let mut rdr = csv::Reader::from_reader(data.as_bytes()); /// let mut iter = rdr.deserialize(); /// /// // Read the first record. /// if let Some(result) = iter.next() { /// let record: Row = result?; /// assert_eq!(record, Row { /// label: Label::Celsius, /// value: Number::Float(22.2222), /// }); /// } else { /// return Err(From::from( /// "expected at least two records but got none")); /// } /// /// // Read the second record. /// if let Some(result) = iter.next() { /// let record: Row = result?; /// assert_eq!(record, Row { /// label: Label::Fahrenheit, /// value: Number::Integer(72), /// }); /// Ok(()) /// } else { /// Err(From::from( /// "expected at least two records but got only one")) /// } /// } /// ``` pub fn deserialize(&mut self) -> DeserializeRecordsIter where D: DeserializeOwned, { DeserializeRecordsIter::new(self) } /// Returns an owned iterator over deserialized records. /// /// Each item yielded by this iterator is a `Result`. /// Therefore, in order to access the record, callers must handle the /// possibility of error (typically with `try!` or `?`). /// /// This is mostly useful when you want to return a CSV iterator or store /// it somewhere. /// /// If `has_headers` was enabled via a `ReaderBuilder` (which is the /// default), then this does not include the first record. Additionally, /// if `has_headers` is enabled, then deserializing into a struct will /// automatically align the values in each row to the fields of a struct /// based on the header row. /// /// For more detailed deserialization rules, see the documentation on the /// `deserialize` method. /// /// # Example /// /// ``` /// use std::error::Error; /// /// #[derive(Debug, serde::Deserialize, Eq, PartialEq)] /// struct Row { /// city: String, /// country: String, /// #[serde(rename = "popcount")] /// population: u64, /// } /// /// # fn main() { example().unwrap(); } /// fn example() -> Result<(), Box> { /// let data = "\ /// city,country,popcount /// Boston,United States,4628910 /// "; /// let rdr = csv::Reader::from_reader(data.as_bytes()); /// let mut iter = rdr.into_deserialize(); /// /// if let Some(result) = iter.next() { /// let record: Row = result?; /// assert_eq!(record, Row { /// city: "Boston".to_string(), /// country: "United States".to_string(), /// population: 4628910, /// }); /// Ok(()) /// } else { /// Err(From::from("expected at least one record but got none")) /// } /// } /// ``` pub fn into_deserialize(self) -> DeserializeRecordsIntoIter where D: DeserializeOwned, { DeserializeRecordsIntoIter::new(self) } /// Returns a borrowed iterator over all records as strings. /// /// Each item yielded by this iterator is a `Result`. /// Therefore, in order to access the record, callers must handle the /// possibility of error (typically with `try!` or `?`). /// /// If `has_headers` was enabled via a `ReaderBuilder` (which is the /// default), then this does not include the first record. /// /// # Example /// /// ``` /// use std::error::Error; /// use csv::Reader; /// /// # fn main() { example().unwrap(); } /// fn example() -> Result<(), Box> { /// let data = "\ /// city,country,pop /// Boston,United States,4628910 /// "; /// let mut rdr = Reader::from_reader(data.as_bytes()); /// let mut iter = rdr.records(); /// /// if let Some(result) = iter.next() { /// let record = result?; /// assert_eq!(record, vec!["Boston", "United States", "4628910"]); /// Ok(()) /// } else { /// Err(From::from("expected at least one record but got none")) /// } /// } /// ``` pub fn records(&mut self) -> StringRecordsIter { StringRecordsIter::new(self) } /// Returns an owned iterator over all records as strings. /// /// Each item yielded by this iterator is a `Result`. /// Therefore, in order to access the record, callers must handle the /// possibility of error (typically with `try!` or `?`). /// /// This is mostly useful when you want to return a CSV iterator or store /// it somewhere. /// /// If `has_headers` was enabled via a `ReaderBuilder` (which is the /// default), then this does not include the first record. /// /// # Example /// /// ``` /// use std::error::Error; /// use csv::Reader; /// /// # fn main() { example().unwrap(); } /// fn example() -> Result<(), Box> { /// let data = "\ /// city,country,pop /// Boston,United States,4628910 /// "; /// let rdr = Reader::from_reader(data.as_bytes()); /// let mut iter = rdr.into_records(); /// /// if let Some(result) = iter.next() { /// let record = result?; /// assert_eq!(record, vec!["Boston", "United States", "4628910"]); /// Ok(()) /// } else { /// Err(From::from("expected at least one record but got none")) /// } /// } /// ``` pub fn into_records(self) -> StringRecordsIntoIter { StringRecordsIntoIter::new(self) } /// Returns a borrowed iterator over all records as raw bytes. /// /// Each item yielded by this iterator is a `Result`. /// Therefore, in order to access the record, callers must handle the /// possibility of error (typically with `try!` or `?`). /// /// If `has_headers` was enabled via a `ReaderBuilder` (which is the /// default), then this does not include the first record. /// /// # Example /// /// ``` /// use std::error::Error; /// use csv::Reader; /// /// # fn main() { example().unwrap(); } /// fn example() -> Result<(), Box> { /// let data = "\ /// city,country,pop /// Boston,United States,4628910 /// "; /// let mut rdr = Reader::from_reader(data.as_bytes()); /// let mut iter = rdr.byte_records(); /// /// if let Some(result) = iter.next() { /// let record = result?; /// assert_eq!(record, vec!["Boston", "United States", "4628910"]); /// Ok(()) /// } else { /// Err(From::from("expected at least one record but got none")) /// } /// } /// ``` pub fn byte_records(&mut self) -> ByteRecordsIter { ByteRecordsIter::new(self) } /// Returns an owned iterator over all records as raw bytes. /// /// Each item yielded by this iterator is a `Result`. /// Therefore, in order to access the record, callers must handle the /// possibility of error (typically with `try!` or `?`). /// /// This is mostly useful when you want to return a CSV iterator or store /// it somewhere. /// /// If `has_headers` was enabled via a `ReaderBuilder` (which is the /// default), then this does not include the first record. /// /// # Example /// /// ``` /// use std::error::Error; /// use csv::Reader; /// /// # fn main() { example().unwrap(); } /// fn example() -> Result<(), Box> { /// let data = "\ /// city,country,pop /// Boston,United States,4628910 /// "; /// let rdr = Reader::from_reader(data.as_bytes()); /// let mut iter = rdr.into_byte_records(); /// /// if let Some(result) = iter.next() { /// let record = result?; /// assert_eq!(record, vec!["Boston", "United States", "4628910"]); /// Ok(()) /// } else { /// Err(From::from("expected at least one record but got none")) /// } /// } /// ``` pub fn into_byte_records(self) -> ByteRecordsIntoIter { ByteRecordsIntoIter::new(self) } /// Returns a reference to the first row read by this parser. /// /// If no row has been read yet, then this will force parsing of the first /// row. /// /// If there was a problem parsing the row or if it wasn't valid UTF-8, /// then this returns an error. /// /// If the underlying reader emits EOF before any data, then this returns /// an empty record. /// /// Note that this method may be used regardless of whether `has_headers` /// was enabled (but it is enabled by default). /// /// # Example /// /// This example shows how to get the header row of CSV data. Notice that /// the header row does not appear as a record in the iterator! /// /// ``` /// use std::error::Error; /// use csv::Reader; /// /// # fn main() { example().unwrap(); } /// fn example() -> Result<(), Box> { /// let data = "\ /// city,country,pop /// Boston,United States,4628910 /// "; /// let mut rdr = Reader::from_reader(data.as_bytes()); /// /// // We can read the headers before iterating. /// { /// // `headers` borrows from the reader, so we put this in its /// // own scope. That way, the borrow ends before we try iterating /// // below. Alternatively, we could clone the headers. /// let headers = rdr.headers()?; /// assert_eq!(headers, vec!["city", "country", "pop"]); /// } /// /// if let Some(result) = rdr.records().next() { /// let record = result?; /// assert_eq!(record, vec!["Boston", "United States", "4628910"]); /// } else { /// return Err(From::from( /// "expected at least one record but got none")) /// } /// /// // We can also read the headers after iterating. /// let headers = rdr.headers()?; /// assert_eq!(headers, vec!["city", "country", "pop"]); /// Ok(()) /// } /// ``` pub fn headers(&mut self) -> Result<&StringRecord> { if self.state.headers.is_none() { let mut record = ByteRecord::new(); self.read_byte_record_impl(&mut record)?; self.set_headers_impl(Err(record)); } let headers = self.state.headers.as_ref().unwrap(); match headers.string_record { Ok(ref record) => Ok(record), Err(ref err) => Err(Error::new(ErrorKind::Utf8 { pos: headers.byte_record.position().map(Clone::clone), err: err.clone(), })), } } /// Returns a reference to the first row read by this parser as raw bytes. /// /// If no row has been read yet, then this will force parsing of the first /// row. /// /// If there was a problem parsing the row then this returns an error. /// /// If the underlying reader emits EOF before any data, then this returns /// an empty record. /// /// Note that this method may be used regardless of whether `has_headers` /// was enabled (but it is enabled by default). /// /// # Example /// /// This example shows how to get the header row of CSV data. Notice that /// the header row does not appear as a record in the iterator! /// /// ``` /// use std::error::Error; /// use csv::Reader; /// /// # fn main() { example().unwrap(); } /// fn example() -> Result<(), Box> { /// let data = "\ /// city,country,pop /// Boston,United States,4628910 /// "; /// let mut rdr = Reader::from_reader(data.as_bytes()); /// /// // We can read the headers before iterating. /// { /// // `headers` borrows from the reader, so we put this in its /// // own scope. That way, the borrow ends before we try iterating /// // below. Alternatively, we could clone the headers. /// let headers = rdr.byte_headers()?; /// assert_eq!(headers, vec!["city", "country", "pop"]); /// } /// /// if let Some(result) = rdr.byte_records().next() { /// let record = result?; /// assert_eq!(record, vec!["Boston", "United States", "4628910"]); /// } else { /// return Err(From::from( /// "expected at least one record but got none")) /// } /// /// // We can also read the headers after iterating. /// let headers = rdr.byte_headers()?; /// assert_eq!(headers, vec!["city", "country", "pop"]); /// Ok(()) /// } /// ``` pub fn byte_headers(&mut self) -> Result<&ByteRecord> { if self.state.headers.is_none() { let mut record = ByteRecord::new(); self.read_byte_record_impl(&mut record)?; self.set_headers_impl(Err(record)); } Ok(&self.state.headers.as_ref().unwrap().byte_record) } /// Set the headers of this CSV parser manually. /// /// This overrides any other setting (including `set_byte_headers`). Any /// automatic detection of headers is disabled. This may be called at any /// time. /// /// # Example /// /// ``` /// use std::error::Error; /// use csv::{Reader, StringRecord}; /// /// # fn main() { example().unwrap(); } /// fn example() -> Result<(), Box> { /// let data = "\ /// city,country,pop /// Boston,United States,4628910 /// "; /// let mut rdr = Reader::from_reader(data.as_bytes()); /// /// assert_eq!(rdr.headers()?, vec!["city", "country", "pop"]); /// rdr.set_headers(StringRecord::from(vec!["a", "b", "c"])); /// assert_eq!(rdr.headers()?, vec!["a", "b", "c"]); /// /// Ok(()) /// } /// ``` pub fn set_headers(&mut self, headers: StringRecord) { self.set_headers_impl(Ok(headers)); } /// Set the headers of this CSV parser manually as raw bytes. /// /// This overrides any other setting (including `set_headers`). Any /// automatic detection of headers is disabled. This may be called at any /// time. /// /// # Example /// /// ``` /// use std::error::Error; /// use csv::{Reader, ByteRecord}; /// /// # fn main() { example().unwrap(); } /// fn example() -> Result<(), Box> { /// let data = "\ /// city,country,pop /// Boston,United States,4628910 /// "; /// let mut rdr = Reader::from_reader(data.as_bytes()); /// /// assert_eq!(rdr.byte_headers()?, vec!["city", "country", "pop"]); /// rdr.set_byte_headers(ByteRecord::from(vec!["a", "b", "c"])); /// assert_eq!(rdr.byte_headers()?, vec!["a", "b", "c"]); /// /// Ok(()) /// } /// ``` pub fn set_byte_headers(&mut self, headers: ByteRecord) { self.set_headers_impl(Err(headers)); } fn set_headers_impl( &mut self, headers: result::Result, ) { // If we have string headers, then get byte headers. But if we have // byte headers, then get the string headers (or a UTF-8 error). let (mut str_headers, mut byte_headers) = match headers { Ok(string) => { let bytes = string.clone().into_byte_record(); (Ok(string), bytes) } Err(bytes) => { match StringRecord::from_byte_record(bytes.clone()) { Ok(str_headers) => (Ok(str_headers), bytes), Err(err) => (Err(err.utf8_error().clone()), bytes), } } }; if self.state.trim.should_trim_headers() { if let Ok(ref mut str_headers) = str_headers.as_mut() { str_headers.trim(); } byte_headers.trim(); } self.state.headers = Some(Headers { byte_record: byte_headers, string_record: str_headers, }); } /// Read a single row into the given record. Returns false when no more /// records could be read. /// /// If `has_headers` was enabled via a `ReaderBuilder` (which is the /// default), then this will never read the first record. /// /// This method is useful when you want to read records as fast as /// as possible. It's less ergonomic than an iterator, but it permits the /// caller to reuse the `StringRecord` allocation, which usually results /// in higher throughput. /// /// Records read via this method are guaranteed to have a position set /// on them, even if the reader is at EOF or if an error is returned. /// /// # Example /// /// ``` /// use std::error::Error; /// use csv::{Reader, StringRecord}; /// /// # fn main() { example().unwrap(); } /// fn example() -> Result<(), Box> { /// let data = "\ /// city,country,pop /// Boston,United States,4628910 /// "; /// let mut rdr = Reader::from_reader(data.as_bytes()); /// let mut record = StringRecord::new(); /// /// if rdr.read_record(&mut record)? { /// assert_eq!(record, vec!["Boston", "United States", "4628910"]); /// Ok(()) /// } else { /// Err(From::from("expected at least one record but got none")) /// } /// } /// ``` pub fn read_record(&mut self, record: &mut StringRecord) -> Result { let result = record.read(self); // We need to trim again because trimming string records includes // Unicode whitespace. (ByteRecord trimming only includes ASCII // whitespace.) if self.state.trim.should_trim_fields() { record.trim(); } result } /// Read a single row into the given byte record. Returns false when no /// more records could be read. /// /// If `has_headers` was enabled via a `ReaderBuilder` (which is the /// default), then this will never read the first record. /// /// This method is useful when you want to read records as fast as /// as possible. It's less ergonomic than an iterator, but it permits the /// caller to reuse the `ByteRecord` allocation, which usually results /// in higher throughput. /// /// Records read via this method are guaranteed to have a position set /// on them, even if the reader is at EOF or if an error is returned. /// /// # Example /// /// ``` /// use std::error::Error; /// use csv::{ByteRecord, Reader}; /// /// # fn main() { example().unwrap(); } /// fn example() -> Result<(), Box> { /// let data = "\ /// city,country,pop /// Boston,United States,4628910 /// "; /// let mut rdr = Reader::from_reader(data.as_bytes()); /// let mut record = ByteRecord::new(); /// /// if rdr.read_byte_record(&mut record)? { /// assert_eq!(record, vec!["Boston", "United States", "4628910"]); /// Ok(()) /// } else { /// Err(From::from("expected at least one record but got none")) /// } /// } /// ``` pub fn read_byte_record( &mut self, record: &mut ByteRecord, ) -> Result { if !self.state.seeked && !self.state.has_headers && !self.state.first { // If the caller indicated "no headers" and we haven't yielded the // first record yet, then we should yield our header row if we have // one. if let Some(ref headers) = self.state.headers { self.state.first = true; record.clone_from(&headers.byte_record); if self.state.trim.should_trim_fields() { record.trim(); } return Ok(!record.is_empty()); } } let ok = self.read_byte_record_impl(record)?; self.state.first = true; if !self.state.seeked && self.state.headers.is_none() { self.set_headers_impl(Err(record.clone())); // If the end user indicated that we have headers, then we should // never return the first row. Instead, we should attempt to // read and return the next one. if self.state.has_headers { let result = self.read_byte_record_impl(record); if self.state.trim.should_trim_fields() { record.trim(); } return result; } } else if self.state.trim.should_trim_fields() { record.trim(); } Ok(ok) } /// Read a byte record from the underlying CSV reader, without accounting /// for headers. #[inline(always)] fn read_byte_record_impl( &mut self, record: &mut ByteRecord, ) -> Result { use csv_core::ReadRecordResult::*; record.clear(); record.set_position(Some(self.state.cur_pos.clone())); if self.state.eof != ReaderEofState::NotEof { return Ok(false); } let (mut outlen, mut endlen) = (0, 0); loop { let (res, nin, nout, nend) = { let input_res = self.rdr.fill_buf(); if input_res.is_err() { self.state.eof = ReaderEofState::IOError; } let input = input_res?; let (fields, ends) = record.as_parts(); self.core.read_record( input, &mut fields[outlen..], &mut ends[endlen..], ) }; self.rdr.consume(nin); let byte = self.state.cur_pos.byte(); self.state .cur_pos .set_byte(byte + nin as u64) .set_line(self.core.line()); outlen += nout; endlen += nend; match res { InputEmpty => continue, OutputFull => { record.expand_fields(); continue; } OutputEndsFull => { record.expand_ends(); continue; } Record => { record.set_len(endlen); self.state.add_record(record)?; return Ok(true); } End => { self.state.eof = ReaderEofState::Eof; return Ok(false); } } } } /// Return the current position of this CSV reader. /// /// The byte offset in the position returned can be used to `seek` this /// reader. In particular, seeking to a position returned here on the same /// data will result in parsing the same subsequent record. /// /// # Example: reading the position /// /// ``` /// use std::{error::Error, io}; /// use csv::{Reader, Position}; /// /// # fn main() { example().unwrap(); } /// fn example() -> Result<(), Box> { /// let data = "\ /// city,country,popcount /// Boston,United States,4628910 /// Concord,United States,42695 /// "; /// let rdr = Reader::from_reader(io::Cursor::new(data)); /// let mut iter = rdr.into_records(); /// let mut pos = Position::new(); /// loop { /// // Read the position immediately before each record. /// let next_pos = iter.reader().position().clone(); /// if iter.next().is_none() { /// break; /// } /// pos = next_pos; /// } /// /// // `pos` should now be the position immediately before the last /// // record. /// assert_eq!(pos.byte(), 51); /// assert_eq!(pos.line(), 3); /// assert_eq!(pos.record(), 2); /// Ok(()) /// } /// ``` pub fn position(&self) -> &Position { &self.state.cur_pos } /// Returns true if and only if this reader has been exhausted. /// /// When this returns true, no more records can be read from this reader /// (unless it has been seeked to another position). /// /// # Example /// /// ``` /// use std::{error::Error, io}; /// use csv::{Reader, Position}; /// /// # fn main() { example().unwrap(); } /// fn example() -> Result<(), Box> { /// let data = "\ /// city,country,popcount /// Boston,United States,4628910 /// Concord,United States,42695 /// "; /// let mut rdr = Reader::from_reader(io::Cursor::new(data)); /// assert!(!rdr.is_done()); /// for result in rdr.records() { /// let _ = result?; /// } /// assert!(rdr.is_done()); /// Ok(()) /// } /// ``` pub fn is_done(&self) -> bool { self.state.eof != ReaderEofState::NotEof } /// Returns true if and only if this reader has been configured to /// interpret the first record as a header record. pub fn has_headers(&self) -> bool { self.state.has_headers } /// Returns a reference to the underlying reader. pub fn get_ref(&self) -> &R { self.rdr.get_ref() } /// Returns a mutable reference to the underlying reader. pub fn get_mut(&mut self) -> &mut R { self.rdr.get_mut() } /// Unwraps this CSV reader, returning the underlying reader. /// /// Note that any leftover data inside this reader's internal buffer is /// lost. pub fn into_inner(self) -> R { self.rdr.into_inner() } } impl Reader { /// Seeks the underlying reader to the position given. /// /// This comes with a few caveats: /// /// * Any internal buffer associated with this reader is cleared. /// * If the given position does not correspond to a position immediately /// before the start of a record, then the behavior of this reader is /// unspecified. /// * Any special logic that skips the first record in the CSV reader /// when reading or iterating over records is disabled. /// /// If the given position has a byte offset equivalent to the current /// position, then no seeking is performed. /// /// If the header row has not already been read, then this will attempt /// to read the header row before seeking. Therefore, it is possible that /// this returns an error associated with reading CSV data. /// /// Note that seeking is performed based only on the byte offset in the /// given position. Namely, the record or line numbers in the position may /// be incorrect, but this will cause any future position generated by /// this CSV reader to be similarly incorrect. /// /// # Example: seek to parse a record twice /// /// ``` /// use std::{error::Error, io}; /// use csv::{Reader, Position}; /// /// # fn main() { example().unwrap(); } /// fn example() -> Result<(), Box> { /// let data = "\ /// city,country,popcount /// Boston,United States,4628910 /// Concord,United States,42695 /// "; /// let rdr = Reader::from_reader(io::Cursor::new(data)); /// let mut iter = rdr.into_records(); /// let mut pos = Position::new(); /// loop { /// // Read the position immediately before each record. /// let next_pos = iter.reader().position().clone(); /// if iter.next().is_none() { /// break; /// } /// pos = next_pos; /// } /// /// // Now seek the reader back to `pos`. This will let us read the /// // last record again. /// iter.reader_mut().seek(pos)?; /// let mut iter = iter.into_reader().into_records(); /// if let Some(result) = iter.next() { /// let record = result?; /// assert_eq!(record, vec!["Concord", "United States", "42695"]); /// Ok(()) /// } else { /// Err(From::from("expected at least one record but got none")) /// } /// } /// ``` pub fn seek(&mut self, pos: Position) -> Result<()> { self.byte_headers()?; self.state.seeked = true; if pos.byte() == self.state.cur_pos.byte() { return Ok(()); } self.rdr.seek(io::SeekFrom::Start(pos.byte()))?; self.core.reset(); self.core.set_line(pos.line()); self.state.cur_pos = pos; self.state.eof = ReaderEofState::NotEof; Ok(()) } /// This is like `seek`, but provides direct control over how the seeking /// operation is performed via `io::SeekFrom`. /// /// The `pos` position given *should* correspond the position indicated /// by `seek_from`, but there is no requirement. If the `pos` position /// given is incorrect, then the position information returned by this /// reader will be similarly incorrect. /// /// If the header row has not already been read, then this will attempt /// to read the header row before seeking. Therefore, it is possible that /// this returns an error associated with reading CSV data. /// /// Unlike `seek`, this will always cause an actual seek to be performed. pub fn seek_raw( &mut self, seek_from: io::SeekFrom, pos: Position, ) -> Result<()> { self.byte_headers()?; self.state.seeked = true; self.rdr.seek(seek_from)?; self.core.reset(); self.core.set_line(pos.line()); self.state.cur_pos = pos; self.state.eof = ReaderEofState::NotEof; Ok(()) } } impl ReaderState { #[inline(always)] fn add_record(&mut self, record: &ByteRecord) -> Result<()> { let i = self.cur_pos.record(); self.cur_pos.set_record(i.checked_add(1).unwrap()); if !self.flexible { match self.first_field_count { None => self.first_field_count = Some(record.len() as u64), Some(expected) => { if record.len() as u64 != expected { return Err(Error::new(ErrorKind::UnequalLengths { pos: record.position().map(Clone::clone), expected_len: expected, len: record.len() as u64, })); } } } } Ok(()) } } /// An owned iterator over deserialized records. /// /// The type parameter `R` refers to the underlying `io::Read` type, and `D` /// refers to the type that this iterator will deserialize a record into. pub struct DeserializeRecordsIntoIter { rdr: Reader, rec: StringRecord, headers: Option, _priv: PhantomData, } impl DeserializeRecordsIntoIter { fn new(mut rdr: Reader) -> DeserializeRecordsIntoIter { let headers = if !rdr.state.has_headers { None } else { rdr.headers().ok().map(Clone::clone) }; DeserializeRecordsIntoIter { rdr, rec: StringRecord::new(), headers, _priv: PhantomData, } } /// Return a reference to the underlying CSV reader. pub fn reader(&self) -> &Reader { &self.rdr } /// Return a mutable reference to the underlying CSV reader. pub fn reader_mut(&mut self) -> &mut Reader { &mut self.rdr } /// Drop this iterator and return the underlying CSV reader. pub fn into_reader(self) -> Reader { self.rdr } } impl Iterator for DeserializeRecordsIntoIter { type Item = Result; fn next(&mut self) -> Option> { match self.rdr.read_record(&mut self.rec) { Err(err) => Some(Err(err)), Ok(false) => None, Ok(true) => Some(self.rec.deserialize(self.headers.as_ref())), } } } /// A borrowed iterator over deserialized records. /// /// The lifetime parameter `'r` refers to the lifetime of the underlying /// CSV `Reader`. The type parameter `R` refers to the underlying `io::Read` /// type, and `D` refers to the type that this iterator will deserialize a /// record into. pub struct DeserializeRecordsIter<'r, R: 'r, D> { rdr: &'r mut Reader, rec: StringRecord, headers: Option, _priv: PhantomData, } impl<'r, R: io::Read, D: DeserializeOwned> DeserializeRecordsIter<'r, R, D> { fn new(rdr: &'r mut Reader) -> DeserializeRecordsIter<'r, R, D> { let headers = if !rdr.state.has_headers { None } else { rdr.headers().ok().map(Clone::clone) }; DeserializeRecordsIter { rdr, rec: StringRecord::new(), headers, _priv: PhantomData, } } /// Return a reference to the underlying CSV reader. pub fn reader(&self) -> &Reader { &self.rdr } /// Return a mutable reference to the underlying CSV reader. pub fn reader_mut(&mut self) -> &mut Reader { &mut self.rdr } } impl<'r, R: io::Read, D: DeserializeOwned> Iterator for DeserializeRecordsIter<'r, R, D> { type Item = Result; fn next(&mut self) -> Option> { match self.rdr.read_record(&mut self.rec) { Err(err) => Some(Err(err)), Ok(false) => None, Ok(true) => Some(self.rec.deserialize(self.headers.as_ref())), } } } /// An owned iterator over records as strings. pub struct StringRecordsIntoIter { rdr: Reader, rec: StringRecord, } impl StringRecordsIntoIter { fn new(rdr: Reader) -> StringRecordsIntoIter { StringRecordsIntoIter { rdr, rec: StringRecord::new() } } /// Return a reference to the underlying CSV reader. pub fn reader(&self) -> &Reader { &self.rdr } /// Return a mutable reference to the underlying CSV reader. pub fn reader_mut(&mut self) -> &mut Reader { &mut self.rdr } /// Drop this iterator and return the underlying CSV reader. pub fn into_reader(self) -> Reader { self.rdr } } impl Iterator for StringRecordsIntoIter { type Item = Result; fn next(&mut self) -> Option> { match self.rdr.read_record(&mut self.rec) { Err(err) => Some(Err(err)), Ok(true) => Some(Ok(self.rec.clone_truncated())), Ok(false) => None, } } } /// A borrowed iterator over records as strings. /// /// The lifetime parameter `'r` refers to the lifetime of the underlying /// CSV `Reader`. pub struct StringRecordsIter<'r, R: 'r> { rdr: &'r mut Reader, rec: StringRecord, } impl<'r, R: io::Read> StringRecordsIter<'r, R> { fn new(rdr: &'r mut Reader) -> StringRecordsIter<'r, R> { StringRecordsIter { rdr, rec: StringRecord::new() } } /// Return a reference to the underlying CSV reader. pub fn reader(&self) -> &Reader { &self.rdr } /// Return a mutable reference to the underlying CSV reader. pub fn reader_mut(&mut self) -> &mut Reader { &mut self.rdr } } impl<'r, R: io::Read> Iterator for StringRecordsIter<'r, R> { type Item = Result; fn next(&mut self) -> Option> { match self.rdr.read_record(&mut self.rec) { Err(err) => Some(Err(err)), Ok(true) => Some(Ok(self.rec.clone_truncated())), Ok(false) => None, } } } /// An owned iterator over records as raw bytes. pub struct ByteRecordsIntoIter { rdr: Reader, rec: ByteRecord, } impl ByteRecordsIntoIter { fn new(rdr: Reader) -> ByteRecordsIntoIter { ByteRecordsIntoIter { rdr, rec: ByteRecord::new() } } /// Return a reference to the underlying CSV reader. pub fn reader(&self) -> &Reader { &self.rdr } /// Return a mutable reference to the underlying CSV reader. pub fn reader_mut(&mut self) -> &mut Reader { &mut self.rdr } /// Drop this iterator and return the underlying CSV reader. pub fn into_reader(self) -> Reader { self.rdr } } impl Iterator for ByteRecordsIntoIter { type Item = Result; fn next(&mut self) -> Option> { match self.rdr.read_byte_record(&mut self.rec) { Err(err) => Some(Err(err)), Ok(true) => Some(Ok(self.rec.clone_truncated())), Ok(false) => None, } } } /// A borrowed iterator over records as raw bytes. /// /// The lifetime parameter `'r` refers to the lifetime of the underlying /// CSV `Reader`. pub struct ByteRecordsIter<'r, R: 'r> { rdr: &'r mut Reader, rec: ByteRecord, } impl<'r, R: io::Read> ByteRecordsIter<'r, R> { fn new(rdr: &'r mut Reader) -> ByteRecordsIter<'r, R> { ByteRecordsIter { rdr, rec: ByteRecord::new() } } /// Return a reference to the underlying CSV reader. pub fn reader(&self) -> &Reader { &self.rdr } /// Return a mutable reference to the underlying CSV reader. pub fn reader_mut(&mut self) -> &mut Reader { &mut self.rdr } } impl<'r, R: io::Read> Iterator for ByteRecordsIter<'r, R> { type Item = Result; fn next(&mut self) -> Option> { match self.rdr.read_byte_record(&mut self.rec) { Err(err) => Some(Err(err)), Ok(true) => Some(Ok(self.rec.clone_truncated())), Ok(false) => None, } } } #[cfg(test)] mod tests { use std::io; use crate::{ byte_record::ByteRecord, error::ErrorKind, string_record::StringRecord, }; use super::{Position, ReaderBuilder, Trim}; fn b(s: &str) -> &[u8] { s.as_bytes() } fn s(b: &[u8]) -> &str { ::std::str::from_utf8(b).unwrap() } fn newpos(byte: u64, line: u64, record: u64) -> Position { let mut p = Position::new(); p.set_byte(byte).set_line(line).set_record(record); p } #[test] fn read_byte_record() { let data = b("foo,\"b,ar\",baz\nabc,mno,xyz"); let mut rdr = ReaderBuilder::new().has_headers(false).from_reader(data); let mut rec = ByteRecord::new(); assert!(rdr.read_byte_record(&mut rec).unwrap()); assert_eq!(3, rec.len()); assert_eq!("foo", s(&rec[0])); assert_eq!("b,ar", s(&rec[1])); assert_eq!("baz", s(&rec[2])); assert!(rdr.read_byte_record(&mut rec).unwrap()); assert_eq!(3, rec.len()); assert_eq!("abc", s(&rec[0])); assert_eq!("mno", s(&rec[1])); assert_eq!("xyz", s(&rec[2])); assert!(!rdr.read_byte_record(&mut rec).unwrap()); } #[test] fn read_trimmed_records_and_headers() { let data = b("foo, bar,\tbaz\n 1, 2, 3\n1\t,\t,3\t\t"); let mut rdr = ReaderBuilder::new() .has_headers(true) .trim(Trim::All) .from_reader(data); let mut rec = ByteRecord::new(); assert!(rdr.read_byte_record(&mut rec).unwrap()); assert_eq!("1", s(&rec[0])); assert_eq!("2", s(&rec[1])); assert_eq!("3", s(&rec[2])); let mut rec = StringRecord::new(); assert!(rdr.read_record(&mut rec).unwrap()); assert_eq!("1", &rec[0]); assert_eq!("", &rec[1]); assert_eq!("3", &rec[2]); { let headers = rdr.headers().unwrap(); assert_eq!(3, headers.len()); assert_eq!("foo", &headers[0]); assert_eq!("bar", &headers[1]); assert_eq!("baz", &headers[2]); } } #[test] fn read_trimmed_header() { let data = b("foo, bar,\tbaz\n 1, 2, 3\n1\t,\t,3\t\t"); let mut rdr = ReaderBuilder::new() .has_headers(true) .trim(Trim::Headers) .from_reader(data); let mut rec = ByteRecord::new(); assert!(rdr.read_byte_record(&mut rec).unwrap()); assert_eq!(" 1", s(&rec[0])); assert_eq!(" 2", s(&rec[1])); assert_eq!(" 3", s(&rec[2])); { let headers = rdr.headers().unwrap(); assert_eq!(3, headers.len()); assert_eq!("foo", &headers[0]); assert_eq!("bar", &headers[1]); assert_eq!("baz", &headers[2]); } } #[test] fn read_trimed_header_invalid_utf8() { let data = &b"foo, b\xFFar,\tbaz\na,b,c\nd,e,f"[..]; let mut rdr = ReaderBuilder::new() .has_headers(true) .trim(Trim::Headers) .from_reader(data); let mut rec = StringRecord::new(); // force the headers to be read let _ = rdr.read_record(&mut rec); // Check the byte headers are trimmed { let headers = rdr.byte_headers().unwrap(); assert_eq!(3, headers.len()); assert_eq!(b"foo", &headers[0]); assert_eq!(b"b\xFFar", &headers[1]); assert_eq!(b"baz", &headers[2]); } match *rdr.headers().unwrap_err().kind() { ErrorKind::Utf8 { pos: Some(ref pos), ref err } => { assert_eq!(pos, &newpos(0, 1, 0)); assert_eq!(err.field(), 1); assert_eq!(err.valid_up_to(), 3); } ref err => panic!("match failed, got {:?}", err), } } #[test] fn read_trimmed_records() { let data = b("foo, bar,\tbaz\n 1, 2, 3\n1\t,\t,3\t\t"); let mut rdr = ReaderBuilder::new() .has_headers(true) .trim(Trim::Fields) .from_reader(data); let mut rec = ByteRecord::new(); assert!(rdr.read_byte_record(&mut rec).unwrap()); assert_eq!("1", s(&rec[0])); assert_eq!("2", s(&rec[1])); assert_eq!("3", s(&rec[2])); { let headers = rdr.headers().unwrap(); assert_eq!(3, headers.len()); assert_eq!("foo", &headers[0]); assert_eq!(" bar", &headers[1]); assert_eq!("\tbaz", &headers[2]); } } #[test] fn read_record_unequal_fails() { let data = b("foo\nbar,baz"); let mut rdr = ReaderBuilder::new().has_headers(false).from_reader(data); let mut rec = ByteRecord::new(); assert!(rdr.read_byte_record(&mut rec).unwrap()); assert_eq!(1, rec.len()); assert_eq!("foo", s(&rec[0])); match rdr.read_byte_record(&mut rec) { Err(err) => match *err.kind() { ErrorKind::UnequalLengths { expected_len: 1, ref pos, len: 2, } => { assert_eq!(pos, &Some(newpos(4, 2, 1))); } ref wrong => panic!("match failed, got {:?}", wrong), }, wrong => panic!("match failed, got {:?}", wrong), } } #[test] fn read_record_unequal_ok() { let data = b("foo\nbar,baz"); let mut rdr = ReaderBuilder::new() .has_headers(false) .flexible(true) .from_reader(data); let mut rec = ByteRecord::new(); assert!(rdr.read_byte_record(&mut rec).unwrap()); assert_eq!(1, rec.len()); assert_eq!("foo", s(&rec[0])); assert!(rdr.read_byte_record(&mut rec).unwrap()); assert_eq!(2, rec.len()); assert_eq!("bar", s(&rec[0])); assert_eq!("baz", s(&rec[1])); assert!(!rdr.read_byte_record(&mut rec).unwrap()); } // This tests that even if we get a CSV error, we can continue reading // if we want. #[test] fn read_record_unequal_continue() { let data = b("foo\nbar,baz\nquux"); let mut rdr = ReaderBuilder::new().has_headers(false).from_reader(data); let mut rec = ByteRecord::new(); assert!(rdr.read_byte_record(&mut rec).unwrap()); assert_eq!(1, rec.len()); assert_eq!("foo", s(&rec[0])); match rdr.read_byte_record(&mut rec) { Err(err) => match err.kind() { &ErrorKind::UnequalLengths { expected_len: 1, ref pos, len: 2, } => { assert_eq!(pos, &Some(newpos(4, 2, 1))); } wrong => panic!("match failed, got {:?}", wrong), }, wrong => panic!("match failed, got {:?}", wrong), } assert!(rdr.read_byte_record(&mut rec).unwrap()); assert_eq!(1, rec.len()); assert_eq!("quux", s(&rec[0])); assert!(!rdr.read_byte_record(&mut rec).unwrap()); } #[test] fn read_record_headers() { let data = b("foo,bar,baz\na,b,c\nd,e,f"); let mut rdr = ReaderBuilder::new().has_headers(true).from_reader(data); let mut rec = StringRecord::new(); assert!(rdr.read_record(&mut rec).unwrap()); assert_eq!(3, rec.len()); assert_eq!("a", &rec[0]); assert!(rdr.read_record(&mut rec).unwrap()); assert_eq!(3, rec.len()); assert_eq!("d", &rec[0]); assert!(!rdr.read_record(&mut rec).unwrap()); { let headers = rdr.byte_headers().unwrap(); assert_eq!(3, headers.len()); assert_eq!(b"foo", &headers[0]); assert_eq!(b"bar", &headers[1]); assert_eq!(b"baz", &headers[2]); } { let headers = rdr.headers().unwrap(); assert_eq!(3, headers.len()); assert_eq!("foo", &headers[0]); assert_eq!("bar", &headers[1]); assert_eq!("baz", &headers[2]); } } #[test] fn read_record_headers_invalid_utf8() { let data = &b"foo,b\xFFar,baz\na,b,c\nd,e,f"[..]; let mut rdr = ReaderBuilder::new().has_headers(true).from_reader(data); let mut rec = StringRecord::new(); assert!(rdr.read_record(&mut rec).unwrap()); assert_eq!(3, rec.len()); assert_eq!("a", &rec[0]); assert!(rdr.read_record(&mut rec).unwrap()); assert_eq!(3, rec.len()); assert_eq!("d", &rec[0]); assert!(!rdr.read_record(&mut rec).unwrap()); // Check that we can read the headers as raw bytes, but that // if we read them as strings, we get an appropriate UTF-8 error. { let headers = rdr.byte_headers().unwrap(); assert_eq!(3, headers.len()); assert_eq!(b"foo", &headers[0]); assert_eq!(b"b\xFFar", &headers[1]); assert_eq!(b"baz", &headers[2]); } match *rdr.headers().unwrap_err().kind() { ErrorKind::Utf8 { pos: Some(ref pos), ref err } => { assert_eq!(pos, &newpos(0, 1, 0)); assert_eq!(err.field(), 1); assert_eq!(err.valid_up_to(), 1); } ref err => panic!("match failed, got {:?}", err), } } #[test] fn read_record_no_headers_before() { let data = b("foo,bar,baz\na,b,c\nd,e,f"); let mut rdr = ReaderBuilder::new().has_headers(false).from_reader(data); let mut rec = StringRecord::new(); { let headers = rdr.headers().unwrap(); assert_eq!(3, headers.len()); assert_eq!("foo", &headers[0]); assert_eq!("bar", &headers[1]); assert_eq!("baz", &headers[2]); } assert!(rdr.read_record(&mut rec).unwrap()); assert_eq!(3, rec.len()); assert_eq!("foo", &rec[0]); assert!(rdr.read_record(&mut rec).unwrap()); assert_eq!(3, rec.len()); assert_eq!("a", &rec[0]); assert!(rdr.read_record(&mut rec).unwrap()); assert_eq!(3, rec.len()); assert_eq!("d", &rec[0]); assert!(!rdr.read_record(&mut rec).unwrap()); } #[test] fn read_record_no_headers_after() { let data = b("foo,bar,baz\na,b,c\nd,e,f"); let mut rdr = ReaderBuilder::new().has_headers(false).from_reader(data); let mut rec = StringRecord::new(); assert!(rdr.read_record(&mut rec).unwrap()); assert_eq!(3, rec.len()); assert_eq!("foo", &rec[0]); assert!(rdr.read_record(&mut rec).unwrap()); assert_eq!(3, rec.len()); assert_eq!("a", &rec[0]); assert!(rdr.read_record(&mut rec).unwrap()); assert_eq!(3, rec.len()); assert_eq!("d", &rec[0]); assert!(!rdr.read_record(&mut rec).unwrap()); let headers = rdr.headers().unwrap(); assert_eq!(3, headers.len()); assert_eq!("foo", &headers[0]); assert_eq!("bar", &headers[1]); assert_eq!("baz", &headers[2]); } #[test] fn seek() { let data = b("foo,bar,baz\na,b,c\nd,e,f\ng,h,i"); let mut rdr = ReaderBuilder::new().from_reader(io::Cursor::new(data)); rdr.seek(newpos(18, 3, 2)).unwrap(); let mut rec = StringRecord::new(); assert_eq!(18, rdr.position().byte()); assert!(rdr.read_record(&mut rec).unwrap()); assert_eq!(3, rec.len()); assert_eq!("d", &rec[0]); assert_eq!(24, rdr.position().byte()); assert_eq!(4, rdr.position().line()); assert_eq!(3, rdr.position().record()); assert!(rdr.read_record(&mut rec).unwrap()); assert_eq!(3, rec.len()); assert_eq!("g", &rec[0]); assert!(!rdr.read_record(&mut rec).unwrap()); } // Test that we can read headers after seeking even if the headers weren't // explicit read before seeking. #[test] fn seek_headers_after() { let data = b("foo,bar,baz\na,b,c\nd,e,f\ng,h,i"); let mut rdr = ReaderBuilder::new().from_reader(io::Cursor::new(data)); rdr.seek(newpos(18, 3, 2)).unwrap(); assert_eq!(rdr.headers().unwrap(), vec!["foo", "bar", "baz"]); } // Test that we can read headers after seeking if the headers were read // before seeking. #[test] fn seek_headers_before_after() { let data = b("foo,bar,baz\na,b,c\nd,e,f\ng,h,i"); let mut rdr = ReaderBuilder::new().from_reader(io::Cursor::new(data)); let headers = rdr.headers().unwrap().clone(); rdr.seek(newpos(18, 3, 2)).unwrap(); assert_eq!(&headers, rdr.headers().unwrap()); } // Test that even if we didn't read headers before seeking, if we seek to // the current byte offset, then no seeking is done and therefore we can // still read headers after seeking. #[test] fn seek_headers_no_actual_seek() { let data = b("foo,bar,baz\na,b,c\nd,e,f\ng,h,i"); let mut rdr = ReaderBuilder::new().from_reader(io::Cursor::new(data)); rdr.seek(Position::new()).unwrap(); assert_eq!("foo", &rdr.headers().unwrap()[0]); } // Test that position info is reported correctly in absence of headers. #[test] fn positions_no_headers() { let mut rdr = ReaderBuilder::new() .has_headers(false) .from_reader("a,b,c\nx,y,z".as_bytes()) .into_records(); let pos = rdr.next().unwrap().unwrap().position().unwrap().clone(); assert_eq!(pos.byte(), 0); assert_eq!(pos.line(), 1); assert_eq!(pos.record(), 0); let pos = rdr.next().unwrap().unwrap().position().unwrap().clone(); assert_eq!(pos.byte(), 6); assert_eq!(pos.line(), 2); assert_eq!(pos.record(), 1); } // Test that position info is reported correctly with headers. #[test] fn positions_headers() { let mut rdr = ReaderBuilder::new() .has_headers(true) .from_reader("a,b,c\nx,y,z".as_bytes()) .into_records(); let pos = rdr.next().unwrap().unwrap().position().unwrap().clone(); assert_eq!(pos.byte(), 6); assert_eq!(pos.line(), 2); assert_eq!(pos.record(), 1); } // Test that reading headers on empty data yields an empty record. #[test] fn headers_on_empty_data() { let mut rdr = ReaderBuilder::new().from_reader("".as_bytes()); let r = rdr.byte_headers().unwrap(); assert_eq!(r.len(), 0); } // Test that reading the first record on empty data works. #[test] fn no_headers_on_empty_data() { let mut rdr = ReaderBuilder::new().has_headers(false).from_reader("".as_bytes()); assert_eq!(rdr.records().count(), 0); } // Test that reading the first record on empty data works, even if // we've tried to read headers before hand. #[test] fn no_headers_on_empty_data_after_headers() { let mut rdr = ReaderBuilder::new().has_headers(false).from_reader("".as_bytes()); assert_eq!(rdr.headers().unwrap().len(), 0); assert_eq!(rdr.records().count(), 0); } }