13 files changed, 314 insertions, 88 deletions
diff --git a/src/reader/config.rs b/src/reader/config.rs
index 3351997..686d0d4 100644
--- a/src/reader/config.rs
+++ b/src/reader/config.rs
@@ -5,7 +5,11 @@ use std::io::Read;
 use crate::reader::EventReader;
 use crate::util::Encoding;
 
-/// Parser configuration structure.
+/// Limits to defend from billion laughs attack
+const DEFAULT_MAX_ENTITY_EXPANSION_LENGTH: usize = 1_000_000;
+const DEFAULT_MAX_ENTITY_EXPANSION_DEPTH: u8 = 10;
+
+/// Parser configuration structure. **There are more config methods than public fileds — see methods below**.
 ///
 /// This structure contains various configuration options which affect
 /// behavior of the parser.
@@ -88,6 +92,8 @@ pub struct ParserConfig {
     ///
     /// By default any whitespace that is not enclosed within at least one level of elements will be
     /// ignored. Setting this value to false will cause root level whitespace events to be emitted.
+    ///
+    /// **There are configuration options – see methods below**
     pub ignore_root_level_whitespace: bool,
 }
 
@@ -198,6 +204,23 @@ pub struct ParserConfig2 {
 
     /// Documents with multiple root elements are ill-formed
     pub allow_multiple_root_elements: bool,
+
+    /// Abort if custom entities create a string longer than this
+    pub max_entity_expansion_length: usize,
+    /// Entities can expand into other entities this many times (be careful about exponential cost!)
+    pub max_entity_expansion_depth: u8,
+
+    /// Maximum length of tag name or attribute name
+    pub max_name_length: usize,
+
+    /// Max number of attributes per element
+    pub max_attributes: usize,
+
+    /// Max number of bytes in each attribute
+    pub max_attribute_length: usize,
+
+    /// Maximum length of strings reprsenting characters, comments, and processing instructions
+    pub max_data_length: usize,
 }
 
 impl Default for ParserConfig2 {
@@ -207,6 +230,12 @@ impl Default for ParserConfig2 {
             override_encoding: None,
             ignore_invalid_encoding_declarations: false,
             allow_multiple_root_elements: true,
+            max_entity_expansion_length: DEFAULT_MAX_ENTITY_EXPANSION_LENGTH,
+            max_entity_expansion_depth: DEFAULT_MAX_ENTITY_EXPANSION_DEPTH,
+            max_attributes: 1<<16,
+            max_attribute_length: 1<<30,
+            max_data_length: 1<<30,
+            max_name_length: 1<<18,
         }
     }
 }
@@ -273,15 +302,48 @@ impl From<ParserConfig> for ParserConfig2 {
 }
 
 gen_setters! { ParserConfig2,
+    /// Set if you got one in the HTTP header
     override_encoding: val Option<Encoding>,
+    /// Allows invalid documents. There should be only a single root element in XML.
     allow_multiple_root_elements: val bool,
+    /// Abort if custom entities create a string longer than this
+    max_entity_expansion_length: val usize,
+    /// Entities can expand into other entities this many times (be careful about exponential cost!)
+    max_entity_expansion_depth: val u8,
+    /// Max number of attributes per element
+    max_attributes: val usize,
+    /// Maximum length of tag name or attribute name
+    max_name_length: val usize,
+    /// Max number of bytes in each attribute
+    max_attribute_length: val usize,
+    /// Maximum length of strings reprsenting characters, comments, and processing instructions
+    max_data_length: val usize,
+    /// Allow `<?xml encoding="bogus"?>`
     ignore_invalid_encoding_declarations: val bool
 }
 
 gen_setters! { ParserConfig,
+    /// Set if you got one in the HTTP header (see `content_type`)
     override_encoding: c2 Option<Encoding>,
+    /// Allow `<?xml encoding="bogus"?>`
     ignore_invalid_encoding_declarations: c2 bool,
+    /// Allows invalid documents. There should be only a single root element in XML.
     allow_multiple_root_elements: c2 bool,
+
+    /// Abort if custom entities create a string longer than this
+    max_entity_expansion_length: c2 usize,
+    /// Entities can expand into other entities this many times (be careful about exponential cost!)
+    max_entity_expansion_depth: c2 u8,
+    /// Max number of attributes per element
+    max_attributes: c2 usize,
+    /// Maximum length of tag name or attribute name
+    max_name_length: c2 usize,
+    /// Max number of bytes in each attribute
+    max_attribute_length: c2 usize,
+    /// Maximum length of strings reprsenting characters, comments, and processing instructions
+    max_data_length: c2 usize,
+
+    /// Set encoding from the MIME type. Important for HTTP compatibility.
     content_type: c2 &str
 }
 
@@ -293,14 +355,15 @@ gen_setters! { ParserConfig2,
     coalesce_characters: delegate bool,
     ignore_end_of_stream: delegate bool,
     replace_unknown_entity_references: delegate bool,
+    /// Whether or not whitespace at the root level of the document is ignored. Default is true.
     ignore_root_level_whitespace: delegate bool
 }
 
 #[test]
 fn mime_parse() {
-    let c = ParserConfig2::new().content_type("text/xml;charset=Us-AScii");
+    let c = ParserConfig2::new().content_type("text/xml;charset=Us-AScii").max_entity_expansion_length(1000);
     assert_eq!(c.override_encoding, Some(Encoding::Ascii));
 
-    let c = ParserConfig2::new().content_type("text/xml;charset = \"UTF-16\"");
+    let c = ParserConfig2::new().max_entity_expansion_depth(3).content_type("text/xml;charset = \"UTF-16\"");
     assert_eq!(c.override_encoding, Some(Encoding::Utf16));
 }
diff --git a/src/reader/error.rs b/src/reader/error.rs
index 8af35ae..64210c4 100644
--- a/src/reader/error.rs
+++ b/src/reader/error.rs
@@ -65,6 +65,8 @@ pub(crate) enum SyntaxError {
     UnexpectedXmlVersion(Box<str>),
     ConflictingEncoding(Encoding, Encoding),
     UnexpectedTokenBefore(&'static str, char),
+    /// Document has more stuff than `ParserConfig` allows
+    ExceededConfiguredLimit,
 }
 
 impl fmt::Display for SyntaxError {
@@ -116,6 +118,7 @@ impl SyntaxError {
             Self::UnexpectedXmlVersion(ref version) => format!("Invalid XML version: {version}").into(),
             Self::UnknownMarkupDeclaration(ref v) => format!("Unknown markup declaration: {v}").into(),
             Self::UnsupportedEncoding(ref v) => format!("Unsupported encoding: {v}").into(),
+            Self::ExceededConfiguredLimit => "This document is larger/more complex than allowed by the parser's configuration".into(),
         }
     }
 }
diff --git a/src/reader/events.rs b/src/reader/events.rs
index de2b930..e8eb81e 100644
--- a/src/reader/events.rs
+++ b/src/reader/events.rs
@@ -1,8 +1,6 @@
 //! Contains `XmlEvent` datatype, instances of which are emitted by the parser.
 
-use std::borrow::Cow;
 use std::fmt;
-
 use crate::attribute::OwnedAttribute;
 use crate::common::XmlVersion;
 use crate::name::OwnedName;
@@ -207,7 +205,7 @@ impl XmlEvent {
                 Some(crate::writer::events::XmlEvent::StartElement {
                     name: name.borrow(),
                     attributes: attributes.iter().map(|a| a.borrow()).collect(),
-                    namespace: Cow::Borrowed(namespace)
+                    namespace: namespace.borrow(),
                 }),
             XmlEvent::EndElement { ref name } =>
                 Some(crate::writer::events::XmlEvent::EndElement { name: Some(name.borrow()) }),
diff --git a/src/reader/indexset.rs b/src/reader/indexset.rs
new file mode 100644
index 0000000..3d683a2
--- /dev/null
+++ b/src/reader/indexset.rs
@@ -0,0 +1,116 @@
+use crate::attribute::OwnedAttribute;
+use crate::name::OwnedName;
+
+use std::collections::hash_map::RandomState;
+use std::collections::HashSet;
+use std::hash::BuildHasher;
+use std::hash::Hash;
+use std::hash::Hasher;
+
+/// An ordered set
+pub(crate) struct AttributesSet {
+    vec: Vec<OwnedAttribute>,
+    /// Uses a no-op hasher, because these u64s are hashes already
+    may_contain: HashSet<u64, U64HasherBuilder>,
+    /// This is real hasher for the `OwnedName`
+    hasher: RandomState,
+}
+
+/// Use linear search and don't allocate `HashSet` if there are few attributes,
+/// because allocation costs more than a few comparisons.
+const HASH_THRESHOLD: usize = 8;
+
+impl AttributesSet {
+    pub fn new() -> Self {
+        Self {
+            vec: Vec::new(),
+            hasher: RandomState::new(),
+            may_contain: HashSet::default(),
+        }
+    }
+
+    fn hash(&self, val: &OwnedName) -> u64 {
+        let mut h = self.hasher.build_hasher();
+        val.hash(&mut h);
+        h.finish()
+    }
+
+    pub fn len(&self) -> usize {
+        self.vec.len()
+    }
+
+    pub fn contains(&self, name: &OwnedName) -> bool {
+        // fall back to linear search only on duplicate or hash collision
+        (self.vec.len() < HASH_THRESHOLD || self.may_contain.contains(&self.hash(name))) &&
+            self.vec.iter().any(move |a| &a.name == name)
+    }
+
+    pub fn push(&mut self, attr: OwnedAttribute) {
+        if self.vec.len() >= HASH_THRESHOLD {
+            if self.vec.len() == HASH_THRESHOLD {
+                self.may_contain.reserve(HASH_THRESHOLD * 2);
+                for attr in &self.vec {
+                    self.may_contain.insert(self.hash(&attr.name));
+                }
+            }
+            self.may_contain.insert(self.hash(&attr.name));
+        }
+        self.vec.push(attr);
+    }
+
+    pub fn into_vec(self) -> Vec<OwnedAttribute> {
+        self.vec
+    }
+}
+
+#[test]
+fn indexset() {
+    let mut s = AttributesSet::new();
+    let not_here = OwnedName {
+        local_name: "attr1000".into(),
+        namespace: Some("test".into()),
+        prefix: None,
+    };
+
+    // this test will take a lot of time if the `contains()` is linear, and the loop is quadratic
+    for i in 0..50000 {
+        let name = OwnedName {
+            local_name: format!("attr{i}"), namespace: None, prefix: None,
+        };
+        assert!(!s.contains(&name));
+
+        s.push(OwnedAttribute { name, value: String::new() });
+        assert!(!s.contains(&not_here));
+    }
+
+    assert!(s.contains(&OwnedName {
+        local_name: "attr1234".into(), namespace: None, prefix: None,
+    }));
+    assert!(s.contains(&OwnedName {
+        local_name: "attr0".into(), namespace: None, prefix: None,
+    }));
+    assert!(s.contains(&OwnedName {
+        local_name: "attr49999".into(), namespace: None, prefix: None,
+    }));
+}
+
+/// Hashser that does nothing except passing u64 through
+struct U64Hasher(u64);
+
+impl Hasher for U64Hasher {
+    fn finish(&self) -> u64 { self.0 }
+    fn write(&mut self, slice: &[u8]) {
+        for &v in slice { self.0 ^= u64::from(v) } // unused in practice
+    }
+    fn write_u64(&mut self, i: u64) {
+        self.0 ^= i;
+    }
+}
+
+#[derive(Default)]
+struct U64HasherBuilder;
+
+impl BuildHasher for U64HasherBuilder {
+    type Hasher = U64Hasher;
+    fn build_hasher(&self) -> U64Hasher { U64Hasher(0) }
+}
diff --git a/src/reader/lexer.rs b/src/reader/lexer.rs
index a8345ba..6b59c86 100644
--- a/src/reader/lexer.rs
+++ b/src/reader/lexer.rs
@@ -13,9 +13,7 @@ use crate::common::{is_name_char, is_whitespace_char, Position, TextPosition, is
 use crate::reader::Error;
 use crate::util::{CharReader, Encoding};
 
-/// Limits to defend from billion laughs attack
-const MAX_ENTITY_EXPANSION_LENGTH: usize = 1_000_000;
-const MAX_ENTITY_EXPANSION_DEPTH: u8 = 10;
+use super::ParserConfig2;
 
 /// `Token` represents a single lexeme of an XML document. These lexemes
 /// are used to perform actual parsing.
@@ -229,6 +227,9 @@ pub(crate) struct Lexer {
     reparse_depth: u8,
     #[cfg(test)]
     skip_errors: bool,
+
+    max_entity_expansion_depth: u8,
+    max_entity_expansion_length: usize,
 }
 
 impl Position for Lexer {
@@ -239,7 +240,7 @@ impl Position for Lexer {
 
 impl Lexer {
     /// Returns a new lexer with default state.
-    pub(crate) fn new() -> Lexer {
+    pub(crate) fn new(config: &ParserConfig2) -> Lexer {
         Lexer {
             reader: CharReader::new(),
             pos: TextPosition::new(),
@@ -252,6 +253,9 @@ impl Lexer {
             reparse_depth: 0,
             #[cfg(test)]
             skip_errors: false,
+
+            max_entity_expansion_depth: config.max_entity_expansion_depth,
+            max_entity_expansion_length: config.max_entity_expansion_length,
         }
     }
 
@@ -422,7 +426,7 @@ impl Lexer {
         }
 
         self.reparse_depth += 1;
-        if self.reparse_depth > MAX_ENTITY_EXPANSION_DEPTH || self.char_queue.len() > MAX_ENTITY_EXPANSION_LENGTH {
+        if self.reparse_depth > self.max_entity_expansion_depth || self.char_queue.len() > self.max_entity_expansion_length {
             return Err(self.error(SyntaxError::EntityTooBig))
         }
 
@@ -650,7 +654,7 @@ impl Lexer {
 
 #[cfg(test)]
 mod tests {
-    use crate::common::Position;
+    use crate::{common::Position, reader::ParserConfig2};
     use std::io::{BufReader, Cursor};
 
     use super::{Lexer, Token};
@@ -680,7 +684,7 @@ mod tests {
     );
 
     fn make_lex_and_buf(s: &str) -> (Lexer, BufReader<Cursor<Vec<u8>>>) {
-        (Lexer::new(), BufReader::new(Cursor::new(s.to_owned().into_bytes())))
+        (Lexer::new(&ParserConfig2::default()), BufReader::new(Cursor::new(s.to_owned().into_bytes())))
     }
 
     #[test]
diff --git a/src/reader/parser.rs b/src/reader/parser.rs
index dcdec89..18f073d 100644
--- a/src/reader/parser.rs
+++ b/src/reader/parser.rs
@@ -1,24 +1,19 @@
 //! Contains an implementation of pull-based XML parser.
 
-
-use crate::common::is_xml11_char;
-use crate::common::is_xml10_char;
-use crate::common::is_xml11_char_not_restricted;
-use crate::reader::error::SyntaxError;
-use std::collections::HashMap;
-use std::io::prelude::*;
-
-use crate::attribute::OwnedAttribute;
-use crate::common::{self, is_name_char, is_name_start_char, Position, TextPosition, XmlVersion, is_whitespace_char};
+use crate::common::{is_xml10_char, is_xml11_char, is_xml11_char_not_restricted, is_name_char, is_name_start_char, is_whitespace_char};
+use crate::common::{Position, TextPosition, XmlVersion};
 use crate::name::OwnedName;
 use crate::namespace::NamespaceStack;
-
 use crate::reader::config::ParserConfig2;
+use crate::reader::error::SyntaxError;
 use crate::reader::events::XmlEvent;
+use crate::reader::indexset::AttributesSet;
 use crate::reader::lexer::{Lexer, Token};
-
 use super::{Error, ErrorKind};
 
+use std::collections::HashMap;
+use std::io::Read;
+
 macro_rules! gen_takes(
     ($($field:ident -> $method:ident, $t:ty, $def:expr);+) => (
         $(
@@ -42,7 +37,7 @@ gen_takes!(
     element_name -> take_element_name, Option<OwnedName>, None;
 
     attr_name    -> take_attr_name, Option<OwnedName>, None;
-    attributes   -> take_attributes, Vec<OwnedAttribute>, vec!()
+    attributes   -> take_attributes, AttributesSet, AttributesSet::new()
 );
 
 mod inside_cdata;
@@ -107,7 +102,7 @@ impl PullParser {
 
     #[inline]
     fn new_with_config2(config: ParserConfig2) -> PullParser {
-        let mut lexer = Lexer::new();
+        let mut lexer = Lexer::new(&config);
         if let Some(enc) = config.override_encoding {
             lexer.set_encoding(enc);
         }
@@ -133,7 +128,7 @@ impl PullParser {
                 element_name: None,
                 quote: None,
                 attr_name: None,
-                attributes: Vec::new(),
+                attributes: AttributesSet::new(),
             },
             final_result: None,
             next_event: None,
@@ -299,7 +294,7 @@ struct MarkupData {
     name: String,     // used for processing instruction name
     ref_data: String,  // used for reference content
 
-    version: Option<common::XmlVersion>,  // used for XML declaration version
+    version: Option<XmlVersion>,  // used for XML declaration version
     encoding: Option<String>,  // used for XML declaration encoding
     standalone: Option<bool>,  // used for XML declaration standalone parameter
 
@@ -307,7 +302,7 @@ struct MarkupData {
 
     quote: Option<QuoteToken>,  // used to hold opening quote for attribute value
     attr_name: Option<OwnedName>,  // used to hold attribute name
-    attributes: Vec<OwnedAttribute>   // used to hold all accumulated attributes
+    attributes: AttributesSet,   // used to hold all accumulated attributes
 }
 
 impl PullParser {
@@ -401,7 +396,7 @@ impl PullParser {
     fn next_pos(&mut self) {
         // unfortunately calls to next_pos will never be perfectly balanced with push_pos,
         // at very least because parse errors and EOF can happen unexpectedly without a prior push.
-        if self.pos.len() > 0 {
+        if !self.pos.is_empty() {
             if self.pos.len() > 1 {
                 self.pos.remove(0);
             } else {
@@ -490,7 +485,7 @@ impl PullParser {
             let name = this.take_buf();
             match name.parse() {
                 Ok(name) => on_name(this, t, name),
-                Err(_) => Some(this.error(SyntaxError::InvalidQualifiedName(name.into())))
+                Err(_) => Some(this.error(SyntaxError::InvalidQualifiedName(name.into()))),
             }
         };
 
@@ -504,6 +499,9 @@ impl PullParser {
 
             Token::Character(c) if c != ':' && (self.buf.is_empty() && is_name_start_char(c) ||
                                           self.buf_has_data() && is_name_char(c)) => {
+                if self.buf.len() > self.config.max_name_length {
+                    return Some(self.error(SyntaxError::ExceededConfiguredLimit));
+                }
                 self.buf.push(c);
                 None
             },
@@ -517,7 +515,7 @@ impl PullParser {
 
             Token::Character(c) if is_whitespace_char(c) => invoke_callback(self, t),
 
-            _ => Some(self.error(SyntaxError::UnexpectedQualifiedName(t)))
+            _ => Some(self.error(SyntaxError::UnexpectedQualifiedName(t))),
         }
     }
 
@@ -529,7 +527,7 @@ impl PullParser {
     fn read_attribute_value<F>(&mut self, t: Token, on_value: F) -> Option<Result>
       where F: Fn(&mut PullParser, String) -> Option<Result> {
         match t {
-            Token::Character(c) if self.data.quote.is_none() && is_whitespace_char(c) => None,  // skip leading whitespace
+            Token::Character(c) if self.data.quote.is_none() && is_whitespace_char(c) => None, // skip leading whitespace
 
             Token::DoubleQuote | Token::SingleQuote => match self.data.quote {
                 None => {  // Entered attribute value
@@ -547,6 +545,9 @@ impl PullParser {
                             return Some(self.error(SyntaxError::InvalidCharacterEntity(c as u32)));
                         }
                     }
+                    if self.buf.len() > self.config.max_attribute_length {
+                        return Some(self.error(SyntaxError::ExceededConfiguredLimit));
+                    }
                     t.push_to_string(&mut self.buf);
                     None
                 }
@@ -557,8 +558,7 @@ impl PullParser {
                 self.into_state_continue(State::InsideReference)
             },
 
-            Token::OpeningTagStart =>
-                Some(self.error(SyntaxError::UnexpectedOpeningTag)),
+            Token::OpeningTagStart => Some(self.error(SyntaxError::UnexpectedOpeningTag)),
 
             Token::Character(c) if !self.is_valid_xml_char_not_restricted(c) => {
                 Some(self.error(SyntaxError::InvalidCharacterEntity(c as u32)))
@@ -566,6 +566,9 @@ impl PullParser {
 
             // Every character except " and ' and < is okay
             _ if self.data.quote.is_some() => {
+                if self.buf.len() > self.config.max_attribute_length {
+                    return Some(self.error(SyntaxError::ExceededConfiguredLimit));
+                }
                 t.push_to_string(&mut self.buf);
                 None
             }
@@ -576,11 +579,11 @@ impl PullParser {
 
     fn emit_start_element(&mut self, emit_end_element: bool) -> Option<Result> {
         let mut name = self.data.take_element_name()?;
-        let mut attributes = self.data.take_attributes();
+        let mut attributes = self.data.take_attributes().into_vec();
 
         // check whether the name prefix is bound and fix its namespace
         match self.nst.get(name.borrow().prefix_repr()) {
-            Some("") => name.namespace = None,  // default namespace
+            Some("") => name.namespace = None, // default namespace
             Some(ns) => name.namespace = Some(ns.into()),
             None => return Some(self.error(SyntaxError::UnboundElementPrefix(name.to_string().into())))
         }
diff --git a/src/reader/parser/inside_cdata.rs b/src/reader/parser/inside_cdata.rs
index 4f46f06..f0ca0c8 100644
--- a/src/reader/parser/inside_cdata.rs
+++ b/src/reader/parser/inside_cdata.rs
@@ -9,6 +9,10 @@ impl PullParser {
         match t {
             Token::CDataEnd => {
                 let event = if self.config.c.cdata_to_characters {
+                    // start called push_pos, but there will be no event to pop it
+                    if self.buf.is_empty() {
+                        self.next_pos();
+                    }
                     None
                 } else {
                     let data = self.take_buf();
diff --git a/src/reader/parser/inside_comment.rs b/src/reader/parser/inside_comment.rs
index e4132c5..240ee20 100644
--- a/src/reader/parser/inside_comment.rs
+++ b/src/reader/parser/inside_comment.rs
@@ -23,6 +23,9 @@ impl PullParser {
             _ if self.config.c.ignore_comments => None, // Do not modify buffer if ignoring the comment
 
             _ => {
+                if self.buf.len() > self.config.max_data_length {
+                    return Some(self.error(SyntaxError::ExceededConfiguredLimit));
+                }
                 t.push_to_string(&mut self.buf);
                 None
             }
diff --git a/src/reader/parser/inside_doctype.rs b/src/reader/parser/inside_doctype.rs
index 93ea470..87595d6 100644
--- a/src/reader/parser/inside_doctype.rs
+++ b/src/reader/parser/inside_doctype.rs
@@ -31,8 +31,8 @@ impl PullParser {
                 _ => None,
             },
             DoctypeSubstate::String => match t {
-                Token::SingleQuote if self.data.quote != Some(QuoteToken::SingleQuoteToken) => { None },
-                Token::DoubleQuote if self.data.quote != Some(QuoteToken::DoubleQuoteToken) => { None },
+                Token::SingleQuote if self.data.quote != Some(QuoteToken::SingleQuoteToken) => None,
+                Token::DoubleQuote if self.data.quote != Some(QuoteToken::DoubleQuoteToken) => None,
                 Token::SingleQuote | Token::DoubleQuote => {
                     self.data.quote = None;
                     self.into_state_continue(State::InsideDoctype(DoctypeSubstate::Outside))
@@ -51,12 +51,12 @@ impl PullParser {
                     None
                 },
                 Token::Character(c) if is_whitespace_char(c) => {
-                    match self.buf.as_str() {
+                    let buf = self.take_buf();
+                    match buf.as_str() {
                         "ENTITY" => self.into_state_continue(State::InsideDoctype(DoctypeSubstate::BeforeEntityName)),
                         "NOTATION" | "ELEMENT" | "ATTLIST" => self.into_state_continue(State::InsideDoctype(DoctypeSubstate::SkipDeclaration)),
-                        s => Some(self.error(SyntaxError::UnknownMarkupDeclaration(s.into()))),
+                        _ => Some(self.error(SyntaxError::UnknownMarkupDeclaration(buf.into()))),
                     }
-
                 },
                 _ => Some(self.error(SyntaxError::UnexpectedToken(t))),
             },
@@ -69,6 +69,9 @@ impl PullParser {
                         self.into_state_continue(State::InsideDoctype(DoctypeSubstate::PEReferenceDefinitionStart))
                     },
                     Token::Character(c) if is_name_start_char(c) => {
+                        if self.data.name.len() > self.config.max_name_length {
+                            return Some(self.error(SyntaxError::ExceededConfiguredLimit));
+                        }
                         self.data.name.push(c);
                         self.into_state_continue(State::InsideDoctype(DoctypeSubstate::EntityName))
                     },
@@ -80,6 +83,9 @@ impl PullParser {
                     self.into_state_continue(State::InsideDoctype(DoctypeSubstate::BeforeEntityValue))
                 },
                 Token::Character(c) if is_name_char(c) => {
+                    if self.data.name.len() > self.config.max_name_length {
+                        return Some(self.error(SyntaxError::ExceededConfiguredLimit));
+                    }
                     self.data.name.push(c);
                     None
                 },
@@ -144,6 +150,9 @@ impl PullParser {
             },
             DoctypeSubstate::PEReferenceDefinition => match t {
                 Token::Character(c) if is_name_char(c) => {
+                    if self.data.name.len() > self.config.max_name_length {
+                        return Some(self.error(SyntaxError::ExceededConfiguredLimit));
+                    }
                     self.data.name.push(c);
                     None
                 },
diff --git a/src/reader/parser/inside_opening_tag.rs b/src/reader/parser/inside_opening_tag.rs
index b7f185a..fb6d001 100644
--- a/src/reader/parser/inside_opening_tag.rs
+++ b/src/reader/parser/inside_opening_tag.rs
@@ -9,6 +9,7 @@ use super::{OpeningTagSubstate, PullParser, QualifiedNameTarget, Result, State};
 
 impl PullParser {
     pub fn inside_opening_tag(&mut self, t: Token, s: OpeningTagSubstate) -> Option<Result> {
+        let max_attrs = self.config.max_attributes;
         match s {
             OpeningTagSubstate::InsideName => self.read_qualified_name(t, QualifiedNameTarget::OpeningTagNameTarget, |this, token, name| {
                 match name.prefix_ref() {
@@ -30,20 +31,29 @@ impl PullParser {
             OpeningTagSubstate::InsideTag => match t {
                 Token::TagEnd => self.emit_start_element(false),
                 Token::EmptyTagEnd => self.emit_start_element(true),
-                Token::Character(c) if is_whitespace_char(c) => None,  // skip whitespace
+                Token::Character(c) if is_whitespace_char(c) => None, // skip whitespace
                 Token::Character(c) if is_name_start_char(c) => {
+                    if self.buf.len() > self.config.max_name_length {
+                        return Some(self.error(SyntaxError::ExceededConfiguredLimit));
+                    }
                     self.buf.push(c);
                     self.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::InsideAttributeName))
                 }
-                _ => Some(self.error(SyntaxError::UnexpectedTokenInOpeningTag(t)))
+                _ => Some(self.error(SyntaxError::UnexpectedTokenInOpeningTag(t))),
             },
 
             OpeningTagSubstate::InsideAttributeName => self.read_qualified_name(t, QualifiedNameTarget::AttributeNameTarget, |this, token, name| {
+                // check that no attribute with such name is already present
+                // if there is one, XML is not well-formed
+                if this.data.attributes.contains(&name) {
+                    return Some(this.error(SyntaxError::RedefinedAttribute(name.to_string().into())))
+                }
+
                 this.data.attr_name = Some(name);
                 match token {
                     Token::EqualsSign => this.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::InsideAttributeValue)),
                     Token::Character(c) if is_whitespace_char(c) => this.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::AfterAttributeName)),
-                    _ => unreachable!()
+                    _ => Some(this.error(SyntaxError::UnexpectedTokenInOpeningTag(t))) // likely unreachable
                 }
             }),
 
@@ -55,58 +65,55 @@ impl PullParser {
 
             OpeningTagSubstate::InsideAttributeValue => self.read_attribute_value(t, |this, value| {
                 let name = this.data.take_attr_name()?;  // will always succeed here
-                // check that no attribute with such name is already present
-                // if there is one, XML is not well-formed
-                if this.data.attributes.iter().any(|a| a.name == name) {  // TODO: looks bad
-                    // TODO: ideally this error should point to the beginning of the attribute,
-                    // TODO: not the end of its value
-                    Some(this.error(SyntaxError::RedefinedAttribute(name.to_string().into())))
-                } else {
-                    match name.prefix_ref() {
-                        // declaring a new prefix; it is sufficient to check prefix only
-                        // because "xmlns" prefix is reserved
-                        Some(namespace::NS_XMLNS_PREFIX) => {
-                            let ln = &*name.local_name;
-                            if ln == namespace::NS_XMLNS_PREFIX {
-                                Some(this.error(SyntaxError::CannotRedefineXmlnsPrefix))
-                            } else if ln == namespace::NS_XML_PREFIX && &*value != namespace::NS_XML_URI {
-                                Some(this.error(SyntaxError::CannotRedefineXmlPrefix))
-                            } else if value.is_empty() {
-                                Some(this.error(SyntaxError::CannotUndefinePrefix(ln.into())))
-                            } else {
-                                this.nst.put(name.local_name.clone(), value);
-                                this.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::AfterAttributeValue))
-                            }
+                match name.prefix_ref() {
+                    // declaring a new prefix; it is sufficient to check prefix only
+                    // because "xmlns" prefix is reserved
+                    Some(namespace::NS_XMLNS_PREFIX) => {
+                        let ln = &*name.local_name;
+                        if ln == namespace::NS_XMLNS_PREFIX {
+                            Some(this.error(SyntaxError::CannotRedefineXmlnsPrefix))
+                        } else if ln == namespace::NS_XML_PREFIX && &*value != namespace::NS_XML_URI {
+                            Some(this.error(SyntaxError::CannotRedefineXmlPrefix))
+                        } else if value.is_empty() {
+                            Some(this.error(SyntaxError::CannotUndefinePrefix(ln.into())))
+                        } else {
+                            this.nst.put(name.local_name.clone(), value);
+                            this.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::AfterAttributeValue))
                         }
+                    }
 
-                        // declaring default namespace
-                        None if &*name.local_name == namespace::NS_XMLNS_PREFIX =>
-                            match &*value {
-                                namespace::NS_XMLNS_PREFIX | namespace::NS_XML_PREFIX | namespace::NS_XML_URI | namespace::NS_XMLNS_URI =>
-                                    Some(this.error(SyntaxError::InvalidDefaultNamespace(value.into()))),
-                                _ => {
-                                    this.nst.put(namespace::NS_NO_PREFIX, value.clone());
-                                    this.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::AfterAttributeValue))
-                                }
-                            },
+                    // declaring default namespace
+                    None if &*name.local_name == namespace::NS_XMLNS_PREFIX =>
+                        match &*value {
+                            namespace::NS_XMLNS_PREFIX | namespace::NS_XML_PREFIX | namespace::NS_XML_URI | namespace::NS_XMLNS_URI =>
+                                Some(this.error(SyntaxError::InvalidDefaultNamespace(value.into()))),
+                            _ => {
+                                this.nst.put(namespace::NS_NO_PREFIX, value.clone());
+                                this.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::AfterAttributeValue))
+                            }
+                        },
 
-                        // regular attribute
-                        _ => {
-                            this.data.attributes.push(OwnedAttribute {
-                                name: name.clone(),
-                                value
-                            });
-                            this.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::AfterAttributeValue))
+                    // regular attribute
+                    _ => {
+                        if this.data.attributes.len() >= max_attrs {
+                            return Some(this.error(SyntaxError::ExceededConfiguredLimit));
                         }
+                        this.data.attributes.push(OwnedAttribute {
+                            name,
+                            value
+                        });
+                        this.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::AfterAttributeValue))
                     }
                 }
             }),
 
             OpeningTagSubstate::AfterAttributeValue => match t {
-                Token::Character(c) if is_whitespace_char(c) => self.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::InsideTag)),
+                Token::Character(c) if is_whitespace_char(c) => {
+                    self.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::InsideTag))
+                },
                 Token::TagEnd => self.emit_start_element(false),
                 Token::EmptyTagEnd => self.emit_start_element(true),
-                _ => Some(self.error(SyntaxError::UnexpectedTokenInOpeningTag(t)))
+                _ => Some(self.error(SyntaxError::UnexpectedTokenInOpeningTag(t))),
             },
         }
     }
diff --git a/src/reader/parser/inside_processing_instruction.rs b/src/reader/parser/inside_processing_instruction.rs
index 96f6753..99caf59 100644
--- a/src/reader/parser/inside_processing_instruction.rs
+++ b/src/reader/parser/inside_processing_instruction.rs
@@ -12,6 +12,9 @@ impl PullParser {
             ProcessingInstructionSubstate::PIInsideName => match t {
                 Token::Character(c) if self.buf.is_empty() && is_name_start_char(c) ||
                                  self.buf_has_data() && is_name_char(c) => {
+                    if self.buf.len() > self.config.max_name_length {
+                        return Some(self.error(SyntaxError::ExceededConfiguredLimit));
+                    }
                     self.buf.push(c);
                     None
                 },
@@ -101,6 +104,9 @@ impl PullParser {
 
                 // Any other token should be treated as plain characters
                 _ => {
+                    if self.buf.len() > self.config.max_data_length {
+                        return Some(self.error(SyntaxError::ExceededConfiguredLimit));
+                    }
                     t.push_to_string(&mut self.buf);
                     None
                 }
diff --git a/src/reader/parser/inside_reference.rs b/src/reader/parser/inside_reference.rs
index 9a15e09..eced606 100644
--- a/src/reader/parser/inside_reference.rs
+++ b/src/reader/parser/inside_reference.rs
@@ -68,6 +68,7 @@ impl PullParser {
         };
         match char::from_u32(val) {
             Some(c) if self.is_valid_xml_char(c) => Ok(c),
+            Some(_) if self.config.c.replace_unknown_entity_references => Ok('\u{fffd}'),
             None if self.config.c.replace_unknown_entity_references => {
                 Ok('\u{fffd}')
             },
diff --git a/src/reader/parser/outside_tag.rs b/src/reader/parser/outside_tag.rs
index 8104224..e62f862 100644
--- a/src/reader/parser/outside_tag.rs
+++ b/src/reader/parser/outside_tag.rs
@@ -31,6 +31,8 @@ impl PullParser {
 
                 if self.buf.is_empty() {
                     self.push_pos();
+                } else if self.buf.len() > self.config.max_data_length {
+                    return Some(self.error(SyntaxError::ExceededConfiguredLimit));
                 }
                 self.buf.push(c);
                 None
@@ -47,7 +49,10 @@ impl PullParser {
                 if let Some(s) = t.as_static_str() {
                     if self.buf.is_empty() {
                         self.push_pos();
+                    } else if self.buf.len() > self.config.max_data_length {
+                        return Some(self.error(SyntaxError::ExceededConfiguredLimit));
                     }
+
                     self.buf.push_str(s);
                 }
                 None
@@ -60,6 +65,9 @@ impl PullParser {
 
             Token::ReferenceEnd if self.depth() > 0 => { // Semi-colon in a text outside an entity
                 self.inside_whitespace = false;
+                if self.buf.len() > self.config.max_data_length {
+                    return Some(self.error(SyntaxError::ExceededConfiguredLimit));
+                }
                 Token::ReferenceEnd.push_to_string(&mut self.buf);
                 None
             },
@@ -85,6 +93,7 @@ impl PullParser {
                     if self.inside_whitespace && self.config.c.trim_whitespace {
                         None
                     } else if self.inside_whitespace && !self.config.c.whitespace_to_characters {
+                        debug_assert!(buf.chars().all(|ch| ch.is_whitespace()), "ws={buf:?}");
                         Some(Ok(XmlEvent::Whitespace(buf)))
                     } else if self.config.c.trim_whitespace {
                         Some(Ok(XmlEvent::Characters(buf.trim_matches(is_whitespace_char).into())))
@@ -166,7 +175,7 @@ impl PullParser {
                 self.into_state(State::OutsideTag, next_event)
             },
 
-            Token::CommentStart  => {
+            Token::CommentStart => {
                 let next_event = self.set_encountered(Encountered::Comment);
                 self.into_state(State::InsideComment, next_event)
             }