aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndroid Build Coastguard Worker <android-build-coastguard-worker@google.com>2023-12-05 02:44:02 +0000
committerAndroid Build Coastguard Worker <android-build-coastguard-worker@google.com>2023-12-05 02:44:02 +0000
commitec0a915fc34a21d95357cd580c8458df629ff564 (patch)
tree75f032e079b05caf8e87878d382055f53324811a
parent69640b263330e18465e85e074cbd9792c59b8bdc (diff)
parent6f95508c081d35323b17f0b3c1b5373099824b0b (diff)
downloadxml-rs-android14-qpr2-s2-release.tar.gz
Change-Id: I57bca9a739aadb6553ae8cb371a74040eaaa02d4
-rw-r--r--.cargo_vcs_info.json2
-rw-r--r--Android.bp6
-rw-r--r--Cargo.toml6
-rw-r--r--Cargo.toml.orig6
-rw-r--r--METADATA21
-rw-r--r--README.md15
-rw-r--r--src/attribute.rs2
-rw-r--r--src/common.rs6
-rw-r--r--src/escape.rs17
-rw-r--r--src/lib.rs4
-rw-r--r--src/macros.rs48
-rw-r--r--src/namespace.rs7
-rw-r--r--src/reader.rs7
-rw-r--r--src/reader/config.rs69
-rw-r--r--src/reader/error.rs3
-rw-r--r--src/reader/events.rs4
-rw-r--r--src/reader/indexset.rs116
-rw-r--r--src/reader/lexer.rs18
-rw-r--r--src/reader/parser.rs53
-rw-r--r--src/reader/parser/inside_cdata.rs4
-rw-r--r--src/reader/parser/inside_comment.rs3
-rw-r--r--src/reader/parser/inside_doctype.rs19
-rw-r--r--src/reader/parser/inside_opening_tag.rs95
-rw-r--r--src/reader/parser/inside_processing_instruction.rs6
-rw-r--r--src/reader/parser/inside_reference.rs1
-rw-r--r--src/reader/parser/outside_tag.rs11
-rw-r--r--src/writer/emitter.rs3
27 files changed, 395 insertions, 157 deletions
diff --git a/.cargo_vcs_info.json b/.cargo_vcs_info.json
index f0a8a38..ea1e63a 100644
--- a/.cargo_vcs_info.json
+++ b/.cargo_vcs_info.json
@@ -1,6 +1,6 @@
{
"git": {
- "sha1": "c4705ddc172950c28f9b229f368ad8f4cba81e3f"
+ "sha1": "bfb185ede18170f7b21f9b17ab65cbb4aba2de22"
},
"path_in_vcs": ""
} \ No newline at end of file
diff --git a/Android.bp b/Android.bp
index 807ac43..aa35d56 100644
--- a/Android.bp
+++ b/Android.bp
@@ -23,7 +23,7 @@ rust_library {
host_supported: true,
crate_name: "xml",
cargo_env_compat: true,
- cargo_pkg_version: "0.8.15-cvss-cries-wolf",
+ cargo_pkg_version: "0.8.19",
srcs: ["src/lib.rs"],
edition: "2021",
apex_available: [
@@ -39,7 +39,7 @@ rust_test {
host_supported: true,
crate_name: "xml",
cargo_env_compat: true,
- cargo_pkg_version: "0.8.15-cvss-cries-wolf",
+ cargo_pkg_version: "0.8.19",
srcs: ["src/lib.rs"],
test_suites: ["general-tests"],
auto_gen_config: true,
@@ -54,7 +54,7 @@ rust_binary {
host_supported: true,
crate_name: "xml_analyze",
cargo_env_compat: true,
- cargo_pkg_version: "0.8.15-cvss-cries-wolf",
+ cargo_pkg_version: "0.8.19",
srcs: ["src/analyze.rs"],
edition: "2021",
rustlibs: ["libxml_rust"],
diff --git a/Cargo.toml b/Cargo.toml
index 3279206..5d8dc0c 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -13,7 +13,7 @@
edition = "2021"
rust-version = "1.58"
name = "xml-rs"
-version = "0.8.15-cvss-cries-wolf"
+version = "0.8.19"
authors = ["Vladimir Matveev <vmatveev@citrine.cc>"]
include = [
"src/**",
@@ -36,6 +36,7 @@ license = "MIT"
repository = "https://github.com/kornelski/xml-rs"
[package.metadata.docs.rs]
+rustdoc-args = ["--generate-link-to-definition"]
targets = ["x86_64-unknown-linux-gnu"]
[package.metadata.release]
@@ -50,8 +51,5 @@ path = "src/lib.rs"
name = "xml-analyze"
path = "src/analyze.rs"
-[dev-dependencies.doc-comment]
-version = "0.3"
-
[badges.maintenance]
status = "actively-developed"
diff --git a/Cargo.toml.orig b/Cargo.toml.orig
index 0282e7a..264cf00 100644
--- a/Cargo.toml.orig
+++ b/Cargo.toml.orig
@@ -1,6 +1,6 @@
[package]
name = "xml-rs"
-version = "0.8.15-cvss-cries-wolf"
+version = "0.8.19"
authors = ["Vladimir Matveev <vmatveev@citrine.cc>"]
license = "MIT"
description = "An XML library in pure Rust"
@@ -22,14 +22,12 @@ path = "src/lib.rs"
name = "xml-analyze"
path = "src/analyze.rs"
-[dev-dependencies]
-doc-comment = "0.3"
-
[badges]
maintenance = { status = "actively-developed" }
[package.metadata.docs.rs]
targets = ["x86_64-unknown-linux-gnu"]
+rustdoc-args = ["--generate-link-to-definition"]
[package.metadata.release]
tag-name = "{{version}}"
diff --git a/METADATA b/METADATA
index 87bd4b7..629fe15 100644
--- a/METADATA
+++ b/METADATA
@@ -1,23 +1,20 @@
# This project was upgraded with external_updater.
# Usage: tools/external_updater/updater.sh update rust/crates/xml-rs
-# For more info, check https://cs.android.com/android/platform/superproject/+/master:tools/external_updater/README.md
+# For more info, check https://cs.android.com/android/platform/superproject/+/main:tools/external_updater/README.md
name: "xml-rs"
description: "An XML library in pure Rust"
third_party {
- url {
- type: HOMEPAGE
- value: "https://crates.io/crates/xml-rs"
- }
- url {
- type: ARCHIVE
- value: "https://static.crates.io/crates/xml-rs/xml-rs-0.8.15-cvss-cries-wolf.crate"
- }
- version: "0.8.15-cvss-cries-wolf"
license_type: NOTICE
last_upgrade_date {
year: 2023
- month: 6
- day: 14
+ month: 12
+ day: 4
+ }
+ homepage: "https://crates.io/crates/xml-rs"
+ identifier {
+ type: "Archive"
+ value: "https://static.crates.io/crates/xml-rs/xml-rs-0.8.19.crate"
+ version: "0.8.19"
}
}
diff --git a/README.md b/README.md
index fa4ba7f..07f306c 100644
--- a/README.md
+++ b/README.md
@@ -15,13 +15,13 @@ It supports reading and writing of XML documents in a streaming fashion (without
### Features
-* API based on `Iterator`s and regular `String`s without tricky lifetimes.
-
* XML spec conformance better than other pure-Rust libraries.
+* Easy to use API based on `Iterator`s and regular `String`s without tricky lifetimes.
+
* Support for UTF-16, UTF-8, ISO-8859-1, and ASCII encodings.
-* Written entirely in the safe Rust subset.
+* Written entirely in the safe Rust subset. Designed to safely handle untrusted input.
The API is heavily inspired by Java Streaming API for XML ([StAX][stax]). It contains a pull parser much like StAX event reader. It provides an iterator API, so you can leverage Rust's existing iterators library features.
@@ -53,7 +53,7 @@ xml-rs uses [Cargo](https://crates.io), so add it with `cargo add xml` or modify
```toml
[dependencies]
-xml = "0.8"
+xml = "0.8.16"
```
The package exposes a single crate called `xml`.
@@ -129,6 +129,13 @@ small program (BTW, it is built with `cargo build` and can be run after that) wh
statistics about specified XML document. It can also be used to check for well-formedness of
XML documents - if a document is not well-formed, this program will exit with an error.
+
+## Parsing untrusted inputs
+
+The parser is written in safe Rust subset, so by Rust's guarantees the worst that it can do is to cause a panic.
+You can use `ParserConfig` to set limits on maximum lenghts of names, attributes, text, entities, etc.
+You should also set a maximum document size via `io::Read`'s [`take(max)`](https://doc.rust-lang.org/stable/std/io/trait.Read.html#method.take) method.
+
Writing XML documents
---------------------
diff --git a/src/attribute.rs b/src/attribute.rs
index 112bf24..5d0184e 100644
--- a/src/attribute.rs
+++ b/src/attribute.rs
@@ -3,7 +3,7 @@
use std::fmt;
-use crate::escape::{Escaped, AttributeEscapes};
+use crate::escape::{AttributeEscapes, Escaped};
use crate::name::{Name, OwnedName};
/// A borrowed version of an XML attribute.
diff --git a/src/common.rs b/src/common.rs
index a1bf3ac..0b324f2 100644
--- a/src/common.rs
+++ b/src/common.rs
@@ -112,15 +112,15 @@ pub fn is_whitespace_str(s: &str) -> bool {
s.chars().all(is_whitespace_char)
}
-pub fn is_xml10_char(c: char) -> bool {
+#[must_use] pub fn is_xml10_char(c: char) -> bool {
matches!(c, '\u{09}' | '\u{0A}' | '\u{0D}' | '\u{20}'..='\u{D7FF}' | '\u{E000}'..='\u{FFFD}' | '\u{10000}'..)
}
-pub fn is_xml11_char(c: char) -> bool {
+#[must_use] pub fn is_xml11_char(c: char) -> bool {
matches!(c, '\u{01}'..='\u{D7FF}' | '\u{E000}'..='\u{FFFD}' | '\u{10000}'..)
}
-pub fn is_xml11_char_not_restricted(c: char) -> bool {
+#[must_use] pub fn is_xml11_char_not_restricted(c: char) -> bool {
is_xml11_char(c) && !matches!(c, '\u{01}'..='\u{08}' | '\u{0B}'..='\u{0C}' | '\u{0E}'..='\u{1F}' | '\u{7F}'..='\u{84}' | '\u{86}'..='\u{9F}')
}
diff --git a/src/escape.rs b/src/escape.rs
index 1fcfd06..ad8ee4a 100644
--- a/src/escape.rs
+++ b/src/escape.rs
@@ -5,11 +5,11 @@ use std::{borrow::Cow, marker::PhantomData, fmt::{Display, Result, Formatter}};
pub(crate) trait Escapes {
fn escape(c: u8) -> Option<&'static str>;
- fn byte_needs_escaping(c: u8) -> bool{
+ fn byte_needs_escaping(c: u8) -> bool {
Self::escape(c).is_some()
}
- fn str_needs_escaping(s: &str) -> bool{
+ fn str_needs_escaping(s: &str) -> bool {
s.bytes().any(|c| Self::escape(c).is_some())
}
}
@@ -22,13 +22,12 @@ pub(crate) struct Escaped<'a, E: Escapes> {
impl<'a, E: Escapes> Escaped<'a, E> {
pub fn new(s: &'a str) -> Self {
Escaped {
- _escape_phantom: PhantomData,
+ _escape_phantom: PhantomData,
to_escape: s,
}
}
}
-
impl<'a, E: Escapes> Display for Escaped<'a, E> {
fn fmt(&self, f: &mut Formatter<'_>) -> Result {
let mut total_remaining = self.to_escape;
@@ -49,7 +48,7 @@ impl<'a, E: Escapes> Display for Escaped<'a, E> {
total_remaining = &remaining[1..];
}
-
+
f.write_str(total_remaining)
}
}
@@ -107,7 +106,7 @@ escapes!(
/// * `"` → `&quot;`
/// * `'` → `&apos;`
/// * `&` → `&amp;`
-///
+///
/// The following characters are escaped so that attributes are printed on
/// a single line:
/// * `\n` → `&#xA;`
@@ -117,7 +116,8 @@ escapes!(
///
/// Does not perform allocations if the given string does not contain escapable characters.
#[inline]
-#[must_use] pub fn escape_str_attribute(s: &str) -> Cow<'_, str> {
+#[must_use]
+pub fn escape_str_attribute(s: &str) -> Cow<'_, str> {
escape_str::<AttributeEscapes>(s)
}
@@ -133,7 +133,8 @@ escapes!(
///
/// Does not perform allocations if the given string does not contain escapable characters.
#[inline]
-#[must_use] pub fn escape_str_pcdata(s: &str) -> Cow<'_, str> {
+#[must_use]
+pub fn escape_str_pcdata(s: &str) -> Cow<'_, str> {
escape_str::<PcDataEscapes>(s)
}
diff --git a/src/lib.rs b/src/lib.rs
index b1486d8..40a4f21 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -8,9 +8,9 @@
//!
//! Please note that functions of this parser may panic.
//! If a panic could cause a Denial Of Service in your codebase, *you're* responsible for wrapping access to this library in `catch_unwind`.
+//!
-#[cfg(doctest)]
-doc_comment::doctest!("../README.md");
+#![cfg_attr(doctest, doc = include_str!("../README.md"))]
pub use crate::reader::EventReader;
pub use crate::reader::ParserConfig;
diff --git a/src/macros.rs b/src/macros.rs
index 25916d3..da1adad 100644
--- a/src/macros.rs
+++ b/src/macros.rs
@@ -3,39 +3,41 @@
//! Contains several macros used in this crate.
macro_rules! gen_setter {
- ($target:ty, $field:ident : into $t:ty) => {
- impl $target {
- /// See [`ParserConfig`][crate::ParserConfig] fields docs for details
+ ($(#[$comments:meta])* $field:ident : into $t:ty) => {
+
+ $(#[$comments])*
+ ///
+ /// <small>See [`ParserConfig`][crate::ParserConfig] fields docs for details</small>
#[inline]
- pub fn $field<T: Into<$t>>(mut self, value: T) -> $target {
+ pub fn $field<T: Into<$t>>(mut self, value: T) -> Self {
self.$field = value.into();
self
}
- }
};
- ($target:ty, $field:ident : val $t:ty) => {
- impl $target {
- /// See [`ParserConfig`][crate::ParserConfig] fields docs for details
+ ($(#[$comments:meta])* $field:ident : val $t:ty) => {
+ $(#[$comments])*
+ ///
+ /// <small>See [`ParserConfig`][crate::ParserConfig] fields docs for details</small>
#[inline]
- pub fn $field(mut self, value: $t) -> $target {
+ #[must_use] pub fn $field(mut self, value: $t) -> Self {
self.$field = value;
self
}
- }
};
- ($target:ty, $field:ident : delegate $t:ty) => {
- impl $target {
- /// See [`ParserConfig`][crate::ParserConfig] fields docs for details
+ ($(#[$comments:meta])* $field:ident : delegate $t:ty) => {
+ $(#[$comments])*
+ ///
+ /// <small>See [`ParserConfig`][crate::ParserConfig] fields docs for details</small>
#[inline]
- pub fn $field(mut self, value: $t) -> $target {
+ #[must_use] pub fn $field(mut self, value: $t) -> Self {
self.c.$field = value;
self
}
- }
};
- ($target:ty, $field:ident : c2 $t:ty) => {
- impl $target {
- /// See [`ParserConfig2`][crate::reader::ParserConfig] fields docs for details
+ ($(#[$comments:meta])* $field:ident : c2 $t:ty) => {
+ $(#[$comments])*
+ ///
+ /// <small>See [`ParserConfig2`][crate::reader::ParserConfig2] fields docs for details</small>
#[inline]
#[must_use]
pub fn $field(self, value: $t) -> ParserConfig2 {
@@ -45,12 +47,14 @@ macro_rules! gen_setter {
}
.$field(value)
}
- }
};
}
macro_rules! gen_setters {
- ($target:ty, $($field:ident : $k:tt $tpe:ty),+) => ($(
- gen_setter! { $target, $field : $k $tpe }
- )+)
+ ($target:ident, $($(#[$comments:meta])* $field:ident : $k:tt $tpe:ty),+) => (
+ impl $target {$(
+
+ gen_setter! { $(#[$comments])* $field : $k $tpe }
+ )+
+ })
}
diff --git a/src/namespace.rs b/src/namespace.rs
index 216a982..02f3b3d 100644
--- a/src/namespace.rs
+++ b/src/namespace.rs
@@ -1,5 +1,6 @@
//! Contains namespace manipulation types and functions.
+use std::borrow::Cow;
use std::collections::btree_map::Iter as Entries;
use std::collections::btree_map::{BTreeMap, Entry};
use std::collections::HashSet;
@@ -165,6 +166,12 @@ impl Namespace {
pub fn get<'a, P: ?Sized + AsRef<str>>(&'a self, prefix: &P) -> Option<&'a str> {
self.0.get(prefix.as_ref()).map(|s| &**s)
}
+
+ /// Borrowed namespace for the writer
+ #[must_use]
+ pub fn borrow(&self) -> Cow<'_, Self> {
+ Cow::Borrowed(self)
+ }
}
/// An alias for iterator type for namespace mappings contained in a namespace.
diff --git a/src/reader.rs b/src/reader.rs
index 71ea79b..20a790c 100644
--- a/src/reader.rs
+++ b/src/reader.rs
@@ -11,17 +11,18 @@ use crate::common::{Position, TextPosition};
pub use self::config::ParserConfig;
pub use self::config::ParserConfig2;
-
+pub use self::error::{Error, ErrorKind};
pub use self::events::XmlEvent;
+
use self::parser::PullParser;
mod config;
mod events;
mod lexer;
mod parser;
-
+mod indexset;
mod error;
-pub use self::error::{Error, ErrorKind};
+
/// A result type yielded by `XmlReader`.
pub type Result<T, E = Error> = result::Result<T, E>;
diff --git a/src/reader/config.rs b/src/reader/config.rs
index 3351997..686d0d4 100644
--- a/src/reader/config.rs
+++ b/src/reader/config.rs
@@ -5,7 +5,11 @@ use std::io::Read;
use crate::reader::EventReader;
use crate::util::Encoding;
-/// Parser configuration structure.
+/// Limits to defend from billion laughs attack
+const DEFAULT_MAX_ENTITY_EXPANSION_LENGTH: usize = 1_000_000;
+const DEFAULT_MAX_ENTITY_EXPANSION_DEPTH: u8 = 10;
+
+/// Parser configuration structure. **There are more config methods than public fileds — see methods below**.
///
/// This structure contains various configuration options which affect
/// behavior of the parser.
@@ -88,6 +92,8 @@ pub struct ParserConfig {
///
/// By default any whitespace that is not enclosed within at least one level of elements will be
/// ignored. Setting this value to false will cause root level whitespace events to be emitted.
+ ///
+ /// **There are configuration options – see methods below**
pub ignore_root_level_whitespace: bool,
}
@@ -198,6 +204,23 @@ pub struct ParserConfig2 {
/// Documents with multiple root elements are ill-formed
pub allow_multiple_root_elements: bool,
+
+ /// Abort if custom entities create a string longer than this
+ pub max_entity_expansion_length: usize,
+ /// Entities can expand into other entities this many times (be careful about exponential cost!)
+ pub max_entity_expansion_depth: u8,
+
+ /// Maximum length of tag name or attribute name
+ pub max_name_length: usize,
+
+ /// Max number of attributes per element
+ pub max_attributes: usize,
+
+ /// Max number of bytes in each attribute
+ pub max_attribute_length: usize,
+
+ /// Maximum length of strings reprsenting characters, comments, and processing instructions
+ pub max_data_length: usize,
}
impl Default for ParserConfig2 {
@@ -207,6 +230,12 @@ impl Default for ParserConfig2 {
override_encoding: None,
ignore_invalid_encoding_declarations: false,
allow_multiple_root_elements: true,
+ max_entity_expansion_length: DEFAULT_MAX_ENTITY_EXPANSION_LENGTH,
+ max_entity_expansion_depth: DEFAULT_MAX_ENTITY_EXPANSION_DEPTH,
+ max_attributes: 1<<16,
+ max_attribute_length: 1<<30,
+ max_data_length: 1<<30,
+ max_name_length: 1<<18,
}
}
}
@@ -273,15 +302,48 @@ impl From<ParserConfig> for ParserConfig2 {
}
gen_setters! { ParserConfig2,
+ /// Set if you got one in the HTTP header
override_encoding: val Option<Encoding>,
+ /// Allows invalid documents. There should be only a single root element in XML.
allow_multiple_root_elements: val bool,
+ /// Abort if custom entities create a string longer than this
+ max_entity_expansion_length: val usize,
+ /// Entities can expand into other entities this many times (be careful about exponential cost!)
+ max_entity_expansion_depth: val u8,
+ /// Max number of attributes per element
+ max_attributes: val usize,
+ /// Maximum length of tag name or attribute name
+ max_name_length: val usize,
+ /// Max number of bytes in each attribute
+ max_attribute_length: val usize,
+ /// Maximum length of strings reprsenting characters, comments, and processing instructions
+ max_data_length: val usize,
+ /// Allow `<?xml encoding="bogus"?>`
ignore_invalid_encoding_declarations: val bool
}
gen_setters! { ParserConfig,
+ /// Set if you got one in the HTTP header (see `content_type`)
override_encoding: c2 Option<Encoding>,
+ /// Allow `<?xml encoding="bogus"?>`
ignore_invalid_encoding_declarations: c2 bool,
+ /// Allows invalid documents. There should be only a single root element in XML.
allow_multiple_root_elements: c2 bool,
+
+ /// Abort if custom entities create a string longer than this
+ max_entity_expansion_length: c2 usize,
+ /// Entities can expand into other entities this many times (be careful about exponential cost!)
+ max_entity_expansion_depth: c2 u8,
+ /// Max number of attributes per element
+ max_attributes: c2 usize,
+ /// Maximum length of tag name or attribute name
+ max_name_length: c2 usize,
+ /// Max number of bytes in each attribute
+ max_attribute_length: c2 usize,
+ /// Maximum length of strings reprsenting characters, comments, and processing instructions
+ max_data_length: c2 usize,
+
+ /// Set encoding from the MIME type. Important for HTTP compatibility.
content_type: c2 &str
}
@@ -293,14 +355,15 @@ gen_setters! { ParserConfig2,
coalesce_characters: delegate bool,
ignore_end_of_stream: delegate bool,
replace_unknown_entity_references: delegate bool,
+ /// Whether or not whitespace at the root level of the document is ignored. Default is true.
ignore_root_level_whitespace: delegate bool
}
#[test]
fn mime_parse() {
- let c = ParserConfig2::new().content_type("text/xml;charset=Us-AScii");
+ let c = ParserConfig2::new().content_type("text/xml;charset=Us-AScii").max_entity_expansion_length(1000);
assert_eq!(c.override_encoding, Some(Encoding::Ascii));
- let c = ParserConfig2::new().content_type("text/xml;charset = \"UTF-16\"");
+ let c = ParserConfig2::new().max_entity_expansion_depth(3).content_type("text/xml;charset = \"UTF-16\"");
assert_eq!(c.override_encoding, Some(Encoding::Utf16));
}
diff --git a/src/reader/error.rs b/src/reader/error.rs
index 8af35ae..64210c4 100644
--- a/src/reader/error.rs
+++ b/src/reader/error.rs
@@ -65,6 +65,8 @@ pub(crate) enum SyntaxError {
UnexpectedXmlVersion(Box<str>),
ConflictingEncoding(Encoding, Encoding),
UnexpectedTokenBefore(&'static str, char),
+ /// Document has more stuff than `ParserConfig` allows
+ ExceededConfiguredLimit,
}
impl fmt::Display for SyntaxError {
@@ -116,6 +118,7 @@ impl SyntaxError {
Self::UnexpectedXmlVersion(ref version) => format!("Invalid XML version: {version}").into(),
Self::UnknownMarkupDeclaration(ref v) => format!("Unknown markup declaration: {v}").into(),
Self::UnsupportedEncoding(ref v) => format!("Unsupported encoding: {v}").into(),
+ Self::ExceededConfiguredLimit => "This document is larger/more complex than allowed by the parser's configuration".into(),
}
}
}
diff --git a/src/reader/events.rs b/src/reader/events.rs
index de2b930..e8eb81e 100644
--- a/src/reader/events.rs
+++ b/src/reader/events.rs
@@ -1,8 +1,6 @@
//! Contains `XmlEvent` datatype, instances of which are emitted by the parser.
-use std::borrow::Cow;
use std::fmt;
-
use crate::attribute::OwnedAttribute;
use crate::common::XmlVersion;
use crate::name::OwnedName;
@@ -207,7 +205,7 @@ impl XmlEvent {
Some(crate::writer::events::XmlEvent::StartElement {
name: name.borrow(),
attributes: attributes.iter().map(|a| a.borrow()).collect(),
- namespace: Cow::Borrowed(namespace)
+ namespace: namespace.borrow(),
}),
XmlEvent::EndElement { ref name } =>
Some(crate::writer::events::XmlEvent::EndElement { name: Some(name.borrow()) }),
diff --git a/src/reader/indexset.rs b/src/reader/indexset.rs
new file mode 100644
index 0000000..3d683a2
--- /dev/null
+++ b/src/reader/indexset.rs
@@ -0,0 +1,116 @@
+use crate::attribute::OwnedAttribute;
+use crate::name::OwnedName;
+
+use std::collections::hash_map::RandomState;
+use std::collections::HashSet;
+use std::hash::BuildHasher;
+use std::hash::Hash;
+use std::hash::Hasher;
+
+/// An ordered set
+pub(crate) struct AttributesSet {
+ vec: Vec<OwnedAttribute>,
+ /// Uses a no-op hasher, because these u64s are hashes already
+ may_contain: HashSet<u64, U64HasherBuilder>,
+ /// This is real hasher for the `OwnedName`
+ hasher: RandomState,
+}
+
+/// Use linear search and don't allocate `HashSet` if there are few attributes,
+/// because allocation costs more than a few comparisons.
+const HASH_THRESHOLD: usize = 8;
+
+impl AttributesSet {
+ pub fn new() -> Self {
+ Self {
+ vec: Vec::new(),
+ hasher: RandomState::new(),
+ may_contain: HashSet::default(),
+ }
+ }
+
+ fn hash(&self, val: &OwnedName) -> u64 {
+ let mut h = self.hasher.build_hasher();
+ val.hash(&mut h);
+ h.finish()
+ }
+
+ pub fn len(&self) -> usize {
+ self.vec.len()
+ }
+
+ pub fn contains(&self, name: &OwnedName) -> bool {
+ // fall back to linear search only on duplicate or hash collision
+ (self.vec.len() < HASH_THRESHOLD || self.may_contain.contains(&self.hash(name))) &&
+ self.vec.iter().any(move |a| &a.name == name)
+ }
+
+ pub fn push(&mut self, attr: OwnedAttribute) {
+ if self.vec.len() >= HASH_THRESHOLD {
+ if self.vec.len() == HASH_THRESHOLD {
+ self.may_contain.reserve(HASH_THRESHOLD * 2);
+ for attr in &self.vec {
+ self.may_contain.insert(self.hash(&attr.name));
+ }
+ }
+ self.may_contain.insert(self.hash(&attr.name));
+ }
+ self.vec.push(attr);
+ }
+
+ pub fn into_vec(self) -> Vec<OwnedAttribute> {
+ self.vec
+ }
+}
+
+#[test]
+fn indexset() {
+ let mut s = AttributesSet::new();
+ let not_here = OwnedName {
+ local_name: "attr1000".into(),
+ namespace: Some("test".into()),
+ prefix: None,
+ };
+
+ // this test will take a lot of time if the `contains()` is linear, and the loop is quadratic
+ for i in 0..50000 {
+ let name = OwnedName {
+ local_name: format!("attr{i}"), namespace: None, prefix: None,
+ };
+ assert!(!s.contains(&name));
+
+ s.push(OwnedAttribute { name, value: String::new() });
+ assert!(!s.contains(&not_here));
+ }
+
+ assert!(s.contains(&OwnedName {
+ local_name: "attr1234".into(), namespace: None, prefix: None,
+ }));
+ assert!(s.contains(&OwnedName {
+ local_name: "attr0".into(), namespace: None, prefix: None,
+ }));
+ assert!(s.contains(&OwnedName {
+ local_name: "attr49999".into(), namespace: None, prefix: None,
+ }));
+}
+
+/// Hashser that does nothing except passing u64 through
+struct U64Hasher(u64);
+
+impl Hasher for U64Hasher {
+ fn finish(&self) -> u64 { self.0 }
+ fn write(&mut self, slice: &[u8]) {
+ for &v in slice { self.0 ^= u64::from(v) } // unused in practice
+ }
+ fn write_u64(&mut self, i: u64) {
+ self.0 ^= i;
+ }
+}
+
+#[derive(Default)]
+struct U64HasherBuilder;
+
+impl BuildHasher for U64HasherBuilder {
+ type Hasher = U64Hasher;
+ fn build_hasher(&self) -> U64Hasher { U64Hasher(0) }
+}
diff --git a/src/reader/lexer.rs b/src/reader/lexer.rs
index a8345ba..6b59c86 100644
--- a/src/reader/lexer.rs
+++ b/src/reader/lexer.rs
@@ -13,9 +13,7 @@ use crate::common::{is_name_char, is_whitespace_char, Position, TextPosition, is
use crate::reader::Error;
use crate::util::{CharReader, Encoding};
-/// Limits to defend from billion laughs attack
-const MAX_ENTITY_EXPANSION_LENGTH: usize = 1_000_000;
-const MAX_ENTITY_EXPANSION_DEPTH: u8 = 10;
+use super::ParserConfig2;
/// `Token` represents a single lexeme of an XML document. These lexemes
/// are used to perform actual parsing.
@@ -229,6 +227,9 @@ pub(crate) struct Lexer {
reparse_depth: u8,
#[cfg(test)]
skip_errors: bool,
+
+ max_entity_expansion_depth: u8,
+ max_entity_expansion_length: usize,
}
impl Position for Lexer {
@@ -239,7 +240,7 @@ impl Position for Lexer {
impl Lexer {
/// Returns a new lexer with default state.
- pub(crate) fn new() -> Lexer {
+ pub(crate) fn new(config: &ParserConfig2) -> Lexer {
Lexer {
reader: CharReader::new(),
pos: TextPosition::new(),
@@ -252,6 +253,9 @@ impl Lexer {
reparse_depth: 0,
#[cfg(test)]
skip_errors: false,
+
+ max_entity_expansion_depth: config.max_entity_expansion_depth,
+ max_entity_expansion_length: config.max_entity_expansion_length,
}
}
@@ -422,7 +426,7 @@ impl Lexer {
}
self.reparse_depth += 1;
- if self.reparse_depth > MAX_ENTITY_EXPANSION_DEPTH || self.char_queue.len() > MAX_ENTITY_EXPANSION_LENGTH {
+ if self.reparse_depth > self.max_entity_expansion_depth || self.char_queue.len() > self.max_entity_expansion_length {
return Err(self.error(SyntaxError::EntityTooBig))
}
@@ -650,7 +654,7 @@ impl Lexer {
#[cfg(test)]
mod tests {
- use crate::common::Position;
+ use crate::{common::Position, reader::ParserConfig2};
use std::io::{BufReader, Cursor};
use super::{Lexer, Token};
@@ -680,7 +684,7 @@ mod tests {
);
fn make_lex_and_buf(s: &str) -> (Lexer, BufReader<Cursor<Vec<u8>>>) {
- (Lexer::new(), BufReader::new(Cursor::new(s.to_owned().into_bytes())))
+ (Lexer::new(&ParserConfig2::default()), BufReader::new(Cursor::new(s.to_owned().into_bytes())))
}
#[test]
diff --git a/src/reader/parser.rs b/src/reader/parser.rs
index dcdec89..18f073d 100644
--- a/src/reader/parser.rs
+++ b/src/reader/parser.rs
@@ -1,24 +1,19 @@
//! Contains an implementation of pull-based XML parser.
-
-use crate::common::is_xml11_char;
-use crate::common::is_xml10_char;
-use crate::common::is_xml11_char_not_restricted;
-use crate::reader::error::SyntaxError;
-use std::collections::HashMap;
-use std::io::prelude::*;
-
-use crate::attribute::OwnedAttribute;
-use crate::common::{self, is_name_char, is_name_start_char, Position, TextPosition, XmlVersion, is_whitespace_char};
+use crate::common::{is_xml10_char, is_xml11_char, is_xml11_char_not_restricted, is_name_char, is_name_start_char, is_whitespace_char};
+use crate::common::{Position, TextPosition, XmlVersion};
use crate::name::OwnedName;
use crate::namespace::NamespaceStack;
-
use crate::reader::config::ParserConfig2;
+use crate::reader::error::SyntaxError;
use crate::reader::events::XmlEvent;
+use crate::reader::indexset::AttributesSet;
use crate::reader::lexer::{Lexer, Token};
-
use super::{Error, ErrorKind};
+use std::collections::HashMap;
+use std::io::Read;
+
macro_rules! gen_takes(
($($field:ident -> $method:ident, $t:ty, $def:expr);+) => (
$(
@@ -42,7 +37,7 @@ gen_takes!(
element_name -> take_element_name, Option<OwnedName>, None;
attr_name -> take_attr_name, Option<OwnedName>, None;
- attributes -> take_attributes, Vec<OwnedAttribute>, vec!()
+ attributes -> take_attributes, AttributesSet, AttributesSet::new()
);
mod inside_cdata;
@@ -107,7 +102,7 @@ impl PullParser {
#[inline]
fn new_with_config2(config: ParserConfig2) -> PullParser {
- let mut lexer = Lexer::new();
+ let mut lexer = Lexer::new(&config);
if let Some(enc) = config.override_encoding {
lexer.set_encoding(enc);
}
@@ -133,7 +128,7 @@ impl PullParser {
element_name: None,
quote: None,
attr_name: None,
- attributes: Vec::new(),
+ attributes: AttributesSet::new(),
},
final_result: None,
next_event: None,
@@ -299,7 +294,7 @@ struct MarkupData {
name: String, // used for processing instruction name
ref_data: String, // used for reference content
- version: Option<common::XmlVersion>, // used for XML declaration version
+ version: Option<XmlVersion>, // used for XML declaration version
encoding: Option<String>, // used for XML declaration encoding
standalone: Option<bool>, // used for XML declaration standalone parameter
@@ -307,7 +302,7 @@ struct MarkupData {
quote: Option<QuoteToken>, // used to hold opening quote for attribute value
attr_name: Option<OwnedName>, // used to hold attribute name
- attributes: Vec<OwnedAttribute> // used to hold all accumulated attributes
+ attributes: AttributesSet, // used to hold all accumulated attributes
}
impl PullParser {
@@ -401,7 +396,7 @@ impl PullParser {
fn next_pos(&mut self) {
// unfortunately calls to next_pos will never be perfectly balanced with push_pos,
// at very least because parse errors and EOF can happen unexpectedly without a prior push.
- if self.pos.len() > 0 {
+ if !self.pos.is_empty() {
if self.pos.len() > 1 {
self.pos.remove(0);
} else {
@@ -490,7 +485,7 @@ impl PullParser {
let name = this.take_buf();
match name.parse() {
Ok(name) => on_name(this, t, name),
- Err(_) => Some(this.error(SyntaxError::InvalidQualifiedName(name.into())))
+ Err(_) => Some(this.error(SyntaxError::InvalidQualifiedName(name.into()))),
}
};
@@ -504,6 +499,9 @@ impl PullParser {
Token::Character(c) if c != ':' && (self.buf.is_empty() && is_name_start_char(c) ||
self.buf_has_data() && is_name_char(c)) => {
+ if self.buf.len() > self.config.max_name_length {
+ return Some(self.error(SyntaxError::ExceededConfiguredLimit));
+ }
self.buf.push(c);
None
},
@@ -517,7 +515,7 @@ impl PullParser {
Token::Character(c) if is_whitespace_char(c) => invoke_callback(self, t),
- _ => Some(self.error(SyntaxError::UnexpectedQualifiedName(t)))
+ _ => Some(self.error(SyntaxError::UnexpectedQualifiedName(t))),
}
}
@@ -529,7 +527,7 @@ impl PullParser {
fn read_attribute_value<F>(&mut self, t: Token, on_value: F) -> Option<Result>
where F: Fn(&mut PullParser, String) -> Option<Result> {
match t {
- Token::Character(c) if self.data.quote.is_none() && is_whitespace_char(c) => None, // skip leading whitespace
+ Token::Character(c) if self.data.quote.is_none() && is_whitespace_char(c) => None, // skip leading whitespace
Token::DoubleQuote | Token::SingleQuote => match self.data.quote {
None => { // Entered attribute value
@@ -547,6 +545,9 @@ impl PullParser {
return Some(self.error(SyntaxError::InvalidCharacterEntity(c as u32)));
}
}
+ if self.buf.len() > self.config.max_attribute_length {
+ return Some(self.error(SyntaxError::ExceededConfiguredLimit));
+ }
t.push_to_string(&mut self.buf);
None
}
@@ -557,8 +558,7 @@ impl PullParser {
self.into_state_continue(State::InsideReference)
},
- Token::OpeningTagStart =>
- Some(self.error(SyntaxError::UnexpectedOpeningTag)),
+ Token::OpeningTagStart => Some(self.error(SyntaxError::UnexpectedOpeningTag)),
Token::Character(c) if !self.is_valid_xml_char_not_restricted(c) => {
Some(self.error(SyntaxError::InvalidCharacterEntity(c as u32)))
@@ -566,6 +566,9 @@ impl PullParser {
// Every character except " and ' and < is okay
_ if self.data.quote.is_some() => {
+ if self.buf.len() > self.config.max_attribute_length {
+ return Some(self.error(SyntaxError::ExceededConfiguredLimit));
+ }
t.push_to_string(&mut self.buf);
None
}
@@ -576,11 +579,11 @@ impl PullParser {
fn emit_start_element(&mut self, emit_end_element: bool) -> Option<Result> {
let mut name = self.data.take_element_name()?;
- let mut attributes = self.data.take_attributes();
+ let mut attributes = self.data.take_attributes().into_vec();
// check whether the name prefix is bound and fix its namespace
match self.nst.get(name.borrow().prefix_repr()) {
- Some("") => name.namespace = None, // default namespace
+ Some("") => name.namespace = None, // default namespace
Some(ns) => name.namespace = Some(ns.into()),
None => return Some(self.error(SyntaxError::UnboundElementPrefix(name.to_string().into())))
}
diff --git a/src/reader/parser/inside_cdata.rs b/src/reader/parser/inside_cdata.rs
index 4f46f06..f0ca0c8 100644
--- a/src/reader/parser/inside_cdata.rs
+++ b/src/reader/parser/inside_cdata.rs
@@ -9,6 +9,10 @@ impl PullParser {
match t {
Token::CDataEnd => {
let event = if self.config.c.cdata_to_characters {
+ // start called push_pos, but there will be no event to pop it
+ if self.buf.is_empty() {
+ self.next_pos();
+ }
None
} else {
let data = self.take_buf();
diff --git a/src/reader/parser/inside_comment.rs b/src/reader/parser/inside_comment.rs
index e4132c5..240ee20 100644
--- a/src/reader/parser/inside_comment.rs
+++ b/src/reader/parser/inside_comment.rs
@@ -23,6 +23,9 @@ impl PullParser {
_ if self.config.c.ignore_comments => None, // Do not modify buffer if ignoring the comment
_ => {
+ if self.buf.len() > self.config.max_data_length {
+ return Some(self.error(SyntaxError::ExceededConfiguredLimit));
+ }
t.push_to_string(&mut self.buf);
None
}
diff --git a/src/reader/parser/inside_doctype.rs b/src/reader/parser/inside_doctype.rs
index 93ea470..87595d6 100644
--- a/src/reader/parser/inside_doctype.rs
+++ b/src/reader/parser/inside_doctype.rs
@@ -31,8 +31,8 @@ impl PullParser {
_ => None,
},
DoctypeSubstate::String => match t {
- Token::SingleQuote if self.data.quote != Some(QuoteToken::SingleQuoteToken) => { None },
- Token::DoubleQuote if self.data.quote != Some(QuoteToken::DoubleQuoteToken) => { None },
+ Token::SingleQuote if self.data.quote != Some(QuoteToken::SingleQuoteToken) => None,
+ Token::DoubleQuote if self.data.quote != Some(QuoteToken::DoubleQuoteToken) => None,
Token::SingleQuote | Token::DoubleQuote => {
self.data.quote = None;
self.into_state_continue(State::InsideDoctype(DoctypeSubstate::Outside))
@@ -51,12 +51,12 @@ impl PullParser {
None
},
Token::Character(c) if is_whitespace_char(c) => {
- match self.buf.as_str() {
+ let buf = self.take_buf();
+ match buf.as_str() {
"ENTITY" => self.into_state_continue(State::InsideDoctype(DoctypeSubstate::BeforeEntityName)),
"NOTATION" | "ELEMENT" | "ATTLIST" => self.into_state_continue(State::InsideDoctype(DoctypeSubstate::SkipDeclaration)),
- s => Some(self.error(SyntaxError::UnknownMarkupDeclaration(s.into()))),
+ _ => Some(self.error(SyntaxError::UnknownMarkupDeclaration(buf.into()))),
}
-
},
_ => Some(self.error(SyntaxError::UnexpectedToken(t))),
},
@@ -69,6 +69,9 @@ impl PullParser {
self.into_state_continue(State::InsideDoctype(DoctypeSubstate::PEReferenceDefinitionStart))
},
Token::Character(c) if is_name_start_char(c) => {
+ if self.data.name.len() > self.config.max_name_length {
+ return Some(self.error(SyntaxError::ExceededConfiguredLimit));
+ }
self.data.name.push(c);
self.into_state_continue(State::InsideDoctype(DoctypeSubstate::EntityName))
},
@@ -80,6 +83,9 @@ impl PullParser {
self.into_state_continue(State::InsideDoctype(DoctypeSubstate::BeforeEntityValue))
},
Token::Character(c) if is_name_char(c) => {
+ if self.data.name.len() > self.config.max_name_length {
+ return Some(self.error(SyntaxError::ExceededConfiguredLimit));
+ }
self.data.name.push(c);
None
},
@@ -144,6 +150,9 @@ impl PullParser {
},
DoctypeSubstate::PEReferenceDefinition => match t {
Token::Character(c) if is_name_char(c) => {
+ if self.data.name.len() > self.config.max_name_length {
+ return Some(self.error(SyntaxError::ExceededConfiguredLimit));
+ }
self.data.name.push(c);
None
},
diff --git a/src/reader/parser/inside_opening_tag.rs b/src/reader/parser/inside_opening_tag.rs
index b7f185a..fb6d001 100644
--- a/src/reader/parser/inside_opening_tag.rs
+++ b/src/reader/parser/inside_opening_tag.rs
@@ -9,6 +9,7 @@ use super::{OpeningTagSubstate, PullParser, QualifiedNameTarget, Result, State};
impl PullParser {
pub fn inside_opening_tag(&mut self, t: Token, s: OpeningTagSubstate) -> Option<Result> {
+ let max_attrs = self.config.max_attributes;
match s {
OpeningTagSubstate::InsideName => self.read_qualified_name(t, QualifiedNameTarget::OpeningTagNameTarget, |this, token, name| {
match name.prefix_ref() {
@@ -30,20 +31,29 @@ impl PullParser {
OpeningTagSubstate::InsideTag => match t {
Token::TagEnd => self.emit_start_element(false),
Token::EmptyTagEnd => self.emit_start_element(true),
- Token::Character(c) if is_whitespace_char(c) => None, // skip whitespace
+ Token::Character(c) if is_whitespace_char(c) => None, // skip whitespace
Token::Character(c) if is_name_start_char(c) => {
+ if self.buf.len() > self.config.max_name_length {
+ return Some(self.error(SyntaxError::ExceededConfiguredLimit));
+ }
self.buf.push(c);
self.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::InsideAttributeName))
}
- _ => Some(self.error(SyntaxError::UnexpectedTokenInOpeningTag(t)))
+ _ => Some(self.error(SyntaxError::UnexpectedTokenInOpeningTag(t))),
},
OpeningTagSubstate::InsideAttributeName => self.read_qualified_name(t, QualifiedNameTarget::AttributeNameTarget, |this, token, name| {
+ // check that no attribute with such name is already present
+ // if there is one, XML is not well-formed
+ if this.data.attributes.contains(&name) {
+ return Some(this.error(SyntaxError::RedefinedAttribute(name.to_string().into())))
+ }
+
this.data.attr_name = Some(name);
match token {
Token::EqualsSign => this.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::InsideAttributeValue)),
Token::Character(c) if is_whitespace_char(c) => this.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::AfterAttributeName)),
- _ => unreachable!()
+ _ => Some(this.error(SyntaxError::UnexpectedTokenInOpeningTag(t))) // likely unreachable
}
}),
@@ -55,58 +65,55 @@ impl PullParser {
OpeningTagSubstate::InsideAttributeValue => self.read_attribute_value(t, |this, value| {
let name = this.data.take_attr_name()?; // will always succeed here
- // check that no attribute with such name is already present
- // if there is one, XML is not well-formed
- if this.data.attributes.iter().any(|a| a.name == name) { // TODO: looks bad
- // TODO: ideally this error should point to the beginning of the attribute,
- // TODO: not the end of its value
- Some(this.error(SyntaxError::RedefinedAttribute(name.to_string().into())))
- } else {
- match name.prefix_ref() {
- // declaring a new prefix; it is sufficient to check prefix only
- // because "xmlns" prefix is reserved
- Some(namespace::NS_XMLNS_PREFIX) => {
- let ln = &*name.local_name;
- if ln == namespace::NS_XMLNS_PREFIX {
- Some(this.error(SyntaxError::CannotRedefineXmlnsPrefix))
- } else if ln == namespace::NS_XML_PREFIX && &*value != namespace::NS_XML_URI {
- Some(this.error(SyntaxError::CannotRedefineXmlPrefix))
- } else if value.is_empty() {
- Some(this.error(SyntaxError::CannotUndefinePrefix(ln.into())))
- } else {
- this.nst.put(name.local_name.clone(), value);
- this.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::AfterAttributeValue))
- }
+ match name.prefix_ref() {
+ // declaring a new prefix; it is sufficient to check prefix only
+ // because "xmlns" prefix is reserved
+ Some(namespace::NS_XMLNS_PREFIX) => {
+ let ln = &*name.local_name;
+ if ln == namespace::NS_XMLNS_PREFIX {
+ Some(this.error(SyntaxError::CannotRedefineXmlnsPrefix))
+ } else if ln == namespace::NS_XML_PREFIX && &*value != namespace::NS_XML_URI {
+ Some(this.error(SyntaxError::CannotRedefineXmlPrefix))
+ } else if value.is_empty() {
+ Some(this.error(SyntaxError::CannotUndefinePrefix(ln.into())))
+ } else {
+ this.nst.put(name.local_name.clone(), value);
+ this.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::AfterAttributeValue))
}
+ }
- // declaring default namespace
- None if &*name.local_name == namespace::NS_XMLNS_PREFIX =>
- match &*value {
- namespace::NS_XMLNS_PREFIX | namespace::NS_XML_PREFIX | namespace::NS_XML_URI | namespace::NS_XMLNS_URI =>
- Some(this.error(SyntaxError::InvalidDefaultNamespace(value.into()))),
- _ => {
- this.nst.put(namespace::NS_NO_PREFIX, value.clone());
- this.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::AfterAttributeValue))
- }
- },
+ // declaring default namespace
+ None if &*name.local_name == namespace::NS_XMLNS_PREFIX =>
+ match &*value {
+ namespace::NS_XMLNS_PREFIX | namespace::NS_XML_PREFIX | namespace::NS_XML_URI | namespace::NS_XMLNS_URI =>
+ Some(this.error(SyntaxError::InvalidDefaultNamespace(value.into()))),
+ _ => {
+ this.nst.put(namespace::NS_NO_PREFIX, value.clone());
+ this.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::AfterAttributeValue))
+ }
+ },
- // regular attribute
- _ => {
- this.data.attributes.push(OwnedAttribute {
- name: name.clone(),
- value
- });
- this.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::AfterAttributeValue))
+ // regular attribute
+ _ => {
+ if this.data.attributes.len() >= max_attrs {
+ return Some(this.error(SyntaxError::ExceededConfiguredLimit));
}
+ this.data.attributes.push(OwnedAttribute {
+ name,
+ value
+ });
+ this.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::AfterAttributeValue))
}
}
}),
OpeningTagSubstate::AfterAttributeValue => match t {
- Token::Character(c) if is_whitespace_char(c) => self.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::InsideTag)),
+ Token::Character(c) if is_whitespace_char(c) => {
+ self.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::InsideTag))
+ },
Token::TagEnd => self.emit_start_element(false),
Token::EmptyTagEnd => self.emit_start_element(true),
- _ => Some(self.error(SyntaxError::UnexpectedTokenInOpeningTag(t)))
+ _ => Some(self.error(SyntaxError::UnexpectedTokenInOpeningTag(t))),
},
}
}
diff --git a/src/reader/parser/inside_processing_instruction.rs b/src/reader/parser/inside_processing_instruction.rs
index 96f6753..99caf59 100644
--- a/src/reader/parser/inside_processing_instruction.rs
+++ b/src/reader/parser/inside_processing_instruction.rs
@@ -12,6 +12,9 @@ impl PullParser {
ProcessingInstructionSubstate::PIInsideName => match t {
Token::Character(c) if self.buf.is_empty() && is_name_start_char(c) ||
self.buf_has_data() && is_name_char(c) => {
+ if self.buf.len() > self.config.max_name_length {
+ return Some(self.error(SyntaxError::ExceededConfiguredLimit));
+ }
self.buf.push(c);
None
},
@@ -101,6 +104,9 @@ impl PullParser {
// Any other token should be treated as plain characters
_ => {
+ if self.buf.len() > self.config.max_data_length {
+ return Some(self.error(SyntaxError::ExceededConfiguredLimit));
+ }
t.push_to_string(&mut self.buf);
None
}
diff --git a/src/reader/parser/inside_reference.rs b/src/reader/parser/inside_reference.rs
index 9a15e09..eced606 100644
--- a/src/reader/parser/inside_reference.rs
+++ b/src/reader/parser/inside_reference.rs
@@ -68,6 +68,7 @@ impl PullParser {
};
match char::from_u32(val) {
Some(c) if self.is_valid_xml_char(c) => Ok(c),
+ Some(_) if self.config.c.replace_unknown_entity_references => Ok('\u{fffd}'),
None if self.config.c.replace_unknown_entity_references => {
Ok('\u{fffd}')
},
diff --git a/src/reader/parser/outside_tag.rs b/src/reader/parser/outside_tag.rs
index 8104224..e62f862 100644
--- a/src/reader/parser/outside_tag.rs
+++ b/src/reader/parser/outside_tag.rs
@@ -31,6 +31,8 @@ impl PullParser {
if self.buf.is_empty() {
self.push_pos();
+ } else if self.buf.len() > self.config.max_data_length {
+ return Some(self.error(SyntaxError::ExceededConfiguredLimit));
}
self.buf.push(c);
None
@@ -47,7 +49,10 @@ impl PullParser {
if let Some(s) = t.as_static_str() {
if self.buf.is_empty() {
self.push_pos();
+ } else if self.buf.len() > self.config.max_data_length {
+ return Some(self.error(SyntaxError::ExceededConfiguredLimit));
}
+
self.buf.push_str(s);
}
None
@@ -60,6 +65,9 @@ impl PullParser {
Token::ReferenceEnd if self.depth() > 0 => { // Semi-colon in a text outside an entity
self.inside_whitespace = false;
+ if self.buf.len() > self.config.max_data_length {
+ return Some(self.error(SyntaxError::ExceededConfiguredLimit));
+ }
Token::ReferenceEnd.push_to_string(&mut self.buf);
None
},
@@ -85,6 +93,7 @@ impl PullParser {
if self.inside_whitespace && self.config.c.trim_whitespace {
None
} else if self.inside_whitespace && !self.config.c.whitespace_to_characters {
+ debug_assert!(buf.chars().all(|ch| ch.is_whitespace()), "ws={buf:?}");
Some(Ok(XmlEvent::Whitespace(buf)))
} else if self.config.c.trim_whitespace {
Some(Ok(XmlEvent::Characters(buf.trim_matches(is_whitespace_char).into())))
@@ -166,7 +175,7 @@ impl PullParser {
self.into_state(State::OutsideTag, next_event)
},
- Token::CommentStart => {
+ Token::CommentStart => {
let next_event = self.set_encountered(Encountered::Comment);
self.into_state(State::InsideComment, next_event)
}
diff --git a/src/writer/emitter.rs b/src/writer/emitter.rs
index 8e74b5f..431d48c 100644
--- a/src/writer/emitter.rs
+++ b/src/writer/emitter.rs
@@ -390,8 +390,7 @@ impl Emitter {
}
}
- pub fn emit_characters<W: Write>(&mut self, target: &mut W,
- content: &str) -> Result<()> {
+ pub fn emit_characters<W: Write>(&mut self, target: &mut W, content: &str) -> Result<()> {
self.check_document_started(target)?;
self.fix_non_empty_element(target)?;