aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMatthew Maurer <mmaurer@google.com>2023-06-16 19:33:00 +0000
committerAutomerger Merge Worker <android-build-automerger-merge-worker@system.gserviceaccount.com>2023-06-16 19:33:00 +0000
commit89fe311d3f22761f7d1b593e45216427f301a87d (patch)
treed45b1c9a0f6e40a99e34d574e097bfb0acf72c21
parentb19242f4c2316fc7acc06ccf2f7fd72e324c7261 (diff)
parentbec0e9a523cf8d6db5ea2c92c99f13d2014b7f80 (diff)
downloadxml-rs-89fe311d3f22761f7d1b593e45216427f301a87d.tar.gz
Upgrade xml-rs to 0.8.15-cvss-cries-wolf am: bec0e9a523
Original change: https://android-review.googlesource.com/c/platform/external/rust/crates/xml-rs/+/2626331 Change-Id: Idb21e2ca0e03e7b1f518878b49877713d8425f53 Signed-off-by: Automerger Merge Worker <android-build-automerger-merge-worker@system.gserviceaccount.com>
-rw-r--r--.cargo_vcs_info.json7
-rw-r--r--.github/workflows/main.yml31
-rw-r--r--Android.bp58
-rw-r--r--Cargo.toml47
-rw-r--r--Cargo.toml.orig27
-rw-r--r--Changelog.md126
-rw-r--r--METADATA14
-rw-r--r--README.md209
-rw-r--r--Readme.md236
-rw-r--r--design.md37
-rw-r--r--src/analyze.rs110
-rw-r--r--src/attribute.rs34
-rw-r--r--src/common.rs69
-rw-r--r--src/escape.rs159
-rw-r--r--src/lib.rs29
-rw-r--r--src/macros.rs32
-rw-r--r--src/name.rs63
-rw-r--r--src/namespace.rs60
-rw-r--r--src/reader.rs (renamed from src/reader/mod.rs)59
-rw-r--r--src/reader/config.rs129
-rw-r--r--src/reader/error.rs197
-rw-r--r--src/reader/events.rs66
-rw-r--r--src/reader/lexer.rs664
-rw-r--r--src/reader/parser.rs (renamed from src/reader/parser/mod.rs)482
-rw-r--r--src/reader/parser/inside_cdata.rs26
-rw-r--r--src/reader/parser/inside_closing_tag_name.rs21
-rw-r--r--src/reader/parser/inside_comment.rs21
-rw-r--r--src/reader/parser/inside_declaration.rs137
-rw-r--r--src/reader/parser/inside_doctype.rs237
-rw-r--r--src/reader/parser/inside_opening_tag.rs73
-rw-r--r--src/reader/parser/inside_processing_instruction.rs82
-rw-r--r--src/reader/parser/inside_reference.rs122
-rw-r--r--src/reader/parser/outside_tag.rs208
-rw-r--r--src/util.rs276
-rw-r--r--src/writer.rs (renamed from src/writer/mod.rs)19
-rw-r--r--src/writer/config.rs10
-rw-r--r--src/writer/emitter.rs135
-rw-r--r--src/writer/events.rs51
-rw-r--r--tests/documents/sample_1.xml34
-rw-r--r--tests/documents/sample_1_full.txt58
-rw-r--r--tests/documents/sample_1_short.txt37
-rw-r--r--tests/documents/sample_2.xml15
-rw-r--r--tests/documents/sample_2_full.txt41
-rw-r--r--tests/documents/sample_2_short.txt30
-rw-r--r--tests/documents/sample_3.xml13
-rw-r--r--tests/documents/sample_3_full.txt23
-rw-r--r--tests/documents/sample_3_short.txt14
-rw-r--r--tests/documents/sample_4.xml15
-rw-r--r--tests/documents/sample_4_full.txt23
-rw-r--r--tests/documents/sample_4_short.txt14
-rw-r--r--tests/documents/sample_5.xml7
-rw-r--r--tests/documents/sample_5_short.txt7
-rw-r--r--tests/documents/sample_6.xml4
-rw-r--r--tests/documents/sample_6_full.txt8
-rw-r--r--tests/event_reader.rs587
-rw-r--r--tests/event_writer.rs269
-rw-r--r--tests/streaming.rs103
57 files changed, 2748 insertions, 2917 deletions
diff --git a/.cargo_vcs_info.json b/.cargo_vcs_info.json
index 6e0c55d..f0a8a38 100644
--- a/.cargo_vcs_info.json
+++ b/.cargo_vcs_info.json
@@ -1,5 +1,6 @@
{
"git": {
- "sha1": "7cd06954fd6e22b7dbf9ea02ff4e22f9ff6309fd"
- }
-}
+ "sha1": "c4705ddc172950c28f9b229f368ad8f4cba81e3f"
+ },
+ "path_in_vcs": ""
+} \ No newline at end of file
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
deleted file mode 100644
index daca69f..0000000
--- a/.github/workflows/main.yml
+++ /dev/null
@@ -1,31 +0,0 @@
-name: CI
-
-on:
- push:
- branches: [ master ]
- pull_request:
- branches: [ master ]
-
-jobs:
- test:
- runs-on: ubuntu-latest
- strategy:
- matrix:
- rust: [stable, beta, nightly]
-
- steps:
- - uses: actions/checkout@v2
-
- - uses: actions-rs/toolchain@v1
- with:
- profile: minimal
- toolchain: ${{ matrix.rust }}
- override: true
-
- - uses: actions-rs/cargo@v1
- with:
- command: build
-
- - uses: actions-rs/cargo@v1
- with:
- command: test
diff --git a/Android.bp b/Android.bp
index 38ba911..16b2b9a 100644
--- a/Android.bp
+++ b/Android.bp
@@ -21,13 +21,12 @@ license {
rust_library {
name: "libxml_rust",
stem: "libxml",
- // has rustc warnings
host_supported: true,
crate_name: "xml",
cargo_env_compat: true,
- cargo_pkg_version: "0.8.4",
+ cargo_pkg_version: "0.8.15-cvss-cries-wolf",
srcs: ["src/lib.rs"],
- edition: "2015",
+ edition: "2021",
apex_available: [
"//apex_available:platform",
"com.android.virt",
@@ -36,63 +35,14 @@ rust_library {
vendor_available: true,
}
-rust_defaults {
- name: "xml-rs_test_defaults",
- crate_name: "xml_rs",
- cargo_env_compat: true,
- cargo_pkg_version: "0.8.4",
- test_suites: ["general-tests"],
- auto_gen_config: true,
- edition: "2015",
- rustlibs: [
- "liblazy_static",
- "libxml_rust",
- ],
-}
-
-rust_test {
- name: "xml-rs_test_tests_event_reader",
- defaults: ["xml-rs_test_defaults"],
- // has rustc warnings
- host_supported: true,
- srcs: ["tests/event_reader.rs"],
- test_options: {
- unit_test: true,
- },
-}
-
-rust_test {
- name: "xml-rs_test_tests_event_writer",
- defaults: ["xml-rs_test_defaults"],
- // has rustc warnings
- host_supported: true,
- srcs: ["tests/event_writer.rs"],
- test_options: {
- unit_test: true,
- },
- data: ["tests/documents/*"],
-}
-
-rust_test {
- name: "xml-rs_test_tests_streaming",
- defaults: ["xml-rs_test_defaults"],
- // has rustc warnings
- host_supported: true,
- srcs: ["tests/streaming.rs"],
- test_options: {
- unit_test: true,
- },
-}
-
rust_binary {
name: "xml_analyze",
- // has rustc warnings
host_supported: true,
crate_name: "xml_analyze",
cargo_env_compat: true,
- cargo_pkg_version: "0.8.4",
+ cargo_pkg_version: "0.8.15-cvss-cries-wolf",
srcs: ["src/analyze.rs"],
- edition: "2015",
+ edition: "2021",
rustlibs: [
"libxml_rust",
],
diff --git a/Cargo.toml b/Cargo.toml
index e704337..3279206 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -3,24 +3,44 @@
# When uploading crates to the registry Cargo will automatically
# "normalize" Cargo.toml files for maximal compatibility
# with all versions of Cargo and also rewrite `path` dependencies
-# to registry (e.g., crates.io) dependencies
+# to registry (e.g., crates.io) dependencies.
#
-# If you believe there's an error in this file please file an
-# issue against the rust-lang/cargo repository. If you're
-# editing this file be aware that the upstream Cargo.toml
-# will likely look very different (and much more reasonable)
+# If you are reading this file be aware that the original Cargo.toml
+# will likely look very different (and much more reasonable).
+# See Cargo.toml.orig for the original contents.
[package]
+edition = "2021"
+rust-version = "1.58"
name = "xml-rs"
-version = "0.8.4"
+version = "0.8.15-cvss-cries-wolf"
authors = ["Vladimir Matveev <vmatveev@citrine.cc>"]
+include = [
+ "src/**",
+ "LICENSE",
+ "README.md",
+]
description = "An XML library in pure Rust"
-documentation = "http://docs.rs/xml-rs/"
-readme = "Readme.md"
-keywords = ["xml", "parsing", "parser"]
-categories = ["parsing"]
+homepage = "https://lib.rs/crates/xml-rs"
+documentation = "https://docs.rs/xml-rs/"
+readme = "README.md"
+keywords = [
+ "xml",
+ "parser",
+ "sax",
+ "parsing",
+ "writer",
+]
+categories = ["parser-implementations"]
license = "MIT"
-repository = "https://github.com/netvl/xml-rs"
+repository = "https://github.com/kornelski/xml-rs"
+
+[package.metadata.docs.rs]
+targets = ["x86_64-unknown-linux-gnu"]
+
+[package.metadata.release]
+tag-message = ""
+tag-name = "{{version}}"
[lib]
name = "xml"
@@ -29,8 +49,9 @@ path = "src/lib.rs"
[[bin]]
name = "xml-analyze"
path = "src/analyze.rs"
+
[dev-dependencies.doc-comment]
version = "0.3"
-[dev-dependencies.lazy_static]
-version = "1.2.0"
+[badges.maintenance]
+status = "actively-developed"
diff --git a/Cargo.toml.orig b/Cargo.toml.orig
index c8df8e6..0282e7a 100644
--- a/Cargo.toml.orig
+++ b/Cargo.toml.orig
@@ -1,14 +1,18 @@
[package]
name = "xml-rs"
-version = "0.8.4"
+version = "0.8.15-cvss-cries-wolf"
authors = ["Vladimir Matveev <vmatveev@citrine.cc>"]
license = "MIT"
description = "An XML library in pure Rust"
-repository = "https://github.com/netvl/xml-rs"
-documentation = "http://docs.rs/xml-rs/"
-readme = "Readme.md"
-keywords = ["xml", "parsing", "parser"]
-categories = ["parsing"]
+repository = "https://github.com/kornelski/xml-rs"
+homepage = "https://lib.rs/crates/xml-rs"
+documentation = "https://docs.rs/xml-rs/"
+readme = "README.md"
+keywords = ["xml", "parser", "sax", "parsing", "writer"]
+categories = ["parser-implementations"]
+edition = "2021"
+rust-version = "1.58"
+include = ["src/**", "LICENSE", "README.md"]
[lib]
name = "xml"
@@ -20,4 +24,13 @@ path = "src/analyze.rs"
[dev-dependencies]
doc-comment = "0.3"
-lazy_static = "1.2.0"
+
+[badges]
+maintenance = { status = "actively-developed" }
+
+[package.metadata.docs.rs]
+targets = ["x86_64-unknown-linux-gnu"]
+
+[package.metadata.release]
+tag-name = "{{version}}"
+tag-message = ""
diff --git a/Changelog.md b/Changelog.md
deleted file mode 100644
index 3cca8b8..0000000
--- a/Changelog.md
+++ /dev/null
@@ -1,126 +0,0 @@
-## Version 0.8.4
-
-* Fixed recognition of `?>`, `]]>` and `/>` tokens as characters.
-* Fixed writer output operations to use `write_all` to ensure that the data
- is written fully.
-* The document declaration is now written before any characters automatically.
-
-## Version 0.8.3
-
-* Added a new parser option, `ignore_root_level_whitespace`, which makes the parser
- skip emitting whitespace events outside of the root element when set to `true`.
- This helps with certain tasks like canonicalization.
-
-## Version 0.8.2
-
-* Added a new parser option, `replace_unknown_entity_references`, which allows to ignore
- invalid Unicode code points and replace them with a Unicode "replacement character"
- during parsing. This can be helpful to deal with e.g. UTF-16 surrogate pairs.
-* Added a new emitter option, `pad_self_closing`, which determines the style of the self-closing
- elements when they are emitted: `<a />` (`true`) vs `<a/>` (`false`).
-
-## Version 0.8.1
-
-* Fixed various issues with tests introduced by updates in Rust.
-* Adjusted the lexer to ignore contents of the `<!DOCTYPE>` tag.
-* Removed unnecessary unsafety in tests.
-* Added tests for doc comments in the readme file.
-* Switched to GitHub Actions from Travis CI.
-
-## Version 0.8.0
-
-* Same as 0.7.1, with 0.7.1 being yanked because of the incorrect semver bump.
-
-## Version 0.7.1
-
-* Removed dependency on bitflags.
-* Added the `XmlWriter::inner_mut()` method.
-* Fixed some rustdoc warnings.
-
-## Version 0.7.0
-
-* Same as 0.6.2, with 0.6.2 being yanked because of the incompatible bump of minimum required version of rustc.
-
-## Version 0.6.2
-
-* Bumped `bitflags` to 1.0.
-
-## Version 0.6.1
-
-* Fixed the writer to escape some special characters when writing attribute values.
-
-## Version 0.6.0
-
-* Changed the target type of extra entities from `char` to `String`. This is an incompatible
- change.
-
-## Version 0.5.0
-
-* Added support for ignoring EOF errors in order to read documents from streams incrementally.
-* Bumped `bitflags` to 0.9.
-
-## Version 0.4.1
-
-* Added missing `Debug` implementation to `xml::writer::XmlEvent`.
-
-## Version 0.4.0
-
-* Bumped version number, since changes introduced in 0.3.7 break backwards compatibility.
-
-## Version 0.3.8
-
-* Fixed a problem introduced in 0.3.7 with entities in attributes causing parsing errors.
-
-## Version 0.3.7
-
-* Fixed the problem with parsing non-whitespace character entities as whitespace (issue #140).
-* Added support for configuring custom entities in the parser configuration.
-
-## Version 0.3.6
-
-* Added an `Error` implementation for `EmitterError`.
-* Fixed escaping of strings with multi-byte code points.
-
-## Version 0.3.5
-
-* Added `Debug` implementation for `XmlVersion`.
-* Fixed some failing tests.
-
-## Version 0.3.3
-
-* Updated `bitflags` to 0.7.
-
-## Version 0.3.2
-
-* Added `From<io::Error>` for `xml::reader::Error`, which improves usability of working with parsing errors.
-
-## Version 0.3.1
-
-* Bumped `bitflags` dependency to 0.4, some internal warning fixes.
-
-## Version 0.3.0
-
-* Changed error handling in `EventReader` - now I/O errors are properly bubbled up from the lexer.
-
-## Version 0.2.4
-
-* Fixed #112 - incorrect handling of namespace redefinitions when writing a document.
-
-## Version 0.2.3
-
-* Added `into_inner()` methods to `EventReader` and `EventWriter`.
-
-## Version 0.2.2
-
-* Using `join` instead of the deprecated `connect`.
-* Added a simple XML analyzer program which demonstrates library usage and can be used to check XML documents for well-formedness.
-* Fixed incorrect handling of unqualified attribute names (#107).
-* Added this changelog.
-
-## Version 0.2.1
-
-* Fixed #105 - incorrect handling of double dashes.
-
-## Version 0.2.0
-
-* Major update, includes proper document writing support and significant architecture changes.
diff --git a/METADATA b/METADATA
index 17fbefc..87bd4b7 100644
--- a/METADATA
+++ b/METADATA
@@ -1,3 +1,7 @@
+# This project was upgraded with external_updater.
+# Usage: tools/external_updater/updater.sh update rust/crates/xml-rs
+# For more info, check https://cs.android.com/android/platform/superproject/+/master:tools/external_updater/README.md
+
name: "xml-rs"
description: "An XML library in pure Rust"
third_party {
@@ -7,13 +11,13 @@ third_party {
}
url {
type: ARCHIVE
- value: "https://static.crates.io/crates/xml-rs/xml-rs-0.8.4.crate"
+ value: "https://static.crates.io/crates/xml-rs/xml-rs-0.8.15-cvss-cries-wolf.crate"
}
- version: "0.8.4"
+ version: "0.8.15-cvss-cries-wolf"
license_type: NOTICE
last_upgrade_date {
- year: 2021
- month: 8
- day: 9
+ year: 2023
+ month: 6
+ day: 14
}
}
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..fa4ba7f
--- /dev/null
+++ b/README.md
@@ -0,0 +1,209 @@
+xml-rs, an XML library for Rust
+===============================
+
+[![CI](https://github.com/kornelski/xml-rs/actions/workflows/main.yml/badge.svg)](https://github.com/kornelski/xml-rs/actions/workflows/main.yml)
+[![crates.io][crates-io-img]](https://lib.rs/crates/xml-rs)
+[![docs][docs-img]](https://docs.rs/xml-rs/)
+
+[Documentation](https://docs.rs/xml-rs/)
+
+ [crates-io-img]: https://img.shields.io/crates/v/xml-rs.svg
+ [docs-img]: https://img.shields.io/badge/docs-latest%20release-6495ed.svg
+
+xml-rs is an XML library for the [Rust](https://www.rust-lang.org/) programming language.
+It supports reading and writing of XML documents in a streaming fashion (without DOM).
+
+### Features
+
+* API based on `Iterator`s and regular `String`s without tricky lifetimes.
+
+* XML spec conformance better than other pure-Rust libraries.
+
+* Support for UTF-16, UTF-8, ISO-8859-1, and ASCII encodings.
+
+* Written entirely in the safe Rust subset.
+
+
+The API is heavily inspired by Java Streaming API for XML ([StAX][stax]). It contains a pull parser much like StAX event reader. It provides an iterator API, so you can leverage Rust's existing iterators library features.
+
+ [stax]: https://en.wikipedia.org/wiki/StAX
+
+It also provides a streaming document writer much like StAX event writer.
+This writer consumes its own set of events, but reader events can be converted to
+writer events easily, and so it is possible to write XML transformation chains in a pretty
+clean manner.
+
+This parser is mostly full-featured, however, there are limitations:
+* Legacy code pages and non-Unicode encodings are not supported;
+* DTD validation is not supported (but entities defined in the internal subset are supported);
+* attribute value normalization is not performed, and end-of-line characters are not normalized either.
+
+Other than that the parser tries to be mostly XML-1.1-compliant.
+
+Writer is also mostly full-featured with the following limitations:
+* no support for encodings other than UTF-8,
+* no support for emitting `<!DOCTYPE>` declarations;
+* more validations of input are needed, for example, checking that namespace prefixes are bounded
+ or comments are well-formed.
+
+Building and using
+------------------
+
+xml-rs uses [Cargo](https://crates.io), so add it with `cargo add xml` or modify `Cargo.toml`:
+
+```toml
+[dependencies]
+xml = "0.8"
+```
+
+The package exposes a single crate called `xml`.
+
+Reading XML documents
+---------------------
+
+[`xml::reader::EventReader`](EventReader) requires a [`Read`](stdread) instance to read from. It can be a `File` wrapped in `BufReader`, or a `Vec<u8>`, or a `&[u8]` slice.
+
+[EventReader]: https://docs.rs/xml-rs/latest/xml/reader/struct.EventReader.html
+[stdread]: https://doc.rust-lang.org/stable/std/io/trait.Read.html
+
+`EventReader` implements `IntoIterator` trait, so you can use it in a `for` loop directly:
+
+```rust,no_run
+use std::fs::File;
+use std::io::BufReader;
+
+use xml::reader::{EventReader, XmlEvent};
+
+fn main() -> std::io::Result<()> {
+ let file = File::open("file.xml")?;
+ let file = BufReader::new(file); // Buffering is important for performance
+
+ let parser = EventReader::new(file);
+ let mut depth = 0;
+ for e in parser {
+ match e {
+ Ok(XmlEvent::StartElement { name, .. }) => {
+ println!("{:spaces$}+{name}", "", spaces = depth * 2);
+ depth += 1;
+ }
+ Ok(XmlEvent::EndElement { name }) => {
+ depth -= 1;
+ println!("{:spaces$}-{name}", "", spaces = depth * 2);
+ }
+ Err(e) => {
+ eprintln!("Error: {e}");
+ break;
+ }
+ // There's more: https://docs.rs/xml-rs/latest/xml/reader/enum.XmlEvent.html
+ _ => {}
+ }
+ }
+
+ Ok(())
+}
+```
+
+Document parsing can end normally or with an error. Regardless of exact cause, the parsing
+process will be stopped, and the iterator will terminate normally.
+
+You can also have finer control over when to pull the next event from the parser using its own
+`next()` method:
+
+```rust,ignore
+match parser.next() {
+ ...
+}
+```
+
+Upon the end of the document or an error, the parser will remember the last event and will always
+return it in the result of `next()` call afterwards. If iterator is used, then it will yield
+error or end-of-document event once and will produce `None` afterwards.
+
+It is also possible to tweak parsing process a little using [`xml::reader::ParserConfig`][ParserConfig] structure.
+See its documentation for more information and examples.
+
+[ParserConfig]: https://docs.rs/xml-rs/latest/xml/reader/struct.ParserConfig.html
+
+You can find a more extensive example of using `EventReader` in `src/analyze.rs`, which is a
+small program (BTW, it is built with `cargo build` and can be run after that) which shows various
+statistics about specified XML document. It can also be used to check for well-formedness of
+XML documents - if a document is not well-formed, this program will exit with an error.
+
+Writing XML documents
+---------------------
+
+xml-rs also provides a streaming writer much like StAX event writer. With it you can write an
+XML document to any `Write` implementor.
+
+```rust,no_run
+use std::io;
+use xml::writer::{EmitterConfig, XmlEvent};
+
+/// A simple demo syntax where "+foo" makes `<foo>`, "-foo" makes `</foo>`
+fn make_event_from_line(line: &str) -> XmlEvent {
+ let line = line.trim();
+ if let Some(name) = line.strip_prefix("+") {
+ XmlEvent::start_element(name).into()
+ } else if line.starts_with("-") {
+ XmlEvent::end_element().into()
+ } else {
+ XmlEvent::characters(line).into()
+ }
+}
+
+fn main() -> io::Result<()> {
+ let input = io::stdin();
+ let output = io::stdout();
+ let mut writer = EmitterConfig::new()
+ .perform_indent(true)
+ .create_writer(output);
+
+ let mut line = String::new();
+ loop {
+ line.clear();
+ let bytes_read = input.read_line(&mut line)?;
+ if bytes_read == 0 {
+ break; // EOF
+ }
+
+ let event = make_event_from_line(&line);
+ if let Err(e) = writer.write(event) {
+ panic!("Write error: {e}")
+ }
+ }
+ Ok(())
+}
+```
+
+The code example above also demonstrates how to create a writer out of its configuration.
+Similar thing also works with `EventReader`.
+
+The library provides an XML event building DSL which helps to construct complex events,
+e.g. ones having namespace definitions. Some examples:
+
+```rust,ignore
+// <a:hello a:param="value" xmlns:a="urn:some:document">
+XmlEvent::start_element("a:hello").attr("a:param", "value").ns("a", "urn:some:document")
+
+// <hello b:config="name" xmlns="urn:default:uri">
+XmlEvent::start_element("hello").attr("b:config", "value").default_ns("urn:defaul:uri")
+
+// <![CDATA[some unescaped text]]>
+XmlEvent::cdata("some unescaped text")
+```
+
+Of course, one can create `XmlEvent` enum variants directly instead of using the builder DSL.
+There are more examples in [`xml::writer::XmlEvent`][XmlEvent] documentation.
+
+[XmlEvent]: https://docs.rs/xml-rs/latest/xml/reader/enum.XmlEvent.html
+
+The writer has multiple configuration options; see `EmitterConfig` documentation for more
+information.
+
+[EmitterConfig]: https://docs.rs/xml-rs/latest/xml/writer/struct.EmitterConfig.html
+
+Bug reports
+------------
+
+Please report issues at: <https://github.com/kornelski/xml-rs/issues>.
+
diff --git a/Readme.md b/Readme.md
deleted file mode 100644
index 5ab88f8..0000000
--- a/Readme.md
+++ /dev/null
@@ -1,236 +0,0 @@
-xml-rs, an XML library for Rust
-===============================
-
-[![Build Status][build-status-img]](https://github.com/netvl/xml-rs/actions?query=workflow%3ACI)
-[![crates.io][crates-io-img]](https://crates.io/crates/xml-rs)
-[![docs][docs-img]](https://docs.rs/xml-rs/)
-
-[Documentation](https://docs.rs/xml-rs/)
-
- [build-status-img]: https://img.shields.io/github/workflow/status/netvl/xml-rs/CI/master?style=flat-square
- [crates-io-img]: https://img.shields.io/crates/v/xml-rs.svg?style=flat-square
- [docs-img]: https://img.shields.io/badge/docs-latest%20release-6495ed.svg?style=flat-square
-
-xml-rs is an XML library for [Rust](http://www.rust-lang.org/) programming language.
-It is heavily inspired by Java [Streaming API for XML (StAX)][stax].
-
- [stax]: https://en.wikipedia.org/wiki/StAX
-
-This library currently contains pull parser much like [StAX event reader][stax-reader].
-It provides iterator API, so you can leverage Rust's existing iterators library features.
-
- [stax-reader]: http://docs.oracle.com/javase/8/docs/api/javax/xml/stream/XMLEventReader.html
-
-It also provides a streaming document writer much like [StAX event writer][stax-writer].
-This writer consumes its own set of events, but reader events can be converted to
-writer events easily, and so it is possible to write XML transformation chains in a pretty
-clean manner.
-
- [stax-writer]: http://docs.oracle.com/javase/8/docs/api/javax/xml/stream/XMLEventWriter.html
-
-This parser is mostly full-featured, however, there are limitations:
-* no other encodings but UTF-8 are supported yet, because no stream-based encoding library
- is available now; when (or if) one will be available, I'll try to make use of it;
-* DTD validation is not supported, `<!DOCTYPE>` declarations are completely ignored; thus no
- support for custom entities too; internal DTD declarations are likely to cause parsing errors;
-* attribute value normalization is not performed, and end-of-line characters are not normalized too.
-
-Other than that the parser tries to be mostly XML-1.0-compliant.
-
-Writer is also mostly full-featured with the following limitations:
-* no support for encodings other than UTF-8, for the same reason as above;
-* no support for emitting `<!DOCTYPE>` declarations;
-* more validations of input are needed, for example, checking that namespace prefixes are bounded
- or comments are well-formed.
-
-What is planned (highest priority first, approximately):
-
-0. missing features required by XML standard (e.g. aforementioned normalization and
- proper DTD parsing);
-1. miscellaneous features of the writer;
-2. parsing into a DOM tree and its serialization back to XML text;
-3. SAX-like callback-based parser (fairly easy to implement over pull parser);
-4. DTD validation;
-5. (let's dream a bit) XML Schema validation.
-
-Building and using
-------------------
-
-xml-rs uses [Cargo](http://crates.io), so just add a dependency section in your project's manifest:
-
-```toml
-[dependencies]
-xml-rs = "0.8"
-```
-
-The package exposes a single crate called `xml`:
-
-```rust
-extern crate xml;
-```
-
-Reading XML documents
----------------------
-
-`xml::reader::EventReader` requires a `Read` instance to read from. When a proper stream-based encoding
-library is available, it is likely that xml-rs will be switched to use whatever character stream structure
-this library would provide, but currently it is a `Read`.
-
-Using `EventReader` is very straightforward. Just provide a `Read` instance to obtain an iterator
-over events:
-
-```rust,no_run
-extern crate xml;
-
-use std::fs::File;
-use std::io::BufReader;
-
-use xml::reader::{EventReader, XmlEvent};
-
-fn indent(size: usize) -> String {
- const INDENT: &'static str = " ";
- (0..size).map(|_| INDENT)
- .fold(String::with_capacity(size*INDENT.len()), |r, s| r + s)
-}
-
-fn main() {
- let file = File::open("file.xml").unwrap();
- let file = BufReader::new(file);
-
- let parser = EventReader::new(file);
- let mut depth = 0;
- for e in parser {
- match e {
- Ok(XmlEvent::StartElement { name, .. }) => {
- println!("{}+{}", indent(depth), name);
- depth += 1;
- }
- Ok(XmlEvent::EndElement { name }) => {
- depth -= 1;
- println!("{}-{}", indent(depth), name);
- }
- Err(e) => {
- println!("Error: {}", e);
- break;
- }
- _ => {}
- }
- }
-}
-```
-
-`EventReader` implements `IntoIterator` trait, so you can just use it in a `for` loop directly.
-Document parsing can end normally or with an error. Regardless of exact cause, the parsing
-process will be stopped, and iterator will terminate normally.
-
-You can also have finer control over when to pull the next event from the parser using its own
-`next()` method:
-
-```rust,ignore
-match parser.next() {
- ...
-}
-```
-
-Upon the end of the document or an error the parser will remember that last event and will always
-return it in the result of `next()` call afterwards. If iterator is used, then it will yield
-error or end-of-document event once and will produce `None` afterwards.
-
-It is also possible to tweak parsing process a little using `xml::reader::ParserConfig` structure.
-See its documentation for more information and examples.
-
-You can find a more extensive example of using `EventReader` in `src/analyze.rs`, which is a
-small program (BTW, it is built with `cargo build` and can be run after that) which shows various
-statistics about specified XML document. It can also be used to check for well-formedness of
-XML documents - if a document is not well-formed, this program will exit with an error.
-
-Writing XML documents
----------------------
-
-xml-rs also provides a streaming writer much like StAX event writer. With it you can write an
-XML document to any `Write` implementor.
-
-```rust,no_run
-extern crate xml;
-
-use std::fs::File;
-use std::io::{self, Write};
-
-use xml::writer::{EventWriter, EmitterConfig, XmlEvent, Result};
-
-fn handle_event<W: Write>(w: &mut EventWriter<W>, line: String) -> Result<()> {
- let line = line.trim();
- let event: XmlEvent = if line.starts_with("+") && line.len() > 1 {
- XmlEvent::start_element(&line[1..]).into()
- } else if line.starts_with("-") {
- XmlEvent::end_element().into()
- } else {
- XmlEvent::characters(&line).into()
- };
- w.write(event)
-}
-
-fn main() {
- let mut file = File::create("output.xml").unwrap();
-
- let mut input = io::stdin();
- let mut output = io::stdout();
- let mut writer = EmitterConfig::new().perform_indent(true).create_writer(&mut file);
- loop {
- print!("> "); output.flush().unwrap();
- let mut line = String::new();
- match input.read_line(&mut line) {
- Ok(0) => break,
- Ok(_) => match handle_event(&mut writer, line) {
- Ok(_) => {}
- Err(e) => panic!("Write error: {}", e)
- },
- Err(e) => panic!("Input error: {}", e)
- }
- }
-}
-```
-
-The code example above also demonstrates how to create a writer out of its configuration.
-Similar thing also works with `EventReader`.
-
-The library provides an XML event building DSL which helps to construct complex events,
-e.g. ones having namespace definitions. Some examples:
-
-```rust,ignore
-// <a:hello a:param="value" xmlns:a="urn:some:document">
-XmlEvent::start_element("a:hello").attr("a:param", "value").ns("a", "urn:some:document")
-
-// <hello b:config="name" xmlns="urn:default:uri">
-XmlEvent::start_element("hello").attr("b:config", "value").default_ns("urn:defaul:uri")
-
-// <![CDATA[some unescaped text]]>
-XmlEvent::cdata("some unescaped text")
-```
-
-Of course, one can create `XmlEvent` enum variants directly instead of using the builder DSL.
-There are more examples in `xml::writer::XmlEvent` documentation.
-
-The writer has multiple configuration options; see `EmitterConfig` documentation for more
-information.
-
-Other things
-------------
-
-No performance tests or measurements are done. The implementation is rather naive, and no specific
-optimizations are made. Hopefully the library is sufficiently fast to process documents of common size.
-I intend to add benchmarks in future, but not until more important features are added.
-
-Known issues
-------------
-
-All known issues are present on GitHub issue tracker: <http://github.com/netvl/xml-rs/issues>.
-Feel free to post any found problems there.
-
-License
--------
-
-This library is licensed under MIT license.
-
----
-Copyright (C) Vladimir Matveev, 2014-2020
diff --git a/design.md b/design.md
deleted file mode 100644
index da67c7b..0000000
--- a/design.md
+++ /dev/null
@@ -1,37 +0,0 @@
-# Reader
-
-Basic features:
- * [x] Parsing XML 1.0 documents and returning a stream of events
- - [ ] Support reading embedded DTD schemas
- - [ ] Support for embedded entities
- * [x] Support for namespaces and emitting namespace information in events
- * [ ] \[maybe\] push-based wrapper
- * Missing XML features
- - [ ] Support for different encodings
- - [ ] Attribute values normalization
- - [ ] EOL characters normalization
-
-Advanced features:
- * [ ] DTD schema validation
- * [ ] XSD schema validation
-
-# Writer
-
-Basic features:
- * [x] Writing basic XML 1.0 documents in UTF-8
- * [x] Writing XML 1.0 documents with namespace support
- * [x] Support for writing elements with empty body as empty elements
- * [x] Pretty-printed and compact output
- * [ ] Writing XML document with embedded DTDs and DTD references
- * Misc features:
- - [ ] Support for different encodings
- - [x] Support for writing CDATA as characters
- - [ ] Checking events for invalid characters (e.g. `--` in comments)
- - [ ] Check for namespaces more correctly, i.e. check both for prefix and namespace URI
- - [ ] Support checking namespace prefix presence in the current namespace for events with prefix but without namespace
- - [ ] Support checking namespace prefix for events with both prefix and namespace URI
-
-# Other
-
-DOM-based API:
- * [ ] Basic support for DOM-based API
diff --git a/src/analyze.rs b/src/analyze.rs
index d369d2f..d50b2d9 100644
--- a/src/analyze.rs
+++ b/src/analyze.rs
@@ -1,37 +1,23 @@
#![forbid(unsafe_code)]
-extern crate xml;
-
use std::cmp;
+use std::collections::HashSet;
use std::env;
-use std::io::{self, Read, Write, BufReader};
use std::fs::File;
-use std::collections::HashSet;
+use std::io::{self, BufReader, Read};
-use xml::ParserConfig;
use xml::reader::XmlEvent;
+use xml::ParserConfig;
-macro_rules! abort {
- ($code:expr) => {::std::process::exit($code)};
- ($code:expr, $($args:tt)+) => {{
- writeln!(&mut ::std::io::stderr(), $($args)+).unwrap();
- ::std::process::exit($code);
- }}
-}
-
-fn main() {
+fn main() -> Result<(), Box<dyn std::error::Error>> {
let mut file;
let mut stdin;
- let source: &mut Read = match env::args().nth(1) {
- Some(file_name) => {
- file = File::open(file_name)
- .unwrap_or_else(|e| abort!(1, "Cannot open input file: {}", e));
- &mut file
- }
- None => {
- stdin = io::stdin();
- &mut stdin
- }
+ let source: &mut dyn Read = if let Some(file_name) = env::args().nth(1) {
+ file = File::open(file_name).map_err(|e| format!("Cannot open input file: {e}"))?;
+ &mut file
+ } else {
+ stdin = io::stdin();
+ &mut stdin
};
let reader = ParserConfig::new()
@@ -51,49 +37,49 @@ fn main() {
let mut max_depth = 0;
for e in reader {
+ let e = e.map_err(|e| format!("Error parsing XML document: {e}"))?;
match e {
- Ok(e) => match e {
- XmlEvent::StartDocument { version, encoding, standalone } =>
- println!(
- "XML document version {}, encoded in {}, {}standalone",
- version, encoding, if standalone.unwrap_or(false) { "" } else { "not " }
- ),
- XmlEvent::EndDocument => println!("Document finished"),
- XmlEvent::ProcessingInstruction { .. } => processing_instructions += 1,
- XmlEvent::Whitespace(_) => {} // can't happen due to configuration
- XmlEvent::Characters(s) => {
- character_blocks += 1;
- characters += s.len();
- }
- XmlEvent::CData(s) => {
- cdata_blocks += 1;
- characters += s.len();
- }
- XmlEvent::Comment(s) => {
- comment_blocks += 1;
- comment_characters += s.len();
- }
- XmlEvent::StartElement { namespace, .. } => {
- depth += 1;
- max_depth = cmp::max(max_depth, depth);
- elements += 1;
- namespaces.extend(namespace.0.into_iter().map(|(_, ns_uri)| ns_uri));
- }
- XmlEvent::EndElement { .. } => {
- depth -= 1;
- }
- },
- Err(e) => abort!(1, "Error parsing XML document: {}", e)
- }
+ XmlEvent::StartDocument { version, encoding, standalone } =>
+ println!(
+ "XML document version {}, encoded in {}, {}standalone",
+ version, encoding, if standalone.unwrap_or(false) { "" } else { "not " }
+ ),
+ XmlEvent::EndDocument => println!("Document finished"),
+ XmlEvent::ProcessingInstruction { .. } => processing_instructions += 1,
+ XmlEvent::Whitespace(_) => {} // can't happen due to configuration
+ XmlEvent::Characters(s) => {
+ character_blocks += 1;
+ characters += s.len();
+ }
+ XmlEvent::CData(s) => {
+ cdata_blocks += 1;
+ characters += s.len();
+ }
+ XmlEvent::Comment(s) => {
+ comment_blocks += 1;
+ comment_characters += s.len();
+ }
+ XmlEvent::StartElement { namespace, .. } => {
+ depth += 1;
+ max_depth = cmp::max(max_depth, depth);
+ elements += 1;
+ namespaces.extend(namespace.0.into_values());
+ }
+ XmlEvent::EndElement { .. } => {
+ depth -= 1;
+ }
+ };
}
+
namespaces.remove(xml::namespace::NS_EMPTY_URI);
namespaces.remove(xml::namespace::NS_XMLNS_URI);
namespaces.remove(xml::namespace::NS_XML_URI);
- println!("Elements: {}, maximum depth: {}", elements, max_depth);
+ println!("Elements: {elements}, maximum depth: {max_depth}");
println!("Namespaces (excluding built-in): {}", namespaces.len());
- println!("Characters: {}, characters blocks: {}, CDATA blocks: {}",
- characters, character_blocks, cdata_blocks);
- println!("Comment blocks: {}, comment characters: {}", comment_blocks, comment_characters);
- println!("Processing instructions (excluding built-in): {}", processing_instructions);
+ println!("Characters: {characters}, characters blocks: {character_blocks}, CDATA blocks: {cdata_blocks}");
+ println!("Comment blocks: {comment_blocks}, comment characters: {comment_characters}");
+ println!("Processing instructions (excluding built-in): {processing_instructions}");
+
+ Ok(())
}
diff --git a/src/attribute.rs b/src/attribute.rs
index 8728f49..112bf24 100644
--- a/src/attribute.rs
+++ b/src/attribute.rs
@@ -3,8 +3,8 @@
use std::fmt;
-use name::{Name, OwnedName};
-use escape::escape_str_attribute;
+use crate::escape::{Escaped, AttributeEscapes};
+use crate::name::{Name, OwnedName};
/// A borrowed version of an XML attribute.
///
@@ -15,18 +15,19 @@ pub struct Attribute<'a> {
pub name: Name<'a>,
/// Attribute value.
- pub value: &'a str
+ pub value: &'a str,
}
impl<'a> fmt::Display for Attribute<'a> {
- fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
- write!(f, "{}=\"{}\"", self.name, escape_str_attribute(self.value))
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ write!(f, "{}=\"{}\"", self.name, Escaped::<AttributeEscapes>::new(self.value))
}
}
impl<'a> Attribute<'a> {
/// Creates an owned attribute out of this borrowed one.
#[inline]
+ #[must_use]
pub fn to_owned(&self) -> OwnedAttribute {
OwnedAttribute {
name: self.name.into(),
@@ -36,8 +37,9 @@ impl<'a> Attribute<'a> {
/// Creates a borrowed attribute using the provided borrowed name and a borrowed string value.
#[inline]
+ #[must_use]
pub fn new(name: Name<'a>, value: &'a str) -> Attribute<'a> {
- Attribute { name, value, }
+ Attribute { name, value }
}
}
@@ -50,15 +52,17 @@ pub struct OwnedAttribute {
pub name: OwnedName,
/// Attribute value.
- pub value: String
+ pub value: String,
}
impl OwnedAttribute {
/// Returns a borrowed `Attribute` out of this owned one.
- pub fn borrow(&self) -> Attribute {
+ #[must_use]
+ #[inline]
+ pub fn borrow(&self) -> Attribute<'_> {
Attribute {
name: self.name.borrow(),
- value: &*self.value,
+ value: &self.value,
}
}
@@ -73,27 +77,27 @@ impl OwnedAttribute {
}
impl fmt::Display for OwnedAttribute {
- fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
- write!(f, "{}=\"{}\"", self.name, escape_str_attribute(&*self.value))
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ write!(f, "{}=\"{}\"", self.name, Escaped::<AttributeEscapes>::new(&self.value))
}
}
#[cfg(test)]
mod tests {
- use super::{Attribute};
+ use super::Attribute;
- use name::Name;
+ use crate::name::Name;
#[test]
fn attribute_display() {
let attr = Attribute::new(
Name::qualified("attribute", "urn:namespace", Some("n")),
- "its value with > & \" ' < weird symbols"
+ "its value with > & \" ' < weird symbols",
);
assert_eq!(
&*attr.to_string(),
"{urn:namespace}n:attribute=\"its value with &gt; &amp; &quot; &apos; &lt; weird symbols\""
- )
+ );
}
}
diff --git a/src/common.rs b/src/common.rs
index 029e851..a1bf3ac 100644
--- a/src/common.rs
+++ b/src/common.rs
@@ -14,6 +14,7 @@ pub struct TextPosition {
impl TextPosition {
/// Creates a new position initialized to the beginning of the document
#[inline]
+ #[must_use]
pub fn new() -> TextPosition {
TextPosition { row: 0, column: 0 }
}
@@ -21,14 +22,14 @@ impl TextPosition {
/// Advances the position in a line
#[inline]
pub fn advance(&mut self, count: u8) {
- self.column += count as u64;
+ self.column += u64::from(count);
}
/// Advances the position in a line to the next tab position
#[inline]
pub fn advance_to_tab(&mut self, width: u8) {
- let width = width as u64;
- self.column += width - self.column % width
+ let width = u64::from(width);
+ self.column += width - self.column % width;
}
/// Advances the position to the beginning of the next line
@@ -40,15 +41,15 @@ impl TextPosition {
}
impl fmt::Debug for TextPosition {
- #[inline]
- fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+ #[cold]
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "{}:{}", self.row + 1, self.column + 1)
}
}
impl fmt::Display for TextPosition {
#[inline]
- fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "{}:{}", self.row + 1, self.column + 1)
}
}
@@ -69,26 +70,27 @@ impl Position for TextPosition {
}
/// XML version enumeration.
-#[derive(Copy, Clone, PartialEq, Eq)]
+#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord)]
pub enum XmlVersion {
/// XML version 1.0.
Version10,
/// XML version 1.1.
- Version11
+ Version11,
}
impl fmt::Display for XmlVersion {
- fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match *self {
- XmlVersion::Version10 => write!(f, "1.0"),
- XmlVersion::Version11 => write!(f, "1.1")
- }
+ XmlVersion::Version10 => "1.0",
+ XmlVersion::Version11 => "1.1",
+ }.fmt(f)
}
}
impl fmt::Debug for XmlVersion {
- fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+ #[cold]
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
fmt::Display::fmt(self, f)
}
}
@@ -97,33 +99,45 @@ impl fmt::Debug for XmlVersion {
/// as is defined by XML 1.1 specification, [section 2.3][1].
///
/// [1]: http://www.w3.org/TR/2006/REC-xml11-20060816/#sec-common-syn
+#[must_use]
+#[inline]
pub fn is_whitespace_char(c: char) -> bool {
- match c {
- '\x20' | '\x09' | '\x0d' | '\x0a' => true,
- _ => false
- }
+ matches!(c, '\x20' | '\x0a' | '\x09' | '\x0d')
}
/// Checks whether the given string is compound only by white space
-/// characters (`S`) using the previous is_whitespace_char to check
+/// characters (`S`) using the previous `is_whitespace_char` to check
/// all characters of this string
pub fn is_whitespace_str(s: &str) -> bool {
s.chars().all(is_whitespace_char)
}
+pub fn is_xml10_char(c: char) -> bool {
+ matches!(c, '\u{09}' | '\u{0A}' | '\u{0D}' | '\u{20}'..='\u{D7FF}' | '\u{E000}'..='\u{FFFD}' | '\u{10000}'..)
+}
+
+pub fn is_xml11_char(c: char) -> bool {
+ matches!(c, '\u{01}'..='\u{D7FF}' | '\u{E000}'..='\u{FFFD}' | '\u{10000}'..)
+}
+
+pub fn is_xml11_char_not_restricted(c: char) -> bool {
+ is_xml11_char(c) && !matches!(c, '\u{01}'..='\u{08}' | '\u{0B}'..='\u{0C}' | '\u{0E}'..='\u{1F}' | '\u{7F}'..='\u{84}' | '\u{86}'..='\u{9F}')
+}
+
/// Checks whether the given character is a name start character (`NameStartChar`)
/// as is defined by XML 1.1 specification, [section 2.3][1].
///
/// [1]: http://www.w3.org/TR/2006/REC-xml11-20060816/#sec-common-syn
+#[must_use]
pub fn is_name_start_char(c: char) -> bool {
match c {
- ':' | 'A'...'Z' | '_' | 'a'...'z' |
- '\u{C0}'...'\u{D6}' | '\u{D8}'...'\u{F6}' | '\u{F8}'...'\u{2FF}' |
- '\u{370}'...'\u{37D}' | '\u{37F}'...'\u{1FFF}' |
- '\u{200C}'...'\u{200D}' | '\u{2070}'...'\u{218F}' |
- '\u{2C00}'...'\u{2FEF}' | '\u{3001}'...'\u{D7FF}' |
- '\u{F900}'...'\u{FDCF}' | '\u{FDF0}'...'\u{FFFD}' |
- '\u{10000}'...'\u{EFFFF}' => true,
+ ':' | 'A'..='Z' | '_' | 'a'..='z' |
+ '\u{C0}'..='\u{D6}' | '\u{D8}'..='\u{F6}' | '\u{F8}'..='\u{2FF}' |
+ '\u{370}'..='\u{37D}' | '\u{37F}'..='\u{1FFF}' |
+ '\u{200C}'..='\u{200D}' | '\u{2070}'..='\u{218F}' |
+ '\u{2C00}'..='\u{2FEF}' | '\u{3001}'..='\u{D7FF}' |
+ '\u{F900}'..='\u{FDCF}' | '\u{FDF0}'..='\u{FFFD}' |
+ '\u{10000}'..='\u{EFFFF}' => true,
_ => false
}
}
@@ -132,11 +146,12 @@ pub fn is_name_start_char(c: char) -> bool {
/// as is defined by XML 1.1 specification, [section 2.3][1].
///
/// [1]: http://www.w3.org/TR/2006/REC-xml11-20060816/#sec-common-syn
+#[must_use]
pub fn is_name_char(c: char) -> bool {
match c {
_ if is_name_start_char(c) => true,
- '-' | '.' | '0'...'9' | '\u{B7}' |
- '\u{300}'...'\u{36F}' | '\u{203F}'...'\u{2040}' => true,
+ '-' | '.' | '0'..='9' | '\u{B7}' |
+ '\u{300}'..='\u{36F}' | '\u{203F}'..='\u{2040}' => true,
_ => false
}
}
diff --git a/src/escape.rs b/src/escape.rs
index 18298b9..1fcfd06 100644
--- a/src/escape.rs
+++ b/src/escape.rs
@@ -1,81 +1,102 @@
//! Contains functions for performing XML special characters escaping.
-use std::borrow::Cow;
+use std::{borrow::Cow, marker::PhantomData, fmt::{Display, Result, Formatter}};
-enum Value {
- Char(char),
- Str(&'static str)
-}
+pub(crate) trait Escapes {
+ fn escape(c: u8) -> Option<&'static str>;
-impl Value {
- fn dispatch_for_attribute(c: char) -> Value {
- match c {
- '<' => Value::Str("&lt;"),
- '>' => Value::Str("&gt;"),
- '"' => Value::Str("&quot;"),
- '\'' => Value::Str("&apos;"),
- '&' => Value::Str("&amp;"),
- '\n' => Value::Str("&#xA;"),
- '\r' => Value::Str("&#xD;"),
- _ => Value::Char(c)
- }
+ fn byte_needs_escaping(c: u8) -> bool{
+ Self::escape(c).is_some()
}
- fn dispatch_for_pcdata(c: char) -> Value {
- match c {
- '<' => Value::Str("&lt;"),
- '&' => Value::Str("&amp;"),
- _ => Value::Char(c)
- }
+ fn str_needs_escaping(s: &str) -> bool{
+ s.bytes().any(|c| Self::escape(c).is_some())
}
}
-enum Process<'a> {
- Borrowed(&'a str),
- Owned(String)
+pub(crate) struct Escaped<'a, E: Escapes> {
+ _escape_phantom: PhantomData<E>,
+ to_escape: &'a str,
}
-impl<'a> Process<'a> {
- fn process(&mut self, (i, next): (usize, Value)) {
- match next {
- Value::Str(s) => match *self {
- Process::Owned(ref mut o) => o.push_str(s),
- Process::Borrowed(b) => {
- let mut r = String::with_capacity(b.len() + s.len());
- r.push_str(&b[..i]);
- r.push_str(s);
- *self = Process::Owned(r);
- }
- },
- Value::Char(c) => match *self {
- Process::Borrowed(_) => {}
- Process::Owned(ref mut o) => o.push(c)
- }
+impl<'a, E: Escapes> Escaped<'a, E> {
+ pub fn new(s: &'a str) -> Self {
+ Escaped {
+ _escape_phantom: PhantomData,
+ to_escape: s,
}
}
+}
- fn into_result(self) -> Cow<'a, str> {
- match self {
- Process::Borrowed(b) => Cow::Borrowed(b),
- Process::Owned(o) => Cow::Owned(o)
+
+impl<'a, E: Escapes> Display for Escaped<'a, E> {
+ fn fmt(&self, f: &mut Formatter<'_>) -> Result {
+ let mut total_remaining = self.to_escape;
+
+ // find the next occurence
+ while let Some(n) = total_remaining
+ .bytes()
+ .position(E::byte_needs_escaping)
+ {
+ let (start, remaining) = total_remaining.split_at(n);
+
+ f.write_str(start)?;
+
+ // unwrap is safe because we checked is_some for position n earlier
+ let next_byte = remaining.bytes().next().unwrap();
+ let replacement = E::escape(next_byte).unwrap();
+ f.write_str(replacement)?;
+
+ total_remaining = &remaining[1..];
}
+
+ f.write_str(total_remaining)
}
}
-impl<'a> Extend<(usize, Value)> for Process<'a> {
- fn extend<I: IntoIterator<Item=(usize, Value)>>(&mut self, it: I) {
- for v in it.into_iter() {
- self.process(v);
- }
+fn escape_str<E: Escapes>(s: &str) -> Cow<'_, str> {
+ if E::str_needs_escaping(s) {
+ Cow::Owned(format!("{}", Escaped::<E>::new(s)))
+ } else {
+ Cow::Borrowed(s)
}
}
-fn escape_str(s: &str, dispatch: fn(char) -> Value) -> Cow<str> {
- let mut p = Process::Borrowed(s);
- p.extend(s.char_indices().map(|(ind, c)| (ind, dispatch(c))));
- p.into_result()
+macro_rules! escapes {
+ {
+ $name: ident,
+ $($k: expr => $v: expr),* $(,)?
+ } => {
+ pub(crate) struct $name;
+
+ impl Escapes for $name {
+ fn escape(c: u8) -> Option<&'static str> {
+ match c {
+ $( $k => Some($v),)*
+ _ => None
+ }
+ }
+ }
+ };
}
+escapes!(
+ AttributeEscapes,
+ b'<' => "&lt;",
+ b'>' => "&gt;",
+ b'"' => "&quot;",
+ b'\'' => "&apos;",
+ b'&' => "&amp;",
+ b'\n' => "&#xA;",
+ b'\r' => "&#xD;",
+);
+
+escapes!(
+ PcDataEscapes,
+ b'<' => "&lt;",
+ b'&' => "&amp;",
+);
+
/// Performs escaping of common XML characters inside an attribute value.
///
/// This function replaces several important markup characters with their
@@ -86,13 +107,18 @@ fn escape_str(s: &str, dispatch: fn(char) -> Value) -> Cow<str> {
/// * `"` → `&quot;`
/// * `'` → `&apos;`
/// * `&` → `&amp;`
+///
+/// The following characters are escaped so that attributes are printed on
+/// a single line:
+/// * `\n` → `&#xA;`
+/// * `\r` → `&#xD;`
///
/// The resulting string is safe to use inside XML attribute values or in PCDATA sections.
///
/// Does not perform allocations if the given string does not contain escapable characters.
#[inline]
-pub fn escape_str_attribute(s: &str) -> Cow<str> {
- escape_str(s, Value::dispatch_for_attribute)
+#[must_use] pub fn escape_str_attribute(s: &str) -> Cow<'_, str> {
+ escape_str::<AttributeEscapes>(s)
}
/// Performs escaping of common XML characters inside PCDATA.
@@ -107,15 +133,25 @@ pub fn escape_str_attribute(s: &str) -> Cow<str> {
///
/// Does not perform allocations if the given string does not contain escapable characters.
#[inline]
-pub fn escape_str_pcdata(s: &str) -> Cow<str> {
- escape_str(s, Value::dispatch_for_pcdata)
+#[must_use] pub fn escape_str_pcdata(s: &str) -> Cow<'_, str> {
+ escape_str::<PcDataEscapes>(s)
}
#[cfg(test)]
mod tests {
- use super::{escape_str_pcdata, escape_str_attribute};
+ use super::{escape_str_attribute, escape_str_pcdata};
- // TODO: add more tests
+ #[test]
+ fn test_escape_str_attribute() {
+ assert_eq!(escape_str_attribute("<>'\"&\n\r"), "&lt;&gt;&apos;&quot;&amp;&#xA;&#xD;");
+ assert_eq!(escape_str_attribute("no_escapes"), "no_escapes");
+ }
+
+ #[test]
+ fn test_escape_str_pcdata() {
+ assert_eq!(escape_str_pcdata("<&"), "&lt;&amp;");
+ assert_eq!(escape_str_pcdata("no_escapes"), "no_escapes");
+ }
#[test]
fn test_escape_multibyte_code_points() {
@@ -123,4 +159,3 @@ mod tests {
assert_eq!(escape_str_pcdata("☃<"), "☃&lt;");
}
}
-
diff --git a/src/lib.rs b/src/lib.rs
index fb672ef..b1486d8 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -1,29 +1,30 @@
//#![warn(missing_doc)]
-#![allow(dead_code)]
-#![allow(unused_variables)]
#![forbid(non_camel_case_types)]
#![forbid(unsafe_code)]
+#![allow(clippy::redundant_closure_for_method_calls)]
+#![allow(clippy::module_name_repetitions)]
//! This crate currently provides an almost XML 1.0/1.1-compliant pull parser.
+//!
+//! Please note that functions of this parser may panic.
+//! If a panic could cause a Denial Of Service in your codebase, *you're* responsible for wrapping access to this library in `catch_unwind`.
#[cfg(doctest)]
-#[macro_use]
-extern crate doc_comment;
+doc_comment::doctest!("../README.md");
-#[cfg(doctest)]
-doctest!("../Readme.md");
-
-pub use reader::EventReader;
-pub use reader::ParserConfig;
-pub use writer::EventWriter;
-pub use writer::EmitterConfig;
+pub use crate::reader::EventReader;
+pub use crate::reader::ParserConfig;
+pub use crate::util::Encoding;
+pub use crate::writer::EmitterConfig;
+pub use crate::writer::EventWriter;
-pub mod macros;
-pub mod name;
pub mod attribute;
pub mod common;
pub mod escape;
+#[doc(hidden)] // FIXME: not supposed to be public
+pub mod macros;
+pub mod name;
pub mod namespace;
pub mod reader;
-pub mod writer;
mod util;
+pub mod writer;
diff --git a/src/macros.rs b/src/macros.rs
index 1cce3d6..25916d3 100644
--- a/src/macros.rs
+++ b/src/macros.rs
@@ -5,7 +5,8 @@
macro_rules! gen_setter {
($target:ty, $field:ident : into $t:ty) => {
impl $target {
- /// Sets the field to the provided value and returns updated config object.
+ /// See [`ParserConfig`][crate::ParserConfig] fields docs for details
+ #[inline]
pub fn $field<T: Into<$t>>(mut self, value: T) -> $target {
self.$field = value.into();
self
@@ -14,13 +15,38 @@ macro_rules! gen_setter {
};
($target:ty, $field:ident : val $t:ty) => {
impl $target {
- /// Sets the field to the provided value and returns updated config object.
+ /// See [`ParserConfig`][crate::ParserConfig] fields docs for details
+ #[inline]
pub fn $field(mut self, value: $t) -> $target {
self.$field = value;
self
}
}
- }
+ };
+ ($target:ty, $field:ident : delegate $t:ty) => {
+ impl $target {
+ /// See [`ParserConfig`][crate::ParserConfig] fields docs for details
+ #[inline]
+ pub fn $field(mut self, value: $t) -> $target {
+ self.c.$field = value;
+ self
+ }
+ }
+ };
+ ($target:ty, $field:ident : c2 $t:ty) => {
+ impl $target {
+ /// See [`ParserConfig2`][crate::reader::ParserConfig] fields docs for details
+ #[inline]
+ #[must_use]
+ pub fn $field(self, value: $t) -> ParserConfig2 {
+ ParserConfig2 {
+ c: self,
+ ..Default::default()
+ }
+ .$field(value)
+ }
+ }
+ };
}
macro_rules! gen_setters {
diff --git a/src/name.rs b/src/name.rs
index a20eae2..fc11981 100644
--- a/src/name.rs
+++ b/src/name.rs
@@ -4,7 +4,7 @@
use std::fmt;
use std::str::FromStr;
-use namespace::NS_NO_PREFIX;
+use crate::namespace::NS_NO_PREFIX;
/// Represents a qualified XML name.
///
@@ -53,16 +53,16 @@ pub struct Name<'a> {
pub namespace: Option<&'a str>,
/// A name prefix, e.g. `xsi` in `xsi:string`.
- pub prefix: Option<&'a str>
+ pub prefix: Option<&'a str>,
}
impl<'a> From<&'a str> for Name<'a> {
fn from(s: &'a str) -> Name<'a> {
- let mut parts = s.splitn(2, ":").fuse();
+ let mut parts = s.splitn(2, ':').fuse();
match (parts.next(), parts.next()) {
(Some(name), None) => Name::local(name),
(Some(prefix), Some(name)) => Name::prefixed(name, prefix),
- _ => unreachable!()
+ _ => unreachable!(),
}
}
}
@@ -74,52 +74,56 @@ impl<'a> From<(&'a str, &'a str)> for Name<'a> {
}
impl<'a> fmt::Display for Name<'a> {
- fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
if let Some(namespace) = self.namespace {
- write!(f, "{{{}}}", namespace)?;
+ write!(f, "{{{namespace}}}")?;
}
if let Some(prefix) = self.prefix {
- write!(f, "{}:", prefix)?;
+ write!(f, "{prefix}:")?;
}
- write!(f, "{}", self.local_name)
+ f.write_str(self.local_name)
}
}
impl<'a> Name<'a> {
/// Returns an owned variant of the qualified name.
+ #[must_use]
pub fn to_owned(&self) -> OwnedName {
OwnedName {
local_name: self.local_name.into(),
- namespace: self.namespace.map(|s| s.into()),
- prefix: self.prefix.map(|s| s.into())
+ namespace: self.namespace.map(std::convert::Into::into),
+ prefix: self.prefix.map(std::convert::Into::into),
}
}
/// Returns a new `Name` instance representing plain local name.
#[inline]
- pub fn local(local_name: &str) -> Name {
+ #[must_use]
+ pub fn local(local_name: &str) -> Name<'_> {
Name {
local_name,
prefix: None,
- namespace: None
+ namespace: None,
}
}
/// Returns a new `Name` instance with the given local name and prefix.
#[inline]
+ #[must_use]
pub fn prefixed(local_name: &'a str, prefix: &'a str) -> Name<'a> {
Name {
local_name,
namespace: None,
- prefix: Some(prefix)
+ prefix: Some(prefix),
}
}
/// Returns a new `Name` instance representing a qualified name with or without a prefix and
/// with a namespace URI.
#[inline]
+ #[must_use]
pub fn qualified(local_name: &'a str, namespace: &'a str, prefix: Option<&'a str>) -> Name<'a> {
Name {
local_name,
@@ -132,6 +136,7 @@ impl<'a> Name<'a> {
///
/// This method is different from the autoimplemented `to_string()` because it does not
/// include namespace URI in the result.
+ #[must_use]
pub fn to_repr(&self) -> String {
self.repr_display().to_string()
}
@@ -142,12 +147,14 @@ impl<'a> Name<'a> {
/// This method is needed for efficiency purposes in order not to create unnecessary
/// allocations.
#[inline]
- pub fn repr_display(&self) -> ReprDisplay {
+ #[must_use]
+ pub fn repr_display(&self) -> ReprDisplay<'_, '_> {
ReprDisplay(self)
}
/// Returns either a prefix of this name or `namespace::NS_NO_PREFIX` constant.
#[inline]
+ #[must_use]
pub fn prefix_repr(&self) -> &str {
self.prefix.unwrap_or(NS_NO_PREFIX)
}
@@ -155,13 +162,13 @@ impl<'a> Name<'a> {
/// A wrapper around `Name` whose `Display` implementation prints the wrapped name as it is
/// displayed in an XML document.
-pub struct ReprDisplay<'a, 'b:'a>(&'a Name<'b>);
+pub struct ReprDisplay<'a, 'b>(&'a Name<'b>);
-impl<'a, 'b:'a> fmt::Display for ReprDisplay<'a, 'b> {
- fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+impl<'a, 'b: 'a> fmt::Display for ReprDisplay<'a, 'b> {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self.0.prefix {
Some(prefix) => write!(f, "{}:{}", prefix, self.0.local_name),
- None => write!(f, "{}", self.0.local_name)
+ None => self.0.local_name.fmt(f),
}
}
}
@@ -183,18 +190,20 @@ pub struct OwnedName {
impl fmt::Display for OwnedName {
#[inline]
- fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
fmt::Display::fmt(&self.borrow(), f)
}
}
impl OwnedName {
/// Constructs a borrowed `Name` based on this owned name.
- pub fn borrow(&self) -> Name {
+ #[must_use]
+ #[inline]
+ pub fn borrow(&self) -> Name<'_> {
Name {
- local_name: &*self.local_name,
- namespace: self.namespace.as_ref().map(|s| &**s),
- prefix: self.prefix.as_ref().map(|s| &**s),
+ local_name: &self.local_name,
+ namespace: self.namespace.as_deref(),
+ prefix: self.prefix.as_deref(),
}
}
@@ -217,22 +226,24 @@ impl OwnedName {
OwnedName {
local_name: local_name.into(),
namespace: Some(namespace.into()),
- prefix: prefix.map(|v| v.into())
+ prefix: prefix.map(std::convert::Into::into),
}
}
/// Returns an optional prefix by reference, equivalent to `self.borrow().prefix`
/// but avoids extra work.
#[inline]
+ #[must_use]
pub fn prefix_ref(&self) -> Option<&str> {
- self.prefix.as_ref().map(|s| &**s)
+ self.prefix.as_deref()
}
/// Returns an optional namespace by reference, equivalen to `self.borrow().namespace`
/// but avoids extra work.
#[inline]
+ #[must_use]
pub fn namespace_ref(&self) -> Option<&str> {
- self.namespace.as_ref().map(|s| &**s)
+ self.namespace.as_deref()
}
}
diff --git a/src/namespace.rs b/src/namespace.rs
index 1ab4a5c..216a982 100644
--- a/src/namespace.rs
+++ b/src/namespace.rs
@@ -1,9 +1,9 @@
//! Contains namespace manipulation types and functions.
-use std::iter::{Map, Rev};
-use std::collections::btree_map::{BTreeMap, Entry};
use std::collections::btree_map::Iter as Entries;
+use std::collections::btree_map::{BTreeMap, Entry};
use std::collections::HashSet;
+use std::iter::{Map, Rev};
use std::slice::Iter;
/// Designates prefix for namespace definitions.
@@ -11,14 +11,14 @@ use std::slice::Iter;
/// See [Namespaces in XML][namespace] spec for more information.
///
/// [namespace]: http://www.w3.org/TR/xml-names/#ns-decl
-pub const NS_XMLNS_PREFIX: &'static str = "xmlns";
+pub const NS_XMLNS_PREFIX: &str = "xmlns";
/// Designates the standard URI for `xmlns` prefix.
///
-/// See [A Namespace Name for xmlns Attributes][1] for more information.
+/// See [A Namespace Name for xmlns Attributes][namespace] for more information.
///
/// [namespace]: http://www.w3.org/2000/xmlns/
-pub const NS_XMLNS_URI: &'static str = "http://www.w3.org/2000/xmlns/";
+pub const NS_XMLNS_URI: &str = "http://www.w3.org/2000/xmlns/";
/// Designates prefix for a namespace containing several special predefined attributes.
///
@@ -29,12 +29,12 @@ pub const NS_XMLNS_URI: &'static str = "http://www.w3.org/2000/xmlns/";
/// [2]: http://www.w3.org/TR/REC-xml/#sec-lang-tag
/// [3]: http://www.w3.org/TR/xmlbase/
/// [4]: http://www.w3.org/TR/xml-id/
-pub const NS_XML_PREFIX: &'static str = "xml";
+pub const NS_XML_PREFIX: &str = "xml";
/// Designates the standard URI for `xml` prefix.
///
/// See `NS_XML_PREFIX` documentation for more information.
-pub const NS_XML_URI: &'static str = "http://www.w3.org/XML/1998/namespace";
+pub const NS_XML_URI: &str = "http://www.w3.org/XML/1998/namespace";
/// Designates the absence of prefix in a qualified name.
///
@@ -52,7 +52,7 @@ pub const NS_XML_URI: &'static str = "http://www.w3.org/XML/1998/namespace"
/// By default empty prefix corresponds to absence of namespace, but this can change either
/// when writing an XML document (manually) or when reading an XML document (based on namespace
/// declarations).
-pub const NS_NO_PREFIX: &'static str = "";
+pub const NS_NO_PREFIX: &str = "";
/// Designates an empty namespace URI, which is equivalent to absence of namespace.
///
@@ -60,7 +60,7 @@ pub const NS_NO_PREFIX: &'static str = "";
/// empty prefix corresponds to absent namespace in `NamespaceStack` instances created with
/// `NamespaceStack::default()`. Therefore, it can be used to restore `NS_NO_PREFIX` mapping
/// in a namespace back to its default value.
-pub const NS_EMPTY_URI: &'static str = "";
+pub const NS_EMPTY_URI: &str = "";
/// Namespace is a map from prefixes to namespace URIs.
///
@@ -71,16 +71,21 @@ pub struct Namespace(pub BTreeMap<String, String>);
impl Namespace {
/// Returns an empty namespace.
#[inline]
- pub fn empty() -> Namespace { Namespace(BTreeMap::new()) }
+ #[must_use]
+ pub fn empty() -> Namespace {
+ Namespace(BTreeMap::new())
+ }
/// Checks whether this namespace is empty.
#[inline]
+ #[must_use]
pub fn is_empty(&self) -> bool {
self.0.is_empty()
}
/// Checks whether this namespace is essentially empty, that is, it does not contain
/// anything but default mappings.
+ #[must_use]
pub fn is_essentially_empty(&self) -> bool {
// a shortcut for a namespace which is definitely not empty
if self.0.len() > 3 { return false; }
@@ -101,7 +106,7 @@ impl Namespace {
/// # Return value
/// `true` if this namespace contains the given prefix, `false` otherwise.
#[inline]
- pub fn contains<P: ?Sized+AsRef<str>>(&self, prefix: &P) -> bool {
+ pub fn contains<P: ?Sized + AsRef<str>>(&self, prefix: &P) -> bool {
self.0.contains_key(prefix.as_ref())
}
@@ -157,7 +162,7 @@ impl Namespace {
///
/// # Return value
/// Namespace URI corresponding to the given prefix, if it is present.
- pub fn get<'a, P: ?Sized+AsRef<str>>(&'a self, prefix: &P) -> Option<&'a str> {
+ pub fn get<'a, P: ?Sized + AsRef<str>>(&'a self, prefix: &P) -> Option<&'a str> {
self.0.get(prefix.as_ref()).map(|s| &**s)
}
}
@@ -174,7 +179,7 @@ impl<'a> IntoIterator for &'a Namespace {
fn into_iter(self) -> Self::IntoIter {
fn mapper<'a>((prefix, uri): (&'a String, &'a String)) -> UriMapping<'a> {
- (&*prefix, &*uri)
+ (prefix, uri)
}
self.0.iter().map(mapper)
}
@@ -190,7 +195,10 @@ pub struct NamespaceStack(pub Vec<Namespace>);
impl NamespaceStack {
/// Returns an empty namespace stack.
#[inline]
- pub fn empty() -> NamespaceStack { NamespaceStack(Vec::with_capacity(2)) }
+ #[must_use]
+ pub fn empty() -> NamespaceStack {
+ NamespaceStack(Vec::with_capacity(2))
+ }
/// Returns a namespace stack with default items in it.
///
@@ -199,6 +207,7 @@ impl NamespaceStack {
/// * `xml` → `http://www.w3.org/XML/1998/namespace`;
/// * `xmlns` → `http://www.w3.org/2000/xmlns/`.
#[inline]
+ #[must_use]
pub fn default() -> NamespaceStack {
let mut nst = NamespaceStack::empty();
nst.push_empty();
@@ -246,6 +255,7 @@ impl NamespaceStack {
///
/// Panics if the stack is empty.
#[inline]
+ #[must_use]
pub fn peek(&self) -> &Namespace {
self.0.last().unwrap()
}
@@ -294,7 +304,11 @@ impl NamespaceStack {
pub fn put<P, U>(&mut self, prefix: P, uri: U) -> bool
where P: Into<String>, U: Into<String>
{
- self.0.last_mut().unwrap().put(prefix, uri)
+ if let Some(ns) = self.0.last_mut() {
+ ns.put(prefix, uri)
+ } else {
+ false
+ }
}
/// Performs a search for the given prefix in the whole stack.
@@ -306,7 +320,7 @@ impl NamespaceStack {
/// # Parameters
/// * `prefix` --- namespace prefix.
#[inline]
- pub fn get<'a, P: ?Sized+AsRef<str>>(&'a self, prefix: &P) -> Option<&'a str> {
+ pub fn get<'a, P: ?Sized + AsRef<str>>(&'a self, prefix: &P) -> Option<&'a str> {
let prefix = prefix.as_ref();
for ns in self.0.iter().rev() {
match ns.get(prefix) {
@@ -321,9 +335,10 @@ impl NamespaceStack {
///
/// Namespaces are combined in left-to-right order, that is, rightmost namespace
/// elements take priority over leftmost ones.
+ #[must_use]
pub fn squash(&self) -> Namespace {
let mut result = BTreeMap::new();
- for ns in self.0.iter() {
+ for ns in &self.0 {
result.extend(ns.0.iter().map(|(k, v)| (k.clone(), v.clone())));
}
Namespace(result)
@@ -333,13 +348,14 @@ impl NamespaceStack {
///
/// See `CheckedTarget` for more information.
#[inline]
- pub fn checked_target(&mut self) -> CheckedTarget {
+ pub fn checked_target(&mut self) -> CheckedTarget<'_> {
CheckedTarget(self)
}
/// Returns an iterator over all mappings in this namespace stack.
#[inline]
- pub fn iter(&self) -> NamespaceStackMappings {
+ #[must_use]
+ pub fn iter(&self) -> NamespaceStackMappings<'_> {
self.into_iter()
}
}
@@ -361,7 +377,7 @@ impl NamespaceStack {
pub struct NamespaceStackMappings<'a> {
namespaces: Rev<Iter<'a, Namespace>>,
current_namespace: Option<NamespaceMappings<'a>>,
- used_keys: HashSet<&'a str>
+ used_keys: HashSet<&'a str>,
}
impl<'a> NamespaceStackMappings<'a> {
@@ -379,7 +395,7 @@ impl<'a> Iterator for NamespaceStackMappings<'a> {
if self.current_namespace.is_none() && !self.go_to_next_namespace() {
return None;
}
- let next_item = self.current_namespace.as_mut().unwrap().next();
+ let next_item = self.current_namespace.as_mut()?.next();
match next_item {
// There is an element in the current namespace
@@ -412,7 +428,7 @@ impl<'a> IntoIterator for &'a NamespaceStack {
NamespaceStackMappings {
namespaces: self.0.iter().rev(),
current_namespace: None,
- used_keys: HashSet::new()
+ used_keys: HashSet::new(),
}
}
}
diff --git a/src/reader/mod.rs b/src/reader.rs
index 90f5b52..71ea79b 100644
--- a/src/reader/mod.rs
+++ b/src/reader.rs
@@ -3,44 +3,46 @@
//! The most important type in this module is `EventReader`, which provides an iterator
//! view for events in XML document.
-use std::io::{Read};
+use std::io::Read;
+use std::iter::FusedIterator;
use std::result;
-use common::{Position, TextPosition};
+use crate::common::{Position, TextPosition};
pub use self::config::ParserConfig;
-pub use self::events::XmlEvent;
+pub use self::config::ParserConfig2;
+pub use self::events::XmlEvent;
use self::parser::PullParser;
-mod lexer;
-mod parser;
mod config;
mod events;
+mod lexer;
+mod parser;
mod error;
pub use self::error::{Error, ErrorKind};
/// A result type yielded by `XmlReader`.
-pub type Result<T> = result::Result<T, Error>;
+pub type Result<T, E = Error> = result::Result<T, E>;
/// A wrapper around an `std::io::Read` instance which provides pull-based XML parsing.
pub struct EventReader<R: Read> {
source: R,
- parser: PullParser
+ parser: PullParser,
}
impl<R: Read> EventReader<R> {
/// Creates a new reader, consuming the given stream.
#[inline]
pub fn new(source: R) -> EventReader<R> {
- EventReader::new_with_config(source, ParserConfig::new())
+ EventReader::new_with_config(source, ParserConfig2::new())
}
/// Creates a new reader with the provded configuration, consuming the given stream.
#[inline]
- pub fn new_with_config(source: R, config: ParserConfig) -> EventReader<R> {
- EventReader { source: source, parser: PullParser::new(config) }
+ pub fn new_with_config(source: R, config: impl Into<ParserConfig2>) -> EventReader<R> {
+ EventReader { source, parser: PullParser::new(config) }
}
/// Pulls and returns next XML event from the stream.
@@ -52,6 +54,27 @@ impl<R: Read> EventReader<R> {
self.parser.next(&mut self.source)
}
+ /// Skips all XML events until the next end tag at the current level.
+ ///
+ /// Convenience function that is useful for the case where you have
+ /// encountered a start tag that is of no interest and want to
+ /// skip the entire XML subtree until the corresponding end tag.
+ #[inline]
+ pub fn skip(&mut self) -> Result<()> {
+ let mut depth = 1;
+
+ while depth > 0 {
+ match self.next()? {
+ XmlEvent::StartElement { .. } => depth += 1,
+ XmlEvent::EndElement { .. } => depth -= 1,
+ XmlEvent::EndDocument => unreachable!(),
+ _ => {}
+ }
+ }
+
+ Ok(())
+ }
+
pub fn source(&self) -> &R { &self.source }
pub fn source_mut(&mut self) -> &mut R { &mut self.source }
@@ -88,7 +111,7 @@ impl<R: Read> IntoIterator for EventReader<R> {
/// it will be returned by the iterator once, and then it will stop producing events.
pub struct Events<R: Read> {
reader: EventReader<R>,
- finished: bool
+ finished: bool,
}
impl<R: Read> Events<R> {
@@ -103,17 +126,20 @@ impl<R: Read> Events<R> {
}
+impl<R: Read> FusedIterator for Events<R> {
+}
+
impl<R: Read> Iterator for Events<R> {
type Item = Result<XmlEvent>;
#[inline]
fn next(&mut self) -> Option<Result<XmlEvent>> {
- if self.finished && !self.reader.parser.is_ignoring_end_of_stream() { None }
- else {
+ if self.finished && !self.reader.parser.is_ignoring_end_of_stream() {
+ None
+ } else {
let ev = self.reader.next();
- match ev {
- Ok(XmlEvent::EndDocument) | Err(_) => self.finished = true,
- _ => {}
+ if let Ok(XmlEvent::EndDocument) | Err(_) = ev {
+ self.finished = true;
}
Some(ev)
}
@@ -123,6 +149,7 @@ impl<R: Read> Iterator for Events<R> {
impl<'r> EventReader<&'r [u8]> {
/// A convenience method to create an `XmlReader` from a string slice.
#[inline]
+ #[must_use]
pub fn from_str(source: &'r str) -> EventReader<&'r [u8]> {
EventReader::new(source.as_bytes())
}
diff --git a/src/reader/config.rs b/src/reader/config.rs
index 0abb165..3351997 100644
--- a/src/reader/config.rs
+++ b/src/reader/config.rs
@@ -1,8 +1,9 @@
//! Contains parser configuration structure.
-use std::io::Read;
use std::collections::HashMap;
+use std::io::Read;
-use reader::EventReader;
+use crate::reader::EventReader;
+use crate::util::Encoding;
/// Parser configuration structure.
///
@@ -103,6 +104,8 @@ impl ParserConfig {
/// .ignore_comments(true)
/// .coalesce_characters(false);
/// ```
+ #[must_use]
+ #[inline]
pub fn new() -> ParserConfig {
ParserConfig {
trim_whitespace: false,
@@ -179,3 +182,125 @@ gen_setters! { ParserConfig,
replace_unknown_entity_references: val bool,
ignore_root_level_whitespace: val bool
}
+
+/// Backwards-compatible extension of `ParserConfig`, which will eventually be merged into the original `ParserConfig` struct
+#[derive(Clone, PartialEq, Eq, Debug)]
+#[non_exhaustive]
+pub struct ParserConfig2 {
+ pub(crate) c: ParserConfig,
+
+ /// Use this encoding as the default. Necessary for UTF-16 files without BOM.
+ pub override_encoding: Option<Encoding>,
+
+ /// Allow `<?xml encoding="…">` to contain unsupported encoding names,
+ /// and interpret them as Latin1 instead. This will mangle non-ASCII characters, but usually it won't fail parsing.
+ pub ignore_invalid_encoding_declarations: bool,
+
+ /// Documents with multiple root elements are ill-formed
+ pub allow_multiple_root_elements: bool,
+}
+
+impl Default for ParserConfig2 {
+ fn default() -> Self {
+ ParserConfig2 {
+ c: Default::default(),
+ override_encoding: None,
+ ignore_invalid_encoding_declarations: false,
+ allow_multiple_root_elements: true,
+ }
+ }
+}
+
+impl ParserConfig2 {
+ #[inline]
+ #[must_use]
+ pub fn new() -> Self {
+ Self::default()
+ }
+
+ /// Read character encoding from `Content-Type` header.
+ /// Set this when parsing XML documents fetched over HTTP.
+ ///
+ /// `text/*` MIME types do *not* imply latin1. UTF-8 is always the default fallback.
+ #[must_use] pub fn content_type(mut self, mime_type: &str) -> Self {
+ let charset = mime_type.split_once(';')
+ .and_then(|(_, args)| args.split_once("charset"))
+ .and_then(|(_, args)| args.split_once('='));
+ if let Some((_, charset)) = charset {
+ let name = charset.trim().trim_matches('"');
+ match name.parse() {
+ Ok(enc) => {
+ self.override_encoding = Some(enc);
+ },
+ Err(_) => {},
+ }
+ }
+ self
+ }
+
+ /// Creates an XML reader with this configuration.
+ ///
+ /// This is a convenience method for configuring and creating a reader at the same time:
+ ///
+ /// ```rust
+ /// use xml::reader::ParserConfig;
+ ///
+ /// let mut source: &[u8] = b"...";
+ ///
+ /// let reader = ParserConfig::new()
+ /// .trim_whitespace(true)
+ /// .ignore_comments(true)
+ /// .coalesce_characters(false)
+ /// .create_reader(&mut source);
+ /// ```
+ ///
+ /// This method is exactly equivalent to calling `EventReader::new_with_config()` with
+ /// this configuration object.
+ #[inline]
+ pub fn create_reader<R: Read>(self, source: R) -> EventReader<R> {
+ EventReader::new_with_config(source, self)
+ }
+}
+
+impl From<ParserConfig> for ParserConfig2 {
+ #[inline]
+ fn from(c: ParserConfig) -> Self {
+ Self {
+ c,
+ ..Default::default()
+ }
+ }
+}
+
+gen_setters! { ParserConfig2,
+ override_encoding: val Option<Encoding>,
+ allow_multiple_root_elements: val bool,
+ ignore_invalid_encoding_declarations: val bool
+}
+
+gen_setters! { ParserConfig,
+ override_encoding: c2 Option<Encoding>,
+ ignore_invalid_encoding_declarations: c2 bool,
+ allow_multiple_root_elements: c2 bool,
+ content_type: c2 &str
+}
+
+gen_setters! { ParserConfig2,
+ trim_whitespace: delegate bool,
+ whitespace_to_characters: delegate bool,
+ cdata_to_characters: delegate bool,
+ ignore_comments: delegate bool,
+ coalesce_characters: delegate bool,
+ ignore_end_of_stream: delegate bool,
+ replace_unknown_entity_references: delegate bool,
+ ignore_root_level_whitespace: delegate bool
+}
+
+#[test]
+fn mime_parse() {
+ let c = ParserConfig2::new().content_type("text/xml;charset=Us-AScii");
+ assert_eq!(c.override_encoding, Some(Encoding::Ascii));
+
+ let c = ParserConfig2::new().content_type("text/xml;charset = \"UTF-16\"");
+ assert_eq!(c.override_encoding, Some(Encoding::Utf16));
+}
diff --git a/src/reader/error.rs b/src/reader/error.rs
index 92378e6..8af35ae 100644
--- a/src/reader/error.rs
+++ b/src/reader/error.rs
@@ -1,12 +1,15 @@
+use crate::Encoding;
+use crate::reader::lexer::Token;
-use std::io;
use std::borrow::Cow;
-use std::fmt;
use std::error;
+use std::error::Error as _;
+use std::fmt;
+use std::io;
use std::str;
-use util;
-use common::{Position, TextPosition};
+use crate::common::{Position, TextPosition};
+use crate::util;
#[derive(Debug)]
pub enum ErrorKind {
@@ -16,18 +19,127 @@ pub enum ErrorKind {
UnexpectedEof,
}
+#[derive(Debug, Clone, PartialEq)]
+#[non_exhaustive]
+pub(crate) enum SyntaxError {
+ CannotRedefineXmlnsPrefix,
+ CannotRedefineXmlPrefix,
+ /// Recursive custom entity expanded to too many chars, it could be DoS
+ EntityTooBig,
+ EmptyEntity,
+ NoRootElement,
+ ProcessingInstructionWithoutName,
+ UnbalancedRootElement,
+ UnexpectedEof,
+ UnexpectedOpeningTag,
+ /// Missing `]]>`
+ UnclosedCdata,
+ UnexpectedQualifiedName(Token),
+ UnexpectedTokenOutsideRoot(Token),
+ UnexpectedToken(Token),
+ UnexpectedTokenInEntity(Token),
+ UnexpectedTokenInClosingTag(Token),
+ UnexpectedTokenInOpeningTag(Token),
+ InvalidQualifiedName(Box<str>),
+ UnboundAttribute(Box<str>),
+ UnboundElementPrefix(Box<str>),
+ UnexpectedClosingTag(Box<str>),
+ UnexpectedName(Box<str>),
+ /// Found <?xml-like PI not at the beginning of a document,
+ /// which is an error, see section 2.6 of XML 1.1 spec
+ UnexpectedProcessingInstruction(Box<str>, Token),
+ CannotUndefinePrefix(Box<str>),
+ InvalidCharacterEntity(u32),
+ InvalidDefaultNamespace(Box<str>),
+ InvalidNamePrefix(Box<str>),
+ InvalidNumericEntity(Box<str>),
+ InvalidStandaloneDeclaration(Box<str>),
+ InvalidXmlProcessingInstruction(Box<str>),
+ RedefinedAttribute(Box<str>),
+ UndefinedEntity(Box<str>),
+ UnexpectedEntity(Box<str>),
+ UnexpectedNameInsideXml(Box<str>),
+ UnsupportedEncoding(Box<str>),
+ /// In DTD
+ UnknownMarkupDeclaration(Box<str>),
+ UnexpectedXmlVersion(Box<str>),
+ ConflictingEncoding(Encoding, Encoding),
+ UnexpectedTokenBefore(&'static str, char),
+}
+
+impl fmt::Display for SyntaxError {
+ fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+ self.to_cow().fmt(f)
+ }
+}
+
+impl SyntaxError {
+ #[inline(never)]
+ #[cold]
+ pub(crate) fn to_cow(&self) -> Cow<'static, str> {
+ match *self {
+ Self::CannotRedefineXmlnsPrefix => "Cannot redefine XMLNS prefix".into(),
+ Self::CannotRedefineXmlPrefix => "Default XMLNS prefix cannot be rebound to another value".into(),
+ Self::EmptyEntity => "Encountered empty entity".into(),
+ Self::EntityTooBig => "Entity too big".into(),
+ Self::NoRootElement => "Unexpected end of stream: no root element found".into(),
+ Self::ProcessingInstructionWithoutName => "Encountered processing instruction without a name".into(),
+ Self::UnbalancedRootElement => "Unexpected end of stream: still inside the root element".into(),
+ Self::UnclosedCdata => "Unclosed <![CDATA[".into(),
+ Self::UnexpectedEof => "Unexpected end of stream".into(),
+ Self::UnexpectedOpeningTag => "'<' is not allowed in attributes".into(),
+ Self::CannotUndefinePrefix(ref ln) => format!("Cannot undefine prefix '{ln}'").into(),
+ Self::ConflictingEncoding(a, b) => format!("Declared encoding {a}, but uses {b}").into(),
+ Self::InvalidCharacterEntity(num) => format!("Invalid character U+{num:04X}").into(),
+ Self::InvalidDefaultNamespace(ref name) => format!( "Namespace '{name}' cannot be default").into(),
+ Self::InvalidNamePrefix(ref prefix) => format!("'{prefix}' cannot be an element name prefix").into(),
+ Self::InvalidNumericEntity(ref v) => format!("Invalid numeric entity: {v}").into(),
+ Self::InvalidQualifiedName(ref e) => format!("Qualified name is invalid: {e}").into(),
+ Self::InvalidStandaloneDeclaration(ref value) => format!("Invalid standalone declaration value: {value}").into(),
+ Self::InvalidXmlProcessingInstruction(ref name) => format!("Invalid processing instruction: <?{name} - \"<?xml\"-like PI is only valid at the beginning of the document").into(),
+ Self::RedefinedAttribute(ref name) => format!("Attribute '{name}' is redefined").into(),
+ Self::UnboundAttribute(ref name) => format!("Attribute {name} prefix is unbound").into(),
+ Self::UnboundElementPrefix(ref name) => format!("Element {name} prefix is unbound").into(),
+ Self::UndefinedEntity(ref v) => format!("Undefined entity: {v}").into(),
+ Self::UnexpectedClosingTag(ref expected_got) => format!("Unexpected closing tag: {expected_got}").into(),
+ Self::UnexpectedEntity(ref name) => format!("Unexpected entity: {name}").into(),
+ Self::UnexpectedName(ref name) => format!("Unexpected name: {name}").into(),
+ Self::UnexpectedNameInsideXml(ref name) => format!("Unexpected name inside XML declaration: {name}").into(),
+ Self::UnexpectedProcessingInstruction(ref buf, token) => format!("Unexpected token inside processing instruction: <?{buf}{token}").into(),
+ Self::UnexpectedQualifiedName(e) => format!("Unexpected token inside qualified name: {e}").into(),
+ Self::UnexpectedToken(token) => format!("Unexpected token: {token}").into(),
+ Self::UnexpectedTokenBefore(before, c) => format!("Unexpected token '{before}' before '{c}'").into(),
+ Self::UnexpectedTokenInClosingTag(token) => format!("Unexpected token inside closing tag: {token}").into(),
+ Self::UnexpectedTokenInEntity(token) => format!("Unexpected token inside entity: {token}").into(),
+ Self::UnexpectedTokenInOpeningTag(token) => format!("Unexpected token inside opening tag: {token}").into(),
+ Self::UnexpectedTokenOutsideRoot(token) => format!("Unexpected characters outside the root element: {token}").into(),
+ Self::UnexpectedXmlVersion(ref version) => format!("Invalid XML version: {version}").into(),
+ Self::UnknownMarkupDeclaration(ref v) => format!("Unknown markup declaration: {v}").into(),
+ Self::UnsupportedEncoding(ref v) => format!("Unsupported encoding: {v}").into(),
+ }
+ }
+}
+
/// An XML parsing error.
///
/// Consists of a 2D position in a document and a textual message describing the error.
#[derive(Clone, PartialEq, Eq, Debug)]
pub struct Error {
- pos: TextPosition,
- kind: ErrorKind,
+ pub(crate) pos: TextPosition,
+ pub(crate) kind: ErrorKind,
}
impl fmt::Display for Error {
- fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
- write!(f, "{} {}", self.pos, self.msg())
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ use self::ErrorKind::{Io, Syntax, UnexpectedEof, Utf8};
+
+ write!(f, "{} ", self.pos)?;
+ match &self.kind {
+ Io(io_error) => io_error.fmt(f),
+ Utf8(reason) => reason.fmt(f),
+ Syntax(msg) => f.write_str(msg),
+ UnexpectedEof => f.write_str("Unexpected EOF"),
+ }
}
}
@@ -38,49 +150,59 @@ impl Position for Error {
impl Error {
/// Returns a reference to a message which is contained inside this error.
- #[inline]
- pub fn msg(&self) -> &str {
- use self::ErrorKind::*;
- match self.kind {
- UnexpectedEof => &"Unexpected EOF",
- Utf8(ref reason) => error_description(reason),
- Io(ref io_error) => error_description(io_error),
- Syntax(ref msg) => msg.as_ref(),
+ #[cold]
+ #[doc(hidden)]
+ #[allow(deprecated)]
+ #[must_use] pub fn msg(&self) -> &str {
+ use self::ErrorKind::{Io, Syntax, UnexpectedEof, Utf8};
+ match &self.kind {
+ Io(io_error) => io_error.description(),
+ Utf8(reason) => reason.description(),
+ Syntax(msg) => msg.as_ref(),
+ UnexpectedEof => "Unexpected EOF",
}
}
- pub fn kind(&self) -> &ErrorKind { &self.kind }
+ #[must_use]
+ #[inline]
+ pub fn kind(&self) -> &ErrorKind {
+ &self.kind
+ }
}
impl error::Error for Error {
- #[inline]
+ #[allow(deprecated)]
+ #[cold]
fn description(&self) -> &str { self.msg() }
}
impl<'a, P, M> From<(&'a P, M)> for Error where P: Position, M: Into<Cow<'static, str>> {
+ #[cold]
fn from(orig: (&'a P, M)) -> Self {
- Error{
+ Error {
pos: orig.0.position(),
- kind: ErrorKind::Syntax(orig.1.into())
+ kind: ErrorKind::Syntax(orig.1.into()),
}
}
}
impl From<util::CharReadError> for Error {
+ #[cold]
fn from(e: util::CharReadError) -> Self {
- use util::CharReadError::*;
- Error{
+ use crate::util::CharReadError::{Io, UnexpectedEof, Utf8};
+ Error {
pos: TextPosition::new(),
kind: match e {
UnexpectedEof => ErrorKind::UnexpectedEof,
Utf8(reason) => ErrorKind::Utf8(reason),
Io(io_error) => ErrorKind::Io(io_error),
- }
+ },
}
}
}
impl From<io::Error> for Error {
+ #[cold]
fn from(e: io::Error) -> Self {
Error {
pos: TextPosition::new(),
@@ -90,26 +212,28 @@ impl From<io::Error> for Error {
}
impl Clone for ErrorKind {
+ #[cold]
fn clone(&self) -> Self {
- use self::ErrorKind::*;
- match *self {
+ use self::ErrorKind::{Io, Syntax, UnexpectedEof, Utf8};
+ match self {
UnexpectedEof => UnexpectedEof,
- Utf8(ref reason) => Utf8(reason.clone()),
- Io(ref io_error) => Io(io::Error::new(io_error.kind(), error_description(io_error))),
- Syntax(ref msg) => Syntax(msg.clone()),
+ Utf8(reason) => Utf8(*reason),
+ Io(io_error) => Io(io::Error::new(io_error.kind(), io_error.to_string())),
+ Syntax(msg) => Syntax(msg.clone()),
}
}
}
impl PartialEq for ErrorKind {
+ #[allow(deprecated)]
fn eq(&self, other: &ErrorKind) -> bool {
- use self::ErrorKind::*;
+ use self::ErrorKind::{Io, Syntax, UnexpectedEof, Utf8};
match (self, other) {
- (&UnexpectedEof, &UnexpectedEof) => true,
- (&Utf8(ref left), &Utf8(ref right)) => left == right,
- (&Io(ref left), &Io(ref right)) =>
+ (UnexpectedEof, UnexpectedEof) => true,
+ (Utf8(left), Utf8(right)) => left == right,
+ (Io(left), Io(right)) =>
left.kind() == right.kind() &&
- error_description(left) == error_description(right),
- (&Syntax(ref left), &Syntax(ref right)) =>
+ left.description() == right.description(),
+ (Syntax(left), Syntax(right)) =>
left == right,
(_, _) => false,
@@ -118,4 +242,7 @@ impl PartialEq for ErrorKind {
}
impl Eq for ErrorKind {}
-fn error_description(e: &error::Error) -> &str { e.description() }
+#[test]
+fn err_size() {
+ assert!(std::mem::size_of::<SyntaxError>() <= 24);
+}
diff --git a/src/reader/events.rs b/src/reader/events.rs
index 46d7621..de2b930 100644
--- a/src/reader/events.rs
+++ b/src/reader/events.rs
@@ -1,12 +1,12 @@
//! Contains `XmlEvent` datatype, instances of which are emitted by the parser.
-use std::fmt;
use std::borrow::Cow;
+use std::fmt;
-use name::OwnedName;
-use attribute::OwnedAttribute;
-use common::XmlVersion;
-use namespace::Namespace;
+use crate::attribute::OwnedAttribute;
+use crate::common::XmlVersion;
+use crate::name::OwnedName;
+use crate::namespace::Namespace;
/// An element of an XML input stream.
///
@@ -36,7 +36,7 @@ pub enum XmlEvent {
/// If XML document is not present or does not contain `standalone` attribute,
/// defaults to `None`. This field is currently used for no other purpose than
/// informational.
- standalone: Option<bool>
+ standalone: Option<bool>,
},
/// Denotes to the end of the document stream.
@@ -54,7 +54,7 @@ pub enum XmlEvent {
name: String,
/// Processing instruction content.
- data: Option<String>
+ data: Option<String>,
},
/// Denotes a beginning of an XML element.
@@ -80,7 +80,7 @@ pub enum XmlEvent {
/// latter case it is emitted immediately after corresponding `StartElement` event.
EndElement {
/// Qualified name of the element.
- name: OwnedName
+ name: OwnedName,
},
/// Denotes CDATA content.
@@ -111,19 +111,20 @@ pub enum XmlEvent {
/// It is possible to configure a parser to emit `Characters` event instead of `Whitespace`.
/// See `pull::ParserConfiguration` structure for more information. When combined with whitespace
/// trimming, it will eliminate standalone whitespace from the event stream completely.
- Whitespace(String)
+ Whitespace(String),
}
impl fmt::Debug for XmlEvent {
- fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+ #[cold]
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match *self {
- XmlEvent::StartDocument { ref version, ref encoding, ref standalone } =>
- write!(f, "StartDocument({}, {}, {:?})", version, *encoding, *standalone),
+ XmlEvent::StartDocument { ref version, ref encoding, standalone } =>
+ write!(f, "StartDocument({}, {}, {:?})", version, *encoding, standalone),
XmlEvent::EndDocument =>
write!(f, "EndDocument"),
XmlEvent::ProcessingInstruction { ref name, ref data } =>
write!(f, "ProcessingInstruction({}{})", *name, match *data {
- Some(ref data) => format!(", {}", data),
+ Some(ref data) => format!(", {data}"),
None => String::new()
}),
XmlEvent::StartElement { ref name, ref attributes, namespace: Namespace(ref namespace) } =>
@@ -136,15 +137,15 @@ impl fmt::Debug for XmlEvent {
format!(", [{}]", attributes.join(", "))
}),
XmlEvent::EndElement { ref name } =>
- write!(f, "EndElement({})", name),
+ write!(f, "EndElement({name})"),
XmlEvent::Comment(ref data) =>
- write!(f, "Comment({})", data),
+ write!(f, "Comment({data})"),
XmlEvent::CData(ref data) =>
- write!(f, "CData({})", data),
+ write!(f, "CData({data})"),
XmlEvent::Characters(ref data) =>
- write!(f, "Characters({})", data),
+ write!(f, "Characters({data})"),
XmlEvent::Whitespace(ref data) =>
- write!(f, "Whitespace({})", data)
+ write!(f, "Whitespace({data})")
}
}
}
@@ -188,32 +189,33 @@ impl XmlEvent {
/// ```
///
/// Note that this API may change or get additions in future to improve its ergonomics.
- pub fn as_writer_event<'a>(&'a self) -> Option<::writer::events::XmlEvent<'a>> {
+ #[must_use]
+ pub fn as_writer_event(&self) -> Option<crate::writer::events::XmlEvent<'_>> {
match *self {
XmlEvent::StartDocument { version, ref encoding, standalone } =>
- Some(::writer::events::XmlEvent::StartDocument {
- version: version,
+ Some(crate::writer::events::XmlEvent::StartDocument {
+ version,
encoding: Some(encoding),
- standalone: standalone
+ standalone
}),
XmlEvent::ProcessingInstruction { ref name, ref data } =>
- Some(::writer::events::XmlEvent::ProcessingInstruction {
- name: name,
- data: data.as_ref().map(|s| &s[..])
+ Some(crate::writer::events::XmlEvent::ProcessingInstruction {
+ name,
+ data: data.as_ref().map(|s| &**s)
}),
XmlEvent::StartElement { ref name, ref attributes, ref namespace } =>
- Some(::writer::events::XmlEvent::StartElement {
+ Some(crate::writer::events::XmlEvent::StartElement {
name: name.borrow(),
attributes: attributes.iter().map(|a| a.borrow()).collect(),
namespace: Cow::Borrowed(namespace)
}),
XmlEvent::EndElement { ref name } =>
- Some(::writer::events::XmlEvent::EndElement { name: Some(name.borrow()) }),
- XmlEvent::Comment(ref data) => Some(::writer::events::XmlEvent::Comment(data)),
- XmlEvent::CData(ref data) => Some(::writer::events::XmlEvent::CData(data)),
- XmlEvent::Characters(ref data) => Some(::writer::events::XmlEvent::Characters(data)),
- XmlEvent::Whitespace(ref data) => Some(::writer::events::XmlEvent::Characters(data)),
- _ => None
+ Some(crate::writer::events::XmlEvent::EndElement { name: Some(name.borrow()) }),
+ XmlEvent::Comment(ref data) => Some(crate::writer::events::XmlEvent::Comment(data)),
+ XmlEvent::CData(ref data) => Some(crate::writer::events::XmlEvent::CData(data)),
+ XmlEvent::Characters(ref data) |
+ XmlEvent::Whitespace(ref data) => Some(crate::writer::events::XmlEvent::Characters(data)),
+ XmlEvent::EndDocument => None,
}
}
}
diff --git a/src/reader/lexer.rs b/src/reader/lexer.rs
index c466db9..a8345ba 100644
--- a/src/reader/lexer.rs
+++ b/src/reader/lexer.rs
@@ -2,20 +2,25 @@
//!
//! This module is for internal use. Use `xml::pull` module to do parsing.
-use std::fmt;
+
+use crate::reader::ErrorKind;
+use crate::reader::error::SyntaxError;
use std::collections::VecDeque;
+use std::fmt;
use std::io::Read;
use std::result;
-use std::borrow::Cow;
+use crate::common::{is_name_char, is_whitespace_char, Position, TextPosition, is_xml10_char, is_xml11_char};
+use crate::reader::Error;
+use crate::util::{CharReader, Encoding};
-use common::{Position, TextPosition, is_whitespace_char, is_name_char};
-use reader::Error;
-use util;
+/// Limits to defend from billion laughs attack
+const MAX_ENTITY_EXPANSION_LENGTH: usize = 1_000_000;
+const MAX_ENTITY_EXPANSION_DEPTH: u8 = 10;
/// `Token` represents a single lexeme of an XML document. These lexemes
/// are used to perform actual parsing.
#[derive(Copy, Clone, PartialEq, Eq, Debug)]
-pub enum Token {
+pub(crate) enum Token {
/// `<?`
ProcessingInstructionStart,
/// `?>`
@@ -34,12 +39,8 @@ pub enum Token {
CommentStart,
/// `-->`
CommentEnd,
- /// A chunk of characters, used for errors recovery.
- Chunk(&'static str),
/// Any non-special character except whitespace.
Character(char),
- /// Whitespace character.
- Whitespace(char),
/// `=`
EqualsSign,
/// `'`
@@ -54,14 +55,16 @@ pub enum Token {
ReferenceStart,
/// `;`
ReferenceEnd,
+ /// `<!` of `ENTITY`
+ MarkupDeclarationStart,
}
impl fmt::Display for Token {
- fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+ #[cold]
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match *self {
- Token::Chunk(s) => write!(f, "{}", s),
- Token::Character(c) | Token::Whitespace(c) => write!(f, "{}", c),
- other => write!(f, "{}", match other {
+ Token::Character(c) => c.fmt(f),
+ other => match other {
Token::OpeningTagStart => "<",
Token::ProcessingInstructionStart => "<?",
Token::DoctypeStart => "<!DOCTYPE",
@@ -78,8 +81,9 @@ impl fmt::Display for Token {
Token::EqualsSign => "=",
Token::SingleQuote => "'",
Token::DoubleQuote => "\"",
+ Token::MarkupDeclarationStart => "<!",
_ => unreachable!()
- })
+ }.fmt(f),
}
}
}
@@ -103,47 +107,28 @@ impl Token {
Token::EqualsSign => Some("="),
Token::SingleQuote => Some("'"),
Token::DoubleQuote => Some("\""),
- Token::Chunk(s) => Some(s),
_ => None
}
}
// using String.push_str(token.to_string()) is simply way too slow
pub fn push_to_string(&self, target: &mut String) {
- match self.as_static_str() {
- Some(s) => { target.push_str(s); }
- None => {
- match *self {
- Token::Character(c) | Token::Whitespace(c) => target.push(c),
- _ => unreachable!()
- }
- }
- }
- }
-
- /// Returns `true` if this token contains data that can be interpreted
- /// as a part of the text. Surprisingly, this also means '>' and '=' and '"' and "'" and '-->'.
- #[inline]
- pub fn contains_char_data(&self) -> bool {
- match *self {
- Token::Whitespace(_) | Token::Chunk(_) | Token::Character(_) | Token::CommentEnd |
- Token::TagEnd | Token::EqualsSign | Token::DoubleQuote | Token::SingleQuote | Token::CDataEnd |
- Token::ProcessingInstructionEnd | Token::EmptyTagEnd => true,
- _ => false
- }
- }
-
- /// Returns `true` if this token corresponds to a white space character.
- #[inline]
- pub fn is_whitespace(&self) -> bool {
match *self {
- Token::Whitespace(_) => true,
- _ => false
+ Token::Character(c) => {
+ debug_assert!(is_xml10_char(c) || is_xml11_char(c));
+ target.push(c)
+ },
+ _ => if let Some(s) = self.as_static_str() {
+ target.push_str(s);
+ }
}
}
}
+#[derive(Copy, Clone)]
enum State {
+ /// Default state
+ Normal,
/// Triggered on '<'
TagStarted,
/// Triggered on '<!'
@@ -152,8 +137,10 @@ enum State {
CommentStarted,
/// Triggered on '<!D' up to '<!DOCTYPE'
DoctypeStarted(DoctypeStartedSubstate),
+ /// Other items like `<!ELEMENT` in DTD
+ InsideMarkupDeclaration,
/// Triggered after DoctypeStarted to handle sub elements
- DoctypeFinishing(u8),
+ InsideDoctype,
/// Triggered on '<![' up to '<![CDATA'
CDataStarted(CDataStartedSubstate),
/// Triggered on '?'
@@ -162,10 +149,23 @@ enum State {
EmptyTagClosing,
/// Triggered on '-' up to '--'
CommentClosing(ClosingSubstate),
- /// Triggered on ']' up to ']]'
+ /// Triggered on ']' up to ']]' inside CDATA
CDataClosing(ClosingSubstate),
- /// Default state
- Normal
+ /// Triggered on ']' up to ']]' outside CDATA
+ InvalidCDataClosing(ClosingSubstate),
+ /// After `<!--`
+ InsideComment,
+ /// After `<[[`
+ InsideCdata,
+ /// After `<?`
+ InsideProcessingInstruction,
+ /// `<!ENTITY "here">`
+ InsideMarkupDeclarationQuotedString(QuoteStyle),
+}
+
+#[derive(Copy, Clone, Eq, PartialEq)]
+enum QuoteStyle {
+ Single, Double
}
#[derive(Copy, Clone)]
@@ -184,7 +184,7 @@ enum CDataStartedSubstate {
}
/// `Result` represents lexing result. It is either a token or an error message.
-pub type Result = result::Result<Option<Token>, Error>;
+pub(crate) type Result<T = Option<Token>, E = Error> = result::Result<T, E>;
/// Helps to set up a dispatch table for lexing large unambigous tokens like
/// `<![CDATA[` or `<!DOCTYPE `.
@@ -216,15 +216,19 @@ macro_rules! dispatch_on_enum_state(
/// When it is not set, errors will be reported as `Err` objects with a string message.
/// By default this flag is not set. Use `enable_errors` and `disable_errors` methods
/// to toggle the behavior.
-pub struct Lexer {
+pub(crate) struct Lexer {
+ st: State,
+ reader: CharReader,
pos: TextPosition,
head_pos: TextPosition,
char_queue: VecDeque<char>,
- st: State,
- skip_errors: bool,
- inside_comment: bool,
+ /// Default state to go back to after a tag end (may be `InsideDoctype`)
+ normal_state: State,
inside_token: bool,
- eof_handled: bool
+ eof_handled: bool,
+ reparse_depth: u8,
+ #[cfg(test)]
+ skip_errors: bool,
}
impl Position for Lexer {
@@ -235,37 +239,33 @@ impl Position for Lexer {
impl Lexer {
/// Returns a new lexer with default state.
- pub fn new() -> Lexer {
+ pub(crate) fn new() -> Lexer {
Lexer {
+ reader: CharReader::new(),
pos: TextPosition::new(),
head_pos: TextPosition::new(),
char_queue: VecDeque::with_capacity(4), // TODO: check size
st: State::Normal,
- skip_errors: false,
- inside_comment: false,
+ normal_state: State::Normal,
inside_token: false,
- eof_handled: false
+ eof_handled: false,
+ reparse_depth: 0,
+ #[cfg(test)]
+ skip_errors: false,
}
}
- /// Enables error handling so `next_token` will return `Some(Err(..))`
- /// upon invalid lexeme.
- #[inline]
- pub fn enable_errors(&mut self) { self.skip_errors = false; }
+ pub(crate) fn encoding(&mut self) -> Encoding {
+ self.reader.encoding
+ }
+
+ pub(crate) fn set_encoding(&mut self, encoding: Encoding) {
+ self.reader.encoding = encoding;
+ }
/// Disables error handling so `next_token` will return `Some(Chunk(..))`
/// upon invalid lexeme with this lexeme content.
- #[inline]
- pub fn disable_errors(&mut self) { self.skip_errors = true; }
-
- /// Enables special handling of some lexemes which should be done when we're parsing comment
- /// internals.
- #[inline]
- pub fn inside_comment(&mut self) { self.inside_comment = true; }
-
- /// Disables the effect of `inside_comment()` method.
- #[inline]
- pub fn outside_comment(&mut self) { self.inside_comment = false; }
+ #[cfg(test)] fn disable_errors(&mut self) { self.skip_errors = true; }
/// Reset the eof handled flag of the lexer.
#[inline]
@@ -293,23 +293,29 @@ impl Lexer {
// Check if we have saved a char or two for ourselves
while let Some(c) = self.char_queue.pop_front() {
- match try!(self.read_next_token(c)) {
+ match self.dispatch_char(c)? {
Some(t) => {
self.inside_token = false;
return Ok(Some(t));
}
- None => {} // continue
+ None => {} // continue
}
}
-
+ // if char_queue is empty, all circular reparsing is done
+ self.reparse_depth = 0;
loop {
- // TODO: this should handle multiple encodings
- let c = match try!(util::next_char_from(b)) {
- Some(c) => c, // got next char
- None => break, // nothing to read left
+ let c = match self.reader.next_char_from(b)? {
+ Some(c) => c, // got next char
+ None => break, // nothing to read left
};
- match try!(self.read_next_token(c)) {
+ if c == '\n' {
+ self.head_pos.new_line();
+ } else {
+ self.head_pos.advance(1);
+ }
+
+ match self.dispatch_char(c)? {
Some(t) => {
self.inside_token = false;
return Ok(Some(t));
@@ -320,61 +326,67 @@ impl Lexer {
}
}
+ self.end_of_stream()
+ }
+
+ #[inline(never)]
+ fn end_of_stream(&mut self) -> Result {
// Handle end of stream
self.eof_handled = true;
self.pos = self.head_pos;
match self.st {
+ State::InsideCdata | State::CDataClosing(_) => Err(self.error(SyntaxError::UnclosedCdata)),
State::TagStarted | State::CommentOrCDataOrDoctypeStarted |
State::CommentStarted | State::CDataStarted(_)| State::DoctypeStarted(_) |
State::CommentClosing(ClosingSubstate::Second) |
- State::DoctypeFinishing(_) =>
- Err(self.error("Unexpected end of stream")),
- State::ProcessingInstructionClosing =>
- Ok(Some(Token::Character('?'))),
+ State::InsideComment | State::InsideMarkupDeclaration |
+ State::InsideProcessingInstruction | State::ProcessingInstructionClosing |
+ State::InsideDoctype | State::InsideMarkupDeclarationQuotedString(_) =>
+ Err(self.error(SyntaxError::UnexpectedEof)),
State::EmptyTagClosing =>
Ok(Some(Token::Character('/'))),
State::CommentClosing(ClosingSubstate::First) =>
Ok(Some(Token::Character('-'))),
- State::CDataClosing(ClosingSubstate::First) =>
+ State::InvalidCDataClosing(ClosingSubstate::First) =>
Ok(Some(Token::Character(']'))),
- State::CDataClosing(ClosingSubstate::Second) =>
- Ok(Some(Token::Chunk("]]"))),
+ State::InvalidCDataClosing(ClosingSubstate::Second) => {
+ self.eof_handled = false;
+ self.move_to_with_unread(State::Normal, &[']'], Token::Character(']'))
+ },
State::Normal =>
- Ok(None)
+ Ok(None),
}
}
- #[inline]
- fn error<M: Into<Cow<'static, str>>>(&self, msg: M) -> Error {
- (self, msg).into()
- }
-
- #[inline]
- fn read_next_token(&mut self, c: char) -> Result {
- let res = self.dispatch_char(c);
- if self.char_queue.is_empty() {
- if c == '\n' {
- self.head_pos.new_line();
- } else {
- self.head_pos.advance(1);
- }
+ #[cold]
+ fn error(&self, e: SyntaxError) -> Error {
+ Error {
+ pos: self.position(),
+ kind: ErrorKind::Syntax(e.to_cow()),
}
- res
}
+
+ #[inline(never)]
fn dispatch_char(&mut self, c: char) -> Result {
match self.st {
State::Normal => self.normal(c),
State::TagStarted => self.tag_opened(c),
+ State::EmptyTagClosing => self.empty_element_closing(c),
State::CommentOrCDataOrDoctypeStarted => self.comment_or_cdata_or_doctype_started(c),
- State::CommentStarted => self.comment_started(c),
+ State::InsideCdata => self.inside_cdata(c),
State::CDataStarted(s) => self.cdata_started(c, s),
- State::DoctypeStarted(s) => self.doctype_started(c, s),
- State::DoctypeFinishing(d) => self.doctype_finishing(c, d),
+ State::InsideComment => self.inside_comment_state(c),
+ State::CommentStarted => self.comment_started(c),
+ State::InsideProcessingInstruction => self.inside_processing_instruction(c),
State::ProcessingInstructionClosing => self.processing_instruction_closing(c),
- State::EmptyTagClosing => self.empty_element_closing(c),
State::CommentClosing(s) => self.comment_closing(c, s),
- State::CDataClosing(s) => self.cdata_closing(c, s)
+ State::CDataClosing(s) => self.cdata_closing(c, s),
+ State::InsideDoctype => self.inside_doctype(c),
+ State::DoctypeStarted(s) => self.doctype_started(c, s),
+ State::InvalidCDataClosing(s) => self.invalid_cdata_closing(c, s),
+ State::InsideMarkupDeclaration => self.markup_declaration(c),
+ State::InsideMarkupDeclarationQuotedString(q) => self.markup_declaration_string(c, q),
}
}
@@ -391,18 +403,50 @@ impl Lexer {
}
#[inline]
+ fn move_to_and_reset_normal(&mut self, st: State, token: Token) -> Result {
+ self.normal_state = st;
+ self.st = st;
+ Ok(Some(token))
+ }
+
fn move_to_with_unread(&mut self, st: State, cs: &[char], token: Token) -> Result {
- self.char_queue.extend(cs.iter().cloned());
+ for c in cs.iter().rev().copied() {
+ self.char_queue.push_front(c);
+ }
self.move_to_with(st, token)
}
+ pub(crate) fn reparse(&mut self, markup: &str) -> Result<()> {
+ if markup.is_empty() {
+ return Ok(());
+ }
+
+ self.reparse_depth += 1;
+ if self.reparse_depth > MAX_ENTITY_EXPANSION_DEPTH || self.char_queue.len() > MAX_ENTITY_EXPANSION_LENGTH {
+ return Err(self.error(SyntaxError::EntityTooBig))
+ }
+
+ self.eof_handled = false;
+ self.char_queue.reserve(markup.len());
+ for c in markup.chars().rev() {
+ self.char_queue.push_front(c);
+ }
+
+ Ok(())
+ }
+
fn handle_error(&mut self, chunk: &'static str, c: char) -> Result {
- self.char_queue.push_back(c);
- if self.skip_errors || (self.inside_comment && chunk != "--") { // FIXME: looks hacky
- self.move_to_with(State::Normal, Token::Chunk(chunk))
- } else {
- Err(self.error(format!("Unexpected token '{}' before '{}'", chunk, c)))
+ debug_assert!(!chunk.is_empty());
+
+ #[cfg(test)]
+ if self.skip_errors {
+ let mut chars = chunk.chars();
+ let first = chars.next().unwrap_or('\0');
+ self.char_queue.extend(chars);
+ self.char_queue.push_back(c);
+ return self.move_to_with(State::Normal, Token::Character(first));
}
+ Err(self.error(SyntaxError::UnexpectedTokenBefore(chunk, c)))
}
/// Encountered a char
@@ -414,12 +458,39 @@ impl Lexer {
'=' => Ok(Some(Token::EqualsSign)),
'"' => Ok(Some(Token::DoubleQuote)),
'\'' => Ok(Some(Token::SingleQuote)),
- '?' => self.move_to(State::ProcessingInstructionClosing),
- '-' => self.move_to(State::CommentClosing(ClosingSubstate::First)),
+ ']' => self.move_to(State::InvalidCDataClosing(ClosingSubstate::First)),
+ '&' => Ok(Some(Token::ReferenceStart)),
+ ';' => Ok(Some(Token::ReferenceEnd)),
+ _ => Ok(Some(Token::Character(c)))
+ }
+ }
+
+ fn inside_cdata(&mut self, c: char) -> Result {
+ match c {
']' => self.move_to(State::CDataClosing(ClosingSubstate::First)),
+ _ => Ok(Some(Token::Character(c)))
+ }
+ }
+
+ fn inside_processing_instruction(&mut self, c: char) -> Result {
+ // These tokens are used by `<?xml?>` parser
+ match c {
+ '?' => self.move_to(State::ProcessingInstructionClosing),
+ '<' => Ok(Some(Token::OpeningTagStart)),
+ '>' => Ok(Some(Token::TagEnd)),
+ '/' => Ok(Some(Token::ClosingTagStart)),
+ '=' => Ok(Some(Token::EqualsSign)),
+ '"' => Ok(Some(Token::DoubleQuote)),
+ '\'' => Ok(Some(Token::SingleQuote)),
'&' => Ok(Some(Token::ReferenceStart)),
';' => Ok(Some(Token::ReferenceEnd)),
- _ if is_whitespace_char(c) => Ok(Some(Token::Whitespace(c))),
+ _ => Ok(Some(Token::Character(c)))
+ }
+ }
+
+ fn inside_comment_state(&mut self, c: char) -> Result {
+ match c {
+ '-' => self.move_to(State::CommentClosing(ClosingSubstate::First)),
_ => Ok(Some(Token::Character(c)))
}
}
@@ -427,11 +498,11 @@ impl Lexer {
/// Encountered '<'
fn tag_opened(&mut self, c: char) -> Result {
match c {
- '?' => self.move_to_with(State::Normal, Token::ProcessingInstructionStart),
- '/' => self.move_to_with(State::Normal, Token::ClosingTagStart),
+ '?' => self.move_to_with(State::InsideProcessingInstruction, Token::ProcessingInstructionStart),
+ '/' => self.move_to_with(self.normal_state, Token::ClosingTagStart),
'!' => self.move_to(State::CommentOrCDataOrDoctypeStarted),
- _ if is_whitespace_char(c) => self.move_to_with_unread(State::Normal, &[c], Token::OpeningTagStart),
- _ if is_name_char(c) => self.move_to_with_unread(State::Normal, &[c], Token::OpeningTagStart),
+ _ if is_whitespace_char(c) => self.move_to_with_unread(self.normal_state, &[c], Token::OpeningTagStart),
+ _ if is_name_char(c) => self.move_to_with_unread(self.normal_state, &[c], Token::OpeningTagStart),
_ => self.handle_error("<", c)
}
}
@@ -442,31 +513,55 @@ impl Lexer {
'-' => self.move_to(State::CommentStarted),
'[' => self.move_to(State::CDataStarted(CDataStartedSubstate::E)),
'D' => self.move_to(State::DoctypeStarted(DoctypeStartedSubstate::D)),
- _ => self.handle_error("<!", c)
+ 'E' | 'A' | 'N' if matches!(self.normal_state, State::InsideDoctype) => {
+ self.move_to_with_unread(State::InsideMarkupDeclaration, &[c], Token::MarkupDeclarationStart)
+ },
+ _ => self.handle_error("<!", c),
}
}
/// Encountered '<!-'
fn comment_started(&mut self, c: char) -> Result {
match c {
- '-' => self.move_to_with(State::Normal, Token::CommentStart),
- _ => self.handle_error("<!-", c)
+ '-' => self.move_to_with(State::InsideComment, Token::CommentStart),
+ _ => self.handle_error("<!-", c),
}
}
/// Encountered '<!['
fn cdata_started(&mut self, c: char, s: CDataStartedSubstate) -> Result {
- use self::CDataStartedSubstate::{E, C, CD, CDA, CDAT, CDATA};
+ use self::CDataStartedSubstate::{C, CD, CDA, CDAT, CDATA, E};
dispatch_on_enum_state!(self, s, c, State::CDataStarted,
E ; 'C' ; C ; "<![",
C ; 'D' ; CD ; "<![C",
CD ; 'A' ; CDA ; "<![CD",
CDA ; 'T' ; CDAT ; "<![CDA",
CDAT ; 'A' ; CDATA ; "<![CDAT";
- CDATA ; '[' ; "<![CDATA" ; self.move_to_with(State::Normal, Token::CDataStart)
+ CDATA ; '[' ; "<![CDATA" ; self.move_to_with(State::InsideCdata, Token::CDataStart)
)
}
+ /// Encountered '<!…' that isn't DOCTYPE or CDATA
+ fn markup_declaration(&mut self, c: char) -> Result {
+ match c {
+ '<' => self.handle_error("<!", c),
+ '>' => self.move_to_with(self.normal_state, Token::TagEnd),
+ '&' => Ok(Some(Token::ReferenceStart)),
+ ';' => Ok(Some(Token::ReferenceEnd)),
+ '"' => self.move_to_with(State::InsideMarkupDeclarationQuotedString(QuoteStyle::Double), Token::DoubleQuote),
+ '\'' => self.move_to_with(State::InsideMarkupDeclarationQuotedString(QuoteStyle::Single), Token::SingleQuote),
+ _ => Ok(Some(Token::Character(c))),
+ }
+ }
+
+ fn markup_declaration_string(&mut self, c: char, q: QuoteStyle) -> Result {
+ match c {
+ '"' if q == QuoteStyle::Double => self.move_to_with(State::InsideMarkupDeclaration, Token::DoubleQuote),
+ '\'' if q == QuoteStyle::Single => self.move_to_with(State::InsideMarkupDeclaration, Token::SingleQuote),
+ _ => Ok(Some(Token::Character(c))),
+ }
+ }
+
/// Encountered '<!D'
fn doctype_started(&mut self, c: char, s: DoctypeStartedSubstate) -> Result {
use self::DoctypeStartedSubstate::{D, DO, DOC, DOCT, DOCTY, DOCTYP};
@@ -476,33 +571,36 @@ impl Lexer {
DOC ; 'T' ; DOCT ; "<!DOC",
DOCT ; 'Y' ; DOCTY ; "<!DOCT",
DOCTY ; 'P' ; DOCTYP ; "<!DOCTY";
- DOCTYP ; 'E' ; "<!DOCTYP" ; self.move_to_with(State::DoctypeFinishing(1), Token::DoctypeStart)
+ DOCTYP ; 'E' ; "<!DOCTYP" ; self.move_to_and_reset_normal(State::InsideDoctype, Token::DoctypeStart)
)
}
/// State used while awaiting the closing bracket for the <!DOCTYPE tag
- fn doctype_finishing(&mut self, c: char, d: u8) -> Result {
+ fn inside_doctype(&mut self, c: char) -> Result {
match c {
- '<' => self.move_to(State::DoctypeFinishing(d + 1)),
- '>' if d == 1 => self.move_to_with(State::Normal, Token::TagEnd),
- '>' => self.move_to(State::DoctypeFinishing(d - 1)),
- _ => Ok(None),
+ '>' => self.move_to_and_reset_normal(State::Normal, Token::TagEnd),
+ '<' => self.move_to(State::TagStarted),
+ '&' => Ok(Some(Token::ReferenceStart)),
+ ';' => Ok(Some(Token::ReferenceEnd)),
+ '"' => Ok(Some(Token::DoubleQuote)),
+ '\'' => Ok(Some(Token::SingleQuote)),
+ _ => Ok(Some(Token::Character(c))),
}
}
/// Encountered '?'
fn processing_instruction_closing(&mut self, c: char) -> Result {
match c {
- '>' => self.move_to_with(State::Normal, Token::ProcessingInstructionEnd),
- _ => self.move_to_with_unread(State::Normal, &[c], Token::Character('?')),
+ '>' => self.move_to_with(self.normal_state, Token::ProcessingInstructionEnd),
+ _ => self.move_to_with_unread(State::InsideProcessingInstruction, &[c], Token::Character('?')),
}
}
/// Encountered '/'
fn empty_element_closing(&mut self, c: char) -> Result {
match c {
- '>' => self.move_to_with(State::Normal, Token::EmptyTagEnd),
- _ => self.move_to_with_unread(State::Normal, &[c], Token::Character('/')),
+ '>' => self.move_to_with(self.normal_state, Token::EmptyTagEnd),
+ _ => self.move_to_with_unread(self.normal_state, &[c], Token::Character('/')),
}
}
@@ -511,18 +609,13 @@ impl Lexer {
match s {
ClosingSubstate::First => match c {
'-' => self.move_to(State::CommentClosing(ClosingSubstate::Second)),
- _ => self.move_to_with_unread(State::Normal, &[c], Token::Character('-'))
+ _ => self.move_to_with_unread(State::InsideComment, &[c], Token::Character('-')),
},
ClosingSubstate::Second => match c {
- '>' => self.move_to_with(State::Normal, Token::CommentEnd),
+ '>' => self.move_to_with(self.normal_state, Token::CommentEnd),
// double dash not followed by a greater-than is a hard error inside comment
- _ if self.inside_comment => self.handle_error("--", c),
- // nothing else except comment closing starts with a double dash, and comment
- // closing can never be after another dash, and also we're outside of a comment,
- // therefore it is safe to push only the last read character to the list of unread
- // characters and pass the double dash directly to the output
- _ => self.move_to_with_unread(State::Normal, &[c], Token::Chunk("--"))
- }
+ _ => self.handle_error("--", c),
+ },
}
}
@@ -531,19 +624,33 @@ impl Lexer {
match s {
ClosingSubstate::First => match c {
']' => self.move_to(State::CDataClosing(ClosingSubstate::Second)),
- _ => self.move_to_with_unread(State::Normal, &[c], Token::Character(']'))
+ _ => self.move_to_with_unread(State::InsideCdata, &[c], Token::Character(']')),
},
ClosingSubstate::Second => match c {
'>' => self.move_to_with(State::Normal, Token::CDataEnd),
- _ => self.move_to_with_unread(State::Normal, &[']', c], Token::Character(']'))
- }
+ _ => self.move_to_with_unread(State::InsideCdata, &[']', c], Token::Character(']')),
+ },
+ }
+ }
+
+ /// Encountered ']'
+ fn invalid_cdata_closing(&mut self, c: char, s: ClosingSubstate) -> Result {
+ match s {
+ ClosingSubstate::First => match c {
+ ']' => self.move_to(State::InvalidCDataClosing(ClosingSubstate::Second)),
+ _ => self.move_to_with_unread(State::Normal, &[c], Token::Character(']')),
+ },
+ ClosingSubstate::Second => match c {
+ '>' => self.move_to_with(self.normal_state, Token::CDataEnd),
+ _ => self.move_to_with_unread(State::Normal, &[']', c], Token::Character(']')),
+ },
}
}
}
#[cfg(test)]
mod tests {
- use common::{Position};
+ use crate::common::Position;
use std::io::{BufReader, Cursor};
use super::{Lexer, Token};
@@ -563,13 +670,12 @@ mod tests {
let err = err.unwrap_err();
assert_eq!($r as u64, err.position().row);
assert_eq!($c as u64, err.position().column);
- assert_eq!($s, err.msg());
})
);
macro_rules! assert_none(
(for $lex:ident and $buf:ident) => (
- assert_eq!(Ok(None), $lex.next_token(&mut $buf));
+ assert_eq!(Ok(None), $lex.next_token(&mut $buf))
)
);
@@ -578,6 +684,47 @@ mod tests {
}
#[test]
+ fn tricky_pi() {
+ let (mut lex, mut buf) = make_lex_and_buf(r#"<?x<!-- &??><x>"#);
+
+ assert_oks!(for lex and buf ;
+ Token::ProcessingInstructionStart
+ Token::Character('x')
+ Token::OpeningTagStart // processing of <?xml?> relies on the extra tokens
+ Token::Character('!')
+ Token::Character('-')
+ Token::Character('-')
+ Token::Character(' ')
+ Token::ReferenceStart
+ Token::Character('?')
+ Token::ProcessingInstructionEnd
+ Token::OpeningTagStart
+ Token::Character('x')
+ Token::TagEnd
+ );
+ assert_none!(for lex and buf);
+ }
+
+ #[test]
+ fn reparser() {
+ let (mut lex, mut buf) = make_lex_and_buf(r#"&a;"#);
+
+ assert_oks!(for lex and buf ;
+ Token::ReferenceStart
+ Token::Character('a')
+ Token::ReferenceEnd
+ );
+ lex.reparse("<hi/>").unwrap();
+ assert_oks!(for lex and buf ;
+ Token::OpeningTagStart
+ Token::Character('h')
+ Token::Character('i')
+ Token::EmptyTagEnd
+ );
+ assert_none!(for lex and buf);
+ }
+
+ #[test]
fn simple_lexer_test() {
let (mut lex, mut buf) = make_lex_and_buf(
r#"<a p='q'> x<b z="y">d </b></a><p/> <?nm ?> <!-- a c --> &nbsp;"#
@@ -586,18 +733,18 @@ mod tests {
assert_oks!(for lex and buf ;
Token::OpeningTagStart
Token::Character('a')
- Token::Whitespace(' ')
+ Token::Character(' ')
Token::Character('p')
Token::EqualsSign
Token::SingleQuote
Token::Character('q')
Token::SingleQuote
Token::TagEnd
- Token::Whitespace(' ')
+ Token::Character(' ')
Token::Character('x')
Token::OpeningTagStart
Token::Character('b')
- Token::Whitespace(' ')
+ Token::Character(' ')
Token::Character('z')
Token::EqualsSign
Token::DoubleQuote
@@ -605,7 +752,7 @@ mod tests {
Token::DoubleQuote
Token::TagEnd
Token::Character('d')
- Token::Whitespace('\t')
+ Token::Character('\t')
Token::ClosingTagStart
Token::Character('b')
Token::TagEnd
@@ -615,21 +762,21 @@ mod tests {
Token::OpeningTagStart
Token::Character('p')
Token::EmptyTagEnd
- Token::Whitespace(' ')
+ Token::Character(' ')
Token::ProcessingInstructionStart
Token::Character('n')
Token::Character('m')
- Token::Whitespace(' ')
+ Token::Character(' ')
Token::ProcessingInstructionEnd
- Token::Whitespace(' ')
+ Token::Character(' ')
Token::CommentStart
- Token::Whitespace(' ')
+ Token::Character(' ')
Token::Character('a')
- Token::Whitespace(' ')
+ Token::Character(' ')
Token::Character('c')
- Token::Whitespace(' ')
+ Token::Character(' ')
Token::CommentEnd
- Token::Whitespace(' ')
+ Token::Character(' ')
Token::ReferenceStart
Token::Character('n')
Token::Character('b')
@@ -651,16 +798,17 @@ mod tests {
Token::Character('x')
Token::Character('!')
Token::Character('+')
- Token::Whitespace(' ')
+ Token::Character(' ')
Token::Character('/')
Token::Character('/')
- Token::Whitespace(' ')
+ Token::Character(' ')
Token::Character('-')
Token::Character('|')
- Token::Whitespace(' ')
+ Token::Character(' ')
Token::Character(']')
Token::Character('z')
- Token::Chunk("]]")
+ Token::Character(']')
+ Token::Character(']')
);
assert_none!(for lex and buf);
}
@@ -677,12 +825,12 @@ mod tests {
Token::TagEnd
Token::CDataStart
Token::Character('x')
- Token::Whitespace(' ')
+ Token::Character(' ')
Token::Character('y')
- Token::Whitespace(' ')
+ Token::Character(' ')
Token::Character('?')
Token::CDataEnd
- Token::Whitespace(' ')
+ Token::Character(' ')
Token::ClosingTagStart
Token::Character('a')
Token::TagEnd
@@ -691,6 +839,33 @@ mod tests {
}
#[test]
+ fn cdata_closers_test() {
+ let (mut lex, mut buf) = make_lex_and_buf(
+ r#"<![CDATA[] > ]> ]]><!---->]]<a>"#
+ );
+
+ assert_oks!(for lex and buf ;
+ Token::CDataStart
+ Token::Character(']')
+ Token::Character(' ')
+ Token::Character('>')
+ Token::Character(' ')
+ Token::Character(']')
+ Token::Character('>')
+ Token::Character(' ')
+ Token::CDataEnd
+ Token::CommentStart
+ Token::CommentEnd
+ Token::Character(']')
+ Token::Character(']')
+ Token::OpeningTagStart
+ Token::Character('a')
+ Token::TagEnd
+ );
+ assert_none!(for lex and buf);
+ }
+
+ #[test]
fn doctype_test() {
let (mut lex, mut buf) = make_lex_and_buf(
r#"<a><!DOCTYPE ab xx z> "#
@@ -700,26 +875,135 @@ mod tests {
Token::Character('a')
Token::TagEnd
Token::DoctypeStart
+ Token::Character(' ')
+ Token::Character('a')
+ Token::Character('b')
+ Token::Character(' ')
+ Token::Character('x')
+ Token::Character('x')
+ Token::Character(' ')
+ Token::Character('z')
Token::TagEnd
- Token::Whitespace(' ')
+ Token::Character(' ')
);
- assert_none!(for lex and buf)
+ assert_none!(for lex and buf);
+ }
+
+ #[test]
+ fn tricky_comments() {
+ let (mut lex, mut buf) = make_lex_and_buf(
+ r#"<a><!-- C ->--></a>"#
+ );
+ assert_oks!(for lex and buf ;
+ Token::OpeningTagStart
+ Token::Character('a')
+ Token::TagEnd
+ Token::CommentStart
+ Token::Character(' ')
+ Token::Character('C')
+ Token::Character(' ')
+ Token::Character('-')
+ Token::Character('>')
+ Token::CommentEnd
+ Token::ClosingTagStart
+ Token::Character('a')
+ Token::TagEnd
+ );
+ assert_none!(for lex and buf);
}
#[test]
fn doctype_with_internal_subset_test() {
let (mut lex, mut buf) = make_lex_and_buf(
- r#"<a><!DOCTYPE ab[<!ELEMENT ba> ]> "#
+ r#"<a><!DOCTYPE ab[<!ELEMENT ba ">>>"> ]> "#
);
assert_oks!(for lex and buf ;
Token::OpeningTagStart
Token::Character('a')
Token::TagEnd
Token::DoctypeStart
+ Token::Character(' ')
+ Token::Character('a')
+ Token::Character('b')
+ Token::Character('[')
+ Token::MarkupDeclarationStart
+ Token::Character('E')
+ Token::Character('L')
+ Token::Character('E')
+ Token::Character('M')
+ Token::Character('E')
+ Token::Character('N')
+ Token::Character('T')
+ Token::Character(' ')
+ Token::Character('b')
+ Token::Character('a')
+ Token::Character(' ')
+ Token::DoubleQuote
+ Token::Character('>')
+ Token::Character('>')
+ Token::Character('>')
+ Token::DoubleQuote
+ Token::TagEnd
+ Token::Character(' ')
+ Token::Character(']')
+ Token::TagEnd
+ Token::Character(' ')
+ );
+ assert_none!(for lex and buf);
+ }
+
+ #[test]
+ fn doctype_internal_pi_comment() {
+ let (mut lex, mut buf) = make_lex_and_buf(
+ "<!DOCTYPE a [\n<!ELEMENT l ANY> <!-- <?non?>--> <?pi > ?> \n]>"
+ );
+ assert_oks!(for lex and buf ;
+ Token::DoctypeStart
+ Token::Character(' ')
+ Token::Character('a')
+ Token::Character(' ')
+ Token::Character('[')
+ Token::Character('\n')
+ Token::MarkupDeclarationStart
+ Token::Character('E')
+ Token::Character('L')
+ Token::Character('E')
+ Token::Character('M')
+ Token::Character('E')
+ Token::Character('N')
+ Token::Character('T')
+ Token::Character(' ')
+ Token::Character('l')
+ Token::Character(' ')
+ Token::Character('A')
+ Token::Character('N')
+ Token::Character('Y')
Token::TagEnd
- Token::Whitespace(' ')
+ Token::Character(' ')
+ Token::CommentStart
+ Token::Character(' ')
+ Token::Character('<')
+ Token::Character('?')
+ Token::Character('n')
+ Token::Character('o')
+ Token::Character('n')
+ Token::Character('?')
+ Token::Character('>')
+ Token::CommentEnd
+ Token::Character(' ')
+ Token::ProcessingInstructionStart
+ Token::Character('p')
+ Token::Character('i')
+ Token::Character(' ')
+ Token::TagEnd // not really
+ Token::Character(' ')
+ Token::ProcessingInstructionEnd
+ Token::Character(' ')
+ Token::Character('\n')
+ Token::Character(']')
+ Token::TagEnd // DTD
);
- assert_none!(for lex and buf)
+ assert_none!(for lex and buf);
}
#[test]
@@ -735,7 +1019,8 @@ mod tests {
eof_check!("/" ; Token::Character('/'));
eof_check!("-" ; Token::Character('-'));
eof_check!("]" ; Token::Character(']'));
- eof_check!("]]" ; Token::Chunk("]]"));
+ eof_check!("]" ; Token::Character(']'));
+ eof_check!("]" ; Token::Character(']'));
}
#[test]
@@ -756,7 +1041,6 @@ mod tests {
eof_check!("<![CDA" ; 0, 6);
eof_check!("<![CDAT" ; 0, 7);
eof_check!("<![CDATA" ; 0, 8);
- eof_check!("--" ; 0, 2);
}
#[test]
@@ -769,7 +1053,8 @@ mod tests {
let (mut lex, mut buf) = make_lex_and_buf("<!x");
lex.disable_errors();
assert_oks!(for lex and buf ;
- Token::Chunk("<!")
+ Token::Character('<')
+ Token::Character('!')
Token::Character('x')
);
assert_none!(for lex and buf);
@@ -785,8 +1070,10 @@ mod tests {
let (mut lex, mut buf) = make_lex_and_buf("<!-\t");
lex.disable_errors();
assert_oks!(for lex and buf ;
- Token::Chunk("<!-")
- Token::Whitespace('\t')
+ Token::Character('<')
+ Token::Character('!')
+ Token::Character('-')
+ Token::Character('\t')
);
assert_none!(for lex and buf);
}
@@ -794,14 +1081,15 @@ mod tests {
#[test]
fn error_in_comment_two_dashes_not_at_end() {
let (mut lex, mut buf) = make_lex_and_buf("--x");
- lex.inside_comment();
+ lex.st = super::State::InsideComment;
assert_err!(for lex and buf expect row 0; 0,
"Unexpected token '--' before 'x'"
);
let (mut lex, mut buf) = make_lex_and_buf("--x");
assert_oks!(for lex and buf ;
- Token::Chunk("--")
+ Token::Character('-')
+ Token::Character('-')
Token::Character('x')
);
}
@@ -813,8 +1101,10 @@ mod tests {
let (mut lex, mut buf) = make_lex_and_buf($data);
lex.disable_errors();
+ for c in $chunk.chars() {
+ assert_eq!(Ok(Some(Token::Character(c))), lex.next_token(&mut buf));
+ }
assert_oks!(for lex and buf ;
- Token::Chunk($chunk)
Token::Character($app)
);
assert_none!(for lex and buf);
@@ -822,6 +1112,12 @@ mod tests {
);
#[test]
+ fn token_size() {
+ assert_eq!(4, std::mem::size_of::<Token>());
+ assert_eq!(2, std::mem::size_of::<super::State>());
+ }
+
+ #[test]
fn error_in_cdata_started() {
check_case!("<![", '['; "<![[" ; 0, 0, "Unexpected token '<![' before '['");
check_case!("<![C", '['; "<![C[" ; 0, 0, "Unexpected token '<![C' before '['");
@@ -854,7 +1150,7 @@ mod tests {
Token::Character('F')
Token::Character('o')
Token::Character('o')
- Token::Whitespace(' ')
+ Token::Character(' ')
Token::Character('[')
Token::Character('B')
Token::Character('a')
diff --git a/src/reader/parser/mod.rs b/src/reader/parser.rs
index 58ca3a6..dcdec89 100644
--- a/src/reader/parser/mod.rs
+++ b/src/reader/parser.rs
@@ -1,29 +1,32 @@
//! Contains an implementation of pull-based XML parser.
-use std::mem;
-use std::borrow::Cow;
+
+use crate::common::is_xml11_char;
+use crate::common::is_xml10_char;
+use crate::common::is_xml11_char_not_restricted;
+use crate::reader::error::SyntaxError;
+use std::collections::HashMap;
use std::io::prelude::*;
-use common::{
- self,
- XmlVersion, Position, TextPosition,
- is_name_start_char, is_name_char,
-};
-use name::OwnedName;
-use attribute::OwnedAttribute;
-use namespace::NamespaceStack;
+use crate::attribute::OwnedAttribute;
+use crate::common::{self, is_name_char, is_name_start_char, Position, TextPosition, XmlVersion, is_whitespace_char};
+use crate::name::OwnedName;
+use crate::namespace::NamespaceStack;
+
+use crate::reader::config::ParserConfig2;
+use crate::reader::events::XmlEvent;
+use crate::reader::lexer::{Lexer, Token};
-use reader::events::XmlEvent;
-use reader::config::ParserConfig;
-use reader::lexer::{Lexer, Token};
+use super::{Error, ErrorKind};
macro_rules! gen_takes(
($($field:ident -> $method:ident, $t:ty, $def:expr);+) => (
$(
impl MarkupData {
#[inline]
+ #[allow(clippy::mem_replace_option_with_none)]
fn $method(&mut self) -> $t {
- mem::replace(&mut self.$field, $def)
+ std::mem::replace(&mut self.$field, $def)
}
}
)+
@@ -34,9 +37,7 @@ gen_takes!(
name -> take_name, String, String::new();
ref_data -> take_ref_data, String, String::new();
- version -> take_version, Option<common::XmlVersion>, None;
encoding -> take_encoding, Option<String>, None;
- standalone -> take_standalone, Option<bool>, None;
element_name -> take_element_name, Option<OwnedName>, None;
@@ -44,34 +45,33 @@ gen_takes!(
attributes -> take_attributes, Vec<OwnedAttribute>, vec!()
);
-macro_rules! self_error(
- ($this:ident; $msg:expr) => ($this.error($msg));
- ($this:ident; $fmt:expr, $($arg:expr),+) => ($this.error(format!($fmt, $($arg),+)))
-);
-
-mod outside_tag;
-mod inside_processing_instruction;
+mod inside_cdata;
+mod inside_closing_tag_name;
+mod inside_comment;
mod inside_declaration;
mod inside_doctype;
mod inside_opening_tag;
-mod inside_closing_tag_name;
-mod inside_comment;
-mod inside_cdata;
+mod inside_processing_instruction;
mod inside_reference;
+mod outside_tag;
-static DEFAULT_VERSION: XmlVersion = XmlVersion::Version10;
-static DEFAULT_ENCODING: &'static str = "UTF-8";
+static DEFAULT_VERSION: XmlVersion = XmlVersion::Version10;
static DEFAULT_STANDALONE: Option<bool> = None;
type ElementStack = Vec<OwnedName>;
pub type Result = super::Result<XmlEvent>;
/// Pull-based XML parser.
-pub struct PullParser {
- config: ParserConfig,
+pub(crate) struct PullParser {
+ config: ParserConfig2,
lexer: Lexer,
st: State,
+ state_after_reference: State,
buf: String,
+
+ /// From DTD internal subset
+ entities: HashMap<String, String>,
+
nst: NamespaceStack,
data: MarkupData,
@@ -80,21 +80,48 @@ pub struct PullParser {
est: ElementStack,
pos: Vec<TextPosition>,
- encountered_element: bool,
- parsed_declaration: bool,
+ encountered: Encountered,
inside_whitespace: bool,
read_prefix_separator: bool,
- pop_namespace: bool
+ pop_namespace: bool,
+}
+
+// Keeps track when XML declaration can happen
+#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord)]
+enum Encountered {
+ None = 0,
+ AnyChars, // whitespace before <?xml is not allowed
+ Declaration,
+ Comment,
+ Doctype,
+ Element,
}
impl PullParser {
/// Returns a new parser using the given config.
- pub fn new(config: ParserConfig) -> PullParser {
+ #[inline]
+ pub fn new(config: impl Into<ParserConfig2>) -> PullParser {
+ let config = config.into();
+ Self::new_with_config2(config)
+ }
+
+ #[inline]
+ fn new_with_config2(config: ParserConfig2) -> PullParser {
+ let mut lexer = Lexer::new();
+ if let Some(enc) = config.override_encoding {
+ lexer.set_encoding(enc);
+ }
+
+ let mut pos = Vec::with_capacity(16);
+ pos.push(TextPosition::new());
+
PullParser {
- config: config,
- lexer: Lexer::new(),
- st: State::OutsideTag,
+ config,
+ lexer,
+ st: State::DocumentStart,
+ state_after_reference: State::OutsideTag,
buf: String::new(),
+ entities: HashMap::new(),
nst: NamespaceStack::default(),
data: MarkupData {
@@ -106,23 +133,44 @@ impl PullParser {
element_name: None,
quote: None,
attr_name: None,
- attributes: Vec::new()
+ attributes: Vec::new(),
},
final_result: None,
next_event: None,
est: Vec::new(),
- pos: vec![TextPosition::new()],
+ pos,
- encountered_element: false,
- parsed_declaration: false,
+ encountered: Encountered::None,
inside_whitespace: true,
read_prefix_separator: false,
- pop_namespace: false
+ pop_namespace: false,
}
}
/// Checks if this parser ignores the end of stream errors.
- pub fn is_ignoring_end_of_stream(&self) -> bool { self.config.ignore_end_of_stream }
+ pub fn is_ignoring_end_of_stream(&self) -> bool { self.config.c.ignore_end_of_stream }
+
+ #[inline(never)]
+ fn set_encountered(&mut self, new_encounter: Encountered) -> Option<Result> {
+ if new_encounter <= self.encountered {
+ return None;
+ }
+ let prev_enc = self.encountered;
+ self.encountered = new_encounter;
+
+ // If declaration was not parsed and we have encountered an element,
+ // emit this declaration as the next event.
+ if prev_enc == Encountered::None {
+ self.push_pos();
+ Some(Ok(XmlEvent::StartDocument {
+ version: DEFAULT_VERSION,
+ encoding: self.lexer.encoding().to_string(),
+ standalone: DEFAULT_STANDALONE,
+ }))
+ } else {
+ None
+ }
+ }
}
impl Position for PullParser {
@@ -133,7 +181,7 @@ impl Position for PullParser {
}
}
-#[derive(Clone, PartialEq)]
+#[derive(Copy, Clone, PartialEq)]
pub enum State {
OutsideTag,
InsideOpeningTag(OpeningTagSubstate),
@@ -142,11 +190,33 @@ pub enum State {
InsideComment,
InsideCData,
InsideDeclaration(DeclarationSubstate),
- InsideDoctype,
- InsideReference(Box<State>)
+ InsideDoctype(DoctypeSubstate),
+ InsideReference,
+ DocumentStart,
}
-#[derive(Clone, PartialEq)]
+#[derive(Copy, Clone, PartialEq)]
+pub enum DoctypeSubstate {
+ Outside,
+ String,
+ InsideName,
+ BeforeEntityName,
+ EntityName,
+ BeforeEntityValue,
+ EntityValue,
+ NumericReferenceStart,
+ NumericReference,
+ /// expansion
+ PEReferenceInValue,
+ PEReferenceInDtd,
+ /// name definition
+ PEReferenceDefinitionStart,
+ PEReferenceDefinition,
+ SkipDeclaration,
+ Comment,
+}
+
+#[derive(Copy, Clone, PartialEq)]
pub enum OpeningTagSubstate {
InsideName,
@@ -156,21 +226,22 @@ pub enum OpeningTagSubstate {
AfterAttributeName,
InsideAttributeValue,
+ AfterAttributeValue,
}
-#[derive(Clone, PartialEq)]
+#[derive(Copy, Clone, PartialEq)]
pub enum ClosingTagSubstate {
CTInsideName,
- CTAfterName
+ CTAfterName,
}
-#[derive(Clone, PartialEq)]
+#[derive(Copy, Clone, PartialEq)]
pub enum ProcessingInstructionSubstate {
PIInsideName,
- PIInsideData
+ PIInsideData,
}
-#[derive(Clone, PartialEq)]
+#[derive(Copy, Clone, PartialEq)]
pub enum DeclarationSubstate {
BeforeVersion,
InsideVersion,
@@ -179,30 +250,32 @@ pub enum DeclarationSubstate {
InsideVersionValue,
AfterVersionValue,
+ BeforeEncoding,
InsideEncoding,
AfterEncoding,
InsideEncodingValue,
+ AfterEncodingValue,
BeforeStandaloneDecl,
InsideStandaloneDecl,
AfterStandaloneDecl,
InsideStandaloneDeclValue,
- AfterStandaloneDeclValue
+ AfterStandaloneDeclValue,
}
#[derive(PartialEq)]
enum QualifiedNameTarget {
AttributeNameTarget,
OpeningTagNameTarget,
- ClosingTagNameTarget
+ ClosingTagNameTarget,
}
#[derive(Copy, Clone, PartialEq, Eq)]
enum QuoteToken {
SingleQuoteToken,
- DoubleQuoteToken
+ DoubleQuoteToken,
}
impl QuoteToken {
@@ -210,14 +283,14 @@ impl QuoteToken {
match *t {
Token::SingleQuote => QuoteToken::SingleQuoteToken,
Token::DoubleQuote => QuoteToken::DoubleQuoteToken,
- _ => panic!("Unexpected token: {}", t)
+ _ => panic!("Unexpected token: {t}"),
}
}
fn as_token(self) -> Token {
match self {
QuoteToken::SingleQuoteToken => Token::SingleQuote,
- QuoteToken::DoubleQuoteToken => Token::DoubleQuote
+ QuoteToken::DoubleQuoteToken => Token::DoubleQuote,
}
}
}
@@ -257,97 +330,114 @@ impl PullParser {
}
loop {
+ debug_assert!(self.next_event.is_none());
+ debug_assert!(!self.pop_namespace);
+
// While lexer gives us Ok(maybe_token) -- we loop.
// Upon having a complete XML-event -- we return from the whole function.
match self.lexer.next_token(r) {
- Ok(maybe_token) =>
- match maybe_token {
- None => break,
- Some(token) =>
- match self.dispatch_token(token) {
- None => {} // continue
- Some(Ok(XmlEvent::EndDocument)) =>
- return {
- self.next_pos();
- self.set_final_result(Ok(XmlEvent::EndDocument))
- },
- Some(Ok(xml_event)) =>
- return {
- self.next_pos();
- Ok(xml_event)
- },
- Some(Err(xml_error)) =>
- return {
- self.next_pos();
- self.set_final_result(Err(xml_error))
- },
- }
- },
- Err(lexer_error) =>
- return self.set_final_result(Err(lexer_error)),
+ Ok(Some(token)) => {
+ match self.dispatch_token(token) {
+ None => {} // continue
+ Some(Ok(xml_event)) => {
+ self.next_pos();
+ return Ok(xml_event)
+ },
+ Some(Err(xml_error)) => {
+ self.next_pos();
+ return self.set_final_result(Err(xml_error))
+ },
+ }
+ },
+ Ok(None) => break,
+ Err(lexer_error) => {
+ return self.set_final_result(Err(lexer_error))
+ },
}
}
- // Handle end of stream
+ self.handle_eof()
+ }
+
+ /// Handle end of stream
+ fn handle_eof(&mut self) -> std::result::Result<XmlEvent, super::Error> {
// Forward pos to the lexer head
self.next_pos();
let ev = if self.depth() == 0 {
- if self.encountered_element && self.st == State::OutsideTag { // all is ok
+ if self.encountered == Encountered::Element && self.st == State::OutsideTag { // all is ok
Ok(XmlEvent::EndDocument)
- } else if !self.encountered_element {
- self_error!(self; "Unexpected end of stream: no root element found")
+ } else if self.encountered < Encountered::Element {
+ self.error(SyntaxError::NoRootElement)
} else { // self.st != State::OutsideTag
- self_error!(self; "Unexpected end of stream") // TODO: add expected hint?
+ self.error(SyntaxError::UnexpectedEof) // TODO: add expected hint?
}
+ } else if self.config.c.ignore_end_of_stream {
+ self.final_result = None;
+ self.lexer.reset_eof_handled();
+ return self.error(SyntaxError::UnbalancedRootElement);
} else {
- if self.config.ignore_end_of_stream {
- self.final_result = None;
- self.lexer.reset_eof_handled();
- return self_error!(self; "Unexpected end of stream: still inside the root element");
- } else {
- self_error!(self; "Unexpected end of stream: still inside the root element")
- }
+ self.error(SyntaxError::UnbalancedRootElement)
};
self.set_final_result(ev)
}
// This function is to be called when a terminal event is reached.
// The function sets up the `self.final_result` into `Some(result)` and return `result`.
+ #[inline]
fn set_final_result(&mut self, result: Result) -> Result {
self.final_result = Some(result.clone());
result
}
- #[inline]
- fn error<M: Into<Cow<'static, str>>>(&self, msg: M) -> Result {
- Err((&self.lexer, msg).into())
+ #[cold]
+ fn error(&self, e: SyntaxError) -> Result {
+ Err(Error {
+ pos: self.lexer.position(),
+ kind: ErrorKind::Syntax(e.to_cow()),
+ })
}
#[inline]
fn next_pos(&mut self) {
- if self.pos.len() > 1 {
- self.pos.remove(0);
- } else {
- self.pos[0] = self.lexer.position();
+ // unfortunately calls to next_pos will never be perfectly balanced with push_pos,
+ // at very least because parse errors and EOF can happen unexpectedly without a prior push.
+ if self.pos.len() > 0 {
+ if self.pos.len() > 1 {
+ self.pos.remove(0);
+ } else {
+ self.pos[0] = self.lexer.position();
+ }
}
}
#[inline]
+ #[track_caller]
fn push_pos(&mut self) {
- self.pos.push(self.lexer.position());
+ debug_assert!(self.pos.len() != self.pos.capacity(), "You've found a bug in xml-rs, caused by calls to push_pos() in states that don't end up emitting events.
+ This case is ignored in release mode, and merely causes document positions to be out of sync.
+ Please file a bug and include the XML document that triggers this assert.");
+
+ // it has capacity preallocated for more than it ever needs, so this reduces code size
+ if self.pos.len() != self.pos.capacity() {
+ self.pos.push(self.lexer.position());
+ } else if self.pos.len() > 1 {
+ self.pos.remove(0); // this mitigates the excessive push_pos() call
+ }
}
+ #[inline(never)]
fn dispatch_token(&mut self, t: Token) -> Option<Result> {
- match self.st.clone() {
+ match self.st {
State::OutsideTag => self.outside_tag(t),
- State::InsideProcessingInstruction(s) => self.inside_processing_instruction(t, s),
- State::InsideDeclaration(s) => self.inside_declaration(t, s),
- State::InsideDoctype => self.inside_doctype(t),
State::InsideOpeningTag(s) => self.inside_opening_tag(t, s),
State::InsideClosingTag(s) => self.inside_closing_tag_name(t, s),
+ State::InsideReference => self.inside_reference(t),
State::InsideComment => self.inside_comment(t),
State::InsideCData => self.inside_cdata(t),
- State::InsideReference(s) => self.inside_reference(t, *s)
+ State::InsideProcessingInstruction(s) => self.inside_processing_instruction(t, s),
+ State::InsideDoctype(s) => self.inside_doctype(t, s),
+ State::InsideDeclaration(s) => self.inside_declaration(t, s),
+ State::DocumentStart => self.document_start(t),
}
}
@@ -358,18 +448,12 @@ impl PullParser {
#[inline]
fn buf_has_data(&self) -> bool {
- self.buf.len() > 0
+ !self.buf.is_empty()
}
#[inline]
fn take_buf(&mut self) -> String {
- mem::replace(&mut self.buf, String::new())
- }
-
- #[inline]
- fn append_char_continue(&mut self, c: char) -> Option<Result> {
- self.buf.push(c);
- None
+ std::mem::take(&mut self.buf)
}
#[inline]
@@ -402,11 +486,11 @@ impl PullParser {
self.read_prefix_separator = false;
}
- let invoke_callback = |this: &mut PullParser, t| {
+ let invoke_callback = move |this: &mut PullParser, t| {
let name = this.take_buf();
match name.parse() {
Ok(name) => on_name(this, t, name),
- Err(_) => Some(self_error!(this; "Qualified name is invalid: {}", name))
+ Err(_) => Some(this.error(SyntaxError::InvalidQualifiedName(name.into())))
}
};
@@ -418,9 +502,11 @@ impl PullParser {
None
}
- Token::Character(c) if c != ':' && (!self.buf_has_data() && is_name_start_char(c) ||
- self.buf_has_data() && is_name_char(c)) =>
- self.append_char_continue(c),
+ Token::Character(c) if c != ':' && (self.buf.is_empty() && is_name_start_char(c) ||
+ self.buf_has_data() && is_name_char(c)) => {
+ self.buf.push(c);
+ None
+ },
Token::EqualsSign if target == QualifiedNameTarget::AttributeNameTarget => invoke_callback(self, t),
@@ -429,9 +515,9 @@ impl PullParser {
Token::TagEnd if target == QualifiedNameTarget::OpeningTagNameTarget ||
target == QualifiedNameTarget::ClosingTagNameTarget => invoke_callback(self, t),
- Token::Whitespace(_) => invoke_callback(self, t),
+ Token::Character(c) if is_whitespace_char(c) => invoke_callback(self, t),
- _ => Some(self_error!(self; "Unexpected token inside qualified name: {}", t))
+ _ => Some(self.error(SyntaxError::UnexpectedQualifiedName(t)))
}
}
@@ -443,7 +529,7 @@ impl PullParser {
fn read_attribute_value<F>(&mut self, t: Token, on_value: F) -> Option<Result>
where F: Fn(&mut PullParser, String) -> Option<Result> {
match t {
- Token::Whitespace(_) if self.data.quote.is_none() => None, // skip leading whitespace
+ Token::Character(c) if self.data.quote.is_none() && is_whitespace_char(c) => None, // skip leading whitespace
Token::DoubleQuote | Token::SingleQuote => match self.data.quote {
None => { // Entered attribute value
@@ -456,45 +542,56 @@ impl PullParser {
on_value(self, value)
}
_ => {
+ if let Token::Character(c) = t {
+ if !self.is_valid_xml_char_not_restricted(c) {
+ return Some(self.error(SyntaxError::InvalidCharacterEntity(c as u32)));
+ }
+ }
t.push_to_string(&mut self.buf);
None
}
},
- Token::ReferenceStart => {
- let st = Box::new(self.st.clone());
- self.into_state_continue(State::InsideReference(st))
- }
+ Token::ReferenceStart if self.data.quote.is_some() => {
+ self.state_after_reference = self.st;
+ self.into_state_continue(State::InsideReference)
+ },
Token::OpeningTagStart =>
- Some(self_error!(self; "Unexpected token inside attribute value: <")),
+ Some(self.error(SyntaxError::UnexpectedOpeningTag)),
+
+ Token::Character(c) if !self.is_valid_xml_char_not_restricted(c) => {
+ Some(self.error(SyntaxError::InvalidCharacterEntity(c as u32)))
+ },
// Every character except " and ' and < is okay
- _ => {
+ _ if self.data.quote.is_some() => {
t.push_to_string(&mut self.buf);
None
}
+
+ _ => Some(self.error(SyntaxError::UnexpectedToken(t))),
}
}
fn emit_start_element(&mut self, emit_end_element: bool) -> Option<Result> {
- let mut name = self.data.take_element_name().unwrap();
+ let mut name = self.data.take_element_name()?;
let mut attributes = self.data.take_attributes();
// check whether the name prefix is bound and fix its namespace
match self.nst.get(name.borrow().prefix_repr()) {
Some("") => name.namespace = None, // default namespace
Some(ns) => name.namespace = Some(ns.into()),
- None => return Some(self_error!(self; "Element {} prefix is unbound", name))
+ None => return Some(self.error(SyntaxError::UnboundElementPrefix(name.to_string().into())))
}
// check and fix accumulated attributes prefixes
- for attr in attributes.iter_mut() {
+ for attr in &mut attributes {
if let Some(ref pfx) = attr.name.prefix {
let new_ns = match self.nst.get(pfx) {
- Some("") => None, // default namespace
+ Some("") => None, // default namespace
Some(ns) => Some(ns.into()),
- None => return Some(self_error!(self; "Attribute {} prefix is unbound", attr.name))
+ None => return Some(self.error(SyntaxError::UnboundAttribute(attr.name.to_string().into())))
};
attr.name.namespace = new_ns;
}
@@ -510,44 +607,60 @@ impl PullParser {
}
let namespace = self.nst.squash();
self.into_state_emit(State::OutsideTag, Ok(XmlEvent::StartElement {
- name: name,
- attributes: attributes,
- namespace: namespace
+ name,
+ attributes,
+ namespace
}))
}
fn emit_end_element(&mut self) -> Option<Result> {
- let mut name = self.data.take_element_name().unwrap();
+ let mut name = self.data.take_element_name()?;
// check whether the name prefix is bound and fix its namespace
match self.nst.get(name.borrow().prefix_repr()) {
- Some("") => name.namespace = None, // default namespace
+ Some("") => name.namespace = None, // default namespace
Some(ns) => name.namespace = Some(ns.into()),
- None => return Some(self_error!(self; "Element {} prefix is unbound", name))
+ None => return Some(self.error(SyntaxError::UnboundElementPrefix(name.to_string().into())))
}
- let op_name = self.est.pop().unwrap();
+ let op_name = self.est.pop()?;
if name == op_name {
self.pop_namespace = true;
- self.into_state_emit(State::OutsideTag, Ok(XmlEvent::EndElement { name: name }))
+ self.into_state_emit(State::OutsideTag, Ok(XmlEvent::EndElement { name }))
} else {
- Some(self_error!(self; "Unexpected closing tag: {}, expected {}", name, op_name))
+ Some(self.error(SyntaxError::UnexpectedClosingTag(format!("{name} != {op_name}").into())))
}
}
+ #[inline]
+ fn is_valid_xml_char(&self, c: char) -> bool {
+ if Some(XmlVersion::Version11) == self.data.version {
+ is_xml11_char(c)
+ } else {
+ is_xml10_char(c)
+ }
+ }
+
+ #[inline]
+ fn is_valid_xml_char_not_restricted(&self, c: char) -> bool {
+ if Some(XmlVersion::Version11) == self.data.version {
+ is_xml11_char_not_restricted(c)
+ } else {
+ is_xml10_char(c)
+ }
+ }
}
#[cfg(test)]
mod tests {
use std::io::BufReader;
-
- use common::{Position, TextPosition};
- use name::OwnedName;
- use attribute::OwnedAttribute;
- use reader::parser::PullParser;
- use reader::ParserConfig;
- use reader::events::XmlEvent;
+ use crate::attribute::OwnedAttribute;
+ use crate::common::TextPosition;
+ use crate::name::OwnedName;
+ use crate::reader::events::XmlEvent;
+ use crate::reader::parser::PullParser;
+ use crate::reader::ParserConfig;
fn new_parser() -> PullParser {
PullParser::new(ParserConfig::new())
@@ -557,13 +670,13 @@ mod tests {
($r:expr, $p:expr, $t:pat) => (
match $p.next(&mut $r) {
$t => {}
- e => panic!("Unexpected event: {:?}", e)
+ e => panic!("Unexpected event: {e:?}\nExpected: {}", stringify!($t))
}
);
($r:expr, $p:expr, $t:pat => $c:expr ) => (
match $p.next(&mut $r) {
$t if $c => {}
- e => panic!("Unexpected event: {:?}", e)
+ e => panic!("Unexpected event: {e:?}\nExpected: {} if {}", stringify!($t), stringify!($c))
}
)
);
@@ -608,15 +721,76 @@ mod tests {
}
#[test]
+ fn issue_220_comment() {
+ let (mut r, mut p) = test_data!(r#"<x><!-- <!--></x>"#);
+ expect_event!(r, p, Ok(XmlEvent::StartDocument { .. }));
+ expect_event!(r, p, Ok(XmlEvent::StartElement { .. }));
+ expect_event!(r, p, Ok(XmlEvent::EndElement { .. }));
+ expect_event!(r, p, Ok(XmlEvent::EndDocument));
+
+ let (mut r, mut p) = test_data!(r#"<x><!-- <!---></x>"#);
+ expect_event!(r, p, Ok(XmlEvent::StartDocument { .. }));
+ expect_event!(r, p, Ok(XmlEvent::StartElement { .. }));
+ expect_event!(r, p, Err(_)); // ---> is forbidden in comments
+
+ let (mut r, mut p) = test_data!(r#"<x><!--<text&x;> <!--></x>"#);
+ p.config.c.ignore_comments = false;
+ expect_event!(r, p, Ok(XmlEvent::StartDocument { .. }));
+ expect_event!(r, p, Ok(XmlEvent::StartElement { .. }));
+ expect_event!(r, p, Ok(XmlEvent::Comment(s)) => s == "<text&x;> <!");
+ expect_event!(r, p, Ok(XmlEvent::EndElement { .. }));
+ expect_event!(r, p, Ok(XmlEvent::EndDocument));
+ }
+
+ #[test]
+ fn malformed_declaration_attrs() {
+ let (mut r, mut p) = test_data!(r#"<?xml version x="1.0"?>"#);
+ expect_event!(r, p, Err(_));
+
+ let (mut r, mut p) = test_data!(r#"<?xml version="1.0" version="1.0"?>"#);
+ expect_event!(r, p, Err(_));
+
+ let (mut r, mut p) = test_data!(r#"<?xml version="1.0"encoding="utf-8"?>"#);
+ expect_event!(r, p, Err(_));
+
+ let (mut r, mut p) = test_data!(r#"<?xml version="1.0"standalone="yes"?>"#);
+ expect_event!(r, p, Err(_));
+
+ let (mut r, mut p) = test_data!(r#"<?xml version="1.0" encoding="utf-8"standalone="yes"?>"#);
+ expect_event!(r, p, Err(_));
+ }
+
+ #[test]
fn opening_tag_in_attribute_value() {
+ use crate::reader::error::{SyntaxError, Error, ErrorKind};
+
let (mut r, mut p) = test_data!(r#"
<a attr="zzz<zzz" />
"#);
expect_event!(r, p, Ok(XmlEvent::StartDocument { .. }));
expect_event!(r, p, Err(ref e) =>
- e.msg() == "Unexpected token inside attribute value: <" &&
- e.position() == TextPosition { row: 1, column: 24 }
+ *e == Error {
+ kind: ErrorKind::Syntax(SyntaxError::UnexpectedOpeningTag.to_cow()),
+ pos: TextPosition { row: 1, column: 24 }
+ }
);
}
+
+ #[test]
+ fn reference_err() {
+ let (mut r, mut p) = test_data!(r#"
+ <a>&&amp;</a>
+ "#);
+
+ expect_event!(r, p, Ok(XmlEvent::StartDocument { .. }));
+ expect_event!(r, p, Ok(XmlEvent::StartElement { .. }));
+ expect_event!(r, p, Err(_));
+ }
+
+ #[test]
+ fn state_size() {
+ assert_eq!(2, std::mem::size_of::<super::State>());
+ assert_eq!(1, std::mem::size_of::<super::DoctypeSubstate>());
+ }
}
diff --git a/src/reader/parser/inside_cdata.rs b/src/reader/parser/inside_cdata.rs
index 3269fb4..4f46f06 100644
--- a/src/reader/parser/inside_cdata.rs
+++ b/src/reader/parser/inside_cdata.rs
@@ -1,14 +1,14 @@
-use reader::events::XmlEvent;
-use reader::lexer::Token;
+use crate::reader::error::SyntaxError;
+use crate::reader::lexer::Token;
+use crate::{common::is_whitespace_char, reader::events::XmlEvent};
-use super::{Result, PullParser, State};
+use super::{PullParser, Result, State};
impl PullParser {
pub fn inside_cdata(&mut self, t: Token) -> Option<Result> {
match t {
Token::CDataEnd => {
- self.lexer.enable_errors();
- let event = if self.config.cdata_to_characters {
+ let event = if self.config.c.cdata_to_characters {
None
} else {
let data = self.take_buf();
@@ -17,16 +17,18 @@ impl PullParser {
self.into_state(State::OutsideTag, event)
}
- Token::Whitespace(_) => {
- t.push_to_string(&mut self.buf);
+ Token::Character(c) if !self.is_valid_xml_char(c) => {
+ Some(self.error(SyntaxError::InvalidCharacterEntity(c as u32)))
+ },
+ Token::Character(c) => {
+ if !is_whitespace_char(c) {
+ self.inside_whitespace = false;
+ }
+ self.buf.push(c);
None
}
- _ => {
- self.inside_whitespace = false;
- t.push_to_string(&mut self.buf);
- None
- }
+ _ => unreachable!(),
}
}
}
diff --git a/src/reader/parser/inside_closing_tag_name.rs b/src/reader/parser/inside_closing_tag_name.rs
index 1d8074a..6d86808 100644
--- a/src/reader/parser/inside_closing_tag_name.rs
+++ b/src/reader/parser/inside_closing_tag_name.rs
@@ -1,8 +1,7 @@
-use namespace;
-
-use reader::lexer::Token;
-
-use super::{Result, PullParser, State, QualifiedNameTarget, ClosingTagSubstate};
+use crate::reader::error::SyntaxError;
+use crate::{common::is_whitespace_char, namespace};
+use crate::reader::lexer::Token;
+use super::{ClosingTagSubstate, PullParser, QualifiedNameTarget, Result, State};
impl PullParser {
pub fn inside_closing_tag_name(&mut self, t: Token, s: ClosingTagSubstate) -> Option<Result> {
@@ -11,24 +10,22 @@ impl PullParser {
match name.prefix_ref() {
Some(prefix) if prefix == namespace::NS_XML_PREFIX ||
prefix == namespace::NS_XMLNS_PREFIX =>
- // TODO: {:?} is bad, need something better
- Some(self_error!(this; "'{:?}' cannot be an element name prefix", name.prefix)),
+ Some(this.error(SyntaxError::InvalidNamePrefix(prefix.into()))),
_ => {
this.data.element_name = Some(name.clone());
match token {
- Token::Whitespace(_) => this.into_state_continue(State::InsideClosingTag(ClosingTagSubstate::CTAfterName)),
Token::TagEnd => this.emit_end_element(),
- _ => Some(self_error!(this; "Unexpected token inside closing tag: {}", token))
+ Token::Character(c) if is_whitespace_char(c) => this.into_state_continue(State::InsideClosingTag(ClosingTagSubstate::CTAfterName)),
+ _ => Some(this.error(SyntaxError::UnexpectedTokenInClosingTag(token)))
}
}
}
}),
ClosingTagSubstate::CTAfterName => match t {
- Token::Whitespace(_) => None, // Skip whitespace
Token::TagEnd => self.emit_end_element(),
- _ => Some(self_error!(self; "Unexpected token inside closing tag: {}", t))
+ Token::Character(c) if is_whitespace_char(c) => None, // Skip whitespace
+ _ => Some(self.error(SyntaxError::UnexpectedTokenInClosingTag(t)))
}
}
}
-
}
diff --git a/src/reader/parser/inside_comment.rs b/src/reader/parser/inside_comment.rs
index fc98320..e4132c5 100644
--- a/src/reader/parser/inside_comment.rs
+++ b/src/reader/parser/inside_comment.rs
@@ -1,26 +1,26 @@
-use reader::events::XmlEvent;
-use reader::lexer::Token;
+use crate::reader::error::SyntaxError;
+use crate::reader::events::XmlEvent;
+use crate::reader::lexer::Token;
-use super::{Result, PullParser, State};
+use super::{PullParser, Result, State};
impl PullParser {
pub fn inside_comment(&mut self, t: Token) -> Option<Result> {
match t {
- // Double dash is illegal inside a comment
- Token::Chunk(ref s) if &s[..] == "--" => Some(self_error!(self; "Unexpected token inside a comment: --")),
-
- Token::CommentEnd if self.config.ignore_comments => {
- self.lexer.outside_comment();
+ Token::CommentEnd if self.config.c.ignore_comments => {
self.into_state_continue(State::OutsideTag)
}
Token::CommentEnd => {
- self.lexer.outside_comment();
let data = self.take_buf();
self.into_state_emit(State::OutsideTag, Ok(XmlEvent::Comment(data)))
}
- _ if self.config.ignore_comments => None, // Do not modify buffer if ignoring the comment
+ Token::Character(c) if !self.is_valid_xml_char(c) => {
+ Some(self.error(SyntaxError::InvalidCharacterEntity(c as u32)))
+ },
+
+ _ if self.config.c.ignore_comments => None, // Do not modify buffer if ignoring the comment
_ => {
t.push_to_string(&mut self.buf);
@@ -28,5 +28,4 @@ impl PullParser {
}
}
}
-
}
diff --git a/src/reader/parser/inside_declaration.rs b/src/reader/parser/inside_declaration.rs
index af39d10..4ff1427 100644
--- a/src/reader/parser/inside_declaration.rs
+++ b/src/reader/parser/inside_declaration.rs
@@ -1,44 +1,62 @@
-
-use common::XmlVersion;
-
-use reader::events::XmlEvent;
-use reader::lexer::Token;
+use crate::common::{is_whitespace_char, XmlVersion};
+use crate::reader::error::SyntaxError;
+use crate::reader::events::XmlEvent;
+use crate::reader::lexer::Token;
+use crate::util::Encoding;
use super::{
- Result, PullParser, State, DeclarationSubstate, QualifiedNameTarget,
- DEFAULT_VERSION, DEFAULT_ENCODING
+ DeclarationSubstate, Encountered, PullParser, QualifiedNameTarget, Result, State,
+ DEFAULT_VERSION,
};
impl PullParser {
+ #[inline(never)]
+ fn emit_start_document(&mut self) -> Option<Result> {
+ debug_assert!(self.encountered == Encountered::None);
+ self.encountered = Encountered::Declaration;
+
+ let version = self.data.version;
+ let encoding = self.data.take_encoding();
+ let standalone = self.data.standalone;
+
+ if let Some(new_encoding) = encoding.as_deref() {
+ let new_encoding = match new_encoding.parse() {
+ Ok(e) => e,
+ Err(_) if self.config.ignore_invalid_encoding_declarations => Encoding::Latin1,
+ Err(_) => return Some(self.error(SyntaxError::UnsupportedEncoding(new_encoding.into()))),
+ };
+ let current_encoding = self.lexer.encoding();
+ if current_encoding != new_encoding {
+ let set = match (current_encoding, new_encoding) {
+ (Encoding::Unknown | Encoding::Default, new) if new != Encoding::Utf16 => new,
+ (Encoding::Utf16Be | Encoding::Utf16Le, Encoding::Utf16) => current_encoding,
+ _ if self.config.ignore_invalid_encoding_declarations => current_encoding,
+ _ => return Some(self.error(SyntaxError::ConflictingEncoding(new_encoding, current_encoding))),
+ };
+ self.lexer.set_encoding(set);
+ }
+ }
+
+ let current_encoding = self.lexer.encoding();
+ self.into_state_emit(State::OutsideTag, Ok(XmlEvent::StartDocument {
+ version: version.unwrap_or(DEFAULT_VERSION),
+ encoding: encoding.unwrap_or_else(move || current_encoding.to_string()),
+ standalone
+ }))
+ }
+
// TODO: remove redundancy via macros or extra methods
pub fn inside_declaration(&mut self, t: Token, s: DeclarationSubstate) -> Option<Result> {
- macro_rules! unexpected_token(
- ($this:expr; $t:expr) => (Some($this.error(format!("Unexpected token inside XML declaration: {}", $t))));
- ($t:expr) => (unexpected_token!(self; $t));
- );
-
- #[inline]
- fn emit_start_document(this: &mut PullParser) -> Option<Result> {
- this.parsed_declaration = true;
- let version = this.data.take_version();
- let encoding = this.data.take_encoding();
- let standalone = this.data.take_standalone();
- this.into_state_emit(State::OutsideTag, Ok(XmlEvent::StartDocument {
- version: version.unwrap_or(DEFAULT_VERSION),
- encoding: encoding.unwrap_or(DEFAULT_ENCODING.into()),
- standalone: standalone
- }))
- }
match s {
DeclarationSubstate::BeforeVersion => match t {
- Token::Whitespace(_) => None, // continue
Token::Character('v') => self.into_state_continue(State::InsideDeclaration(DeclarationSubstate::InsideVersion)),
- _ => unexpected_token!(t)
+ Token::Character(c) if is_whitespace_char(c) => None, // continue
+ _ => Some(self.error(SyntaxError::UnexpectedToken(t))),
},
DeclarationSubstate::InsideVersion => self.read_qualified_name(t, QualifiedNameTarget::AttributeNameTarget, |this, token, name| {
- match &name.local_name[..] {
+ match &*name.local_name {
"ersion" if name.namespace.is_none() =>
this.into_state_continue(State::InsideDeclaration(
if token == Token::EqualsSign {
@@ -47,18 +65,18 @@ impl PullParser {
DeclarationSubstate::AfterVersion
}
)),
- _ => unexpected_token!(this; name)
+ _ => Some(this.error(SyntaxError::UnexpectedNameInsideXml(name.to_string().into()))),
}
}),
DeclarationSubstate::AfterVersion => match t {
- Token::Whitespace(_) => None,
Token::EqualsSign => self.into_state_continue(State::InsideDeclaration(DeclarationSubstate::InsideVersionValue)),
- _ => unexpected_token!(t)
+ Token::Character(c) if is_whitespace_char(c) => None,
+ _ => Some(self.error(SyntaxError::UnexpectedToken(t))),
},
DeclarationSubstate::InsideVersionValue => self.read_attribute_value(t, |this, value| {
- this.data.version = match &value[..] {
+ this.data.version = match &*value {
"1.0" => Some(XmlVersion::Version10),
"1.1" => Some(XmlVersion::Version11),
_ => None
@@ -66,48 +84,60 @@ impl PullParser {
if this.data.version.is_some() {
this.into_state_continue(State::InsideDeclaration(DeclarationSubstate::AfterVersionValue))
} else {
- Some(self_error!(this; "Unexpected XML version value: {}", value))
+ Some(this.error(SyntaxError::UnexpectedXmlVersion(value.into())))
}
}),
DeclarationSubstate::AfterVersionValue => match t {
- Token::Whitespace(_) => None, // skip whitespace
+ Token::Character(c) if is_whitespace_char(c) => self.into_state_continue(State::InsideDeclaration(DeclarationSubstate::BeforeEncoding)),
+ Token::ProcessingInstructionEnd => self.emit_start_document(),
+ _ => Some(self.error(SyntaxError::UnexpectedToken(t))),
+ },
+
+ DeclarationSubstate::BeforeEncoding => match t {
Token::Character('e') => self.into_state_continue(State::InsideDeclaration(DeclarationSubstate::InsideEncoding)),
Token::Character('s') => self.into_state_continue(State::InsideDeclaration(DeclarationSubstate::InsideStandaloneDecl)),
- Token::ProcessingInstructionEnd => emit_start_document(self),
- _ => unexpected_token!(t)
+ Token::ProcessingInstructionEnd => self.emit_start_document(),
+ Token::Character(c) if is_whitespace_char(c) => None, // skip whitespace
+ _ => Some(self.error(SyntaxError::UnexpectedToken(t))),
},
DeclarationSubstate::InsideEncoding => self.read_qualified_name(t, QualifiedNameTarget::AttributeNameTarget, |this, token, name| {
- match &name.local_name[..] {
+ match &*name.local_name {
"ncoding" if name.namespace.is_none() =>
this.into_state_continue(State::InsideDeclaration(
if token == Token::EqualsSign { DeclarationSubstate::InsideEncodingValue } else { DeclarationSubstate::AfterEncoding }
)),
- _ => unexpected_token!(this; name)
+ _ => Some(this.error(SyntaxError::UnexpectedName(name.to_string().into())))
}
}),
DeclarationSubstate::AfterEncoding => match t {
- Token::Whitespace(_) => None,
Token::EqualsSign => self.into_state_continue(State::InsideDeclaration(DeclarationSubstate::InsideEncodingValue)),
- _ => unexpected_token!(t)
+ Token::Character(c) if is_whitespace_char(c) => None,
+ _ => Some(self.error(SyntaxError::UnexpectedToken(t))),
},
DeclarationSubstate::InsideEncodingValue => self.read_attribute_value(t, |this, value| {
this.data.encoding = Some(value);
- this.into_state_continue(State::InsideDeclaration(DeclarationSubstate::BeforeStandaloneDecl))
+ this.into_state_continue(State::InsideDeclaration(DeclarationSubstate::AfterEncodingValue))
}),
+ DeclarationSubstate::AfterEncodingValue => match t {
+ Token::Character(c) if is_whitespace_char(c) => self.into_state_continue(State::InsideDeclaration(DeclarationSubstate::BeforeStandaloneDecl)),
+ Token::ProcessingInstructionEnd => self.emit_start_document(),
+ _ => Some(self.error(SyntaxError::UnexpectedToken(t))),
+ },
+
DeclarationSubstate::BeforeStandaloneDecl => match t {
- Token::Whitespace(_) => None, // skip whitespace
Token::Character('s') => self.into_state_continue(State::InsideDeclaration(DeclarationSubstate::InsideStandaloneDecl)),
- Token::ProcessingInstructionEnd => emit_start_document(self),
- _ => unexpected_token!(t)
+ Token::ProcessingInstructionEnd => self.emit_start_document(),
+ Token::Character(c) if is_whitespace_char(c) => None, // skip whitespace
+ _ => Some(self.error(SyntaxError::UnexpectedToken(t))),
},
DeclarationSubstate::InsideStandaloneDecl => self.read_qualified_name(t, QualifiedNameTarget::AttributeNameTarget, |this, token, name| {
- match &name.local_name[..] {
+ match &*name.local_name {
"tandalone" if name.namespace.is_none() =>
this.into_state_continue(State::InsideDeclaration(
if token == Token::EqualsSign {
@@ -116,18 +146,18 @@ impl PullParser {
DeclarationSubstate::AfterStandaloneDecl
}
)),
- _ => unexpected_token!(this; name)
+ _ => Some(this.error(SyntaxError::UnexpectedName(name.to_string().into()))),
}
}),
DeclarationSubstate::AfterStandaloneDecl => match t {
- Token::Whitespace(_) => None,
Token::EqualsSign => self.into_state_continue(State::InsideDeclaration(DeclarationSubstate::InsideStandaloneDeclValue)),
- _ => unexpected_token!(t)
+ Token::Character(c) if is_whitespace_char(c) => None,
+ _ => Some(self.error(SyntaxError::UnexpectedToken(t))),
},
DeclarationSubstate::InsideStandaloneDeclValue => self.read_attribute_value(t, |this, value| {
- let standalone = match &value[..] {
+ let standalone = match &*value {
"yes" => Some(true),
"no" => Some(false),
_ => None
@@ -136,16 +166,15 @@ impl PullParser {
this.data.standalone = standalone;
this.into_state_continue(State::InsideDeclaration(DeclarationSubstate::AfterStandaloneDeclValue))
} else {
- Some(self_error!(this; "Invalid standalone declaration value: {}", value))
+ Some(this.error(SyntaxError::InvalidStandaloneDeclaration(value.into())))
}
}),
DeclarationSubstate::AfterStandaloneDeclValue => match t {
- Token::Whitespace(_) => None, // skip whitespace
- Token::ProcessingInstructionEnd => emit_start_document(self),
- _ => unexpected_token!(t)
- }
+ Token::ProcessingInstructionEnd => self.emit_start_document(),
+ Token::Character(c) if is_whitespace_char(c) => None, // skip whitespace
+ _ => Some(self.error(SyntaxError::UnexpectedToken(t))),
+ },
}
}
-
}
diff --git a/src/reader/parser/inside_doctype.rs b/src/reader/parser/inside_doctype.rs
index 8dcf367..93ea470 100644
--- a/src/reader/parser/inside_doctype.rs
+++ b/src/reader/parser/inside_doctype.rs
@@ -1,16 +1,235 @@
-use reader::lexer::Token;
+use crate::reader::error::SyntaxError;
+use crate::common::{is_name_char, is_name_start_char, is_whitespace_char};
+use crate::reader::lexer::Token;
-use super::{Result, PullParser, State};
+use super::{DoctypeSubstate, PullParser, QuoteToken, Result, State};
impl PullParser {
- pub fn inside_doctype(&mut self, t: Token) -> Option<Result> {
- match t {
- Token::TagEnd => {
- self.lexer.enable_errors();
- self.into_state_continue(State::OutsideTag)
- }
+ pub fn inside_doctype(&mut self, t: Token, substate: DoctypeSubstate) -> Option<Result> {
+ match substate {
+ DoctypeSubstate::Outside => match t {
+ Token::TagEnd => self.into_state_continue(State::OutsideTag),
+ Token::MarkupDeclarationStart => {
+ self.buf.clear();
+ self.into_state_continue(State::InsideDoctype(DoctypeSubstate::InsideName))
+ },
+ Token::Character('%') => {
+ self.data.ref_data.clear();
+ self.data.ref_data.push('%');
+ self.into_state_continue(State::InsideDoctype(DoctypeSubstate::PEReferenceInDtd))
+ },
+ Token::CommentStart => {
+ self.into_state_continue(State::InsideDoctype(DoctypeSubstate::Comment))
+ },
+ Token::SingleQuote | Token::DoubleQuote => {
+ // just discard string literals
+ self.data.quote = Some(super::QuoteToken::from_token(&t));
+ self.into_state_continue(State::InsideDoctype(DoctypeSubstate::String))
+ },
+ Token::CDataEnd | Token::CDataStart => Some(self.error(SyntaxError::UnexpectedToken(t))),
+ // TODO: parse SYSTEM, and [
+ _ => None,
+ },
+ DoctypeSubstate::String => match t {
+ Token::SingleQuote if self.data.quote != Some(QuoteToken::SingleQuoteToken) => { None },
+ Token::DoubleQuote if self.data.quote != Some(QuoteToken::DoubleQuoteToken) => { None },
+ Token::SingleQuote | Token::DoubleQuote => {
+ self.data.quote = None;
+ self.into_state_continue(State::InsideDoctype(DoctypeSubstate::Outside))
+ },
+ _ => None,
+ },
+ DoctypeSubstate::Comment => match t {
+ Token::CommentEnd => {
+ self.into_state_continue(State::InsideDoctype(DoctypeSubstate::Outside))
+ },
+ _ => None,
+ },
+ DoctypeSubstate::InsideName => match t {
+ Token::Character(c @ 'A'..='Z') => {
+ self.buf.push(c);
+ None
+ },
+ Token::Character(c) if is_whitespace_char(c) => {
+ match self.buf.as_str() {
+ "ENTITY" => self.into_state_continue(State::InsideDoctype(DoctypeSubstate::BeforeEntityName)),
+ "NOTATION" | "ELEMENT" | "ATTLIST" => self.into_state_continue(State::InsideDoctype(DoctypeSubstate::SkipDeclaration)),
+ s => Some(self.error(SyntaxError::UnknownMarkupDeclaration(s.into()))),
+ }
- _ => None
+ },
+ _ => Some(self.error(SyntaxError::UnexpectedToken(t))),
+ },
+ DoctypeSubstate::BeforeEntityName => {
+ self.data.name.clear();
+ match t {
+ Token::Character(c) if is_whitespace_char(c) => None,
+ Token::Character('%') => { // % is for PEDecl
+ self.data.name.push('%');
+ self.into_state_continue(State::InsideDoctype(DoctypeSubstate::PEReferenceDefinitionStart))
+ },
+ Token::Character(c) if is_name_start_char(c) => {
+ self.data.name.push(c);
+ self.into_state_continue(State::InsideDoctype(DoctypeSubstate::EntityName))
+ },
+ _ => Some(self.error(SyntaxError::UnexpectedTokenInEntity(t))),
+ }
+ },
+ DoctypeSubstate::EntityName => match t {
+ Token::Character(c) if is_whitespace_char(c) => {
+ self.into_state_continue(State::InsideDoctype(DoctypeSubstate::BeforeEntityValue))
+ },
+ Token::Character(c) if is_name_char(c) => {
+ self.data.name.push(c);
+ None
+ },
+ _ => Some(self.error(SyntaxError::UnexpectedTokenInEntity(t))),
+ },
+ DoctypeSubstate::BeforeEntityValue => {
+ self.buf.clear();
+ match t {
+ Token::Character(c) if is_whitespace_char(c) => None,
+ // SYSTEM/PUBLIC not supported
+ Token::Character('S' | 'P') => {
+ let name = self.data.take_name();
+ self.entities.entry(name).or_insert_with(String::new); // Dummy value, but at least the name is recognized
+
+ self.into_state_continue(State::InsideDoctype(DoctypeSubstate::SkipDeclaration))
+ },
+ Token::SingleQuote | Token::DoubleQuote => {
+ self.data.quote = Some(super::QuoteToken::from_token(&t));
+ self.into_state_continue(State::InsideDoctype(DoctypeSubstate::EntityValue))
+ },
+ _ => Some(self.error(SyntaxError::UnexpectedTokenInEntity(t))),
+ }
+ },
+ DoctypeSubstate::EntityValue => match t {
+ Token::SingleQuote if self.data.quote != Some(QuoteToken::SingleQuoteToken) => { self.buf.push('\''); None },
+ Token::DoubleQuote if self.data.quote != Some(QuoteToken::DoubleQuoteToken) => { self.buf.push('"'); None },
+ Token::SingleQuote | Token::DoubleQuote => {
+ self.data.quote = None;
+ let name = self.data.take_name();
+ let val = self.take_buf();
+ self.entities.entry(name).or_insert(val); // First wins
+ self.into_state_continue(State::InsideDoctype(DoctypeSubstate::SkipDeclaration)) // FIXME
+ },
+ Token::ReferenceStart | Token::Character('&') => {
+ self.data.ref_data.clear();
+ self.into_state_continue(State::InsideDoctype(DoctypeSubstate::NumericReferenceStart))
+ },
+ Token::Character('%') => {
+ self.data.ref_data.clear();
+ self.data.ref_data.push('%'); // include literal % in the name to distinguish from regular entities
+ self.into_state_continue(State::InsideDoctype(DoctypeSubstate::PEReferenceInValue))
+ },
+ Token::Character(c) if !self.is_valid_xml_char(c) => {
+ Some(self.error(SyntaxError::InvalidCharacterEntity(c as u32)))
+ },
+ Token::Character(c) => {
+ self.buf.push(c);
+ None
+ },
+ _ => Some(self.error(SyntaxError::UnexpectedTokenInEntity(t))),
+ },
+ DoctypeSubstate::PEReferenceDefinitionStart => match t {
+ Token::Character(c) if is_whitespace_char(c) => {
+ None
+ },
+ Token::Character(c) if is_name_start_char(c) => {
+ debug_assert_eq!(self.data.name, "%");
+ self.data.name.push(c);
+ self.into_state_continue(State::InsideDoctype(DoctypeSubstate::PEReferenceDefinition))
+ },
+ _ => Some(self.error(SyntaxError::UnexpectedTokenInEntity(t))),
+ },
+ DoctypeSubstate::PEReferenceDefinition => match t {
+ Token::Character(c) if is_name_char(c) => {
+ self.data.name.push(c);
+ None
+ },
+ Token::Character(c) if is_whitespace_char(c) => {
+ self.into_state_continue(State::InsideDoctype(DoctypeSubstate::BeforeEntityValue))
+ },
+ _ => Some(self.error(SyntaxError::UnexpectedTokenInEntity(t))),
+ },
+ DoctypeSubstate::PEReferenceInDtd => match t {
+ Token::Character(c) if is_name_char(c) => {
+ self.data.ref_data.push(c);
+ None
+ },
+ Token::ReferenceEnd | Token::Character(';') => {
+ let name = self.data.take_ref_data();
+ match self.entities.get(&name) {
+ Some(ent) => {
+ if let Err(e) = self.lexer.reparse(ent) {
+ return Some(Err(e));
+ }
+ self.into_state_continue(State::InsideDoctype(DoctypeSubstate::Outside))
+ },
+ None => Some(self.error(SyntaxError::UndefinedEntity(name.into()))),
+ }
+ },
+ _ => Some(self.error(SyntaxError::UnexpectedTokenInEntity(t))),
+ },
+ DoctypeSubstate::PEReferenceInValue => match t {
+ Token::Character(c) if is_name_char(c) => {
+ self.data.ref_data.push(c);
+ None
+ },
+ Token::ReferenceEnd | Token::Character(';') => {
+ let name = self.data.take_ref_data();
+ match self.entities.get(&name) {
+ Some(ent) => {
+ self.buf.push_str(ent);
+ self.into_state_continue(State::InsideDoctype(DoctypeSubstate::EntityValue))
+ },
+ None => Some(self.error(SyntaxError::UndefinedEntity(name.into()))),
+ }
+ },
+ _ => Some(self.error(SyntaxError::UnexpectedTokenInEntity(t))),
+ },
+ DoctypeSubstate::NumericReferenceStart => match t {
+ Token::Character('#') => {
+ self.into_state_continue(State::InsideDoctype(DoctypeSubstate::NumericReference))
+ },
+ Token::Character(c) if !self.is_valid_xml_char(c) => {
+ Some(self.error(SyntaxError::InvalidCharacterEntity(c as u32)))
+ },
+ Token::Character(c) => {
+ self.buf.push('&');
+ self.buf.push(c);
+ // named entities are not expanded inside doctype
+ self.into_state_continue(State::InsideDoctype(DoctypeSubstate::EntityValue))
+ },
+ _ => Some(self.error(SyntaxError::UnexpectedTokenInEntity(t))),
+ },
+ DoctypeSubstate::NumericReference => match t {
+ Token::ReferenceEnd | Token::Character(';') => {
+ let r = self.data.take_ref_data();
+ // https://www.w3.org/TR/xml/#sec-entexpand
+ match self.numeric_reference_from_str(&r) {
+ Ok(c) => {
+ self.buf.push(c);
+ self.into_state_continue(State::InsideDoctype(DoctypeSubstate::EntityValue))
+ }
+ Err(e) => Some(self.error(e)),
+ }
+ },
+ Token::Character(c) if !self.is_valid_xml_char(c) => {
+ Some(self.error(SyntaxError::InvalidCharacterEntity(c as u32)))
+ },
+ Token::Character(c) => {
+ self.data.ref_data.push(c);
+ None
+ },
+ _ => Some(self.error(SyntaxError::UnexpectedTokenInEntity(t))),
+ },
+ DoctypeSubstate::SkipDeclaration => match t {
+ Token::TagEnd => {
+ self.into_state_continue(State::InsideDoctype(DoctypeSubstate::Outside))
+ },
+ _ => None,
+ },
}
}
}
diff --git a/src/reader/parser/inside_opening_tag.rs b/src/reader/parser/inside_opening_tag.rs
index 533874f..b7f185a 100644
--- a/src/reader/parser/inside_opening_tag.rs
+++ b/src/reader/parser/inside_opening_tag.rs
@@ -1,26 +1,26 @@
-use common::is_name_start_char;
-use attribute::OwnedAttribute;
-use namespace;
+use crate::reader::error::SyntaxError;
+use crate::common::is_name_start_char;
+use crate::namespace;
+use crate::{attribute::OwnedAttribute, common::is_whitespace_char};
-use reader::lexer::Token;
+use crate::reader::lexer::Token;
-use super::{Result, PullParser, State, OpeningTagSubstate, QualifiedNameTarget};
+use super::{OpeningTagSubstate, PullParser, QualifiedNameTarget, Result, State};
impl PullParser {
pub fn inside_opening_tag(&mut self, t: Token, s: OpeningTagSubstate) -> Option<Result> {
- macro_rules! unexpected_token(($t:expr) => (Some(self_error!(self; "Unexpected token inside opening tag: {}", $t))));
match s {
OpeningTagSubstate::InsideName => self.read_qualified_name(t, QualifiedNameTarget::OpeningTagNameTarget, |this, token, name| {
match name.prefix_ref() {
Some(prefix) if prefix == namespace::NS_XML_PREFIX ||
prefix == namespace::NS_XMLNS_PREFIX =>
- Some(self_error!(this; "'{:?}' cannot be an element name prefix", name.prefix)),
+ Some(this.error(SyntaxError::InvalidNamePrefix(prefix.into()))),
_ => {
this.data.element_name = Some(name.clone());
match token {
Token::TagEnd => this.emit_start_element(false),
Token::EmptyTagEnd => this.emit_start_element(true),
- Token::Whitespace(_) => this.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::InsideTag)),
+ Token::Character(c) if is_whitespace_char(c) => this.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::InsideTag)),
_ => unreachable!()
}
}
@@ -28,66 +28,65 @@ impl PullParser {
}),
OpeningTagSubstate::InsideTag => match t {
- Token::Whitespace(_) => None, // skip whitespace
+ Token::TagEnd => self.emit_start_element(false),
+ Token::EmptyTagEnd => self.emit_start_element(true),
+ Token::Character(c) if is_whitespace_char(c) => None, // skip whitespace
Token::Character(c) if is_name_start_char(c) => {
self.buf.push(c);
self.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::InsideAttributeName))
}
- Token::TagEnd => self.emit_start_element(false),
- Token::EmptyTagEnd => self.emit_start_element(true),
- _ => unexpected_token!(t)
+ _ => Some(self.error(SyntaxError::UnexpectedTokenInOpeningTag(t)))
},
OpeningTagSubstate::InsideAttributeName => self.read_qualified_name(t, QualifiedNameTarget::AttributeNameTarget, |this, token, name| {
this.data.attr_name = Some(name);
match token {
- Token::Whitespace(_) => this.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::AfterAttributeName)),
Token::EqualsSign => this.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::InsideAttributeValue)),
+ Token::Character(c) if is_whitespace_char(c) => this.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::AfterAttributeName)),
_ => unreachable!()
}
}),
OpeningTagSubstate::AfterAttributeName => match t {
- Token::Whitespace(_) => None,
Token::EqualsSign => self.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::InsideAttributeValue)),
- _ => unexpected_token!(t)
+ Token::Character(c) if is_whitespace_char(c) => None,
+ _ => Some(self.error(SyntaxError::UnexpectedTokenInOpeningTag(t)))
},
OpeningTagSubstate::InsideAttributeValue => self.read_attribute_value(t, |this, value| {
- let name = this.data.take_attr_name().unwrap(); // unwrap() will always succeed here
-
+ let name = this.data.take_attr_name()?; // will always succeed here
// check that no attribute with such name is already present
// if there is one, XML is not well-formed
- if this.data.attributes.iter().find(|a| a.name == name).is_some() { // TODO: looks bad
+ if this.data.attributes.iter().any(|a| a.name == name) { // TODO: looks bad
// TODO: ideally this error should point to the beginning of the attribute,
// TODO: not the end of its value
- Some(self_error!(this; "Attribute '{}' is redefined", name))
+ Some(this.error(SyntaxError::RedefinedAttribute(name.to_string().into())))
} else {
match name.prefix_ref() {
// declaring a new prefix; it is sufficient to check prefix only
// because "xmlns" prefix is reserved
Some(namespace::NS_XMLNS_PREFIX) => {
- let ln = &name.local_name[..];
+ let ln = &*name.local_name;
if ln == namespace::NS_XMLNS_PREFIX {
- Some(self_error!(this; "Cannot redefine prefix '{}'", namespace::NS_XMLNS_PREFIX))
- } else if ln == namespace::NS_XML_PREFIX && &value[..] != namespace::NS_XML_URI {
- Some(self_error!(this; "Prefix '{}' cannot be rebound to another value", namespace::NS_XML_PREFIX))
+ Some(this.error(SyntaxError::CannotRedefineXmlnsPrefix))
+ } else if ln == namespace::NS_XML_PREFIX && &*value != namespace::NS_XML_URI {
+ Some(this.error(SyntaxError::CannotRedefineXmlPrefix))
} else if value.is_empty() {
- Some(self_error!(this; "Cannot undefine prefix '{}'", ln))
+ Some(this.error(SyntaxError::CannotUndefinePrefix(ln.into())))
} else {
this.nst.put(name.local_name.clone(), value);
- this.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::InsideTag))
+ this.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::AfterAttributeValue))
}
}
// declaring default namespace
- None if &name.local_name[..] == namespace::NS_XMLNS_PREFIX =>
- match &value[..] {
- namespace::NS_XMLNS_PREFIX | namespace::NS_XML_PREFIX =>
- Some(self_error!(this; "Namespace '{}' cannot be default", value)),
+ None if &*name.local_name == namespace::NS_XMLNS_PREFIX =>
+ match &*value {
+ namespace::NS_XMLNS_PREFIX | namespace::NS_XML_PREFIX | namespace::NS_XML_URI | namespace::NS_XMLNS_URI =>
+ Some(this.error(SyntaxError::InvalidDefaultNamespace(value.into()))),
_ => {
this.nst.put(namespace::NS_NO_PREFIX, value.clone());
- this.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::InsideTag))
+ this.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::AfterAttributeValue))
}
},
@@ -95,14 +94,20 @@ impl PullParser {
_ => {
this.data.attributes.push(OwnedAttribute {
name: name.clone(),
- value: value
+ value
});
- this.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::InsideTag))
+ this.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::AfterAttributeValue))
}
}
}
- })
+ }),
+
+ OpeningTagSubstate::AfterAttributeValue => match t {
+ Token::Character(c) if is_whitespace_char(c) => self.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::InsideTag)),
+ Token::TagEnd => self.emit_start_element(false),
+ Token::EmptyTagEnd => self.emit_start_element(true),
+ _ => Some(self.error(SyntaxError::UnexpectedTokenInOpeningTag(t)))
+ },
}
}
-
}
diff --git a/src/reader/parser/inside_processing_instruction.rs b/src/reader/parser/inside_processing_instruction.rs
index 8ddf6b8..96f6753 100644
--- a/src/reader/parser/inside_processing_instruction.rs
+++ b/src/reader/parser/inside_processing_instruction.rs
@@ -1,18 +1,20 @@
-use common::{
- is_name_start_char, is_name_char,
-};
+use crate::reader::error::SyntaxError;
+use crate::common::{is_name_char, is_name_start_char, is_whitespace_char};
-use reader::events::XmlEvent;
-use reader::lexer::Token;
+use crate::reader::events::XmlEvent;
+use crate::reader::lexer::Token;
-use super::{Result, PullParser, State, ProcessingInstructionSubstate, DeclarationSubstate};
+use super::{DeclarationSubstate, ProcessingInstructionSubstate, PullParser, Result, State, Encountered};
impl PullParser {
pub fn inside_processing_instruction(&mut self, t: Token, s: ProcessingInstructionSubstate) -> Option<Result> {
match s {
ProcessingInstructionSubstate::PIInsideName => match t {
- Token::Character(c) if !self.buf_has_data() && is_name_start_char(c) ||
- self.buf_has_data() && is_name_char(c) => self.append_char_continue(c),
+ Token::Character(c) if self.buf.is_empty() && is_name_start_char(c) ||
+ self.buf_has_data() && is_name_char(c) => {
+ self.buf.push(c);
+ None
+ },
Token::ProcessingInstructionEnd => {
// self.buf contains PI name
@@ -20,70 +22,83 @@ impl PullParser {
// Don't need to check for declaration because it has mandatory attributes
// but there is none
- match &name[..] {
+ match &*name {
// Name is empty, it is an error
- "" => Some(self_error!(self; "Encountered processing instruction without name")),
+ "" => Some(self.error(SyntaxError::ProcessingInstructionWithoutName)),
// Found <?xml-like PI not at the beginning of a document,
// it is an error - see section 2.6 of XML 1.1 spec
- "xml"|"xmL"|"xMl"|"xML"|"Xml"|"XmL"|"XMl"|"XML" =>
- Some(self_error!(self; "Invalid processing instruction: <?{}", name)),
+ n if "xml".eq_ignore_ascii_case(n) =>
+ Some(self.error(SyntaxError::InvalidXmlProcessingInstruction(name.into()))),
// All is ok, emitting event
_ => {
- self.into_state_emit(
- State::OutsideTag,
- Ok(XmlEvent::ProcessingInstruction {
- name: name,
- data: None
- })
- )
+ debug_assert!(self.next_event.is_none(), "{:?}", self.next_event);
+ // can't have a PI before `<?xml`
+ let event1 = self.set_encountered(Encountered::Declaration);
+ let event2 = Some(Ok(XmlEvent::ProcessingInstruction {
+ name,
+ data: None
+ }));
+ // emitting two events at once is cumbersome
+ let event1 = if event1.is_some() {
+ self.next_event = event2;
+ event1
+ } else {
+ event2
+ };
+ self.into_state(State::OutsideTag, event1)
}
}
}
- Token::Whitespace(_) => {
+ Token::Character(c) if is_whitespace_char(c) => {
// self.buf contains PI name
let name = self.take_buf();
- match &name[..] {
+ match &*name {
// We have not ever encountered an element and have not parsed XML declaration
- "xml" if !self.encountered_element && !self.parsed_declaration =>
+ "xml" if self.encountered == Encountered::None =>
self.into_state_continue(State::InsideDeclaration(DeclarationSubstate::BeforeVersion)),
// Found <?xml-like PI after the beginning of a document,
// it is an error - see section 2.6 of XML 1.1 spec
- "xml"|"xmL"|"xMl"|"xML"|"Xml"|"XmL"|"XMl"|"XML"
- if self.encountered_element || self.parsed_declaration =>
- Some(self_error!(self; "Invalid processing instruction: <?{}", name)),
+ n if "xml".eq_ignore_ascii_case(n) =>
+ Some(self.error(SyntaxError::InvalidXmlProcessingInstruction(name.into()))),
// All is ok, starting parsing PI data
_ => {
- self.lexer.disable_errors(); // data is arbitrary, so disable errors
self.data.name = name;
- self.into_state_continue(State::InsideProcessingInstruction(ProcessingInstructionSubstate::PIInsideData))
+ // can't have a PI before `<?xml`
+ let next_event = self.set_encountered(Encountered::Declaration);
+ self.into_state(State::InsideProcessingInstruction(ProcessingInstructionSubstate::PIInsideData), next_event)
}
-
}
}
- _ => Some(self_error!(self; "Unexpected token: <?{}{}", self.buf, t))
+ _ => {
+ let buf = self.take_buf();
+ Some(self.error(SyntaxError::UnexpectedProcessingInstruction(buf.into(), t)))
+ }
},
ProcessingInstructionSubstate::PIInsideData => match t {
Token::ProcessingInstructionEnd => {
- self.lexer.enable_errors();
let name = self.data.take_name();
let data = self.take_buf();
self.into_state_emit(
State::OutsideTag,
Ok(XmlEvent::ProcessingInstruction {
- name: name,
- data: Some(data)
- })
+ name,
+ data: Some(data),
+ }),
)
},
+ Token::Character(c) if !self.is_valid_xml_char(c) => {
+ Some(self.error(SyntaxError::InvalidCharacterEntity(c as u32)))
+ },
+
// Any other token should be treated as plain characters
_ => {
t.push_to_string(&mut self.buf);
@@ -92,5 +107,4 @@ impl PullParser {
},
}
}
-
}
diff --git a/src/reader/parser/inside_reference.rs b/src/reader/parser/inside_reference.rs
index 60026d5..9a15e09 100644
--- a/src/reader/parser/inside_reference.rs
+++ b/src/reader/parser/inside_reference.rs
@@ -1,13 +1,11 @@
+use crate::reader::error::SyntaxError;
use std::char;
-
-use common::{is_name_start_char, is_name_char, is_whitespace_str};
-
-use reader::lexer::Token;
-
-use super::{Result, PullParser, State};
+use crate::common::{is_name_char, is_name_start_char, is_whitespace_char};
+use crate::reader::lexer::Token;
+use super::{PullParser, Result, State};
impl PullParser {
- pub fn inside_reference(&mut self, t: Token, prev_st: State) -> Option<Result> {
+ pub fn inside_reference(&mut self, t: Token) -> Option<Result> {
match t {
Token::Character(c) if !self.data.ref_data.is_empty() && is_name_char(c) ||
self.data.ref_data.is_empty() && (is_name_start_char(c) || c == '#') => {
@@ -16,74 +14,64 @@ impl PullParser {
}
Token::ReferenceEnd => {
- // TODO: check for unicode correctness
let name = self.data.take_ref_data();
- let name_len = name.len(); // compute once
- let c = match &name[..] {
- "lt" => Ok('<'.to_string()),
- "gt" => Ok('>'.to_string()),
- "amp" => Ok('&'.to_string()),
- "apos" => Ok('\''.to_string()),
- "quot" => Ok('"'.to_string()),
- "" => Err(self_error!(self; "Encountered empty entity")),
- _ if name_len > 2 && name.starts_with("#x") => {
- let num_str = &name[2..name_len];
- if num_str == "0" {
- Err(self_error!(self; "Null character entity is not allowed"))
- } else {
- if self.config.replace_unknown_entity_references {
- match u32::from_str_radix(num_str, 16).ok().map(|i| char::from_u32(i).unwrap_or('\u{fffd}')) {
- Some(c) => Ok(c.to_string()),
- None => Err(self_error!(self; "Invalid hexadecimal character number in an entity: {}", name))
- }
- } else {
- match u32::from_str_radix(num_str, 16).ok().and_then(char::from_u32) {
- Some(c) => Ok(c.to_string()),
- None => Err(self_error!(self; "Invalid hexadecimal character number in an entity: {}", name))
- }
- }
- }
- }
- _ if name_len > 1 && name.starts_with('#') => {
- let num_str = &name[1..name_len];
- if num_str == "0" {
- Err(self_error!(self; "Null character entity is not allowed"))
- } else {
- if self.config.replace_unknown_entity_references {
- match u32::from_str_radix(num_str, 10).ok().map(|i| char::from_u32(i).unwrap_or('\u{fffd}')) {
- Some(c) => Ok(c.to_string()),
- None => Err(self_error!(self; "Invalid decimal character number in an entity: {}", name))
- }
- }
- else {
- match u32::from_str_radix(num_str, 10).ok().and_then(char::from_u32) {
- Some(c) => Ok(c.to_string()),
- None => Err(self_error!(self; "Invalid decimal character number in an entity: {}", name))
- }
- }
- }
+ if name.is_empty() {
+ return Some(self.error(SyntaxError::EmptyEntity));
+ }
+
+ let c = match &*name {
+ "lt" => Some('<'),
+ "gt" => Some('>'),
+ "amp" => Some('&'),
+ "apos" => Some('\''),
+ "quot" => Some('"'),
+ _ if name.starts_with('#') => match self.numeric_reference_from_str(&name[1..]) {
+ Ok(c) => Some(c),
+ Err(e) => return Some(self.error(e))
},
- _ => {
- if let Some(v) = self.config.extra_entities.get(&name) {
- Ok(v.clone())
- } else {
- Err(self_error!(self; "Unexpected entity: {}", name))
- }
- }
+ _ => None,
};
- match c {
- Ok(c) => {
- self.buf.push_str(&c);
- if prev_st == State::OutsideTag && !is_whitespace_str(&c) {
- self.inside_whitespace = false;
+ if let Some(c) = c {
+ self.buf.push(c);
+ } else if let Some(v) = self.config.c.extra_entities.get(&name) {
+ self.buf.push_str(v);
+ } else if let Some(v) = self.entities.get(&name) {
+ if self.state_after_reference == State::OutsideTag {
+ // an entity can expand to *elements*, so outside of a tag it needs a full reparse
+ if let Err(e) = self.lexer.reparse(v) {
+ return Some(Err(e));
}
- self.into_state_continue(prev_st)
+ } else {
+ // however, inside attributes it's not allowed to affect attribute quoting,
+ // so it can't be fed to the lexer
+ self.buf.push_str(v);
}
- Err(e) => Some(e)
+ } else {
+ return Some(self.error(SyntaxError::UnexpectedEntity(name.into())));
+ }
+ let prev_st = self.state_after_reference;
+ if prev_st == State::OutsideTag && !is_whitespace_char(self.buf.chars().last().unwrap_or('\0')) {
+ self.inside_whitespace = false;
}
+ self.into_state_continue(prev_st)
}
- _ => Some(self_error!(self; "Unexpected token inside an entity: {}", t))
+ _ => Some(self.error(SyntaxError::UnexpectedTokenInEntity(t))),
+ }
+ }
+
+ pub(crate) fn numeric_reference_from_str(&self, num_str: &str) -> std::result::Result<char, SyntaxError> {
+ let val = if let Some(hex) = num_str.strip_prefix('x') {
+ u32::from_str_radix(hex, 16).map_err(move |_| SyntaxError::InvalidNumericEntity(num_str.into()))?
+ } else {
+ u32::from_str_radix(num_str, 10).map_err(move |_| SyntaxError::InvalidNumericEntity(num_str.into()))?
+ };
+ match char::from_u32(val) {
+ Some(c) if self.is_valid_xml_char(c) => Ok(c),
+ None if self.config.c.replace_unknown_entity_references => {
+ Ok('\u{fffd}')
+ },
+ _ => Err(SyntaxError::InvalidCharacterEntity(val)),
}
}
}
diff --git a/src/reader/parser/outside_tag.rs b/src/reader/parser/outside_tag.rs
index d3f7598..8104224 100644
--- a/src/reader/parser/outside_tag.rs
+++ b/src/reader/parser/outside_tag.rs
@@ -1,130 +1,196 @@
-use common::is_whitespace_char;
-
-use reader::events::XmlEvent;
-use reader::lexer::Token;
+use crate::reader::error::SyntaxError;
+use crate::common::is_whitespace_char;
+use crate::reader::events::XmlEvent;
+use crate::reader::lexer::Token;
use super::{
- Result, PullParser, State, ClosingTagSubstate, OpeningTagSubstate,
- ProcessingInstructionSubstate, DEFAULT_VERSION, DEFAULT_ENCODING, DEFAULT_STANDALONE
+ ClosingTagSubstate, DoctypeSubstate, Encountered, OpeningTagSubstate,
+ ProcessingInstructionSubstate, PullParser, Result, State,
};
impl PullParser {
pub fn outside_tag(&mut self, t: Token) -> Option<Result> {
match t {
- Token::ReferenceStart =>
- self.into_state_continue(State::InsideReference(Box::new(State::OutsideTag))),
-
- Token::Whitespace(_) if self.depth() == 0 && self.config.ignore_root_level_whitespace => None, // skip whitespace outside of the root element
+ Token::Character(c) => {
+ if is_whitespace_char(c) {
+ // skip whitespace outside of the root element
+ if (self.config.c.trim_whitespace && self.buf.is_empty()) ||
+ (self.depth() == 0 && self.config.c.ignore_root_level_whitespace) {
+ return None;
+ }
+ } else {
+ self.inside_whitespace = false;
+ if self.depth() == 0 {
+ return Some(self.error(SyntaxError::UnexpectedTokenOutsideRoot(t)));
+ }
+ }
- Token::Whitespace(_) if self.config.trim_whitespace && !self.buf_has_data() => None,
+ if !self.is_valid_xml_char_not_restricted(c) {
+ return Some(self.error(SyntaxError::InvalidCharacterEntity(c as u32)));
+ }
- Token::Whitespace(c) => {
- if !self.buf_has_data() {
+ if self.buf.is_empty() {
self.push_pos();
}
- self.append_char_continue(c)
- }
-
- _ if t.contains_char_data() && self.depth() == 0 =>
- Some(self_error!(self; "Unexpected characters outside the root element: {}", t)),
+ self.buf.push(c);
+ None
+ },
- _ if t.contains_char_data() => { // Non-whitespace char data
- if !self.buf_has_data() {
- self.push_pos();
+ Token::CommentEnd | Token::TagEnd | Token::EqualsSign |
+ Token::DoubleQuote | Token::SingleQuote |
+ Token::ProcessingInstructionEnd | Token::EmptyTagEnd => {
+ if self.depth() == 0 {
+ return Some(self.error(SyntaxError::UnexpectedTokenOutsideRoot(t)));
}
self.inside_whitespace = false;
- t.push_to_string(&mut self.buf);
+
+ if let Some(s) = t.as_static_str() {
+ if self.buf.is_empty() {
+ self.push_pos();
+ }
+ self.buf.push_str(s);
+ }
None
- }
+ },
+
+ Token::ReferenceStart if self.depth() > 0 => {
+ self.state_after_reference = State::OutsideTag;
+ self.into_state_continue(State::InsideReference)
+ },
- Token::ReferenceEnd => { // Semi-colon in a text outside an entity
+ Token::ReferenceEnd if self.depth() > 0 => { // Semi-colon in a text outside an entity
self.inside_whitespace = false;
Token::ReferenceEnd.push_to_string(&mut self.buf);
None
- }
+ },
- Token::CommentStart if self.config.coalesce_characters && self.config.ignore_comments => {
+ Token::CommentStart if self.config.c.coalesce_characters && self.config.c.ignore_comments => {
+ let next_event = self.set_encountered(Encountered::Comment);
// We need to switch the lexer into a comment mode inside comments
- self.lexer.inside_comment();
- self.into_state_continue(State::InsideComment)
+ self.into_state(State::InsideComment, next_event)
}
- Token::CDataStart if self.config.coalesce_characters && self.config.cdata_to_characters => {
- if !self.buf_has_data() {
+ Token::CDataStart if self.depth() > 0 && self.config.c.coalesce_characters && self.config.c.cdata_to_characters => {
+ if self.buf.is_empty() {
self.push_pos();
}
- // We need to disable lexing errors inside CDATA
- self.lexer.disable_errors();
self.into_state_continue(State::InsideCData)
- }
+ },
_ => {
// Encountered some markup event, flush the buffer as characters
// or a whitespace
let mut next_event = if self.buf_has_data() {
let buf = self.take_buf();
- if self.inside_whitespace && self.config.trim_whitespace {
+ if self.inside_whitespace && self.config.c.trim_whitespace {
None
- } else if self.inside_whitespace && !self.config.whitespace_to_characters {
+ } else if self.inside_whitespace && !self.config.c.whitespace_to_characters {
Some(Ok(XmlEvent::Whitespace(buf)))
- } else if self.config.trim_whitespace {
+ } else if self.config.c.trim_whitespace {
Some(Ok(XmlEvent::Characters(buf.trim_matches(is_whitespace_char).into())))
} else {
Some(Ok(XmlEvent::Characters(buf)))
}
} else { None };
self.inside_whitespace = true; // Reset inside_whitespace flag
- self.push_pos();
- match t {
- Token::ProcessingInstructionStart =>
- self.into_state(State::InsideProcessingInstruction(ProcessingInstructionSubstate::PIInsideName), next_event),
-
- Token::DoctypeStart if !self.encountered_element => {
- // We don't have a doctype event so skip this position
- // FIXME: update when we have a doctype event
- self.next_pos();
- self.lexer.disable_errors();
- self.into_state(State::InsideDoctype, next_event)
- }
- Token::OpeningTagStart => {
- // If declaration was not parsed and we have encountered an element,
- // emit this declaration as the next event.
- if !self.parsed_declaration {
- self.parsed_declaration = true;
- let sd_event = XmlEvent::StartDocument {
- version: DEFAULT_VERSION,
- encoding: DEFAULT_ENCODING.into(),
- standalone: DEFAULT_STANDALONE
- };
- // next_event is always none here because we're outside of
- // the root element
- next_event = Some(Ok(sd_event));
- self.push_pos();
+ // pos is popped whenever an event is emitted, so pushes must happen only if there will be an event to balance it
+ // and ignored comments don't pop
+ if t != Token::CommentStart || !self.config.c.ignore_comments {
+ self.push_pos();
+ }
+ match t {
+ Token::OpeningTagStart if self.depth() > 0 || self.encountered < Encountered::Element || self.config.allow_multiple_root_elements => {
+ if let Some(e) = self.set_encountered(Encountered::Element) {
+ next_event = Some(e);
}
- self.encountered_element = true;
self.nst.push_empty();
self.into_state(State::InsideOpeningTag(OpeningTagSubstate::InsideName), next_event)
- }
+ },
Token::ClosingTagStart if self.depth() > 0 =>
self.into_state(State::InsideClosingTag(ClosingTagSubstate::CTInsideName), next_event),
Token::CommentStart => {
+ if let Some(e) = self.set_encountered(Encountered::Comment) {
+ next_event = Some(e);
+ }
// We need to switch the lexer into a comment mode inside comments
- self.lexer.inside_comment();
self.into_state(State::InsideComment, next_event)
- }
+ },
+
+ Token::DoctypeStart if self.encountered < Encountered::Doctype => {
+ if let Some(e) = self.set_encountered(Encountered::Doctype) {
+ next_event = Some(e);
+ }
- Token::CDataStart => {
- // We need to disable lexing errors inside CDATA
- self.lexer.disable_errors();
+ // We don't have a doctype event so skip this position
+ // FIXME: update when we have a doctype event
+ self.next_pos();
+ self.into_state(State::InsideDoctype(DoctypeSubstate::Outside), next_event)
+ },
+
+ Token::ProcessingInstructionStart =>
+ self.into_state(State::InsideProcessingInstruction(ProcessingInstructionSubstate::PIInsideName), next_event),
+
+ Token::CDataStart if self.depth() > 0 => {
self.into_state(State::InsideCData, next_event)
- }
+ },
- _ => Some(self_error!(self; "Unexpected token: {}", t))
+ _ => Some(self.error(SyntaxError::UnexpectedToken(t)))
}
}
}
}
+
+ pub fn document_start(&mut self, t: Token) -> Option<Result> {
+ debug_assert!(self.encountered < Encountered::Declaration);
+
+ match t {
+ Token::Character(c) => {
+ let next_event = self.set_encountered(Encountered::AnyChars);
+
+ if !is_whitespace_char(c) {
+ return Some(self.error(SyntaxError::UnexpectedTokenOutsideRoot(t)));
+ }
+ self.inside_whitespace = true;
+
+ // skip whitespace outside of the root element
+ if (self.config.c.trim_whitespace && self.buf.is_empty()) ||
+ (self.depth() == 0 && self.config.c.ignore_root_level_whitespace) {
+ return self.into_state(State::OutsideTag, next_event);
+ }
+
+ self.push_pos();
+ self.buf.push(c);
+ self.into_state(State::OutsideTag, next_event)
+ },
+
+ Token::CommentStart => {
+ let next_event = self.set_encountered(Encountered::Comment);
+ self.into_state(State::InsideComment, next_event)
+ }
+
+ Token::OpeningTagStart => {
+ let next_event = self.set_encountered(Encountered::Element);
+ self.nst.push_empty();
+ self.into_state(State::InsideOpeningTag(OpeningTagSubstate::InsideName), next_event)
+ },
+
+ Token::DoctypeStart => {
+ let next_event = self.set_encountered(Encountered::Doctype);
+ // We don't have a doctype event so skip this position
+ // FIXME: update when we have a doctype event
+ self.next_pos();
+ self.into_state(State::InsideDoctype(DoctypeSubstate::Outside), next_event)
+ },
+
+ Token::ProcessingInstructionStart => {
+ self.push_pos();
+ self.into_state_continue(State::InsideProcessingInstruction(ProcessingInstructionSubstate::PIInsideName))
+ },
+
+ _ => Some(self.error(SyntaxError::UnexpectedToken(t))),
+ }
+ }
}
diff --git a/src/util.rs b/src/util.rs
index 23fee04..07d0336 100644
--- a/src/util.rs
+++ b/src/util.rs
@@ -1,107 +1,305 @@
-use std::io::{self, Read};
-use std::str;
use std::fmt;
+use std::io::{self, Read};
+use std::str::{self, FromStr};
#[derive(Debug)]
pub enum CharReadError {
UnexpectedEof,
Utf8(str::Utf8Error),
- Io(io::Error)
+ Io(io::Error),
}
impl From<str::Utf8Error> for CharReadError {
+ #[cold]
fn from(e: str::Utf8Error) -> CharReadError {
CharReadError::Utf8(e)
}
}
impl From<io::Error> for CharReadError {
+ #[cold]
fn from(e: io::Error) -> CharReadError {
CharReadError::Io(e)
}
}
impl fmt::Display for CharReadError {
- fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
- use self::CharReadError::*;
+ #[cold]
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ use self::CharReadError::{Io, UnexpectedEof, Utf8};
match *self {
UnexpectedEof => write!(f, "unexpected end of stream"),
- Utf8(ref e) => write!(f, "UTF-8 decoding error: {}", e),
- Io(ref e) => write!(f, "I/O error: {}", e)
+ Utf8(ref e) => write!(f, "UTF-8 decoding error: {e}"),
+ Io(ref e) => write!(f, "I/O error: {e}"),
}
}
}
-pub fn next_char_from<R: Read>(source: &mut R) -> Result<Option<char>, CharReadError> {
- const MAX_CODEPOINT_LEN: usize = 4;
+/// Character encoding used for parsing
+#[derive(Debug, Copy, Clone, Eq, PartialEq)]
+#[non_exhaustive]
+pub enum Encoding {
+ /// Explicitly UTF-8 only
+ Utf8,
+ /// UTF-8 fallback, but can be any 8-bit encoding
+ Default,
+ /// ISO-8859-1
+ Latin1,
+ /// US-ASCII
+ Ascii,
+ /// Big-Endian
+ Utf16Be,
+ /// Little-Endian
+ Utf16Le,
+ /// Unknown endianness yet, will be sniffed
+ Utf16,
+ /// Not determined yet, may be sniffed to be anything
+ Unknown,
+}
- let mut bytes = source.bytes();
- let mut buf = [0u8; MAX_CODEPOINT_LEN];
- let mut pos = 0;
+// Rustc inlines eq_ignore_ascii_case and creates kilobytes of code!
+#[inline(never)]
+fn icmp(lower: &str, varcase: &str) -> bool {
+ lower.bytes().zip(varcase.bytes()).all(|(l, v)| l == v.to_ascii_lowercase())
+}
- loop {
- let next = match bytes.next() {
- Some(Ok(b)) => b,
- Some(Err(e)) => return Err(e.into()),
- None if pos == 0 => return Ok(None),
- None => return Err(CharReadError::UnexpectedEof)
- };
- buf[pos] = next;
- pos += 1;
+impl FromStr for Encoding {
+ type Err = &'static str;
+
+ fn from_str(val: &str) -> Result<Self, Self::Err> {
+ if ["utf-8", "utf8"].into_iter().any(move |label| icmp(label, val)) {
+ Ok(Encoding::Utf8)
+ } else if ["iso-8859-1", "latin1"].into_iter().any(move |label| icmp(label, val)) {
+ Ok(Encoding::Latin1)
+ } else if ["utf-16", "utf16"].into_iter().any(move |label| icmp(label, val)) {
+ Ok(Encoding::Utf16)
+ } else if ["ascii", "us-ascii"].into_iter().any(move |label| icmp(label, val)) {
+ Ok(Encoding::Ascii)
+ } else {
+ Err("unknown encoding name")
+ }
+ }
+}
+
+impl fmt::Display for Encoding {
+ #[cold]
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ f.write_str(match self {
+ Encoding::Utf8 => "UTF-8",
+ Encoding::Default => "UTF-8",
+ Encoding::Latin1 => "ISO-8859-1",
+ Encoding::Ascii => "US-ASCII",
+ Encoding::Utf16Be => "UTF-16",
+ Encoding::Utf16Le => "UTF-16",
+ Encoding::Utf16 => "UTF-16",
+ Encoding::Unknown => "(unknown)",
+ })
+ }
+}
+
+pub(crate) struct CharReader {
+ pub encoding: Encoding,
+}
+
+impl CharReader {
+ pub fn new() -> Self {
+ Self {
+ encoding: Encoding::Unknown,
+ }
+ }
+
+ pub fn next_char_from<R: Read>(&mut self, source: &mut R) -> Result<Option<char>, CharReadError> {
+ let mut bytes = source.bytes();
+ const MAX_CODEPOINT_LEN: usize = 4;
+
+ let mut buf = [0u8; MAX_CODEPOINT_LEN];
+ let mut pos = 0;
+ loop {
+ let next = match bytes.next() {
+ Some(Ok(b)) => b,
+ Some(Err(e)) => return Err(e.into()),
+ None if pos == 0 => return Ok(None),
+ None => return Err(CharReadError::UnexpectedEof),
+ };
+
+ match self.encoding {
+ Encoding::Utf8 | Encoding::Default => {
+ // fast path for ASCII subset
+ if pos == 0 && next.is_ascii() {
+ return Ok(Some(next.into()));
+ }
- match str::from_utf8(&buf[..pos]) {
- Ok(s) => return Ok(s.chars().next()), // always Some(..)
- Err(_) if pos < MAX_CODEPOINT_LEN => {},
- Err(e) => return Err(e.into())
+ buf[pos] = next;
+ pos += 1;
+
+ match str::from_utf8(&buf[..pos]) {
+ Ok(s) => return Ok(s.chars().next()), // always Some(..)
+ Err(_) if pos < MAX_CODEPOINT_LEN => continue,
+ Err(e) => return Err(e.into()),
+ }
+ },
+ Encoding::Latin1 => {
+ return Ok(Some(next.into()));
+ },
+ Encoding::Ascii => {
+ if next.is_ascii() {
+ return Ok(Some(next.into()));
+ } else {
+ return Err(CharReadError::Io(io::Error::new(io::ErrorKind::InvalidData, "char is not ASCII")));
+ }
+ },
+ Encoding::Unknown | Encoding::Utf16 => {
+ buf[pos] = next;
+ pos += 1;
+
+ // sniff BOM
+ if pos <= 3 && buf[..pos] == [0xEF, 0xBB, 0xBF][..pos] {
+ if pos == 3 && self.encoding != Encoding::Utf16 {
+ pos = 0;
+ self.encoding = Encoding::Utf8;
+ }
+ } else if pos <= 2 && buf[..pos] == [0xFE, 0xFF][..pos] {
+ if pos == 2 {
+ pos = 0;
+ self.encoding = Encoding::Utf16Be;
+ }
+ } else if pos <= 2 && buf[..pos] == [0xFF, 0xFE][..pos] {
+ if pos == 2 {
+ pos = 0;
+ self.encoding = Encoding::Utf16Le;
+ }
+ } else if pos == 1 && self.encoding == Encoding::Utf16 {
+ // sniff ASCII char in UTF-16
+ self.encoding = if next == 0 { Encoding::Utf16Be } else { Encoding::Utf16Le };
+ } else {
+ // UTF-8 is the default, but XML decl can change it to other 8-bit encoding
+ self.encoding = Encoding::Default;
+ if pos == 1 && next.is_ascii() {
+ return Ok(Some(next.into()));
+ }
+ }
+ },
+ Encoding::Utf16Be => {
+ buf[pos] = next;
+ pos += 1;
+ if pos == 2 {
+ if let Some(Ok(c)) = char::decode_utf16([u16::from_be_bytes(buf[..2].try_into().unwrap())]).next() {
+ return Ok(Some(c));
+ }
+ } else if pos == 4 { // surrogate
+ return char::decode_utf16([u16::from_be_bytes(buf[..2].try_into().unwrap()), u16::from_be_bytes(buf[2..4].try_into().unwrap())])
+ .next().transpose()
+ .map_err(|e| CharReadError::Io(io::Error::new(io::ErrorKind::InvalidData, e)));
+ }
+ },
+ Encoding::Utf16Le => {
+ buf[pos] = next;
+ pos += 1;
+ if pos == 2 {
+ if let Some(Ok(c)) = char::decode_utf16([u16::from_le_bytes(buf[..2].try_into().unwrap())]).next() {
+ return Ok(Some(c));
+ }
+ } else if pos == 4 { // surrogate
+ return char::decode_utf16([u16::from_le_bytes(buf[..2].try_into().unwrap()), u16::from_le_bytes(buf[2..4].try_into().unwrap())])
+ .next().transpose()
+ .map_err(|e| CharReadError::Io(io::Error::new(io::ErrorKind::InvalidData, e)));
+ }
+ },
+ }
}
}
}
#[cfg(test)]
mod tests {
+ use super::{CharReadError, CharReader, Encoding};
+
#[test]
fn test_next_char_from() {
use std::io;
- use std::error::Error;
let mut bytes: &[u8] = "correct".as_bytes(); // correct ASCII
- assert_eq!(super::next_char_from(&mut bytes).unwrap(), Some('c'));
+ assert_eq!(CharReader::new().next_char_from(&mut bytes).unwrap(), Some('c'));
+
+ let mut bytes: &[u8] = b"\xEF\xBB\xBF\xE2\x80\xA2!"; // BOM
+ assert_eq!(CharReader::new().next_char_from(&mut bytes).unwrap(), Some('•'));
+
+ let mut bytes: &[u8] = b"\xEF\xBB\xBFx123"; // BOM
+ assert_eq!(CharReader::new().next_char_from(&mut bytes).unwrap(), Some('x'));
+
+ let mut bytes: &[u8] = b"\xEF\xBB\xBF"; // Nothing after BOM
+ assert_eq!(CharReader::new().next_char_from(&mut bytes).unwrap(), None);
+
+ let mut bytes: &[u8] = b"\xEF\xBB"; // Nothing after BO
+ assert!(matches!(CharReader::new().next_char_from(&mut bytes), Err(CharReadError::UnexpectedEof)));
+
+ let mut bytes: &[u8] = b"\xEF\xBB\x42"; // Nothing after BO
+ assert!(matches!(CharReader::new().next_char_from(&mut bytes), Err(_)));
+
+ let mut bytes: &[u8] = b"\xFE\xFF\x00\x42"; // UTF-16
+ assert_eq!(CharReader::new().next_char_from(&mut bytes).unwrap(), Some('B'));
+
+ let mut bytes: &[u8] = b"\xFF\xFE\x42\x00"; // UTF-16
+ assert_eq!(CharReader::new().next_char_from(&mut bytes).unwrap(), Some('B'));
+
+ let mut bytes: &[u8] = b"\xFF\xFE"; // UTF-16
+ assert_eq!(CharReader::new().next_char_from(&mut bytes).unwrap(), None);
+
+ let mut bytes: &[u8] = b"\xFF\xFE\x00"; // UTF-16
+ assert!(matches!(CharReader::new().next_char_from(&mut bytes), Err(CharReadError::UnexpectedEof)));
let mut bytes: &[u8] = "правильно".as_bytes(); // correct BMP
- assert_eq!(super::next_char_from(&mut bytes).unwrap(), Some('п'));
+ assert_eq!(CharReader::new().next_char_from(&mut bytes).unwrap(), Some('п'));
+
+ let mut bytes: &[u8] = "правильно".as_bytes();
+ assert_eq!(CharReader { encoding: Encoding::Utf16Be }.next_char_from(&mut bytes).unwrap(), Some('킿'));
+
+ let mut bytes: &[u8] = "правильно".as_bytes();
+ assert_eq!(CharReader { encoding: Encoding::Utf16Le }.next_char_from(&mut bytes).unwrap(), Some('뿐'));
+
+ let mut bytes: &[u8] = b"\xD8\xD8\x80";
+ assert!(matches!(CharReader { encoding: Encoding::Utf16 }.next_char_from(&mut bytes), Err(_)));
+
+ let mut bytes: &[u8] = b"\x00\x42";
+ assert_eq!(CharReader { encoding: Encoding::Utf16 }.next_char_from(&mut bytes).unwrap(), Some('B'));
+
+ let mut bytes: &[u8] = b"\x42\x00";
+ assert_eq!(CharReader { encoding: Encoding::Utf16 }.next_char_from(&mut bytes).unwrap(), Some('B'));
+
+ let mut bytes: &[u8] = b"\x00";
+ assert!(matches!(CharReader { encoding: Encoding::Utf16Be }.next_char_from(&mut bytes), Err(_)));
let mut bytes: &[u8] = "😊".as_bytes(); // correct non-BMP
- assert_eq!(super::next_char_from(&mut bytes).unwrap(), Some('😊'));
+ assert_eq!(CharReader::new().next_char_from(&mut bytes).unwrap(), Some('😊'));
let mut bytes: &[u8] = b""; // empty
- assert_eq!(super::next_char_from(&mut bytes).unwrap(), None);
+ assert_eq!(CharReader::new().next_char_from(&mut bytes).unwrap(), None);
let mut bytes: &[u8] = b"\xf0\x9f\x98"; // incomplete code point
- match super::next_char_from(&mut bytes).unwrap_err() {
+ match CharReader::new().next_char_from(&mut bytes).unwrap_err() {
super::CharReadError::UnexpectedEof => {},
- e => panic!("Unexpected result: {:?}", e)
+ e => panic!("Unexpected result: {e:?}")
};
let mut bytes: &[u8] = b"\xff\x9f\x98\x32"; // invalid code point
- match super::next_char_from(&mut bytes).unwrap_err() {
+ match CharReader::new().next_char_from(&mut bytes).unwrap_err() {
super::CharReadError::Utf8(_) => {},
- e => panic!("Unexpected result: {:?}", e)
+ e => panic!("Unexpected result: {e:?}")
};
-
// error during read
struct ErrorReader;
impl io::Read for ErrorReader {
- fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
+ fn read(&mut self, _: &mut [u8]) -> io::Result<usize> {
Err(io::Error::new(io::ErrorKind::Other, "test error"))
}
}
let mut r = ErrorReader;
- match super::next_char_from(&mut r).unwrap_err() {
+ match CharReader::new().next_char_from(&mut r).unwrap_err() {
super::CharReadError::Io(ref e) if e.kind() == io::ErrorKind::Other &&
- e.description() == "test error" => {},
- e => panic!("Unexpected result: {:?}", e)
+ e.to_string().contains("test error") => {},
+ e => panic!("Unexpected result: {e:?}")
}
}
}
diff --git a/src/writer/mod.rs b/src/writer.rs
index ea1b242..e2b70ec 100644
--- a/src/writer/mod.rs
+++ b/src/writer.rs
@@ -3,24 +3,24 @@
//! The most important type in this module is `EventWriter` which allows writing an XML document
//! to some output stream.
-pub use self::emitter::Result;
-pub use self::emitter::EmitterError as Error;
pub use self::config::EmitterConfig;
+pub use self::emitter::EmitterError as Error;
+pub use self::emitter::Result;
pub use self::events::XmlEvent;
use self::emitter::Emitter;
use std::io::prelude::*;
-mod emitter;
mod config;
+mod emitter;
pub mod events;
/// A wrapper around an `std::io::Write` instance which emits XML document according to provided
/// events.
pub struct EventWriter<W> {
sink: W,
- emitter: Emitter
+ emitter: Emitter,
}
impl<W: Write> EventWriter<W> {
@@ -37,7 +37,7 @@ impl<W: Write> EventWriter<W> {
pub fn new_with_config(sink: W, config: EmitterConfig) -> EventWriter<W> {
EventWriter {
sink,
- emitter: Emitter::new(config)
+ emitter: Emitter::new(config),
}
}
@@ -63,12 +63,9 @@ impl<W: Write> EventWriter<W> {
self.emitter.namespace_stack_mut().try_pop();
r
}
- XmlEvent::Comment(content) =>
- self.emitter.emit_comment(&mut self.sink, content),
- XmlEvent::CData(content) =>
- self.emitter.emit_cdata(&mut self.sink, content),
- XmlEvent::Characters(content) =>
- self.emitter.emit_characters(&mut self.sink, content)
+ XmlEvent::Comment(content) => self.emitter.emit_comment(&mut self.sink, content),
+ XmlEvent::CData(content) => self.emitter.emit_cdata(&mut self.sink, content),
+ XmlEvent::Characters(content) => self.emitter.emit_characters(&mut self.sink, content),
}
}
diff --git a/src/writer/config.rs b/src/writer/config.rs
index ebabf18..c7841bc 100644
--- a/src/writer/config.rs
+++ b/src/writer/config.rs
@@ -1,9 +1,8 @@
//! Contains emitter configuration structure.
-use std::io::Write;
use std::borrow::Cow;
-
-use writer::EventWriter;
+use std::io::Write;
+use crate::writer::EventWriter;
/// Emitter configuration structure.
///
@@ -98,10 +97,11 @@ impl EmitterConfig {
/// .normalize_empty_elements(false);
/// ```
#[inline]
+ #[must_use]
pub fn new() -> EmitterConfig {
EmitterConfig {
line_separator: "\n".into(),
- indent_string: " ".into(), // two spaces
+ indent_string: " ".into(), // two spaces
perform_indent: false,
perform_escaping: true,
write_document_declaration: true,
@@ -109,7 +109,7 @@ impl EmitterConfig {
cdata_to_characters: false,
keep_element_names_stack: true,
autopad_comments: true,
- pad_self_closing: true
+ pad_self_closing: true,
}
}
diff --git a/src/writer/emitter.rs b/src/writer/emitter.rs
index ba80f66..8e74b5f 100644
--- a/src/writer/emitter.rs
+++ b/src/writer/emitter.rs
@@ -1,18 +1,17 @@
+use std::error::Error;
+use std::fmt;
use std::io;
use std::io::prelude::*;
-use std::fmt;
use std::result;
-use std::borrow::Cow;
-use std::error::Error;
-use common;
-use name::{Name, OwnedName};
-use attribute::Attribute;
-use escape::{escape_str_attribute, escape_str_pcdata};
-use common::XmlVersion;
-use namespace::{NamespaceStack, NS_NO_PREFIX, NS_EMPTY_URI, NS_XMLNS_PREFIX, NS_XML_PREFIX};
+use crate::attribute::Attribute;
+use crate::common;
+use crate::common::XmlVersion;
+use crate::escape::{AttributeEscapes, Escaped, PcDataEscapes};
+use crate::name::{Name, OwnedName};
+use crate::namespace::{NamespaceStack, NS_EMPTY_URI, NS_NO_PREFIX, NS_XMLNS_PREFIX, NS_XML_PREFIX};
-use writer::config::EmitterConfig;
+use crate::writer::config::EmitterConfig;
/// An error which may be returned by `XmlWriter` when writing XML events.
#[derive(Debug)]
@@ -32,47 +31,35 @@ pub enum EmitterError {
/// End element name is not specified when it is needed, for example, when automatic
/// closing is not enabled in configuration.
- EndElementNameIsNotSpecified
+ EndElementNameIsNotSpecified,
}
impl From<io::Error> for EmitterError {
+ #[cold]
fn from(err: io::Error) -> EmitterError {
EmitterError::Io(err)
}
}
impl fmt::Display for EmitterError {
- fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
-
- write!(f, "emitter error: ")?;
- match *self {
- EmitterError::Io(ref e) =>
- write!(f, "I/O error: {}", e),
- ref other =>
- write!(f, "{}", other.description()),
+ #[cold]
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ f.write_str("emitter error: ")?;
+ match self {
+ EmitterError::Io(e) => write!(f, "I/O error: {e}"),
+ EmitterError::DocumentStartAlreadyEmitted => f.write_str("document start event has already been emitted"),
+ EmitterError::LastElementNameNotAvailable => f.write_str("last element name is not available"),
+ EmitterError::EndElementNameIsNotEqualToLastStartElementName => f.write_str("end element name is not equal to last start element name"),
+ EmitterError::EndElementNameIsNotSpecified => f.write_str("end element name is not specified and can't be inferred"),
}
}
}
impl Error for EmitterError {
- fn description(&self) -> &str {
- match *self {
- EmitterError::Io(_) =>
- "I/O error",
- EmitterError::DocumentStartAlreadyEmitted =>
- "document start event has already been emitted",
- EmitterError::LastElementNameNotAvailable =>
- "last element name is not available",
- EmitterError::EndElementNameIsNotEqualToLastStartElementName =>
- "end element name is not equal to last start element name",
- EmitterError::EndElementNameIsNotSpecified =>
- "end element name is not specified and can't be inferred",
- }
- }
}
/// A result type yielded by `XmlWriter`.
-pub type Result<T> = result::Result<T, EmitterError>;
+pub type Result<T, E = EmitterError> = result::Result<T, E>;
// TODO: split into a low-level fast writer without any checks and formatting logic and a
// high-level indenting validating writer
@@ -87,23 +74,26 @@ pub struct Emitter {
element_names: Vec<OwnedName>,
start_document_emitted: bool,
- just_wrote_start_element: bool
+ just_wrote_start_element: bool,
}
impl Emitter {
pub fn new(config: EmitterConfig) -> Emitter {
+ let mut indent_stack = Vec::with_capacity(16);
+ indent_stack.push(IndentFlags::WroteNothing);
+
Emitter {
config,
nst: NamespaceStack::empty(),
indent_level: 0,
- indent_stack: vec![IndentFlags::WroteNothing],
+ indent_stack,
element_names: Vec::new(),
start_document_emitted: false,
- just_wrote_start_element: false
+ just_wrote_start_element: false,
}
}
}
@@ -124,27 +114,26 @@ impl Emitter {
#[inline]
fn wrote_text(&self) -> bool {
- *self.indent_stack.last().unwrap() == IndentFlags::WroteText
+ self.indent_stack.last().map_or(false, |&e| e == IndentFlags::WroteText)
}
#[inline]
fn wrote_markup(&self) -> bool {
- *self.indent_stack.last().unwrap() == IndentFlags::WroteMarkup
+ self.indent_stack.last().map_or(false, |&e| e == IndentFlags::WroteMarkup)
}
#[inline]
fn set_wrote_text(&mut self) {
- *self.indent_stack.last_mut().unwrap() = IndentFlags::WroteText;
+ if let Some(e) = self.indent_stack.last_mut() {
+ *e = IndentFlags::WroteText;
+ }
}
#[inline]
fn set_wrote_markup(&mut self) {
- *self.indent_stack.last_mut().unwrap() = IndentFlags::WroteMarkup;
- }
-
- #[inline]
- fn reset_state(&mut self) {
- *self.indent_stack.last_mut().unwrap() = IndentFlags::WroteNothing;
+ if let Some(e) = self.indent_stack.last_mut() {
+ *e = IndentFlags::WroteMarkup;
+ }
}
fn write_newline<W: Write>(&mut self, target: &mut W, level: usize) -> Result<()> {
@@ -216,7 +205,7 @@ impl Emitter {
self.before_markup(target)?;
let result = {
let mut write = move || {
- write!(target, "<?xml version=\"{}\" encoding=\"{}\"", version, encoding)?;
+ write!(target, "<?xml version=\"{version}\" encoding=\"{encoding}\"")?;
if let Some(standalone) = standalone {
write!(target, " standalone=\"{}\"", if standalone { "yes" } else { "no" })?;
@@ -260,11 +249,11 @@ impl Emitter {
self.before_markup(target)?;
let result = {
- let mut write = || {
- write!(target, "<?{}", name)?;
+ let mut write = move || {
+ write!(target, "<?{name}")?;
if let Some(data) = data {
- write!(target, " {}", data)?;
+ write!(target, " {data}")?;
}
write!(target, "?>")?;
@@ -280,8 +269,8 @@ impl Emitter {
}
fn emit_start_element_initial<W>(&mut self, target: &mut W,
- name: Name,
- attributes: &[Attribute]) -> Result<()>
+ name: Name<'_>,
+ attributes: &[Attribute<'_>]) -> Result<()>
where W: Write
{
self.check_document_started(target)?;
@@ -295,8 +284,8 @@ impl Emitter {
}
pub fn emit_start_element<W>(&mut self, target: &mut W,
- name: Name,
- attributes: &[Attribute]) -> Result<()>
+ name: Name<'_>,
+ attributes: &[Attribute<'_>]) -> Result<()>
where W: Write
{
if self.config.keep_element_names_stack {
@@ -324,29 +313,31 @@ impl Emitter {
//prefix if self.nst.get(prefix) == Some(uri) => Ok(()),
// emit xmlns only if it is overridden
NS_NO_PREFIX => if uri != NS_EMPTY_URI {
- write!(target, " xmlns=\"{}\"", uri)
+ write!(target, " xmlns=\"{uri}\"")
} else { Ok(()) },
// everything else
- prefix => write!(target, " xmlns:{}=\"{}\"", prefix, uri)
+ prefix => write!(target, " xmlns:{prefix}=\"{uri}\"")
}?;
}
Ok(())
}
pub fn emit_attributes<W: Write>(&mut self, target: &mut W,
- attributes: &[Attribute]) -> Result<()> {
- for attr in attributes.iter() {
- write!(
- target, " {}=\"{}\"",
- attr.name.repr_display(),
- if self.config.perform_escaping { escape_str_attribute(attr.value) } else { Cow::Borrowed(attr.value) }
- )?
+ attributes: &[Attribute<'_>]) -> Result<()> {
+ for attr in attributes.iter() {
+ write!(target, " {}=\"", attr.name.repr_display())?;
+ if self.config.perform_escaping {
+ write!(target, "{}", Escaped::<AttributeEscapes>::new(attr.value))?;
+ } else {
+ write!(target, "{}", attr.value)?;
+ }
+ write!(target, "\"")?;
}
Ok(())
}
pub fn emit_end_element<W: Write>(&mut self, target: &mut W,
- name: Option<Name>) -> Result<()> {
+ name: Option<Name<'_>>) -> Result<()> {
let owned_name = if self.config.keep_element_names_stack {
Some(self.element_names.pop().ok_or(EmitterError::LastElementNameNotAvailable)?)
} else {
@@ -403,13 +394,13 @@ impl Emitter {
content: &str) -> Result<()> {
self.check_document_started(target)?;
self.fix_non_empty_element(target)?;
- target.write_all(
- (if self.config.perform_escaping {
- escape_str_pcdata(content)
- } else {
- Cow::Borrowed(content)
- }).as_bytes()
- )?;
+
+ if self.config.perform_escaping {
+ write!(target, "{}", Escaped::<PcDataEscapes>::new(content))?;
+ } else {
+ target.write_all(content.as_bytes())?;
+ }
+
self.after_text();
Ok(())
}
@@ -420,7 +411,7 @@ impl Emitter {
// TODO: add escaping dashes at the end of the comment
let autopad_comments = self.config.autopad_comments;
- let write = |target: &mut W| -> Result<()> {
+ let write = move |target: &mut W| -> Result<()> {
target.write_all(b"<!--")?;
if autopad_comments && !content.starts_with(char::is_whitespace) {
diff --git a/src/writer/events.rs b/src/writer/events.rs
index 1f7040f..af9f37c 100644
--- a/src/writer/events.rs
+++ b/src/writer/events.rs
@@ -2,16 +2,16 @@
use std::borrow::Cow;
-use name::Name;
-use attribute::Attribute;
-use common::XmlVersion;
-use namespace::{Namespace, NS_NO_PREFIX};
+use crate::attribute::Attribute;
+use crate::common::XmlVersion;
+use crate::name::Name;
+use crate::namespace::{Namespace, NS_NO_PREFIX};
/// A part of an XML output stream.
///
/// Objects of this enum are consumed by `EventWriter`. They correspond to different parts of
/// an XML document.
-#[derive(Debug)]
+#[derive(Debug, Clone)]
pub enum XmlEvent<'a> {
/// Corresponds to XML document declaration.
///
@@ -32,7 +32,7 @@ pub enum XmlEvent<'a> {
/// XML standalone declaration.
///
/// Defaults to `None`.
- standalone: Option<bool>
+ standalone: Option<bool>,
},
/// Denotes an XML processing instruction.
@@ -41,7 +41,7 @@ pub enum XmlEvent<'a> {
name: &'a str,
/// Processing instruction content.
- data: Option<&'a str>
+ data: Option<&'a str>,
},
/// Denotes a beginning of an XML element.
@@ -71,7 +71,7 @@ pub enum XmlEvent<'a> {
/// If `None`, then it is assumed that the element name should be the last valid one.
/// If `Some` and element names tracking is enabled, then the writer will check it for
/// correctness.
- name: Option<Name<'a>>
+ name: Option<Name<'a>>,
},
/// Denotes CDATA content.
@@ -90,14 +90,15 @@ pub enum XmlEvent<'a> {
///
/// Contents of this event will be escaped if `perform_escaping` option is enabled,
/// that is, every character invalid for PCDATA will appear as a character entity.
- Characters(&'a str)
+ Characters(&'a str),
}
impl<'a> XmlEvent<'a> {
/// Returns an writer event for a processing instruction.
#[inline]
+ #[must_use]
pub fn processing_instruction(name: &'a str, data: Option<&'a str>) -> XmlEvent<'a> {
- XmlEvent::ProcessingInstruction { name: name, data: data }
+ XmlEvent::ProcessingInstruction { name, data }
}
/// Returns a builder for a starting element.
@@ -109,7 +110,7 @@ impl<'a> XmlEvent<'a> {
StartElementBuilder {
name: name.into(),
attributes: Vec::new(),
- namespace: Namespace::empty().into()
+ namespace: Namespace::empty(),
}
}
@@ -119,6 +120,7 @@ impl<'a> XmlEvent<'a> {
/// the writer is able to determine it automatically. However, when this functionality
/// is disabled, it is possible to specify the name with `name()` method on the builder.
#[inline]
+ #[must_use]
pub fn end_element() -> EndElementBuilder<'a> {
EndElementBuilder { name: None }
}
@@ -128,26 +130,37 @@ impl<'a> XmlEvent<'a> {
/// Naturally, the provided string won't be escaped, except for closing CDATA token `]]>`
/// (depending on the configuration).
#[inline]
- pub fn cdata(data: &'a str) -> XmlEvent<'a> { XmlEvent::CData(data) }
+ #[must_use]
+ pub fn cdata(data: &'a str) -> XmlEvent<'a> {
+ XmlEvent::CData(data)
+ }
/// Returns a regular characters (PCDATA) event.
///
/// All offending symbols, in particular, `&` and `<`, will be escaped by the writer.
#[inline]
- pub fn characters(data: &'a str) -> XmlEvent<'a> { XmlEvent::Characters(data) }
+ #[must_use]
+ pub fn characters(data: &'a str) -> XmlEvent<'a> {
+ XmlEvent::Characters(data)
+ }
/// Returns a comment event.
#[inline]
- pub fn comment(data: &'a str) -> XmlEvent<'a> { XmlEvent::Comment(data) }
+ #[must_use]
+ pub fn comment(data: &'a str) -> XmlEvent<'a> {
+ XmlEvent::Comment(data)
+ }
}
impl<'a> From<&'a str> for XmlEvent<'a> {
#[inline]
- fn from(s: &'a str) -> XmlEvent<'a> { XmlEvent::Characters(s) }
+ fn from(s: &'a str) -> XmlEvent<'a> {
+ XmlEvent::Characters(s)
+ }
}
pub struct EndElementBuilder<'a> {
- name: Option<Name<'a>>
+ name: Option<Name<'a>>,
}
/// A builder for a closing element event.
@@ -175,7 +188,7 @@ impl<'a> From<EndElementBuilder<'a>> for XmlEvent<'a> {
pub struct StartElementBuilder<'a> {
name: Name<'a>,
attributes: Vec<Attribute<'a>>,
- namespace: Namespace
+ namespace: Namespace,
}
impl<'a> StartElementBuilder<'a> {
@@ -210,6 +223,7 @@ impl<'a> StartElementBuilder<'a> {
/// then another binding will be added as a part of this element attribute set, shadowing
/// the outer binding.
#[inline]
+ #[must_use]
pub fn ns<S1, S2>(mut self, prefix: S1, uri: S2) -> StartElementBuilder<'a>
where S1: Into<String>, S2: Into<String>
{
@@ -221,6 +235,7 @@ impl<'a> StartElementBuilder<'a> {
///
/// Same rules as for `ns()` are also valid for the default namespace mapping.
#[inline]
+ #[must_use]
pub fn default_ns<S>(mut self, uri: S) -> StartElementBuilder<'a>
where S: Into<String>
{
@@ -235,7 +250,7 @@ impl<'a> From<StartElementBuilder<'a>> for XmlEvent<'a> {
XmlEvent::StartElement {
name: b.name,
attributes: Cow::Owned(b.attributes),
- namespace: Cow::Owned(b.namespace)
+ namespace: Cow::Owned(b.namespace),
}
}
}
diff --git a/tests/documents/sample_1.xml b/tests/documents/sample_1.xml
deleted file mode 100644
index 4d1cbc0..0000000
--- a/tests/documents/sample_1.xml
+++ /dev/null
@@ -1,34 +0,0 @@
-<?xml version="1.0" encoding="utf-8" standalone="yes"?>
-<project name="project-name">
- <libraries>
- <library groupId="org.example" artifactId="&lt;name&gt;" version="0.1"/>
- <library groupId="com.example" artifactId="&quot;cool-lib&amp;" version="999"/>
- </libraries>
- <module name="module-1">
- <files>
- <file name="somefile.java" type="java">
- Some &lt;java&gt; class
- </file>
- <file name="another_file.java" type="java">
- Another &quot;java&quot; class
- </file>
- <file name="config.xml" type="xml">
- Weird &apos;XML&apos; config
- </file>
- </files>
- <libraries>
- <library groupId="junit" artifactId="junit" version="1.9.5"/>
- </libraries>
- </module>
- <module name="module-2">
- <files>
- <file name="program.js" type="javascript">
- JavaScript &amp; program
- </file>
- <file name="style.css" type="css">
- Cascading style sheet: &#xA9; - &#1161;
- </file>
- </files>
- </module>
-</project>
-
diff --git a/tests/documents/sample_1_full.txt b/tests/documents/sample_1_full.txt
deleted file mode 100644
index a8d64d0..0000000
--- a/tests/documents/sample_1_full.txt
+++ /dev/null
@@ -1,58 +0,0 @@
-StartDocument(1.0, utf-8)
-StartElement(project [name="project-name"])
-Whitespace("\n ")
-StartElement(libraries)
-Whitespace("\n ")
-StartElement(library [groupId="org.example", artifactId="<name>", version="0.1"])
-EndElement(library)
-Whitespace("\n ")
-StartElement(library [groupId="com.example", artifactId="\"cool-lib&", version="999"])
-EndElement(library)
-Whitespace("\n ")
-EndElement(libraries)
-Whitespace("\n ")
-StartElement(module [name="module-1"])
-Whitespace("\n ")
-StartElement(files)
-Whitespace("\n ")
-StartElement(file [name="somefile.java", type="java"])
-Characters("\n Some <java> class\n ")
-EndElement(file)
-Whitespace("\n ")
-StartElement(file [name="another_file.java", type="java"])
-Characters("\n Another \"java\" class\n ")
-EndElement(file)
-Whitespace("\n ")
-StartElement(file [name="config.xml", type="xml"])
-Characters("\n Weird \'XML\' config\n ")
-EndElement(file)
-Whitespace("\n ")
-EndElement(files)
-Whitespace("\n ")
-StartElement(libraries)
-Whitespace("\n ")
-StartElement(library [groupId="junit", artifactId="junit", version="1.9.5"])
-EndElement(library)
-Whitespace("\n ")
-EndElement(libraries)
-Whitespace("\n ")
-EndElement(module)
-Whitespace("\n ")
-StartElement(module [name="module-2"])
-Whitespace("\n ")
-StartElement(files)
-Whitespace("\n ")
-StartElement(file [name="program.js", type="javascript"])
-Characters("\n JavaScript & program\n ")
-EndElement(file)
-Whitespace("\n ")
-StartElement(file [name="style.css", type="css"])
-Characters("\n Cascading style sheet: © - ҉\n ")
-EndElement(file)
-Whitespace("\n ")
-EndElement(files)
-Whitespace("\n ")
-EndElement(module)
-Whitespace("\n")
-EndElement(project)
-EndDocument
diff --git a/tests/documents/sample_1_short.txt b/tests/documents/sample_1_short.txt
deleted file mode 100644
index 4dbe285..0000000
--- a/tests/documents/sample_1_short.txt
+++ /dev/null
@@ -1,37 +0,0 @@
-StartDocument(1.0, utf-8)
-StartElement(project [name="project-name"])
-StartElement(libraries)
-StartElement(library [groupId="org.example", artifactId="<name>", version="0.1"])
-EndElement(library)
-StartElement(library [groupId="com.example", artifactId="\"cool-lib&", version="999"])
-EndElement(library)
-EndElement(libraries)
-StartElement(module [name="module-1"])
-StartElement(files)
-StartElement(file [name="somefile.java", type="java"])
-Characters("Some <java> class")
-EndElement(file)
-StartElement(file [name="another_file.java", type="java"])
-Characters("Another \"java\" class")
-EndElement(file)
-StartElement(file [name="config.xml", type="xml"])
-Characters("Weird \'XML\' config")
-EndElement(file)
-EndElement(files)
-StartElement(libraries)
-StartElement(library [groupId="junit", artifactId="junit", version="1.9.5"])
-EndElement(library)
-EndElement(libraries)
-EndElement(module)
-StartElement(module [name="module-2"])
-StartElement(files)
-StartElement(file [name="program.js", type="javascript"])
-Characters("JavaScript & program")
-EndElement(file)
-StartElement(file [name="style.css", type="css"])
-Characters("Cascading style sheet: © - ҉")
-EndElement(file)
-EndElement(files)
-EndElement(module)
-EndElement(project)
-EndDocument
diff --git a/tests/documents/sample_2.xml b/tests/documents/sample_2.xml
deleted file mode 100644
index f9543ac..0000000
--- a/tests/documents/sample_2.xml
+++ /dev/null
@@ -1,15 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<p:data xmlns:d="urn:example:double" xmlns:h="urn:example:header" xmlns:p="urn:example:namespace">
- <p:datum id="34">
- <p:name>Name</p:name>
- <d:name>Another name</d:name>
- <d:arg>0.3</d:arg>
- <d:arg>0.2</d:arg>
- <p:arg>0.1</p:arg>
- <p:arg>0.01</p:arg>
- <h:header name="Header-1">header 1 value</h:header>
- <h:header name="Header-2">
- Some bigger value
- </h:header>
- </p:datum>
-</p:data>
diff --git a/tests/documents/sample_2_full.txt b/tests/documents/sample_2_full.txt
deleted file mode 100644
index 75075cd..0000000
--- a/tests/documents/sample_2_full.txt
+++ /dev/null
@@ -1,41 +0,0 @@
-StartDocument(1.0, utf-8)
-StartElement({urn:example:namespace}p:data)
-Whitespace("\n ")
-StartElement({urn:example:namespace}p:datum [id="34"])
-Whitespace("\n ")
-StartElement({urn:example:namespace}p:name)
-Characters("Name")
-EndElement({urn:example:namespace}p:name)
-Whitespace("\n ")
-StartElement({urn:example:double}d:name)
-Characters("Another name")
-EndElement({urn:example:double}d:name)
-Whitespace("\n ")
-StartElement({urn:example:double}d:arg)
-Characters("0.3")
-EndElement({urn:example:double}d:arg)
-Whitespace("\n ")
-StartElement({urn:example:double}d:arg)
-Characters("0.2")
-EndElement({urn:example:double}d:arg)
-Whitespace("\n ")
-StartElement({urn:example:namespace}p:arg)
-Characters("0.1")
-EndElement({urn:example:namespace}p:arg)
-Whitespace("\n ")
-StartElement({urn:example:namespace}p:arg)
-Characters("0.01")
-EndElement({urn:example:namespace}p:arg)
-Whitespace("\n ")
-StartElement({urn:example:header}h:header [name="Header-1"])
-Characters("header 1 value")
-EndElement({urn:example:header}h:header)
-Whitespace("\n ")
-StartElement({urn:example:header}h:header [name="Header-2"])
-Characters("\n Some bigger value\n ")
-EndElement({urn:example:header}h:header)
-Whitespace("\n ")
-EndElement({urn:example:namespace}p:datum)
-Whitespace("\n")
-EndElement({urn:example:namespace}p:data)
-EndDocument
diff --git a/tests/documents/sample_2_short.txt b/tests/documents/sample_2_short.txt
deleted file mode 100644
index 2368025..0000000
--- a/tests/documents/sample_2_short.txt
+++ /dev/null
@@ -1,30 +0,0 @@
-StartDocument(1.0, utf-8)
-StartElement({urn:example:namespace}p:data)
-StartElement({urn:example:namespace}p:datum [id="34"])
-StartElement({urn:example:namespace}p:name)
-Characters("Name")
-EndElement({urn:example:namespace}p:name)
-StartElement({urn:example:double}d:name)
-Characters("Another name")
-EndElement({urn:example:double}d:name)
-StartElement({urn:example:double}d:arg)
-Characters("0.3")
-EndElement({urn:example:double}d:arg)
-StartElement({urn:example:double}d:arg)
-Characters("0.2")
-EndElement({urn:example:double}d:arg)
-StartElement({urn:example:namespace}p:arg)
-Characters("0.1")
-EndElement({urn:example:namespace}p:arg)
-StartElement({urn:example:namespace}p:arg)
-Characters("0.01")
-EndElement({urn:example:namespace}p:arg)
-StartElement({urn:example:header}h:header [name="Header-1"])
-Characters("header 1 value")
-EndElement({urn:example:header}h:header)
-StartElement({urn:example:header}h:header [name="Header-2"])
-Characters("Some bigger value")
-EndElement({urn:example:header}h:header)
-EndElement({urn:example:namespace}p:datum)
-EndElement({urn:example:namespace}p:data)
-EndDocument
diff --git a/tests/documents/sample_3.xml b/tests/documents/sample_3.xml
deleted file mode 100644
index 657e37d..0000000
--- a/tests/documents/sample_3.xml
+++ /dev/null
@@ -1,13 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<p:data xmlns:p="urn:x" z=">">
- <!-- abcd &lt; &gt; &amp; -->
- <a>test</a>
- <b>kkss" = ddd' ></b>
- <![CDATA[
- <a>ddddd</b>!e3--><!-- ddckx
- ]]>
- <c/>
- <![CDATA[
- <![CDATA[zzzz]]]]><![CDATA[>]]>
-</p:data>
-
diff --git a/tests/documents/sample_3_full.txt b/tests/documents/sample_3_full.txt
deleted file mode 100644
index e9a0f7e..0000000
--- a/tests/documents/sample_3_full.txt
+++ /dev/null
@@ -1,23 +0,0 @@
-1:1 StartDocument(1.0, utf-8)
-2:1 StartElement({urn:x}p:data [z=">"])
-2:31 Whitespace("\n ")
-3:5 Comment(" abcd &lt; &gt; &amp; ")
-3:34 Whitespace("\n ")
-4:5 StartElement(a)
-4:8 Characters("test")
-4:12 EndElement(a)
-4:16 Whitespace("\n ")
-5:5 StartElement(b)
-5:8 Characters("kkss\" = ddd\' >")
-5:22 EndElement(b)
-5:26 Whitespace("\n ")
-6:5 CData("\n <a>ddddd</b>!e3--><!-- ddckx\n ")
-8:8 Characters("\n ")
-9:5 StartElement(c)
-9:5 EndElement(c)
-9:9 Whitespace("\n ")
-10:5 CData("\n <![CDATA[zzzz]]")
-11:23 CData(">")
-11:36 Characters("\n")
-12:1 EndElement({urn:x}p:data)
-14:1 EndDocument
diff --git a/tests/documents/sample_3_short.txt b/tests/documents/sample_3_short.txt
deleted file mode 100644
index 2582f33..0000000
--- a/tests/documents/sample_3_short.txt
+++ /dev/null
@@ -1,14 +0,0 @@
-1:1 StartDocument(1.0, utf-8)
-2:1 StartElement({urn:x}p:data [z=">"])
-4:5 StartElement(a)
-4:8 Characters("test")
-4:12 EndElement(a)
-5:5 StartElement(b)
-5:8 Characters("kkss\" = ddd\' >")
-5:22 EndElement(b)
-6:5 Characters("<a>ddddd</b>!e3--><!-- ddckx")
-9:5 StartElement(c)
-9:5 EndElement(c)
-10:5 Characters("<![CDATA[zzzz]]>")
-12:1 EndElement({urn:x}p:data)
-14:1 EndDocument
diff --git a/tests/documents/sample_4.xml b/tests/documents/sample_4.xml
deleted file mode 100644
index fb915ff..0000000
--- a/tests/documents/sample_4.xml
+++ /dev/null
@@ -1,15 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<!DOCTYPE data SYSTEM "abcd.dtd">
-<p:data xmlns:p="urn:x" z=">">
- <!-- abcd &lt; &gt; &amp; -->
- <a>test</a>
- <b>kkss" = ddd' ></b>
- <![CDATA[
- <a>ddddd</b>!e3--><!-- ddckx
- ]]>
- <c/>
- <![CDATA[
- <![CDATA[zzzz]]]]><![CDATA[>]]>
-</p:data>
-
-
diff --git a/tests/documents/sample_4_full.txt b/tests/documents/sample_4_full.txt
deleted file mode 100644
index 4bdadfb..0000000
--- a/tests/documents/sample_4_full.txt
+++ /dev/null
@@ -1,23 +0,0 @@
-StartDocument(1.0, utf-8)
-StartElement({urn:x}p:data [z=">"])
-Whitespace("\n ")
-Comment(" abcd &lt; &gt; &amp; ")
-Whitespace("\n ")
-StartElement(a)
-Characters("test")
-EndElement(a)
-Whitespace("\n ")
-StartElement(b)
-Characters("kkss\" = ddd\' >")
-EndElement(b)
-Whitespace("\n ")
-CData("\n <a>ddddd</b>!e3--><!-- ddckx\n ")
-Characters("\n ")
-StartElement(c)
-EndElement(c)
-Whitespace("\n ")
-CData("\n <![CDATA[zzzz]]")
-CData(">")
-Characters("\n")
-EndElement({urn:x}p:data)
-EndDocument
diff --git a/tests/documents/sample_4_short.txt b/tests/documents/sample_4_short.txt
deleted file mode 100644
index 52e4b83..0000000
--- a/tests/documents/sample_4_short.txt
+++ /dev/null
@@ -1,14 +0,0 @@
-StartDocument(1.0, utf-8)
-StartElement({urn:x}p:data [z=">"])
-StartElement(a)
-Characters("test")
-EndElement(a)
-StartElement(b)
-Characters("kkss\" = ddd\' >")
-EndElement(b)
-Characters("<a>ddddd</b>!e3--><!-- ddckx")
-StartElement(c)
-EndElement(c)
-Characters("<![CDATA[zzzz]]>")
-EndElement({urn:x}p:data)
-EndDocument
diff --git a/tests/documents/sample_5.xml b/tests/documents/sample_5.xml
deleted file mode 100644
index 92aa31d..0000000
--- a/tests/documents/sample_5.xml
+++ /dev/null
@@ -1,7 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<!DOCTYPE data SYSTEM "abcd.dtd">
-<p>
- <a>test&nbsp;&copy;&NotEqualTilde;</a>
-</p>
-
-
diff --git a/tests/documents/sample_5_short.txt b/tests/documents/sample_5_short.txt
deleted file mode 100644
index 3079811..0000000
--- a/tests/documents/sample_5_short.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-StartDocument(1.0, utf-8)
-StartElement(p)
-StartElement(a)
-Characters("test ©≂̸")
-EndElement(a)
-EndElement(p)
-EndDocument
diff --git a/tests/documents/sample_6.xml b/tests/documents/sample_6.xml
deleted file mode 100644
index 943c02d..0000000
--- a/tests/documents/sample_6.xml
+++ /dev/null
@@ -1,4 +0,0 @@
-<?xml version="1.0"?>
-<?xml-stylesheet href="doc.xsl"?>
-
-<doc>Hello</doc>
diff --git a/tests/documents/sample_6_full.txt b/tests/documents/sample_6_full.txt
deleted file mode 100644
index debb366..0000000
--- a/tests/documents/sample_6_full.txt
+++ /dev/null
@@ -1,8 +0,0 @@
-StartDocument(1.0, UTF-8)
-Whitespace("\n")
-ProcessingInstruction(xml-stylesheet="href=\"doc.xsl\"")
-Whitespace("\n\n")
-StartElement(doc)
-Characters("Hello")
-EndElement(doc)
-EndDocument
diff --git a/tests/event_reader.rs b/tests/event_reader.rs
deleted file mode 100644
index 750dcc4..0000000
--- a/tests/event_reader.rs
+++ /dev/null
@@ -1,587 +0,0 @@
-#![forbid(unsafe_code)]
-
-extern crate xml;
-#[macro_use]
-extern crate lazy_static;
-
-use std::env;
-use std::fmt;
-use std::fs::File;
-use std::io::{BufRead, BufReader, Write, stderr};
-use std::path::Path;
-
-use xml::name::OwnedName;
-use xml::common::Position;
-use xml::reader::{Result, XmlEvent, ParserConfig, EventReader};
-
-/// Dummy function that opens a file, parses it, and returns a `Result`.
-/// There can be IO errors (from `File::open`) and XML errors (from the parser).
-/// Having `impl From<std::io::Error> for xml::reader::Error` allows the user to
-/// do this without defining their own error type.
-#[allow(dead_code)]
-fn count_event_in_file(name: &Path) -> Result<usize> {
- let mut event_count = 0;
- for event in EventReader::new(BufReader::new(try!(File::open(name)))) {
- try!(event);
- event_count += 1;
- }
- Ok(event_count)
-}
-
-#[test]
-fn sample_1_short() {
- test(
- include_bytes!("documents/sample_1.xml"),
- include_bytes!("documents/sample_1_short.txt"),
- ParserConfig::new()
- .ignore_comments(true)
- .whitespace_to_characters(true)
- .cdata_to_characters(true)
- .trim_whitespace(true)
- .coalesce_characters(true),
- false
- );
-}
-
-#[test]
-fn sample_1_full() {
- test(
- include_bytes!("documents/sample_1.xml"),
- include_bytes!("documents/sample_1_full.txt"),
- ParserConfig::new()
- .ignore_comments(false)
- .whitespace_to_characters(false)
- .cdata_to_characters(false)
- .trim_whitespace(false)
- .coalesce_characters(false),
- false
- );
-}
-
-#[test]
-fn sample_2_short() {
- test(
- include_bytes!("documents/sample_2.xml"),
- include_bytes!("documents/sample_2_short.txt"),
- ParserConfig::new()
- .ignore_comments(true)
- .whitespace_to_characters(true)
- .cdata_to_characters(true)
- .trim_whitespace(true)
- .coalesce_characters(true),
- false
- );
-}
-
-#[test]
-fn sample_2_full() {
- test(
- include_bytes!("documents/sample_2.xml"),
- include_bytes!("documents/sample_2_full.txt"),
- ParserConfig::new()
- .ignore_comments(false)
- .whitespace_to_characters(false)
- .cdata_to_characters(false)
- .trim_whitespace(false)
- .coalesce_characters(false),
- false
- );
-}
-
-#[test]
-fn sample_3_short() {
- test(
- include_bytes!("documents/sample_3.xml"),
- include_bytes!("documents/sample_3_short.txt"),
- ParserConfig::new()
- .ignore_comments(true)
- .whitespace_to_characters(true)
- .cdata_to_characters(true)
- .trim_whitespace(true)
- .coalesce_characters(true),
- true
- );
-}
-
-#[test]
-fn sample_3_full() {
- test(
- include_bytes!("documents/sample_3.xml"),
- include_bytes!("documents/sample_3_full.txt"),
- ParserConfig::new()
- .ignore_comments(false)
- .whitespace_to_characters(false)
- .cdata_to_characters(false)
- .trim_whitespace(false)
- .coalesce_characters(false),
- true
- );
-}
-
-#[test]
-fn sample_4_short() {
- test(
- include_bytes!("documents/sample_4.xml"),
- include_bytes!("documents/sample_4_short.txt"),
- ParserConfig::new()
- .ignore_comments(true)
- .whitespace_to_characters(true)
- .cdata_to_characters(true)
- .trim_whitespace(true)
- .coalesce_characters(true),
- false
- );
-}
-
-#[test]
-fn sample_4_full() {
- test(
- include_bytes!("documents/sample_4.xml"),
- include_bytes!("documents/sample_4_full.txt"),
- ParserConfig::new()
- .ignore_comments(false)
- .whitespace_to_characters(false)
- .cdata_to_characters(false)
- .trim_whitespace(false)
- .coalesce_characters(false),
- false
- );
-
-}
-
-#[test]
-fn sample_5_short() {
- test(
- include_bytes!("documents/sample_5.xml"),
- include_bytes!("documents/sample_5_short.txt"),
- ParserConfig::new()
- .ignore_comments(true)
- .whitespace_to_characters(true)
- .cdata_to_characters(true)
- .trim_whitespace(true)
- .coalesce_characters(true)
- .add_entity("nbsp", " ")
- .add_entity("copy", "©")
- .add_entity("NotEqualTilde", "≂̸"),
- false
- );
-}
-
-#[test]
-fn sample_6_full() {
- test(
- include_bytes!("documents/sample_6.xml"),
- include_bytes!("documents/sample_6_full.txt"),
- ParserConfig::new()
- .ignore_root_level_whitespace(false)
- .ignore_comments(false)
- .whitespace_to_characters(false)
- .cdata_to_characters(false)
- .trim_whitespace(false)
- .coalesce_characters(false),
- false
- );
-}
-
-#[test]
-fn eof_1() {
- test(
- br#"<?xml"#,
- br#"1:6 Unexpected end of stream: no root element found"#,
- ParserConfig::new(),
- false
- );
-}
-
-#[test]
-fn bad_1() {
- test(
- br#"<?xml&.,"#,
- br#"1:6 Unexpected token: <?xml&"#,
- ParserConfig::new(),
- false
- );
-}
-
-#[test]
-fn dashes_in_comments() {
- test(
- br#"<!-- comment -- --><hello/>"#,
- br#"
- |1:14 Unexpected token '--' before ' '
- "#,
- ParserConfig::new(),
- false
- );
-
- test(
- br#"<!-- comment ---><hello/>"#,
- br#"
- |1:14 Unexpected token '--' before '-'
- "#,
- ParserConfig::new(),
- false
- );
-}
-
-#[test]
-fn tabs_1() {
- test(
- b"\t<a>\t<b/></a>",
- br#"
- |1:2 StartDocument(1.0, UTF-8)
- |1:2 StartElement(a)
- |1:6 StartElement(b)
- |1:6 EndElement(b)
- |1:10 EndElement(a)
- |1:14 EndDocument
- "#,
- ParserConfig::new()
- .trim_whitespace(true),
- true
- );
-}
-
-#[test]
-fn issue_32_unescaped_cdata_end() {
- test(
- br#"<hello>]]></hello>"#,
- br#"
- |StartDocument(1.0, UTF-8)
- |StartElement(hello)
- |Characters("]]>")
- |EndElement(hello)
- |EndDocument
- "#,
- ParserConfig::new(),
- false
- );
-}
-
-#[test]
-fn issue_unescaped_processing_instruction_end() {
- test(
- br#"<hello>?></hello>"#,
- br#"
- |StartDocument(1.0, UTF-8)
- |StartElement(hello)
- |Characters("?>")
- |EndElement(hello)
- |EndDocument
- "#,
- ParserConfig::new(),
- false
- );
-}
-
-#[test]
-fn issue_unescaped_empty_tag_end() {
- test(
- br#"<hello>/></hello>"#,
- br#"
- |StartDocument(1.0, UTF-8)
- |StartElement(hello)
- |Characters("/>")
- |EndElement(hello)
- |EndDocument
- "#,
- ParserConfig::new(),
- false
- );
-}
-
-#[test]
-fn issue_83_duplicate_attributes() {
- test(
- br#"<hello><some-tag a='10' a="20"></hello>"#,
- br#"
- |StartDocument(1.0, UTF-8)
- |StartElement(hello)
- |1:30 Attribute 'a' is redefined
- "#,
- ParserConfig::new(),
- false
- );
-}
-
-#[test]
-fn issue_93_large_characters_in_entity_references() {
- test(
- r#"<hello>&𤶼;</hello>"#.as_bytes(),
- r#"
- |StartDocument(1.0, UTF-8)
- |StartElement(hello)
- |1:10 Unexpected entity: 𤶼
- "#.as_bytes(), // FIXME: it shouldn't be 10, looks like indices are off slightly
- ParserConfig::new(),
- false
- )
-}
-
-#[test]
-fn issue_98_cdata_ending_with_right_bracket() {
- test(
- br#"<hello><![CDATA[Foo [Bar]]]></hello>"#,
- br#"
- |StartDocument(1.0, UTF-8)
- |StartElement(hello)
- |CData("Foo [Bar]")
- |EndElement(hello)
- |EndDocument
- "#,
- ParserConfig::new(),
- false
- )
-}
-
-#[test]
-fn issue_105_unexpected_double_dash() {
- test(
- br#"<hello>-- </hello>"#,
- br#"
- |StartDocument(1.0, UTF-8)
- |StartElement(hello)
- |Characters("-- ")
- |EndElement(hello)
- |EndDocument
- "#,
- ParserConfig::new(),
- false
- );
-
- test(
- br#"<hello>--</hello>"#,
- br#"
- |StartDocument(1.0, UTF-8)
- |StartElement(hello)
- |Characters("--")
- |EndElement(hello)
- |EndDocument
- "#,
- ParserConfig::new(),
- false
- );
-
- test(
- br#"<hello>--></hello>"#,
- br#"
- |StartDocument(1.0, UTF-8)
- |StartElement(hello)
- |Characters("-->")
- |EndElement(hello)
- |EndDocument
- "#,
- ParserConfig::new(),
- false
- );
-
- test(
- br#"<hello><![CDATA[--]]></hello>"#,
- br#"
- |StartDocument(1.0, UTF-8)
- |StartElement(hello)
- |CData("--")
- |EndElement(hello)
- |EndDocument
- "#,
- ParserConfig::new(),
- false
- );
-}
-
-#[test]
-fn issue_attribues_have_no_default_namespace () {
- test(
- br#"<hello xmlns="urn:foo" x="y"/>"#,
- br#"
- |StartDocument(1.0, UTF-8)
- |StartElement({urn:foo}hello [x="y"])
- |EndElement({urn:foo}hello)
- |EndDocument
- "#,
- ParserConfig::new(),
- false
- );
-}
-
-#[test]
-fn issue_replacement_character_entity_reference() {
- test(
- br#"<doc>&#55357;&#56628;</doc>"#,
- br#"
- |StartDocument(1.0, UTF-8)
- |StartElement(doc)
- |1:13 Invalid decimal character number in an entity: #55357
- "#,
- ParserConfig::new(),
- false,
- );
-
- test(
- br#"<doc>&#xd83d;&#xdd34;</doc>"#,
- br#"
- |StartDocument(1.0, UTF-8)
- |StartElement(doc)
- |1:13 Invalid hexadecimal character number in an entity: #xd83d
- "#,
- ParserConfig::new(),
- false,
- );
-
- test(
- br#"<doc>&#55357;&#56628;</doc>"#,
- format!(
- r#"
- |StartDocument(1.0, UTF-8)
- |StartElement(doc)
- |Characters("{replacement_character}{replacement_character}")
- |EndElement(doc)
- |EndDocument
- "#,
- replacement_character = "\u{fffd}"
- )
- .as_bytes(),
- ParserConfig::new()
- .replace_unknown_entity_references(true),
- false,
- );
-
- test(
- br#"<doc>&#xd83d;&#xdd34;</doc>"#,
- format!(
- r#"
- |StartDocument(1.0, UTF-8)
- |StartElement(doc)
- |Characters("{replacement_character}{replacement_character}")
- |EndElement(doc)
- |EndDocument
- "#,
- replacement_character = "\u{fffd}"
- )
- .as_bytes(),
- ParserConfig::new()
- .replace_unknown_entity_references(true),
- false,
- );
-}
-
-lazy_static! {
- // If PRINT_SPEC env variable is set, print the lines
- // to stderr instead of comparing with the output
- // it can be used like this:
- // PRINT_SPEC=1 cargo test --test event_reader sample_1_full 2> sample_1_full.txt
- static ref PRINT: bool = {
- for (key, value) in env::vars() {
- if key == "PRINT_SPEC" && value == "1" {
- return true;
- }
- }
- false
- };
-}
-
-// clones a lot but that's fine
-fn trim_until_bar(s: String) -> String {
- match s.trim() {
- ts if ts.starts_with('|') => return ts[1..].to_owned(),
- _ => {}
- }
- s
-}
-
-fn test(input: &[u8], output: &[u8], config: ParserConfig, test_position: bool) {
- let mut reader = config.create_reader(input);
- let mut spec_lines = BufReader::new(output).lines()
- .map(|line| line.unwrap())
- .enumerate()
- .map(|(i, line)| (i, trim_until_bar(line)))
- .filter(|&(_, ref line)| !line.trim().is_empty());
-
- loop {
- let e = reader.next();
- let line =
- if test_position {
- format!("{} {}", reader.position(), Event(&e))
- } else {
- format!("{}", Event(&e))
- };
-
- if *PRINT {
- writeln!(&mut stderr(), "{}", line).unwrap();
- } else {
- if let Some((n, spec)) = spec_lines.next() {
- if line != spec {
- const SPLITTER: &'static str = "-------------------";
- panic!("\n{}\nUnexpected event at line {}:\nExpected: {}\nFound: {}\n{}\n",
- SPLITTER, n + 1, spec, line, std::str::from_utf8(output).unwrap());
- }
- } else {
- panic!("Unexpected event: {}", line);
- }
- }
-
- match e {
- Ok(XmlEvent::EndDocument) | Err(_) => break,
- _ => {},
- }
- }
-}
-
-// Here we define our own string representation of events so we don't depend
-// on the specifics of Display implementation for XmlEvent and OwnedName.
-
-struct Name<'a>(&'a OwnedName);
-
-impl <'a> fmt::Display for Name<'a> {
- fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
- if let Some(ref namespace) = self.0.namespace {
- try! { write!(f, "{{{}}}", namespace) }
- }
-
- if let Some(ref prefix) = self.0.prefix {
- try! { write!(f, "{}:", prefix) }
- }
-
- write!(f, "{}", self.0.local_name)
- }
-}
-
-struct Event<'a>(&'a Result<XmlEvent>);
-
-impl<'a> fmt::Display for Event<'a> {
- fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
- let empty = String::new();
- match *self.0 {
- Ok(ref e) => match *e {
- XmlEvent::StartDocument { ref version, ref encoding, .. } =>
- write!(f, "StartDocument({}, {})", version, encoding),
- XmlEvent::EndDocument =>
- write!(f, "EndDocument"),
- XmlEvent::ProcessingInstruction { ref name, ref data } =>
- write!(f, "ProcessingInstruction({}={:?})", name,
- data.as_ref().unwrap_or(&empty)),
- XmlEvent::StartElement { ref name, ref attributes, .. } => {
- if attributes.is_empty() {
- write!(f, "StartElement({})", Name(name))
- }
- else {
- let attrs: Vec<_> = attributes.iter()
- .map(|a| format!("{}={:?}", Name(&a.name), a.value)) .collect();
- write!(f, "StartElement({} [{}])", Name(name), attrs.join(", "))
- }
- },
- XmlEvent::EndElement { ref name } =>
- write!(f, "EndElement({})", Name(name)),
- XmlEvent::Comment(ref data) =>
- write!(f, r#"Comment("{}")"#, data.escape_debug()),
- XmlEvent::CData(ref data) =>
- write!(f, r#"CData("{}")"#, data.escape_debug()),
- XmlEvent::Characters(ref data) =>
- write!(f, r#"Characters("{}")"#, data.escape_debug()),
- XmlEvent::Whitespace(ref data) =>
- write!(f, r#"Whitespace("{}")"#, data.escape_debug()),
- },
- Err(ref e) => e.fmt(f),
- }
- }
-}
diff --git a/tests/event_writer.rs b/tests/event_writer.rs
deleted file mode 100644
index dd64a43..0000000
--- a/tests/event_writer.rs
+++ /dev/null
@@ -1,269 +0,0 @@
-#![forbid(unsafe_code)]
-
-extern crate xml;
-
-use std::io::{BufReader, SeekFrom};
-use std::io::prelude::*;
-use std::fs::File;
-use std::str;
-
-use xml::reader::EventReader;
-use xml::writer::EmitterConfig;
-
-macro_rules! unwrap_all {
- ($($e:expr);+) => {{
- $($e.unwrap();)+
- }}
-}
-
-#[test]
-fn reading_writing_equal_with_namespaces() {
- let mut f = File::open("tests/documents/sample_2.xml").unwrap();
- let mut b = Vec::new();
-
- {
- let r = EventReader::new(BufReader::new(&mut f));
- let mut w = EmitterConfig::default().perform_indent(true).create_writer(&mut b);
-
- for e in r {
- match e {
- Ok(e) => if let Some(e) = e.as_writer_event() {
- match w.write(e) {
- Ok(_) => {},
- Err(e) => panic!("Writer error: {:?}", e)
- }
- },
- Err(e) => panic!("Error: {}", e)
- }
- }
- }
-
- f.seek(SeekFrom::Start(0)).unwrap();
- let mut fs = String::new();
- f.read_to_string(&mut fs).unwrap();
-
- let bs = String::from_utf8(b).unwrap();
-
- assert_eq!(fs.trim(), bs.trim());
-}
-
-#[test]
-fn writing_simple() {
- use xml::writer::XmlEvent;
-
- let mut b = Vec::new();
-
- {
- let mut w = EmitterConfig::new().write_document_declaration(false).create_writer(&mut b);
-
- w.write(XmlEvent::start_element("h:hello").ns("h", "urn:hello-world")).unwrap();
- w.write("hello world").unwrap();
- w.write(XmlEvent::end_element()).unwrap();
- }
-
- assert_eq!(
- str::from_utf8(&b).unwrap(),
- r#"<h:hello xmlns:h="urn:hello-world">hello world</h:hello>"#
- );
-}
-
-#[test]
-fn writing_empty_elements_with_normalizing() {
- use xml::writer::XmlEvent;
-
- let mut b = Vec::new();
-
- {
- let mut w = EmitterConfig::new().write_document_declaration(false).create_writer(&mut b);
-
- unwrap_all! {
- w.write(XmlEvent::start_element("hello"));
- w.write(XmlEvent::start_element("world"));
- w.write(XmlEvent::end_element());
- w.write(XmlEvent::end_element())
- }
- }
-
- assert_eq!(str::from_utf8(&b).unwrap(), r#"<hello><world /></hello>"#);
-}
-
-#[test]
-fn writing_empty_elements_without_normalizing() {
- use xml::writer::XmlEvent;
-
- let mut b = Vec::new();
-
- {
- let mut w = EmitterConfig::new()
- .write_document_declaration(false)
- .normalize_empty_elements(false)
- .create_writer(&mut b);
-
- unwrap_all! {
- w.write(XmlEvent::start_element("hello"));
- w.write(XmlEvent::start_element("world"));
- w.write(XmlEvent::end_element());
- w.write(XmlEvent::end_element())
- }
- }
-
- assert_eq!(str::from_utf8(&b).unwrap(), r#"<hello><world></world></hello>"#);
-}
-
-#[test]
-fn writing_empty_elements_without_pad_self_closing() {
- use xml::writer::XmlEvent;
-
- let mut b = Vec::new();
-
- {
- let mut w = EmitterConfig::new()
- .write_document_declaration(false)
- .pad_self_closing(false)
- .create_writer(&mut b);
-
- unwrap_all! {
- w.write(XmlEvent::start_element("hello"));
- w.write(XmlEvent::start_element("world"));
- w.write(XmlEvent::end_element());
- w.write(XmlEvent::end_element())
- }
- }
-
- assert_eq!(str::from_utf8(&b).unwrap(), r#"<hello><world/></hello>"#);
-}
-#[test]
-fn writing_empty_elements_pad_self_closing_explicit() {
- use xml::writer::XmlEvent;
-
- let mut b = Vec::new();
-
- {
- let mut w = EmitterConfig::new()
- .write_document_declaration(false)
- .pad_self_closing(true)
- .create_writer(&mut b);
-
- unwrap_all! {
- w.write(XmlEvent::start_element("hello"));
- w.write(XmlEvent::start_element("world"));
- w.write(XmlEvent::end_element());
- w.write(XmlEvent::end_element())
- }
- }
-
- assert_eq!(str::from_utf8(&b).unwrap(), r#"<hello><world /></hello>"#);
-}
-
-#[test]
-fn writing_comments_with_indentation() {
- use xml::writer::XmlEvent;
-
- let mut b = Vec::new();
-
- {
- let mut w = EmitterConfig::new()
- .write_document_declaration(false)
- .perform_indent(true)
- .create_writer(&mut b);
-
- unwrap_all! {
- w.write(XmlEvent::start_element("hello"));
- w.write(XmlEvent::start_element("world"));
- w.write(XmlEvent::comment(" this is a manually padded comment\t"));
- w.write(XmlEvent::comment("this is an unpadded comment"));
- w.write(XmlEvent::end_element());
- w.write(XmlEvent::end_element())
- }
- }
-
- assert_eq!(
- str::from_utf8(&b).unwrap(),
- "<hello>
- <world>
- <!-- this is a manually padded comment\t-->
- <!-- this is an unpadded comment -->
- </world>
-</hello>");
-}
-
-#[test]
-fn issue_112_overriding_namepace_prefix() {
- use xml::writer::XmlEvent;
-
- let mut b = Vec::new();
-
- {
- let mut w = EmitterConfig::new()
- .write_document_declaration(false)
- .create_writer(&mut b);
-
- unwrap_all! {
- w.write(XmlEvent::start_element("iq").ns("", "jabber:client").ns("a", "urn:A"));
- w.write(XmlEvent::start_element("bind").ns("", "urn:ietf:params:xml:ns:xmpp-bind"));
- w.write(XmlEvent::end_element());
- w.write(XmlEvent::start_element("whatever").ns("a", "urn:X"));
- w.write(XmlEvent::end_element());
- w.write(XmlEvent::end_element())
- }
- }
-
- assert_eq!(
- str::from_utf8(&b).unwrap(),
- r#"<iq xmlns="jabber:client" xmlns:a="urn:A"><bind xmlns="urn:ietf:params:xml:ns:xmpp-bind" /><whatever xmlns:a="urn:X" /></iq>"#
- )
-}
-
-#[test]
-fn attribute_escaping() {
- use xml::writer::XmlEvent;
-
- let mut b = Vec::new();
-
- {
- let mut w = EmitterConfig::new()
- .write_document_declaration(false)
- .perform_indent(true)
- .create_writer(&mut b);
-
- unwrap_all! {
- w.write(
- XmlEvent::start_element("hello")
- .attr("testLt", "<")
- .attr("testGt", ">")
- );
- w.write(XmlEvent::end_element());
- w.write(
- XmlEvent::start_element("hello")
- .attr("testQuot", "\"")
- .attr("testApos", "\'")
- );
- w.write(XmlEvent::end_element());
- w.write(
- XmlEvent::start_element("hello")
- .attr("testAmp", "&")
- );
- w.write(XmlEvent::end_element());
- w.write(
- XmlEvent::start_element("hello")
- .attr("testNl", "\n")
- .attr("testCr", "\r")
- );
- w.write(XmlEvent::end_element());
- w.write(
- XmlEvent::start_element("hello")
- .attr("testNl", "\\n")
- .attr("testCr", "\\r")
- );
- w.write(XmlEvent::end_element())
- }
- }
- assert_eq!(
- str::from_utf8(&b).unwrap(),
- "<hello testLt=\"&lt;\" testGt=\"&gt;\" />
-<hello testQuot=\"&quot;\" testApos=\"&apos;\" />
-<hello testAmp=\"&amp;\" />
-<hello testNl=\"&#xA;\" testCr=\"&#xD;\" />
-<hello testNl=\"\\n\" testCr=\"\\r\" />"
- );
-} \ No newline at end of file
diff --git a/tests/streaming.rs b/tests/streaming.rs
deleted file mode 100644
index a577a00..0000000
--- a/tests/streaming.rs
+++ /dev/null
@@ -1,103 +0,0 @@
-#![forbid(unsafe_code)]
-
-extern crate xml;
-
-use std::io::{Cursor, Write};
-
-use xml::EventReader;
-use xml::reader::ParserConfig;
-use xml::reader::XmlEvent;
-
-macro_rules! assert_match {
- ($actual:expr, $expected:pat) => {
- match $actual {
- $expected => {},
- _ => panic!("assertion failed: `(left matches right)` \
- (left: `{:?}`, right: `{}`", $actual, stringify!($expected))
- }
- };
- ($actual:expr, $expected:pat if $guard:expr) => {
- match $actual {
- $expected if $guard => {},
- _ => panic!("assertion failed: `(left matches right)` \
- (left: `{:?}`, right: `{} if {}`",
- $actual, stringify!($expected), stringify!($guard))
- }
- }
-}
-
-fn write_and_reset_position<W>(c: &mut Cursor<W>, data: &[u8]) where Cursor<W>: Write {
- let p = c.position();
- c.write_all(data).unwrap();
- c.set_position(p);
-}
-
-#[test]
-fn reading_streamed_content() {
- let buf = Cursor::new(b"<root>".to_vec());
- let reader = EventReader::new(buf);
-
- let mut it = reader.into_iter();
-
- assert_match!(it.next(), Some(Ok(XmlEvent::StartDocument { .. })));
- assert_match!(it.next(), Some(Ok(XmlEvent::StartElement { ref name, .. })) if name.local_name == "root");
-
- write_and_reset_position(it.source_mut(), b"<child-1>content</child-1>");
- assert_match!(it.next(), Some(Ok(XmlEvent::StartElement { ref name, .. })) if name.local_name == "child-1");
- assert_match!(it.next(), Some(Ok(XmlEvent::Characters(ref c))) if c == "content");
- assert_match!(it.next(), Some(Ok(XmlEvent::EndElement { ref name })) if name.local_name == "child-1");
-
- write_and_reset_position(it.source_mut(), b"<child-2/>");
- assert_match!(it.next(), Some(Ok(XmlEvent::StartElement { ref name, .. })) if name.local_name == "child-2");
- assert_match!(it.next(), Some(Ok(XmlEvent::EndElement { ref name })) if name.local_name == "child-2");
-
- write_and_reset_position(it.source_mut(), b"<child-3/>");
- assert_match!(it.next(), Some(Ok(XmlEvent::StartElement { ref name, .. })) if name.local_name == "child-3");
- assert_match!(it.next(), Some(Ok(XmlEvent::EndElement { ref name })) if name.local_name == "child-3");
- // doesn't seem to work because of how tags parsing is done
-// write_and_reset_position(it.source_mut(), b"some text");
- // assert_match!(it.next(), Some(Ok(XmlEvent::Characters(ref c))) if c == "some text");
-
- write_and_reset_position(it.source_mut(), b"</root>");
- assert_match!(it.next(), Some(Ok(XmlEvent::EndElement { ref name })) if name.local_name == "root");
- assert_match!(it.next(), Some(Ok(XmlEvent::EndDocument)));
- assert_match!(it.next(), None);
-}
-
-#[test]
-fn reading_streamed_content2() {
- let buf = Cursor::new(b"<root>".to_vec());
- let mut config = ParserConfig::new();
- config.ignore_end_of_stream = true;
- let readerb = EventReader::new_with_config(buf, config);
-
- let mut reader = readerb.into_iter();
-
- assert_match!(reader.next(), Some(Ok(XmlEvent::StartDocument { .. })));
- assert_match!(reader.next(), Some(Ok(XmlEvent::StartElement { ref name, .. })) if name.local_name == "root");
-
- write_and_reset_position(reader.source_mut(), b"<child-1>content</child-1>");
- assert_match!(reader.next(), Some(Ok(XmlEvent::StartElement { ref name, .. })) if name.local_name == "child-1");
- assert_match!(reader.next(), Some(Ok(XmlEvent::Characters(ref c))) if c == "content");
- assert_match!(reader.next(), Some(Ok(XmlEvent::EndElement { ref name })) if name.local_name == "child-1");
-
- write_and_reset_position(reader.source_mut(), b"<child-2>content</child-2>");
-
- assert_match!(reader.next(), Some(Ok(XmlEvent::StartElement { ref name, .. })) if name.local_name == "child-2");
- assert_match!(reader.next(), Some(Ok(XmlEvent::Characters(ref c))) if c == "content");
- assert_match!(reader.next(), Some(Ok(XmlEvent::EndElement { ref name })) if name.local_name == "child-2");
- assert_match!(reader.next(), Some(Err(_)));
- write_and_reset_position(reader.source_mut(), b"<child-3></child-3>");
- assert_match!(reader.next(), Some(Ok(XmlEvent::StartElement { ref name, .. })) if name.local_name == "child-3");
- write_and_reset_position(reader.source_mut(), b"<child-4 type='get'");
- match reader.next() {
- None |
- Some(Ok(_)) => {
- panic!("At this point, parser must not detect something.");
- },
- Some(Err(_)) => {}
- };
- write_and_reset_position(reader.source_mut(), b" />");
- assert_match!(reader.next(), Some(Ok(XmlEvent::StartElement { ref name, .. })) if name.local_name == "child-4");
-}
-