From bec0e9a523cf8d6db5ea2c92c99f13d2014b7f80 Mon Sep 17 00:00:00 2001 From: Matthew Maurer Date: Wed, 14 Jun 2023 16:28:50 +0000 Subject: Upgrade xml-rs to 0.8.15-cvss-cries-wolf This project was upgraded with external_updater. Usage: tools/external_updater/updater.sh update rust/crates/xml-rs For more info, check https://cs.android.com/android/platform/superproject/+/master:tools/external_updater/README.md Bug: 287255605 Test: TreeHugger Change-Id: I67a6db8d638bbf1b1318b1a865f50493ed1f6282 --- .cargo_vcs_info.json | 7 +- .github/workflows/main.yml | 31 - Android.bp | 58 +- Cargo.toml | 47 +- Cargo.toml.orig | 27 +- Changelog.md | 126 ---- METADATA | 14 +- README.md | 209 ++++++ Readme.md | 236 ------ design.md | 37 - src/analyze.rs | 110 ++- src/attribute.rs | 34 +- src/common.rs | 69 +- src/escape.rs | 159 ++-- src/lib.rs | 29 +- src/macros.rs | 32 +- src/name.rs | 63 +- src/namespace.rs | 60 +- src/reader.rs | 156 ++++ src/reader/config.rs | 129 +++- src/reader/error.rs | 197 ++++- src/reader/events.rs | 66 +- src/reader/lexer.rs | 664 ++++++++++++----- src/reader/mod.rs | 129 ---- src/reader/parser.rs | 796 +++++++++++++++++++++ src/reader/parser/inside_cdata.rs | 26 +- src/reader/parser/inside_closing_tag_name.rs | 21 +- src/reader/parser/inside_comment.rs | 21 +- src/reader/parser/inside_declaration.rs | 137 ++-- src/reader/parser/inside_doctype.rs | 237 +++++- src/reader/parser/inside_opening_tag.rs | 73 +- src/reader/parser/inside_processing_instruction.rs | 82 ++- src/reader/parser/inside_reference.rs | 122 ++-- src/reader/parser/mod.rs | 622 ---------------- src/reader/parser/outside_tag.rs | 208 ++++-- src/util.rs | 276 ++++++- src/writer.rs | 90 +++ src/writer/config.rs | 10 +- src/writer/emitter.rs | 135 ++-- src/writer/events.rs | 51 +- src/writer/mod.rs | 93 --- tests/documents/sample_1.xml | 34 - tests/documents/sample_1_full.txt | 58 -- tests/documents/sample_1_short.txt | 37 - tests/documents/sample_2.xml | 15 - tests/documents/sample_2_full.txt | 41 -- tests/documents/sample_2_short.txt | 30 - tests/documents/sample_3.xml | 13 - tests/documents/sample_3_full.txt | 23 - tests/documents/sample_3_short.txt | 14 - tests/documents/sample_4.xml | 15 - tests/documents/sample_4_full.txt | 23 - tests/documents/sample_4_short.txt | 14 - tests/documents/sample_5.xml | 7 - tests/documents/sample_5_short.txt | 7 - tests/documents/sample_6.xml | 4 - tests/documents/sample_6_full.txt | 8 - tests/event_reader.rs | 587 --------------- tests/event_writer.rs | 269 ------- tests/streaming.rs | 103 --- 60 files changed, 3411 insertions(+), 3580 deletions(-) delete mode 100644 .github/workflows/main.yml delete mode 100644 Changelog.md create mode 100644 README.md delete mode 100644 Readme.md delete mode 100644 design.md create mode 100644 src/reader.rs delete mode 100644 src/reader/mod.rs create mode 100644 src/reader/parser.rs delete mode 100644 src/reader/parser/mod.rs create mode 100644 src/writer.rs delete mode 100644 src/writer/mod.rs delete mode 100644 tests/documents/sample_1.xml delete mode 100644 tests/documents/sample_1_full.txt delete mode 100644 tests/documents/sample_1_short.txt delete mode 100644 tests/documents/sample_2.xml delete mode 100644 tests/documents/sample_2_full.txt delete mode 100644 tests/documents/sample_2_short.txt delete mode 100644 tests/documents/sample_3.xml delete mode 100644 tests/documents/sample_3_full.txt delete mode 100644 tests/documents/sample_3_short.txt delete mode 100644 tests/documents/sample_4.xml delete mode 100644 tests/documents/sample_4_full.txt delete mode 100644 tests/documents/sample_4_short.txt delete mode 100644 tests/documents/sample_5.xml delete mode 100644 tests/documents/sample_5_short.txt delete mode 100644 tests/documents/sample_6.xml delete mode 100644 tests/documents/sample_6_full.txt delete mode 100644 tests/event_reader.rs delete mode 100644 tests/event_writer.rs delete mode 100644 tests/streaming.rs diff --git a/.cargo_vcs_info.json b/.cargo_vcs_info.json index 6e0c55d..f0a8a38 100644 --- a/.cargo_vcs_info.json +++ b/.cargo_vcs_info.json @@ -1,5 +1,6 @@ { "git": { - "sha1": "7cd06954fd6e22b7dbf9ea02ff4e22f9ff6309fd" - } -} + "sha1": "c4705ddc172950c28f9b229f368ad8f4cba81e3f" + }, + "path_in_vcs": "" +} \ No newline at end of file diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml deleted file mode 100644 index daca69f..0000000 --- a/.github/workflows/main.yml +++ /dev/null @@ -1,31 +0,0 @@ -name: CI - -on: - push: - branches: [ master ] - pull_request: - branches: [ master ] - -jobs: - test: - runs-on: ubuntu-latest - strategy: - matrix: - rust: [stable, beta, nightly] - - steps: - - uses: actions/checkout@v2 - - - uses: actions-rs/toolchain@v1 - with: - profile: minimal - toolchain: ${{ matrix.rust }} - override: true - - - uses: actions-rs/cargo@v1 - with: - command: build - - - uses: actions-rs/cargo@v1 - with: - command: test diff --git a/Android.bp b/Android.bp index 38ba911..16b2b9a 100644 --- a/Android.bp +++ b/Android.bp @@ -21,13 +21,12 @@ license { rust_library { name: "libxml_rust", stem: "libxml", - // has rustc warnings host_supported: true, crate_name: "xml", cargo_env_compat: true, - cargo_pkg_version: "0.8.4", + cargo_pkg_version: "0.8.15-cvss-cries-wolf", srcs: ["src/lib.rs"], - edition: "2015", + edition: "2021", apex_available: [ "//apex_available:platform", "com.android.virt", @@ -36,63 +35,14 @@ rust_library { vendor_available: true, } -rust_defaults { - name: "xml-rs_test_defaults", - crate_name: "xml_rs", - cargo_env_compat: true, - cargo_pkg_version: "0.8.4", - test_suites: ["general-tests"], - auto_gen_config: true, - edition: "2015", - rustlibs: [ - "liblazy_static", - "libxml_rust", - ], -} - -rust_test { - name: "xml-rs_test_tests_event_reader", - defaults: ["xml-rs_test_defaults"], - // has rustc warnings - host_supported: true, - srcs: ["tests/event_reader.rs"], - test_options: { - unit_test: true, - }, -} - -rust_test { - name: "xml-rs_test_tests_event_writer", - defaults: ["xml-rs_test_defaults"], - // has rustc warnings - host_supported: true, - srcs: ["tests/event_writer.rs"], - test_options: { - unit_test: true, - }, - data: ["tests/documents/*"], -} - -rust_test { - name: "xml-rs_test_tests_streaming", - defaults: ["xml-rs_test_defaults"], - // has rustc warnings - host_supported: true, - srcs: ["tests/streaming.rs"], - test_options: { - unit_test: true, - }, -} - rust_binary { name: "xml_analyze", - // has rustc warnings host_supported: true, crate_name: "xml_analyze", cargo_env_compat: true, - cargo_pkg_version: "0.8.4", + cargo_pkg_version: "0.8.15-cvss-cries-wolf", srcs: ["src/analyze.rs"], - edition: "2015", + edition: "2021", rustlibs: [ "libxml_rust", ], diff --git a/Cargo.toml b/Cargo.toml index e704337..3279206 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -3,24 +3,44 @@ # When uploading crates to the registry Cargo will automatically # "normalize" Cargo.toml files for maximal compatibility # with all versions of Cargo and also rewrite `path` dependencies -# to registry (e.g., crates.io) dependencies +# to registry (e.g., crates.io) dependencies. # -# If you believe there's an error in this file please file an -# issue against the rust-lang/cargo repository. If you're -# editing this file be aware that the upstream Cargo.toml -# will likely look very different (and much more reasonable) +# If you are reading this file be aware that the original Cargo.toml +# will likely look very different (and much more reasonable). +# See Cargo.toml.orig for the original contents. [package] +edition = "2021" +rust-version = "1.58" name = "xml-rs" -version = "0.8.4" +version = "0.8.15-cvss-cries-wolf" authors = ["Vladimir Matveev "] +include = [ + "src/**", + "LICENSE", + "README.md", +] description = "An XML library in pure Rust" -documentation = "http://docs.rs/xml-rs/" -readme = "Readme.md" -keywords = ["xml", "parsing", "parser"] -categories = ["parsing"] +homepage = "https://lib.rs/crates/xml-rs" +documentation = "https://docs.rs/xml-rs/" +readme = "README.md" +keywords = [ + "xml", + "parser", + "sax", + "parsing", + "writer", +] +categories = ["parser-implementations"] license = "MIT" -repository = "https://github.com/netvl/xml-rs" +repository = "https://github.com/kornelski/xml-rs" + +[package.metadata.docs.rs] +targets = ["x86_64-unknown-linux-gnu"] + +[package.metadata.release] +tag-message = "" +tag-name = "{{version}}" [lib] name = "xml" @@ -29,8 +49,9 @@ path = "src/lib.rs" [[bin]] name = "xml-analyze" path = "src/analyze.rs" + [dev-dependencies.doc-comment] version = "0.3" -[dev-dependencies.lazy_static] -version = "1.2.0" +[badges.maintenance] +status = "actively-developed" diff --git a/Cargo.toml.orig b/Cargo.toml.orig index c8df8e6..0282e7a 100644 --- a/Cargo.toml.orig +++ b/Cargo.toml.orig @@ -1,14 +1,18 @@ [package] name = "xml-rs" -version = "0.8.4" +version = "0.8.15-cvss-cries-wolf" authors = ["Vladimir Matveev "] license = "MIT" description = "An XML library in pure Rust" -repository = "https://github.com/netvl/xml-rs" -documentation = "http://docs.rs/xml-rs/" -readme = "Readme.md" -keywords = ["xml", "parsing", "parser"] -categories = ["parsing"] +repository = "https://github.com/kornelski/xml-rs" +homepage = "https://lib.rs/crates/xml-rs" +documentation = "https://docs.rs/xml-rs/" +readme = "README.md" +keywords = ["xml", "parser", "sax", "parsing", "writer"] +categories = ["parser-implementations"] +edition = "2021" +rust-version = "1.58" +include = ["src/**", "LICENSE", "README.md"] [lib] name = "xml" @@ -20,4 +24,13 @@ path = "src/analyze.rs" [dev-dependencies] doc-comment = "0.3" -lazy_static = "1.2.0" + +[badges] +maintenance = { status = "actively-developed" } + +[package.metadata.docs.rs] +targets = ["x86_64-unknown-linux-gnu"] + +[package.metadata.release] +tag-name = "{{version}}" +tag-message = "" diff --git a/Changelog.md b/Changelog.md deleted file mode 100644 index 3cca8b8..0000000 --- a/Changelog.md +++ /dev/null @@ -1,126 +0,0 @@ -## Version 0.8.4 - -* Fixed recognition of `?>`, `]]>` and `/>` tokens as characters. -* Fixed writer output operations to use `write_all` to ensure that the data - is written fully. -* The document declaration is now written before any characters automatically. - -## Version 0.8.3 - -* Added a new parser option, `ignore_root_level_whitespace`, which makes the parser - skip emitting whitespace events outside of the root element when set to `true`. - This helps with certain tasks like canonicalization. - -## Version 0.8.2 - -* Added a new parser option, `replace_unknown_entity_references`, which allows to ignore - invalid Unicode code points and replace them with a Unicode "replacement character" - during parsing. This can be helpful to deal with e.g. UTF-16 surrogate pairs. -* Added a new emitter option, `pad_self_closing`, which determines the style of the self-closing - elements when they are emitted: `` (`true`) vs `` (`false`). - -## Version 0.8.1 - -* Fixed various issues with tests introduced by updates in Rust. -* Adjusted the lexer to ignore contents of the `` tag. -* Removed unnecessary unsafety in tests. -* Added tests for doc comments in the readme file. -* Switched to GitHub Actions from Travis CI. - -## Version 0.8.0 - -* Same as 0.7.1, with 0.7.1 being yanked because of the incorrect semver bump. - -## Version 0.7.1 - -* Removed dependency on bitflags. -* Added the `XmlWriter::inner_mut()` method. -* Fixed some rustdoc warnings. - -## Version 0.7.0 - -* Same as 0.6.2, with 0.6.2 being yanked because of the incompatible bump of minimum required version of rustc. - -## Version 0.6.2 - -* Bumped `bitflags` to 1.0. - -## Version 0.6.1 - -* Fixed the writer to escape some special characters when writing attribute values. - -## Version 0.6.0 - -* Changed the target type of extra entities from `char` to `String`. This is an incompatible - change. - -## Version 0.5.0 - -* Added support for ignoring EOF errors in order to read documents from streams incrementally. -* Bumped `bitflags` to 0.9. - -## Version 0.4.1 - -* Added missing `Debug` implementation to `xml::writer::XmlEvent`. - -## Version 0.4.0 - -* Bumped version number, since changes introduced in 0.3.7 break backwards compatibility. - -## Version 0.3.8 - -* Fixed a problem introduced in 0.3.7 with entities in attributes causing parsing errors. - -## Version 0.3.7 - -* Fixed the problem with parsing non-whitespace character entities as whitespace (issue #140). -* Added support for configuring custom entities in the parser configuration. - -## Version 0.3.6 - -* Added an `Error` implementation for `EmitterError`. -* Fixed escaping of strings with multi-byte code points. - -## Version 0.3.5 - -* Added `Debug` implementation for `XmlVersion`. -* Fixed some failing tests. - -## Version 0.3.3 - -* Updated `bitflags` to 0.7. - -## Version 0.3.2 - -* Added `From` for `xml::reader::Error`, which improves usability of working with parsing errors. - -## Version 0.3.1 - -* Bumped `bitflags` dependency to 0.4, some internal warning fixes. - -## Version 0.3.0 - -* Changed error handling in `EventReader` - now I/O errors are properly bubbled up from the lexer. - -## Version 0.2.4 - -* Fixed #112 - incorrect handling of namespace redefinitions when writing a document. - -## Version 0.2.3 - -* Added `into_inner()` methods to `EventReader` and `EventWriter`. - -## Version 0.2.2 - -* Using `join` instead of the deprecated `connect`. -* Added a simple XML analyzer program which demonstrates library usage and can be used to check XML documents for well-formedness. -* Fixed incorrect handling of unqualified attribute names (#107). -* Added this changelog. - -## Version 0.2.1 - -* Fixed #105 - incorrect handling of double dashes. - -## Version 0.2.0 - -* Major update, includes proper document writing support and significant architecture changes. diff --git a/METADATA b/METADATA index 17fbefc..87bd4b7 100644 --- a/METADATA +++ b/METADATA @@ -1,3 +1,7 @@ +# This project was upgraded with external_updater. +# Usage: tools/external_updater/updater.sh update rust/crates/xml-rs +# For more info, check https://cs.android.com/android/platform/superproject/+/master:tools/external_updater/README.md + name: "xml-rs" description: "An XML library in pure Rust" third_party { @@ -7,13 +11,13 @@ third_party { } url { type: ARCHIVE - value: "https://static.crates.io/crates/xml-rs/xml-rs-0.8.4.crate" + value: "https://static.crates.io/crates/xml-rs/xml-rs-0.8.15-cvss-cries-wolf.crate" } - version: "0.8.4" + version: "0.8.15-cvss-cries-wolf" license_type: NOTICE last_upgrade_date { - year: 2021 - month: 8 - day: 9 + year: 2023 + month: 6 + day: 14 } } diff --git a/README.md b/README.md new file mode 100644 index 0000000..fa4ba7f --- /dev/null +++ b/README.md @@ -0,0 +1,209 @@ +xml-rs, an XML library for Rust +=============================== + +[![CI](https://github.com/kornelski/xml-rs/actions/workflows/main.yml/badge.svg)](https://github.com/kornelski/xml-rs/actions/workflows/main.yml) +[![crates.io][crates-io-img]](https://lib.rs/crates/xml-rs) +[![docs][docs-img]](https://docs.rs/xml-rs/) + +[Documentation](https://docs.rs/xml-rs/) + + [crates-io-img]: https://img.shields.io/crates/v/xml-rs.svg + [docs-img]: https://img.shields.io/badge/docs-latest%20release-6495ed.svg + +xml-rs is an XML library for the [Rust](https://www.rust-lang.org/) programming language. +It supports reading and writing of XML documents in a streaming fashion (without DOM). + +### Features + +* API based on `Iterator`s and regular `String`s without tricky lifetimes. + +* XML spec conformance better than other pure-Rust libraries. + +* Support for UTF-16, UTF-8, ISO-8859-1, and ASCII encodings. + +* Written entirely in the safe Rust subset. + + +The API is heavily inspired by Java Streaming API for XML ([StAX][stax]). It contains a pull parser much like StAX event reader. It provides an iterator API, so you can leverage Rust's existing iterators library features. + + [stax]: https://en.wikipedia.org/wiki/StAX + +It also provides a streaming document writer much like StAX event writer. +This writer consumes its own set of events, but reader events can be converted to +writer events easily, and so it is possible to write XML transformation chains in a pretty +clean manner. + +This parser is mostly full-featured, however, there are limitations: +* Legacy code pages and non-Unicode encodings are not supported; +* DTD validation is not supported (but entities defined in the internal subset are supported); +* attribute value normalization is not performed, and end-of-line characters are not normalized either. + +Other than that the parser tries to be mostly XML-1.1-compliant. + +Writer is also mostly full-featured with the following limitations: +* no support for encodings other than UTF-8, +* no support for emitting `` declarations; +* more validations of input are needed, for example, checking that namespace prefixes are bounded + or comments are well-formed. + +Building and using +------------------ + +xml-rs uses [Cargo](https://crates.io), so add it with `cargo add xml` or modify `Cargo.toml`: + +```toml +[dependencies] +xml = "0.8" +``` + +The package exposes a single crate called `xml`. + +Reading XML documents +--------------------- + +[`xml::reader::EventReader`](EventReader) requires a [`Read`](stdread) instance to read from. It can be a `File` wrapped in `BufReader`, or a `Vec`, or a `&[u8]` slice. + +[EventReader]: https://docs.rs/xml-rs/latest/xml/reader/struct.EventReader.html +[stdread]: https://doc.rust-lang.org/stable/std/io/trait.Read.html + +`EventReader` implements `IntoIterator` trait, so you can use it in a `for` loop directly: + +```rust,no_run +use std::fs::File; +use std::io::BufReader; + +use xml::reader::{EventReader, XmlEvent}; + +fn main() -> std::io::Result<()> { + let file = File::open("file.xml")?; + let file = BufReader::new(file); // Buffering is important for performance + + let parser = EventReader::new(file); + let mut depth = 0; + for e in parser { + match e { + Ok(XmlEvent::StartElement { name, .. }) => { + println!("{:spaces$}+{name}", "", spaces = depth * 2); + depth += 1; + } + Ok(XmlEvent::EndElement { name }) => { + depth -= 1; + println!("{:spaces$}-{name}", "", spaces = depth * 2); + } + Err(e) => { + eprintln!("Error: {e}"); + break; + } + // There's more: https://docs.rs/xml-rs/latest/xml/reader/enum.XmlEvent.html + _ => {} + } + } + + Ok(()) +} +``` + +Document parsing can end normally or with an error. Regardless of exact cause, the parsing +process will be stopped, and the iterator will terminate normally. + +You can also have finer control over when to pull the next event from the parser using its own +`next()` method: + +```rust,ignore +match parser.next() { + ... +} +``` + +Upon the end of the document or an error, the parser will remember the last event and will always +return it in the result of `next()` call afterwards. If iterator is used, then it will yield +error or end-of-document event once and will produce `None` afterwards. + +It is also possible to tweak parsing process a little using [`xml::reader::ParserConfig`][ParserConfig] structure. +See its documentation for more information and examples. + +[ParserConfig]: https://docs.rs/xml-rs/latest/xml/reader/struct.ParserConfig.html + +You can find a more extensive example of using `EventReader` in `src/analyze.rs`, which is a +small program (BTW, it is built with `cargo build` and can be run after that) which shows various +statistics about specified XML document. It can also be used to check for well-formedness of +XML documents - if a document is not well-formed, this program will exit with an error. + +Writing XML documents +--------------------- + +xml-rs also provides a streaming writer much like StAX event writer. With it you can write an +XML document to any `Write` implementor. + +```rust,no_run +use std::io; +use xml::writer::{EmitterConfig, XmlEvent}; + +/// A simple demo syntax where "+foo" makes ``, "-foo" makes `` +fn make_event_from_line(line: &str) -> XmlEvent { + let line = line.trim(); + if let Some(name) = line.strip_prefix("+") { + XmlEvent::start_element(name).into() + } else if line.starts_with("-") { + XmlEvent::end_element().into() + } else { + XmlEvent::characters(line).into() + } +} + +fn main() -> io::Result<()> { + let input = io::stdin(); + let output = io::stdout(); + let mut writer = EmitterConfig::new() + .perform_indent(true) + .create_writer(output); + + let mut line = String::new(); + loop { + line.clear(); + let bytes_read = input.read_line(&mut line)?; + if bytes_read == 0 { + break; // EOF + } + + let event = make_event_from_line(&line); + if let Err(e) = writer.write(event) { + panic!("Write error: {e}") + } + } + Ok(()) +} +``` + +The code example above also demonstrates how to create a writer out of its configuration. +Similar thing also works with `EventReader`. + +The library provides an XML event building DSL which helps to construct complex events, +e.g. ones having namespace definitions. Some examples: + +```rust,ignore +// +XmlEvent::start_element("a:hello").attr("a:param", "value").ns("a", "urn:some:document") + +// +XmlEvent::start_element("hello").attr("b:config", "value").default_ns("urn:defaul:uri") + +// +XmlEvent::cdata("some unescaped text") +``` + +Of course, one can create `XmlEvent` enum variants directly instead of using the builder DSL. +There are more examples in [`xml::writer::XmlEvent`][XmlEvent] documentation. + +[XmlEvent]: https://docs.rs/xml-rs/latest/xml/reader/enum.XmlEvent.html + +The writer has multiple configuration options; see `EmitterConfig` documentation for more +information. + +[EmitterConfig]: https://docs.rs/xml-rs/latest/xml/writer/struct.EmitterConfig.html + +Bug reports +------------ + +Please report issues at: . + diff --git a/Readme.md b/Readme.md deleted file mode 100644 index 5ab88f8..0000000 --- a/Readme.md +++ /dev/null @@ -1,236 +0,0 @@ -xml-rs, an XML library for Rust -=============================== - -[![Build Status][build-status-img]](https://github.com/netvl/xml-rs/actions?query=workflow%3ACI) -[![crates.io][crates-io-img]](https://crates.io/crates/xml-rs) -[![docs][docs-img]](https://docs.rs/xml-rs/) - -[Documentation](https://docs.rs/xml-rs/) - - [build-status-img]: https://img.shields.io/github/workflow/status/netvl/xml-rs/CI/master?style=flat-square - [crates-io-img]: https://img.shields.io/crates/v/xml-rs.svg?style=flat-square - [docs-img]: https://img.shields.io/badge/docs-latest%20release-6495ed.svg?style=flat-square - -xml-rs is an XML library for [Rust](http://www.rust-lang.org/) programming language. -It is heavily inspired by Java [Streaming API for XML (StAX)][stax]. - - [stax]: https://en.wikipedia.org/wiki/StAX - -This library currently contains pull parser much like [StAX event reader][stax-reader]. -It provides iterator API, so you can leverage Rust's existing iterators library features. - - [stax-reader]: http://docs.oracle.com/javase/8/docs/api/javax/xml/stream/XMLEventReader.html - -It also provides a streaming document writer much like [StAX event writer][stax-writer]. -This writer consumes its own set of events, but reader events can be converted to -writer events easily, and so it is possible to write XML transformation chains in a pretty -clean manner. - - [stax-writer]: http://docs.oracle.com/javase/8/docs/api/javax/xml/stream/XMLEventWriter.html - -This parser is mostly full-featured, however, there are limitations: -* no other encodings but UTF-8 are supported yet, because no stream-based encoding library - is available now; when (or if) one will be available, I'll try to make use of it; -* DTD validation is not supported, `` declarations are completely ignored; thus no - support for custom entities too; internal DTD declarations are likely to cause parsing errors; -* attribute value normalization is not performed, and end-of-line characters are not normalized too. - -Other than that the parser tries to be mostly XML-1.0-compliant. - -Writer is also mostly full-featured with the following limitations: -* no support for encodings other than UTF-8, for the same reason as above; -* no support for emitting `` declarations; -* more validations of input are needed, for example, checking that namespace prefixes are bounded - or comments are well-formed. - -What is planned (highest priority first, approximately): - -0. missing features required by XML standard (e.g. aforementioned normalization and - proper DTD parsing); -1. miscellaneous features of the writer; -2. parsing into a DOM tree and its serialization back to XML text; -3. SAX-like callback-based parser (fairly easy to implement over pull parser); -4. DTD validation; -5. (let's dream a bit) XML Schema validation. - -Building and using ------------------- - -xml-rs uses [Cargo](http://crates.io), so just add a dependency section in your project's manifest: - -```toml -[dependencies] -xml-rs = "0.8" -``` - -The package exposes a single crate called `xml`: - -```rust -extern crate xml; -``` - -Reading XML documents ---------------------- - -`xml::reader::EventReader` requires a `Read` instance to read from. When a proper stream-based encoding -library is available, it is likely that xml-rs will be switched to use whatever character stream structure -this library would provide, but currently it is a `Read`. - -Using `EventReader` is very straightforward. Just provide a `Read` instance to obtain an iterator -over events: - -```rust,no_run -extern crate xml; - -use std::fs::File; -use std::io::BufReader; - -use xml::reader::{EventReader, XmlEvent}; - -fn indent(size: usize) -> String { - const INDENT: &'static str = " "; - (0..size).map(|_| INDENT) - .fold(String::with_capacity(size*INDENT.len()), |r, s| r + s) -} - -fn main() { - let file = File::open("file.xml").unwrap(); - let file = BufReader::new(file); - - let parser = EventReader::new(file); - let mut depth = 0; - for e in parser { - match e { - Ok(XmlEvent::StartElement { name, .. }) => { - println!("{}+{}", indent(depth), name); - depth += 1; - } - Ok(XmlEvent::EndElement { name }) => { - depth -= 1; - println!("{}-{}", indent(depth), name); - } - Err(e) => { - println!("Error: {}", e); - break; - } - _ => {} - } - } -} -``` - -`EventReader` implements `IntoIterator` trait, so you can just use it in a `for` loop directly. -Document parsing can end normally or with an error. Regardless of exact cause, the parsing -process will be stopped, and iterator will terminate normally. - -You can also have finer control over when to pull the next event from the parser using its own -`next()` method: - -```rust,ignore -match parser.next() { - ... -} -``` - -Upon the end of the document or an error the parser will remember that last event and will always -return it in the result of `next()` call afterwards. If iterator is used, then it will yield -error or end-of-document event once and will produce `None` afterwards. - -It is also possible to tweak parsing process a little using `xml::reader::ParserConfig` structure. -See its documentation for more information and examples. - -You can find a more extensive example of using `EventReader` in `src/analyze.rs`, which is a -small program (BTW, it is built with `cargo build` and can be run after that) which shows various -statistics about specified XML document. It can also be used to check for well-formedness of -XML documents - if a document is not well-formed, this program will exit with an error. - -Writing XML documents ---------------------- - -xml-rs also provides a streaming writer much like StAX event writer. With it you can write an -XML document to any `Write` implementor. - -```rust,no_run -extern crate xml; - -use std::fs::File; -use std::io::{self, Write}; - -use xml::writer::{EventWriter, EmitterConfig, XmlEvent, Result}; - -fn handle_event(w: &mut EventWriter, line: String) -> Result<()> { - let line = line.trim(); - let event: XmlEvent = if line.starts_with("+") && line.len() > 1 { - XmlEvent::start_element(&line[1..]).into() - } else if line.starts_with("-") { - XmlEvent::end_element().into() - } else { - XmlEvent::characters(&line).into() - }; - w.write(event) -} - -fn main() { - let mut file = File::create("output.xml").unwrap(); - - let mut input = io::stdin(); - let mut output = io::stdout(); - let mut writer = EmitterConfig::new().perform_indent(true).create_writer(&mut file); - loop { - print!("> "); output.flush().unwrap(); - let mut line = String::new(); - match input.read_line(&mut line) { - Ok(0) => break, - Ok(_) => match handle_event(&mut writer, line) { - Ok(_) => {} - Err(e) => panic!("Write error: {}", e) - }, - Err(e) => panic!("Input error: {}", e) - } - } -} -``` - -The code example above also demonstrates how to create a writer out of its configuration. -Similar thing also works with `EventReader`. - -The library provides an XML event building DSL which helps to construct complex events, -e.g. ones having namespace definitions. Some examples: - -```rust,ignore -// -XmlEvent::start_element("a:hello").attr("a:param", "value").ns("a", "urn:some:document") - -// -XmlEvent::start_element("hello").attr("b:config", "value").default_ns("urn:defaul:uri") - -// -XmlEvent::cdata("some unescaped text") -``` - -Of course, one can create `XmlEvent` enum variants directly instead of using the builder DSL. -There are more examples in `xml::writer::XmlEvent` documentation. - -The writer has multiple configuration options; see `EmitterConfig` documentation for more -information. - -Other things ------------- - -No performance tests or measurements are done. The implementation is rather naive, and no specific -optimizations are made. Hopefully the library is sufficiently fast to process documents of common size. -I intend to add benchmarks in future, but not until more important features are added. - -Known issues ------------- - -All known issues are present on GitHub issue tracker: . -Feel free to post any found problems there. - -License -------- - -This library is licensed under MIT license. - ---- -Copyright (C) Vladimir Matveev, 2014-2020 diff --git a/design.md b/design.md deleted file mode 100644 index da67c7b..0000000 --- a/design.md +++ /dev/null @@ -1,37 +0,0 @@ -# Reader - -Basic features: - * [x] Parsing XML 1.0 documents and returning a stream of events - - [ ] Support reading embedded DTD schemas - - [ ] Support for embedded entities - * [x] Support for namespaces and emitting namespace information in events - * [ ] \[maybe\] push-based wrapper - * Missing XML features - - [ ] Support for different encodings - - [ ] Attribute values normalization - - [ ] EOL characters normalization - -Advanced features: - * [ ] DTD schema validation - * [ ] XSD schema validation - -# Writer - -Basic features: - * [x] Writing basic XML 1.0 documents in UTF-8 - * [x] Writing XML 1.0 documents with namespace support - * [x] Support for writing elements with empty body as empty elements - * [x] Pretty-printed and compact output - * [ ] Writing XML document with embedded DTDs and DTD references - * Misc features: - - [ ] Support for different encodings - - [x] Support for writing CDATA as characters - - [ ] Checking events for invalid characters (e.g. `--` in comments) - - [ ] Check for namespaces more correctly, i.e. check both for prefix and namespace URI - - [ ] Support checking namespace prefix presence in the current namespace for events with prefix but without namespace - - [ ] Support checking namespace prefix for events with both prefix and namespace URI - -# Other - -DOM-based API: - * [ ] Basic support for DOM-based API diff --git a/src/analyze.rs b/src/analyze.rs index d369d2f..d50b2d9 100644 --- a/src/analyze.rs +++ b/src/analyze.rs @@ -1,37 +1,23 @@ #![forbid(unsafe_code)] -extern crate xml; - use std::cmp; +use std::collections::HashSet; use std::env; -use std::io::{self, Read, Write, BufReader}; use std::fs::File; -use std::collections::HashSet; +use std::io::{self, BufReader, Read}; -use xml::ParserConfig; use xml::reader::XmlEvent; +use xml::ParserConfig; -macro_rules! abort { - ($code:expr) => {::std::process::exit($code)}; - ($code:expr, $($args:tt)+) => {{ - writeln!(&mut ::std::io::stderr(), $($args)+).unwrap(); - ::std::process::exit($code); - }} -} - -fn main() { +fn main() -> Result<(), Box> { let mut file; let mut stdin; - let source: &mut Read = match env::args().nth(1) { - Some(file_name) => { - file = File::open(file_name) - .unwrap_or_else(|e| abort!(1, "Cannot open input file: {}", e)); - &mut file - } - None => { - stdin = io::stdin(); - &mut stdin - } + let source: &mut dyn Read = if let Some(file_name) = env::args().nth(1) { + file = File::open(file_name).map_err(|e| format!("Cannot open input file: {e}"))?; + &mut file + } else { + stdin = io::stdin(); + &mut stdin }; let reader = ParserConfig::new() @@ -51,49 +37,49 @@ fn main() { let mut max_depth = 0; for e in reader { + let e = e.map_err(|e| format!("Error parsing XML document: {e}"))?; match e { - Ok(e) => match e { - XmlEvent::StartDocument { version, encoding, standalone } => - println!( - "XML document version {}, encoded in {}, {}standalone", - version, encoding, if standalone.unwrap_or(false) { "" } else { "not " } - ), - XmlEvent::EndDocument => println!("Document finished"), - XmlEvent::ProcessingInstruction { .. } => processing_instructions += 1, - XmlEvent::Whitespace(_) => {} // can't happen due to configuration - XmlEvent::Characters(s) => { - character_blocks += 1; - characters += s.len(); - } - XmlEvent::CData(s) => { - cdata_blocks += 1; - characters += s.len(); - } - XmlEvent::Comment(s) => { - comment_blocks += 1; - comment_characters += s.len(); - } - XmlEvent::StartElement { namespace, .. } => { - depth += 1; - max_depth = cmp::max(max_depth, depth); - elements += 1; - namespaces.extend(namespace.0.into_iter().map(|(_, ns_uri)| ns_uri)); - } - XmlEvent::EndElement { .. } => { - depth -= 1; - } - }, - Err(e) => abort!(1, "Error parsing XML document: {}", e) - } + XmlEvent::StartDocument { version, encoding, standalone } => + println!( + "XML document version {}, encoded in {}, {}standalone", + version, encoding, if standalone.unwrap_or(false) { "" } else { "not " } + ), + XmlEvent::EndDocument => println!("Document finished"), + XmlEvent::ProcessingInstruction { .. } => processing_instructions += 1, + XmlEvent::Whitespace(_) => {} // can't happen due to configuration + XmlEvent::Characters(s) => { + character_blocks += 1; + characters += s.len(); + } + XmlEvent::CData(s) => { + cdata_blocks += 1; + characters += s.len(); + } + XmlEvent::Comment(s) => { + comment_blocks += 1; + comment_characters += s.len(); + } + XmlEvent::StartElement { namespace, .. } => { + depth += 1; + max_depth = cmp::max(max_depth, depth); + elements += 1; + namespaces.extend(namespace.0.into_values()); + } + XmlEvent::EndElement { .. } => { + depth -= 1; + } + }; } + namespaces.remove(xml::namespace::NS_EMPTY_URI); namespaces.remove(xml::namespace::NS_XMLNS_URI); namespaces.remove(xml::namespace::NS_XML_URI); - println!("Elements: {}, maximum depth: {}", elements, max_depth); + println!("Elements: {elements}, maximum depth: {max_depth}"); println!("Namespaces (excluding built-in): {}", namespaces.len()); - println!("Characters: {}, characters blocks: {}, CDATA blocks: {}", - characters, character_blocks, cdata_blocks); - println!("Comment blocks: {}, comment characters: {}", comment_blocks, comment_characters); - println!("Processing instructions (excluding built-in): {}", processing_instructions); + println!("Characters: {characters}, characters blocks: {character_blocks}, CDATA blocks: {cdata_blocks}"); + println!("Comment blocks: {comment_blocks}, comment characters: {comment_characters}"); + println!("Processing instructions (excluding built-in): {processing_instructions}"); + + Ok(()) } diff --git a/src/attribute.rs b/src/attribute.rs index 8728f49..112bf24 100644 --- a/src/attribute.rs +++ b/src/attribute.rs @@ -3,8 +3,8 @@ use std::fmt; -use name::{Name, OwnedName}; -use escape::escape_str_attribute; +use crate::escape::{Escaped, AttributeEscapes}; +use crate::name::{Name, OwnedName}; /// A borrowed version of an XML attribute. /// @@ -15,18 +15,19 @@ pub struct Attribute<'a> { pub name: Name<'a>, /// Attribute value. - pub value: &'a str + pub value: &'a str, } impl<'a> fmt::Display for Attribute<'a> { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!(f, "{}=\"{}\"", self.name, escape_str_attribute(self.value)) + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{}=\"{}\"", self.name, Escaped::::new(self.value)) } } impl<'a> Attribute<'a> { /// Creates an owned attribute out of this borrowed one. #[inline] + #[must_use] pub fn to_owned(&self) -> OwnedAttribute { OwnedAttribute { name: self.name.into(), @@ -36,8 +37,9 @@ impl<'a> Attribute<'a> { /// Creates a borrowed attribute using the provided borrowed name and a borrowed string value. #[inline] + #[must_use] pub fn new(name: Name<'a>, value: &'a str) -> Attribute<'a> { - Attribute { name, value, } + Attribute { name, value } } } @@ -50,15 +52,17 @@ pub struct OwnedAttribute { pub name: OwnedName, /// Attribute value. - pub value: String + pub value: String, } impl OwnedAttribute { /// Returns a borrowed `Attribute` out of this owned one. - pub fn borrow(&self) -> Attribute { + #[must_use] + #[inline] + pub fn borrow(&self) -> Attribute<'_> { Attribute { name: self.name.borrow(), - value: &*self.value, + value: &self.value, } } @@ -73,27 +77,27 @@ impl OwnedAttribute { } impl fmt::Display for OwnedAttribute { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!(f, "{}=\"{}\"", self.name, escape_str_attribute(&*self.value)) + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{}=\"{}\"", self.name, Escaped::::new(&self.value)) } } #[cfg(test)] mod tests { - use super::{Attribute}; + use super::Attribute; - use name::Name; + use crate::name::Name; #[test] fn attribute_display() { let attr = Attribute::new( Name::qualified("attribute", "urn:namespace", Some("n")), - "its value with > & \" ' < weird symbols" + "its value with > & \" ' < weird symbols", ); assert_eq!( &*attr.to_string(), "{urn:namespace}n:attribute=\"its value with > & " ' < weird symbols\"" - ) + ); } } diff --git a/src/common.rs b/src/common.rs index 029e851..a1bf3ac 100644 --- a/src/common.rs +++ b/src/common.rs @@ -14,6 +14,7 @@ pub struct TextPosition { impl TextPosition { /// Creates a new position initialized to the beginning of the document #[inline] + #[must_use] pub fn new() -> TextPosition { TextPosition { row: 0, column: 0 } } @@ -21,14 +22,14 @@ impl TextPosition { /// Advances the position in a line #[inline] pub fn advance(&mut self, count: u8) { - self.column += count as u64; + self.column += u64::from(count); } /// Advances the position in a line to the next tab position #[inline] pub fn advance_to_tab(&mut self, width: u8) { - let width = width as u64; - self.column += width - self.column % width + let width = u64::from(width); + self.column += width - self.column % width; } /// Advances the position to the beginning of the next line @@ -40,15 +41,15 @@ impl TextPosition { } impl fmt::Debug for TextPosition { - #[inline] - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + #[cold] + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!(f, "{}:{}", self.row + 1, self.column + 1) } } impl fmt::Display for TextPosition { #[inline] - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!(f, "{}:{}", self.row + 1, self.column + 1) } } @@ -69,26 +70,27 @@ impl Position for TextPosition { } /// XML version enumeration. -#[derive(Copy, Clone, PartialEq, Eq)] +#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord)] pub enum XmlVersion { /// XML version 1.0. Version10, /// XML version 1.1. - Version11 + Version11, } impl fmt::Display for XmlVersion { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match *self { - XmlVersion::Version10 => write!(f, "1.0"), - XmlVersion::Version11 => write!(f, "1.1") - } + XmlVersion::Version10 => "1.0", + XmlVersion::Version11 => "1.1", + }.fmt(f) } } impl fmt::Debug for XmlVersion { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + #[cold] + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { fmt::Display::fmt(self, f) } } @@ -97,33 +99,45 @@ impl fmt::Debug for XmlVersion { /// as is defined by XML 1.1 specification, [section 2.3][1]. /// /// [1]: http://www.w3.org/TR/2006/REC-xml11-20060816/#sec-common-syn +#[must_use] +#[inline] pub fn is_whitespace_char(c: char) -> bool { - match c { - '\x20' | '\x09' | '\x0d' | '\x0a' => true, - _ => false - } + matches!(c, '\x20' | '\x0a' | '\x09' | '\x0d') } /// Checks whether the given string is compound only by white space -/// characters (`S`) using the previous is_whitespace_char to check +/// characters (`S`) using the previous `is_whitespace_char` to check /// all characters of this string pub fn is_whitespace_str(s: &str) -> bool { s.chars().all(is_whitespace_char) } +pub fn is_xml10_char(c: char) -> bool { + matches!(c, '\u{09}' | '\u{0A}' | '\u{0D}' | '\u{20}'..='\u{D7FF}' | '\u{E000}'..='\u{FFFD}' | '\u{10000}'..) +} + +pub fn is_xml11_char(c: char) -> bool { + matches!(c, '\u{01}'..='\u{D7FF}' | '\u{E000}'..='\u{FFFD}' | '\u{10000}'..) +} + +pub fn is_xml11_char_not_restricted(c: char) -> bool { + is_xml11_char(c) && !matches!(c, '\u{01}'..='\u{08}' | '\u{0B}'..='\u{0C}' | '\u{0E}'..='\u{1F}' | '\u{7F}'..='\u{84}' | '\u{86}'..='\u{9F}') +} + /// Checks whether the given character is a name start character (`NameStartChar`) /// as is defined by XML 1.1 specification, [section 2.3][1]. /// /// [1]: http://www.w3.org/TR/2006/REC-xml11-20060816/#sec-common-syn +#[must_use] pub fn is_name_start_char(c: char) -> bool { match c { - ':' | 'A'...'Z' | '_' | 'a'...'z' | - '\u{C0}'...'\u{D6}' | '\u{D8}'...'\u{F6}' | '\u{F8}'...'\u{2FF}' | - '\u{370}'...'\u{37D}' | '\u{37F}'...'\u{1FFF}' | - '\u{200C}'...'\u{200D}' | '\u{2070}'...'\u{218F}' | - '\u{2C00}'...'\u{2FEF}' | '\u{3001}'...'\u{D7FF}' | - '\u{F900}'...'\u{FDCF}' | '\u{FDF0}'...'\u{FFFD}' | - '\u{10000}'...'\u{EFFFF}' => true, + ':' | 'A'..='Z' | '_' | 'a'..='z' | + '\u{C0}'..='\u{D6}' | '\u{D8}'..='\u{F6}' | '\u{F8}'..='\u{2FF}' | + '\u{370}'..='\u{37D}' | '\u{37F}'..='\u{1FFF}' | + '\u{200C}'..='\u{200D}' | '\u{2070}'..='\u{218F}' | + '\u{2C00}'..='\u{2FEF}' | '\u{3001}'..='\u{D7FF}' | + '\u{F900}'..='\u{FDCF}' | '\u{FDF0}'..='\u{FFFD}' | + '\u{10000}'..='\u{EFFFF}' => true, _ => false } } @@ -132,11 +146,12 @@ pub fn is_name_start_char(c: char) -> bool { /// as is defined by XML 1.1 specification, [section 2.3][1]. /// /// [1]: http://www.w3.org/TR/2006/REC-xml11-20060816/#sec-common-syn +#[must_use] pub fn is_name_char(c: char) -> bool { match c { _ if is_name_start_char(c) => true, - '-' | '.' | '0'...'9' | '\u{B7}' | - '\u{300}'...'\u{36F}' | '\u{203F}'...'\u{2040}' => true, + '-' | '.' | '0'..='9' | '\u{B7}' | + '\u{300}'..='\u{36F}' | '\u{203F}'..='\u{2040}' => true, _ => false } } diff --git a/src/escape.rs b/src/escape.rs index 18298b9..1fcfd06 100644 --- a/src/escape.rs +++ b/src/escape.rs @@ -1,81 +1,102 @@ //! Contains functions for performing XML special characters escaping. -use std::borrow::Cow; +use std::{borrow::Cow, marker::PhantomData, fmt::{Display, Result, Formatter}}; -enum Value { - Char(char), - Str(&'static str) -} +pub(crate) trait Escapes { + fn escape(c: u8) -> Option<&'static str>; -impl Value { - fn dispatch_for_attribute(c: char) -> Value { - match c { - '<' => Value::Str("<"), - '>' => Value::Str(">"), - '"' => Value::Str("""), - '\'' => Value::Str("'"), - '&' => Value::Str("&"), - '\n' => Value::Str(" "), - '\r' => Value::Str(" "), - _ => Value::Char(c) - } + fn byte_needs_escaping(c: u8) -> bool{ + Self::escape(c).is_some() } - fn dispatch_for_pcdata(c: char) -> Value { - match c { - '<' => Value::Str("<"), - '&' => Value::Str("&"), - _ => Value::Char(c) - } + fn str_needs_escaping(s: &str) -> bool{ + s.bytes().any(|c| Self::escape(c).is_some()) } } -enum Process<'a> { - Borrowed(&'a str), - Owned(String) +pub(crate) struct Escaped<'a, E: Escapes> { + _escape_phantom: PhantomData, + to_escape: &'a str, } -impl<'a> Process<'a> { - fn process(&mut self, (i, next): (usize, Value)) { - match next { - Value::Str(s) => match *self { - Process::Owned(ref mut o) => o.push_str(s), - Process::Borrowed(b) => { - let mut r = String::with_capacity(b.len() + s.len()); - r.push_str(&b[..i]); - r.push_str(s); - *self = Process::Owned(r); - } - }, - Value::Char(c) => match *self { - Process::Borrowed(_) => {} - Process::Owned(ref mut o) => o.push(c) - } +impl<'a, E: Escapes> Escaped<'a, E> { + pub fn new(s: &'a str) -> Self { + Escaped { + _escape_phantom: PhantomData, + to_escape: s, } } +} - fn into_result(self) -> Cow<'a, str> { - match self { - Process::Borrowed(b) => Cow::Borrowed(b), - Process::Owned(o) => Cow::Owned(o) + +impl<'a, E: Escapes> Display for Escaped<'a, E> { + fn fmt(&self, f: &mut Formatter<'_>) -> Result { + let mut total_remaining = self.to_escape; + + // find the next occurence + while let Some(n) = total_remaining + .bytes() + .position(E::byte_needs_escaping) + { + let (start, remaining) = total_remaining.split_at(n); + + f.write_str(start)?; + + // unwrap is safe because we checked is_some for position n earlier + let next_byte = remaining.bytes().next().unwrap(); + let replacement = E::escape(next_byte).unwrap(); + f.write_str(replacement)?; + + total_remaining = &remaining[1..]; } + + f.write_str(total_remaining) } } -impl<'a> Extend<(usize, Value)> for Process<'a> { - fn extend>(&mut self, it: I) { - for v in it.into_iter() { - self.process(v); - } +fn escape_str(s: &str) -> Cow<'_, str> { + if E::str_needs_escaping(s) { + Cow::Owned(format!("{}", Escaped::::new(s))) + } else { + Cow::Borrowed(s) } } -fn escape_str(s: &str, dispatch: fn(char) -> Value) -> Cow { - let mut p = Process::Borrowed(s); - p.extend(s.char_indices().map(|(ind, c)| (ind, dispatch(c)))); - p.into_result() +macro_rules! escapes { + { + $name: ident, + $($k: expr => $v: expr),* $(,)? + } => { + pub(crate) struct $name; + + impl Escapes for $name { + fn escape(c: u8) -> Option<&'static str> { + match c { + $( $k => Some($v),)* + _ => None + } + } + } + }; } +escapes!( + AttributeEscapes, + b'<' => "<", + b'>' => ">", + b'"' => """, + b'\'' => "'", + b'&' => "&", + b'\n' => " ", + b'\r' => " ", +); + +escapes!( + PcDataEscapes, + b'<' => "<", + b'&' => "&", +); + /// Performs escaping of common XML characters inside an attribute value. /// /// This function replaces several important markup characters with their @@ -86,13 +107,18 @@ fn escape_str(s: &str, dispatch: fn(char) -> Value) -> Cow { /// * `"` → `"` /// * `'` → `'` /// * `&` → `&` +/// +/// The following characters are escaped so that attributes are printed on +/// a single line: +/// * `\n` → ` ` +/// * `\r` → ` ` /// /// The resulting string is safe to use inside XML attribute values or in PCDATA sections. /// /// Does not perform allocations if the given string does not contain escapable characters. #[inline] -pub fn escape_str_attribute(s: &str) -> Cow { - escape_str(s, Value::dispatch_for_attribute) +#[must_use] pub fn escape_str_attribute(s: &str) -> Cow<'_, str> { + escape_str::(s) } /// Performs escaping of common XML characters inside PCDATA. @@ -107,15 +133,25 @@ pub fn escape_str_attribute(s: &str) -> Cow { /// /// Does not perform allocations if the given string does not contain escapable characters. #[inline] -pub fn escape_str_pcdata(s: &str) -> Cow { - escape_str(s, Value::dispatch_for_pcdata) +#[must_use] pub fn escape_str_pcdata(s: &str) -> Cow<'_, str> { + escape_str::(s) } #[cfg(test)] mod tests { - use super::{escape_str_pcdata, escape_str_attribute}; + use super::{escape_str_attribute, escape_str_pcdata}; - // TODO: add more tests + #[test] + fn test_escape_str_attribute() { + assert_eq!(escape_str_attribute("<>'\"&\n\r"), "<>'"& "); + assert_eq!(escape_str_attribute("no_escapes"), "no_escapes"); + } + + #[test] + fn test_escape_str_pcdata() { + assert_eq!(escape_str_pcdata("<&"), "<&"); + assert_eq!(escape_str_pcdata("no_escapes"), "no_escapes"); + } #[test] fn test_escape_multibyte_code_points() { @@ -123,4 +159,3 @@ mod tests { assert_eq!(escape_str_pcdata("☃<"), "☃<"); } } - diff --git a/src/lib.rs b/src/lib.rs index fb672ef..b1486d8 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,29 +1,30 @@ //#![warn(missing_doc)] -#![allow(dead_code)] -#![allow(unused_variables)] #![forbid(non_camel_case_types)] #![forbid(unsafe_code)] +#![allow(clippy::redundant_closure_for_method_calls)] +#![allow(clippy::module_name_repetitions)] //! This crate currently provides an almost XML 1.0/1.1-compliant pull parser. +//! +//! Please note that functions of this parser may panic. +//! If a panic could cause a Denial Of Service in your codebase, *you're* responsible for wrapping access to this library in `catch_unwind`. #[cfg(doctest)] -#[macro_use] -extern crate doc_comment; +doc_comment::doctest!("../README.md"); -#[cfg(doctest)] -doctest!("../Readme.md"); - -pub use reader::EventReader; -pub use reader::ParserConfig; -pub use writer::EventWriter; -pub use writer::EmitterConfig; +pub use crate::reader::EventReader; +pub use crate::reader::ParserConfig; +pub use crate::util::Encoding; +pub use crate::writer::EmitterConfig; +pub use crate::writer::EventWriter; -pub mod macros; -pub mod name; pub mod attribute; pub mod common; pub mod escape; +#[doc(hidden)] // FIXME: not supposed to be public +pub mod macros; +pub mod name; pub mod namespace; pub mod reader; -pub mod writer; mod util; +pub mod writer; diff --git a/src/macros.rs b/src/macros.rs index 1cce3d6..25916d3 100644 --- a/src/macros.rs +++ b/src/macros.rs @@ -5,7 +5,8 @@ macro_rules! gen_setter { ($target:ty, $field:ident : into $t:ty) => { impl $target { - /// Sets the field to the provided value and returns updated config object. + /// See [`ParserConfig`][crate::ParserConfig] fields docs for details + #[inline] pub fn $field>(mut self, value: T) -> $target { self.$field = value.into(); self @@ -14,13 +15,38 @@ macro_rules! gen_setter { }; ($target:ty, $field:ident : val $t:ty) => { impl $target { - /// Sets the field to the provided value and returns updated config object. + /// See [`ParserConfig`][crate::ParserConfig] fields docs for details + #[inline] pub fn $field(mut self, value: $t) -> $target { self.$field = value; self } } - } + }; + ($target:ty, $field:ident : delegate $t:ty) => { + impl $target { + /// See [`ParserConfig`][crate::ParserConfig] fields docs for details + #[inline] + pub fn $field(mut self, value: $t) -> $target { + self.c.$field = value; + self + } + } + }; + ($target:ty, $field:ident : c2 $t:ty) => { + impl $target { + /// See [`ParserConfig2`][crate::reader::ParserConfig] fields docs for details + #[inline] + #[must_use] + pub fn $field(self, value: $t) -> ParserConfig2 { + ParserConfig2 { + c: self, + ..Default::default() + } + .$field(value) + } + } + }; } macro_rules! gen_setters { diff --git a/src/name.rs b/src/name.rs index a20eae2..fc11981 100644 --- a/src/name.rs +++ b/src/name.rs @@ -4,7 +4,7 @@ use std::fmt; use std::str::FromStr; -use namespace::NS_NO_PREFIX; +use crate::namespace::NS_NO_PREFIX; /// Represents a qualified XML name. /// @@ -53,16 +53,16 @@ pub struct Name<'a> { pub namespace: Option<&'a str>, /// A name prefix, e.g. `xsi` in `xsi:string`. - pub prefix: Option<&'a str> + pub prefix: Option<&'a str>, } impl<'a> From<&'a str> for Name<'a> { fn from(s: &'a str) -> Name<'a> { - let mut parts = s.splitn(2, ":").fuse(); + let mut parts = s.splitn(2, ':').fuse(); match (parts.next(), parts.next()) { (Some(name), None) => Name::local(name), (Some(prefix), Some(name)) => Name::prefixed(name, prefix), - _ => unreachable!() + _ => unreachable!(), } } } @@ -74,52 +74,56 @@ impl<'a> From<(&'a str, &'a str)> for Name<'a> { } impl<'a> fmt::Display for Name<'a> { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { if let Some(namespace) = self.namespace { - write!(f, "{{{}}}", namespace)?; + write!(f, "{{{namespace}}}")?; } if let Some(prefix) = self.prefix { - write!(f, "{}:", prefix)?; + write!(f, "{prefix}:")?; } - write!(f, "{}", self.local_name) + f.write_str(self.local_name) } } impl<'a> Name<'a> { /// Returns an owned variant of the qualified name. + #[must_use] pub fn to_owned(&self) -> OwnedName { OwnedName { local_name: self.local_name.into(), - namespace: self.namespace.map(|s| s.into()), - prefix: self.prefix.map(|s| s.into()) + namespace: self.namespace.map(std::convert::Into::into), + prefix: self.prefix.map(std::convert::Into::into), } } /// Returns a new `Name` instance representing plain local name. #[inline] - pub fn local(local_name: &str) -> Name { + #[must_use] + pub fn local(local_name: &str) -> Name<'_> { Name { local_name, prefix: None, - namespace: None + namespace: None, } } /// Returns a new `Name` instance with the given local name and prefix. #[inline] + #[must_use] pub fn prefixed(local_name: &'a str, prefix: &'a str) -> Name<'a> { Name { local_name, namespace: None, - prefix: Some(prefix) + prefix: Some(prefix), } } /// Returns a new `Name` instance representing a qualified name with or without a prefix and /// with a namespace URI. #[inline] + #[must_use] pub fn qualified(local_name: &'a str, namespace: &'a str, prefix: Option<&'a str>) -> Name<'a> { Name { local_name, @@ -132,6 +136,7 @@ impl<'a> Name<'a> { /// /// This method is different from the autoimplemented `to_string()` because it does not /// include namespace URI in the result. + #[must_use] pub fn to_repr(&self) -> String { self.repr_display().to_string() } @@ -142,12 +147,14 @@ impl<'a> Name<'a> { /// This method is needed for efficiency purposes in order not to create unnecessary /// allocations. #[inline] - pub fn repr_display(&self) -> ReprDisplay { + #[must_use] + pub fn repr_display(&self) -> ReprDisplay<'_, '_> { ReprDisplay(self) } /// Returns either a prefix of this name or `namespace::NS_NO_PREFIX` constant. #[inline] + #[must_use] pub fn prefix_repr(&self) -> &str { self.prefix.unwrap_or(NS_NO_PREFIX) } @@ -155,13 +162,13 @@ impl<'a> Name<'a> { /// A wrapper around `Name` whose `Display` implementation prints the wrapped name as it is /// displayed in an XML document. -pub struct ReprDisplay<'a, 'b:'a>(&'a Name<'b>); +pub struct ReprDisplay<'a, 'b>(&'a Name<'b>); -impl<'a, 'b:'a> fmt::Display for ReprDisplay<'a, 'b> { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { +impl<'a, 'b: 'a> fmt::Display for ReprDisplay<'a, 'b> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match self.0.prefix { Some(prefix) => write!(f, "{}:{}", prefix, self.0.local_name), - None => write!(f, "{}", self.0.local_name) + None => self.0.local_name.fmt(f), } } } @@ -183,18 +190,20 @@ pub struct OwnedName { impl fmt::Display for OwnedName { #[inline] - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { fmt::Display::fmt(&self.borrow(), f) } } impl OwnedName { /// Constructs a borrowed `Name` based on this owned name. - pub fn borrow(&self) -> Name { + #[must_use] + #[inline] + pub fn borrow(&self) -> Name<'_> { Name { - local_name: &*self.local_name, - namespace: self.namespace.as_ref().map(|s| &**s), - prefix: self.prefix.as_ref().map(|s| &**s), + local_name: &self.local_name, + namespace: self.namespace.as_deref(), + prefix: self.prefix.as_deref(), } } @@ -217,22 +226,24 @@ impl OwnedName { OwnedName { local_name: local_name.into(), namespace: Some(namespace.into()), - prefix: prefix.map(|v| v.into()) + prefix: prefix.map(std::convert::Into::into), } } /// Returns an optional prefix by reference, equivalent to `self.borrow().prefix` /// but avoids extra work. #[inline] + #[must_use] pub fn prefix_ref(&self) -> Option<&str> { - self.prefix.as_ref().map(|s| &**s) + self.prefix.as_deref() } /// Returns an optional namespace by reference, equivalen to `self.borrow().namespace` /// but avoids extra work. #[inline] + #[must_use] pub fn namespace_ref(&self) -> Option<&str> { - self.namespace.as_ref().map(|s| &**s) + self.namespace.as_deref() } } diff --git a/src/namespace.rs b/src/namespace.rs index 1ab4a5c..216a982 100644 --- a/src/namespace.rs +++ b/src/namespace.rs @@ -1,9 +1,9 @@ //! Contains namespace manipulation types and functions. -use std::iter::{Map, Rev}; -use std::collections::btree_map::{BTreeMap, Entry}; use std::collections::btree_map::Iter as Entries; +use std::collections::btree_map::{BTreeMap, Entry}; use std::collections::HashSet; +use std::iter::{Map, Rev}; use std::slice::Iter; /// Designates prefix for namespace definitions. @@ -11,14 +11,14 @@ use std::slice::Iter; /// See [Namespaces in XML][namespace] spec for more information. /// /// [namespace]: http://www.w3.org/TR/xml-names/#ns-decl -pub const NS_XMLNS_PREFIX: &'static str = "xmlns"; +pub const NS_XMLNS_PREFIX: &str = "xmlns"; /// Designates the standard URI for `xmlns` prefix. /// -/// See [A Namespace Name for xmlns Attributes][1] for more information. +/// See [A Namespace Name for xmlns Attributes][namespace] for more information. /// /// [namespace]: http://www.w3.org/2000/xmlns/ -pub const NS_XMLNS_URI: &'static str = "http://www.w3.org/2000/xmlns/"; +pub const NS_XMLNS_URI: &str = "http://www.w3.org/2000/xmlns/"; /// Designates prefix for a namespace containing several special predefined attributes. /// @@ -29,12 +29,12 @@ pub const NS_XMLNS_URI: &'static str = "http://www.w3.org/2000/xmlns/"; /// [2]: http://www.w3.org/TR/REC-xml/#sec-lang-tag /// [3]: http://www.w3.org/TR/xmlbase/ /// [4]: http://www.w3.org/TR/xml-id/ -pub const NS_XML_PREFIX: &'static str = "xml"; +pub const NS_XML_PREFIX: &str = "xml"; /// Designates the standard URI for `xml` prefix. /// /// See `NS_XML_PREFIX` documentation for more information. -pub const NS_XML_URI: &'static str = "http://www.w3.org/XML/1998/namespace"; +pub const NS_XML_URI: &str = "http://www.w3.org/XML/1998/namespace"; /// Designates the absence of prefix in a qualified name. /// @@ -52,7 +52,7 @@ pub const NS_XML_URI: &'static str = "http://www.w3.org/XML/1998/namespace" /// By default empty prefix corresponds to absence of namespace, but this can change either /// when writing an XML document (manually) or when reading an XML document (based on namespace /// declarations). -pub const NS_NO_PREFIX: &'static str = ""; +pub const NS_NO_PREFIX: &str = ""; /// Designates an empty namespace URI, which is equivalent to absence of namespace. /// @@ -60,7 +60,7 @@ pub const NS_NO_PREFIX: &'static str = ""; /// empty prefix corresponds to absent namespace in `NamespaceStack` instances created with /// `NamespaceStack::default()`. Therefore, it can be used to restore `NS_NO_PREFIX` mapping /// in a namespace back to its default value. -pub const NS_EMPTY_URI: &'static str = ""; +pub const NS_EMPTY_URI: &str = ""; /// Namespace is a map from prefixes to namespace URIs. /// @@ -71,16 +71,21 @@ pub struct Namespace(pub BTreeMap); impl Namespace { /// Returns an empty namespace. #[inline] - pub fn empty() -> Namespace { Namespace(BTreeMap::new()) } + #[must_use] + pub fn empty() -> Namespace { + Namespace(BTreeMap::new()) + } /// Checks whether this namespace is empty. #[inline] + #[must_use] pub fn is_empty(&self) -> bool { self.0.is_empty() } /// Checks whether this namespace is essentially empty, that is, it does not contain /// anything but default mappings. + #[must_use] pub fn is_essentially_empty(&self) -> bool { // a shortcut for a namespace which is definitely not empty if self.0.len() > 3 { return false; } @@ -101,7 +106,7 @@ impl Namespace { /// # Return value /// `true` if this namespace contains the given prefix, `false` otherwise. #[inline] - pub fn contains>(&self, prefix: &P) -> bool { + pub fn contains>(&self, prefix: &P) -> bool { self.0.contains_key(prefix.as_ref()) } @@ -157,7 +162,7 @@ impl Namespace { /// /// # Return value /// Namespace URI corresponding to the given prefix, if it is present. - pub fn get<'a, P: ?Sized+AsRef>(&'a self, prefix: &P) -> Option<&'a str> { + pub fn get<'a, P: ?Sized + AsRef>(&'a self, prefix: &P) -> Option<&'a str> { self.0.get(prefix.as_ref()).map(|s| &**s) } } @@ -174,7 +179,7 @@ impl<'a> IntoIterator for &'a Namespace { fn into_iter(self) -> Self::IntoIter { fn mapper<'a>((prefix, uri): (&'a String, &'a String)) -> UriMapping<'a> { - (&*prefix, &*uri) + (prefix, uri) } self.0.iter().map(mapper) } @@ -190,7 +195,10 @@ pub struct NamespaceStack(pub Vec); impl NamespaceStack { /// Returns an empty namespace stack. #[inline] - pub fn empty() -> NamespaceStack { NamespaceStack(Vec::with_capacity(2)) } + #[must_use] + pub fn empty() -> NamespaceStack { + NamespaceStack(Vec::with_capacity(2)) + } /// Returns a namespace stack with default items in it. /// @@ -199,6 +207,7 @@ impl NamespaceStack { /// * `xml` → `http://www.w3.org/XML/1998/namespace`; /// * `xmlns` → `http://www.w3.org/2000/xmlns/`. #[inline] + #[must_use] pub fn default() -> NamespaceStack { let mut nst = NamespaceStack::empty(); nst.push_empty(); @@ -246,6 +255,7 @@ impl NamespaceStack { /// /// Panics if the stack is empty. #[inline] + #[must_use] pub fn peek(&self) -> &Namespace { self.0.last().unwrap() } @@ -294,7 +304,11 @@ impl NamespaceStack { pub fn put(&mut self, prefix: P, uri: U) -> bool where P: Into, U: Into { - self.0.last_mut().unwrap().put(prefix, uri) + if let Some(ns) = self.0.last_mut() { + ns.put(prefix, uri) + } else { + false + } } /// Performs a search for the given prefix in the whole stack. @@ -306,7 +320,7 @@ impl NamespaceStack { /// # Parameters /// * `prefix` --- namespace prefix. #[inline] - pub fn get<'a, P: ?Sized+AsRef>(&'a self, prefix: &P) -> Option<&'a str> { + pub fn get<'a, P: ?Sized + AsRef>(&'a self, prefix: &P) -> Option<&'a str> { let prefix = prefix.as_ref(); for ns in self.0.iter().rev() { match ns.get(prefix) { @@ -321,9 +335,10 @@ impl NamespaceStack { /// /// Namespaces are combined in left-to-right order, that is, rightmost namespace /// elements take priority over leftmost ones. + #[must_use] pub fn squash(&self) -> Namespace { let mut result = BTreeMap::new(); - for ns in self.0.iter() { + for ns in &self.0 { result.extend(ns.0.iter().map(|(k, v)| (k.clone(), v.clone()))); } Namespace(result) @@ -333,13 +348,14 @@ impl NamespaceStack { /// /// See `CheckedTarget` for more information. #[inline] - pub fn checked_target(&mut self) -> CheckedTarget { + pub fn checked_target(&mut self) -> CheckedTarget<'_> { CheckedTarget(self) } /// Returns an iterator over all mappings in this namespace stack. #[inline] - pub fn iter(&self) -> NamespaceStackMappings { + #[must_use] + pub fn iter(&self) -> NamespaceStackMappings<'_> { self.into_iter() } } @@ -361,7 +377,7 @@ impl NamespaceStack { pub struct NamespaceStackMappings<'a> { namespaces: Rev>, current_namespace: Option>, - used_keys: HashSet<&'a str> + used_keys: HashSet<&'a str>, } impl<'a> NamespaceStackMappings<'a> { @@ -379,7 +395,7 @@ impl<'a> Iterator for NamespaceStackMappings<'a> { if self.current_namespace.is_none() && !self.go_to_next_namespace() { return None; } - let next_item = self.current_namespace.as_mut().unwrap().next(); + let next_item = self.current_namespace.as_mut()?.next(); match next_item { // There is an element in the current namespace @@ -412,7 +428,7 @@ impl<'a> IntoIterator for &'a NamespaceStack { NamespaceStackMappings { namespaces: self.0.iter().rev(), current_namespace: None, - used_keys: HashSet::new() + used_keys: HashSet::new(), } } } diff --git a/src/reader.rs b/src/reader.rs new file mode 100644 index 0000000..71ea79b --- /dev/null +++ b/src/reader.rs @@ -0,0 +1,156 @@ +//! Contains high-level interface for a pull-based XML parser. +//! +//! The most important type in this module is `EventReader`, which provides an iterator +//! view for events in XML document. + +use std::io::Read; +use std::iter::FusedIterator; +use std::result; + +use crate::common::{Position, TextPosition}; + +pub use self::config::ParserConfig; +pub use self::config::ParserConfig2; + +pub use self::events::XmlEvent; +use self::parser::PullParser; + +mod config; +mod events; +mod lexer; +mod parser; + +mod error; +pub use self::error::{Error, ErrorKind}; + +/// A result type yielded by `XmlReader`. +pub type Result = result::Result; + +/// A wrapper around an `std::io::Read` instance which provides pull-based XML parsing. +pub struct EventReader { + source: R, + parser: PullParser, +} + +impl EventReader { + /// Creates a new reader, consuming the given stream. + #[inline] + pub fn new(source: R) -> EventReader { + EventReader::new_with_config(source, ParserConfig2::new()) + } + + /// Creates a new reader with the provded configuration, consuming the given stream. + #[inline] + pub fn new_with_config(source: R, config: impl Into) -> EventReader { + EventReader { source, parser: PullParser::new(config) } + } + + /// Pulls and returns next XML event from the stream. + /// + /// If returned event is `XmlEvent::Error` or `XmlEvent::EndDocument`, then + /// further calls to this method will return this event again. + #[inline] + pub fn next(&mut self) -> Result { + self.parser.next(&mut self.source) + } + + /// Skips all XML events until the next end tag at the current level. + /// + /// Convenience function that is useful for the case where you have + /// encountered a start tag that is of no interest and want to + /// skip the entire XML subtree until the corresponding end tag. + #[inline] + pub fn skip(&mut self) -> Result<()> { + let mut depth = 1; + + while depth > 0 { + match self.next()? { + XmlEvent::StartElement { .. } => depth += 1, + XmlEvent::EndElement { .. } => depth -= 1, + XmlEvent::EndDocument => unreachable!(), + _ => {} + } + } + + Ok(()) + } + + pub fn source(&self) -> &R { &self.source } + pub fn source_mut(&mut self) -> &mut R { &mut self.source } + + /// Unwraps this `EventReader`, returning the underlying reader. + /// + /// Note that this operation is destructive; unwrapping the reader and wrapping it + /// again with `EventReader::new()` will create a fresh reader which will attempt + /// to parse an XML document from the beginning. + pub fn into_inner(self) -> R { + self.source + } +} + +impl Position for EventReader { + /// Returns the position of the last event produced by the reader. + #[inline] + fn position(&self) -> TextPosition { + self.parser.position() + } +} + +impl IntoIterator for EventReader { + type Item = Result; + type IntoIter = Events; + + fn into_iter(self) -> Events { + Events { reader: self, finished: false } + } +} + +/// An iterator over XML events created from some type implementing `Read`. +/// +/// When the next event is `xml::event::Error` or `xml::event::EndDocument`, then +/// it will be returned by the iterator once, and then it will stop producing events. +pub struct Events { + reader: EventReader, + finished: bool, +} + +impl Events { + /// Unwraps the iterator, returning the internal `EventReader`. + #[inline] + pub fn into_inner(self) -> EventReader { + self.reader + } + + pub fn source(&self) -> &R { &self.reader.source } + pub fn source_mut(&mut self) -> &mut R { &mut self.reader.source } + +} + +impl FusedIterator for Events { +} + +impl Iterator for Events { + type Item = Result; + + #[inline] + fn next(&mut self) -> Option> { + if self.finished && !self.reader.parser.is_ignoring_end_of_stream() { + None + } else { + let ev = self.reader.next(); + if let Ok(XmlEvent::EndDocument) | Err(_) = ev { + self.finished = true; + } + Some(ev) + } + } +} + +impl<'r> EventReader<&'r [u8]> { + /// A convenience method to create an `XmlReader` from a string slice. + #[inline] + #[must_use] + pub fn from_str(source: &'r str) -> EventReader<&'r [u8]> { + EventReader::new(source.as_bytes()) + } +} diff --git a/src/reader/config.rs b/src/reader/config.rs index 0abb165..3351997 100644 --- a/src/reader/config.rs +++ b/src/reader/config.rs @@ -1,8 +1,9 @@ //! Contains parser configuration structure. -use std::io::Read; use std::collections::HashMap; +use std::io::Read; -use reader::EventReader; +use crate::reader::EventReader; +use crate::util::Encoding; /// Parser configuration structure. /// @@ -103,6 +104,8 @@ impl ParserConfig { /// .ignore_comments(true) /// .coalesce_characters(false); /// ``` + #[must_use] + #[inline] pub fn new() -> ParserConfig { ParserConfig { trim_whitespace: false, @@ -179,3 +182,125 @@ gen_setters! { ParserConfig, replace_unknown_entity_references: val bool, ignore_root_level_whitespace: val bool } + +/// Backwards-compatible extension of `ParserConfig`, which will eventually be merged into the original `ParserConfig` struct +#[derive(Clone, PartialEq, Eq, Debug)] +#[non_exhaustive] +pub struct ParserConfig2 { + pub(crate) c: ParserConfig, + + /// Use this encoding as the default. Necessary for UTF-16 files without BOM. + pub override_encoding: Option, + + /// Allow `` to contain unsupported encoding names, + /// and interpret them as Latin1 instead. This will mangle non-ASCII characters, but usually it won't fail parsing. + pub ignore_invalid_encoding_declarations: bool, + + /// Documents with multiple root elements are ill-formed + pub allow_multiple_root_elements: bool, +} + +impl Default for ParserConfig2 { + fn default() -> Self { + ParserConfig2 { + c: Default::default(), + override_encoding: None, + ignore_invalid_encoding_declarations: false, + allow_multiple_root_elements: true, + } + } +} + +impl ParserConfig2 { + #[inline] + #[must_use] + pub fn new() -> Self { + Self::default() + } + + /// Read character encoding from `Content-Type` header. + /// Set this when parsing XML documents fetched over HTTP. + /// + /// `text/*` MIME types do *not* imply latin1. UTF-8 is always the default fallback. + #[must_use] pub fn content_type(mut self, mime_type: &str) -> Self { + let charset = mime_type.split_once(';') + .and_then(|(_, args)| args.split_once("charset")) + .and_then(|(_, args)| args.split_once('=')); + if let Some((_, charset)) = charset { + let name = charset.trim().trim_matches('"'); + match name.parse() { + Ok(enc) => { + self.override_encoding = Some(enc); + }, + Err(_) => {}, + } + } + self + } + + /// Creates an XML reader with this configuration. + /// + /// This is a convenience method for configuring and creating a reader at the same time: + /// + /// ```rust + /// use xml::reader::ParserConfig; + /// + /// let mut source: &[u8] = b"..."; + /// + /// let reader = ParserConfig::new() + /// .trim_whitespace(true) + /// .ignore_comments(true) + /// .coalesce_characters(false) + /// .create_reader(&mut source); + /// ``` + /// + /// This method is exactly equivalent to calling `EventReader::new_with_config()` with + /// this configuration object. + #[inline] + pub fn create_reader(self, source: R) -> EventReader { + EventReader::new_with_config(source, self) + } +} + +impl From for ParserConfig2 { + #[inline] + fn from(c: ParserConfig) -> Self { + Self { + c, + ..Default::default() + } + } +} + +gen_setters! { ParserConfig2, + override_encoding: val Option, + allow_multiple_root_elements: val bool, + ignore_invalid_encoding_declarations: val bool +} + +gen_setters! { ParserConfig, + override_encoding: c2 Option, + ignore_invalid_encoding_declarations: c2 bool, + allow_multiple_root_elements: c2 bool, + content_type: c2 &str +} + +gen_setters! { ParserConfig2, + trim_whitespace: delegate bool, + whitespace_to_characters: delegate bool, + cdata_to_characters: delegate bool, + ignore_comments: delegate bool, + coalesce_characters: delegate bool, + ignore_end_of_stream: delegate bool, + replace_unknown_entity_references: delegate bool, + ignore_root_level_whitespace: delegate bool +} + +#[test] +fn mime_parse() { + let c = ParserConfig2::new().content_type("text/xml;charset=Us-AScii"); + assert_eq!(c.override_encoding, Some(Encoding::Ascii)); + + let c = ParserConfig2::new().content_type("text/xml;charset = \"UTF-16\""); + assert_eq!(c.override_encoding, Some(Encoding::Utf16)); +} diff --git a/src/reader/error.rs b/src/reader/error.rs index 92378e6..8af35ae 100644 --- a/src/reader/error.rs +++ b/src/reader/error.rs @@ -1,12 +1,15 @@ +use crate::Encoding; +use crate::reader::lexer::Token; -use std::io; use std::borrow::Cow; -use std::fmt; use std::error; +use std::error::Error as _; +use std::fmt; +use std::io; use std::str; -use util; -use common::{Position, TextPosition}; +use crate::common::{Position, TextPosition}; +use crate::util; #[derive(Debug)] pub enum ErrorKind { @@ -16,18 +19,127 @@ pub enum ErrorKind { UnexpectedEof, } +#[derive(Debug, Clone, PartialEq)] +#[non_exhaustive] +pub(crate) enum SyntaxError { + CannotRedefineXmlnsPrefix, + CannotRedefineXmlPrefix, + /// Recursive custom entity expanded to too many chars, it could be DoS + EntityTooBig, + EmptyEntity, + NoRootElement, + ProcessingInstructionWithoutName, + UnbalancedRootElement, + UnexpectedEof, + UnexpectedOpeningTag, + /// Missing `]]>` + UnclosedCdata, + UnexpectedQualifiedName(Token), + UnexpectedTokenOutsideRoot(Token), + UnexpectedToken(Token), + UnexpectedTokenInEntity(Token), + UnexpectedTokenInClosingTag(Token), + UnexpectedTokenInOpeningTag(Token), + InvalidQualifiedName(Box), + UnboundAttribute(Box), + UnboundElementPrefix(Box), + UnexpectedClosingTag(Box), + UnexpectedName(Box), + /// Found , Token), + CannotUndefinePrefix(Box), + InvalidCharacterEntity(u32), + InvalidDefaultNamespace(Box), + InvalidNamePrefix(Box), + InvalidNumericEntity(Box), + InvalidStandaloneDeclaration(Box), + InvalidXmlProcessingInstruction(Box), + RedefinedAttribute(Box), + UndefinedEntity(Box), + UnexpectedEntity(Box), + UnexpectedNameInsideXml(Box), + UnsupportedEncoding(Box), + /// In DTD + UnknownMarkupDeclaration(Box), + UnexpectedXmlVersion(Box), + ConflictingEncoding(Encoding, Encoding), + UnexpectedTokenBefore(&'static str, char), +} + +impl fmt::Display for SyntaxError { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + self.to_cow().fmt(f) + } +} + +impl SyntaxError { + #[inline(never)] + #[cold] + pub(crate) fn to_cow(&self) -> Cow<'static, str> { + match *self { + Self::CannotRedefineXmlnsPrefix => "Cannot redefine XMLNS prefix".into(), + Self::CannotRedefineXmlPrefix => "Default XMLNS prefix cannot be rebound to another value".into(), + Self::EmptyEntity => "Encountered empty entity".into(), + Self::EntityTooBig => "Entity too big".into(), + Self::NoRootElement => "Unexpected end of stream: no root element found".into(), + Self::ProcessingInstructionWithoutName => "Encountered processing instruction without a name".into(), + Self::UnbalancedRootElement => "Unexpected end of stream: still inside the root element".into(), + Self::UnclosedCdata => "Unclosed "Unexpected end of stream".into(), + Self::UnexpectedOpeningTag => "'<' is not allowed in attributes".into(), + Self::CannotUndefinePrefix(ref ln) => format!("Cannot undefine prefix '{ln}'").into(), + Self::ConflictingEncoding(a, b) => format!("Declared encoding {a}, but uses {b}").into(), + Self::InvalidCharacterEntity(num) => format!("Invalid character U+{num:04X}").into(), + Self::InvalidDefaultNamespace(ref name) => format!( "Namespace '{name}' cannot be default").into(), + Self::InvalidNamePrefix(ref prefix) => format!("'{prefix}' cannot be an element name prefix").into(), + Self::InvalidNumericEntity(ref v) => format!("Invalid numeric entity: {v}").into(), + Self::InvalidQualifiedName(ref e) => format!("Qualified name is invalid: {e}").into(), + Self::InvalidStandaloneDeclaration(ref value) => format!("Invalid standalone declaration value: {value}").into(), + Self::InvalidXmlProcessingInstruction(ref name) => format!("Invalid processing instruction: format!("Attribute '{name}' is redefined").into(), + Self::UnboundAttribute(ref name) => format!("Attribute {name} prefix is unbound").into(), + Self::UnboundElementPrefix(ref name) => format!("Element {name} prefix is unbound").into(), + Self::UndefinedEntity(ref v) => format!("Undefined entity: {v}").into(), + Self::UnexpectedClosingTag(ref expected_got) => format!("Unexpected closing tag: {expected_got}").into(), + Self::UnexpectedEntity(ref name) => format!("Unexpected entity: {name}").into(), + Self::UnexpectedName(ref name) => format!("Unexpected name: {name}").into(), + Self::UnexpectedNameInsideXml(ref name) => format!("Unexpected name inside XML declaration: {name}").into(), + Self::UnexpectedProcessingInstruction(ref buf, token) => format!("Unexpected token inside processing instruction: format!("Unexpected token inside qualified name: {e}").into(), + Self::UnexpectedToken(token) => format!("Unexpected token: {token}").into(), + Self::UnexpectedTokenBefore(before, c) => format!("Unexpected token '{before}' before '{c}'").into(), + Self::UnexpectedTokenInClosingTag(token) => format!("Unexpected token inside closing tag: {token}").into(), + Self::UnexpectedTokenInEntity(token) => format!("Unexpected token inside entity: {token}").into(), + Self::UnexpectedTokenInOpeningTag(token) => format!("Unexpected token inside opening tag: {token}").into(), + Self::UnexpectedTokenOutsideRoot(token) => format!("Unexpected characters outside the root element: {token}").into(), + Self::UnexpectedXmlVersion(ref version) => format!("Invalid XML version: {version}").into(), + Self::UnknownMarkupDeclaration(ref v) => format!("Unknown markup declaration: {v}").into(), + Self::UnsupportedEncoding(ref v) => format!("Unsupported encoding: {v}").into(), + } + } +} + /// An XML parsing error. /// /// Consists of a 2D position in a document and a textual message describing the error. #[derive(Clone, PartialEq, Eq, Debug)] pub struct Error { - pos: TextPosition, - kind: ErrorKind, + pub(crate) pos: TextPosition, + pub(crate) kind: ErrorKind, } impl fmt::Display for Error { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!(f, "{} {}", self.pos, self.msg()) + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + use self::ErrorKind::{Io, Syntax, UnexpectedEof, Utf8}; + + write!(f, "{} ", self.pos)?; + match &self.kind { + Io(io_error) => io_error.fmt(f), + Utf8(reason) => reason.fmt(f), + Syntax(msg) => f.write_str(msg), + UnexpectedEof => f.write_str("Unexpected EOF"), + } } } @@ -38,49 +150,59 @@ impl Position for Error { impl Error { /// Returns a reference to a message which is contained inside this error. - #[inline] - pub fn msg(&self) -> &str { - use self::ErrorKind::*; - match self.kind { - UnexpectedEof => &"Unexpected EOF", - Utf8(ref reason) => error_description(reason), - Io(ref io_error) => error_description(io_error), - Syntax(ref msg) => msg.as_ref(), + #[cold] + #[doc(hidden)] + #[allow(deprecated)] + #[must_use] pub fn msg(&self) -> &str { + use self::ErrorKind::{Io, Syntax, UnexpectedEof, Utf8}; + match &self.kind { + Io(io_error) => io_error.description(), + Utf8(reason) => reason.description(), + Syntax(msg) => msg.as_ref(), + UnexpectedEof => "Unexpected EOF", } } - pub fn kind(&self) -> &ErrorKind { &self.kind } + #[must_use] + #[inline] + pub fn kind(&self) -> &ErrorKind { + &self.kind + } } impl error::Error for Error { - #[inline] + #[allow(deprecated)] + #[cold] fn description(&self) -> &str { self.msg() } } impl<'a, P, M> From<(&'a P, M)> for Error where P: Position, M: Into> { + #[cold] fn from(orig: (&'a P, M)) -> Self { - Error{ + Error { pos: orig.0.position(), - kind: ErrorKind::Syntax(orig.1.into()) + kind: ErrorKind::Syntax(orig.1.into()), } } } impl From for Error { + #[cold] fn from(e: util::CharReadError) -> Self { - use util::CharReadError::*; - Error{ + use crate::util::CharReadError::{Io, UnexpectedEof, Utf8}; + Error { pos: TextPosition::new(), kind: match e { UnexpectedEof => ErrorKind::UnexpectedEof, Utf8(reason) => ErrorKind::Utf8(reason), Io(io_error) => ErrorKind::Io(io_error), - } + }, } } } impl From for Error { + #[cold] fn from(e: io::Error) -> Self { Error { pos: TextPosition::new(), @@ -90,26 +212,28 @@ impl From for Error { } impl Clone for ErrorKind { + #[cold] fn clone(&self) -> Self { - use self::ErrorKind::*; - match *self { + use self::ErrorKind::{Io, Syntax, UnexpectedEof, Utf8}; + match self { UnexpectedEof => UnexpectedEof, - Utf8(ref reason) => Utf8(reason.clone()), - Io(ref io_error) => Io(io::Error::new(io_error.kind(), error_description(io_error))), - Syntax(ref msg) => Syntax(msg.clone()), + Utf8(reason) => Utf8(*reason), + Io(io_error) => Io(io::Error::new(io_error.kind(), io_error.to_string())), + Syntax(msg) => Syntax(msg.clone()), } } } impl PartialEq for ErrorKind { + #[allow(deprecated)] fn eq(&self, other: &ErrorKind) -> bool { - use self::ErrorKind::*; + use self::ErrorKind::{Io, Syntax, UnexpectedEof, Utf8}; match (self, other) { - (&UnexpectedEof, &UnexpectedEof) => true, - (&Utf8(ref left), &Utf8(ref right)) => left == right, - (&Io(ref left), &Io(ref right)) => + (UnexpectedEof, UnexpectedEof) => true, + (Utf8(left), Utf8(right)) => left == right, + (Io(left), Io(right)) => left.kind() == right.kind() && - error_description(left) == error_description(right), - (&Syntax(ref left), &Syntax(ref right)) => + left.description() == right.description(), + (Syntax(left), Syntax(right)) => left == right, (_, _) => false, @@ -118,4 +242,7 @@ impl PartialEq for ErrorKind { } impl Eq for ErrorKind {} -fn error_description(e: &error::Error) -> &str { e.description() } +#[test] +fn err_size() { + assert!(std::mem::size_of::() <= 24); +} diff --git a/src/reader/events.rs b/src/reader/events.rs index 46d7621..de2b930 100644 --- a/src/reader/events.rs +++ b/src/reader/events.rs @@ -1,12 +1,12 @@ //! Contains `XmlEvent` datatype, instances of which are emitted by the parser. -use std::fmt; use std::borrow::Cow; +use std::fmt; -use name::OwnedName; -use attribute::OwnedAttribute; -use common::XmlVersion; -use namespace::Namespace; +use crate::attribute::OwnedAttribute; +use crate::common::XmlVersion; +use crate::name::OwnedName; +use crate::namespace::Namespace; /// An element of an XML input stream. /// @@ -36,7 +36,7 @@ pub enum XmlEvent { /// If XML document is not present or does not contain `standalone` attribute, /// defaults to `None`. This field is currently used for no other purpose than /// informational. - standalone: Option + standalone: Option, }, /// Denotes to the end of the document stream. @@ -54,7 +54,7 @@ pub enum XmlEvent { name: String, /// Processing instruction content. - data: Option + data: Option, }, /// Denotes a beginning of an XML element. @@ -80,7 +80,7 @@ pub enum XmlEvent { /// latter case it is emitted immediately after corresponding `StartElement` event. EndElement { /// Qualified name of the element. - name: OwnedName + name: OwnedName, }, /// Denotes CDATA content. @@ -111,19 +111,20 @@ pub enum XmlEvent { /// It is possible to configure a parser to emit `Characters` event instead of `Whitespace`. /// See `pull::ParserConfiguration` structure for more information. When combined with whitespace /// trimming, it will eliminate standalone whitespace from the event stream completely. - Whitespace(String) + Whitespace(String), } impl fmt::Debug for XmlEvent { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + #[cold] + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match *self { - XmlEvent::StartDocument { ref version, ref encoding, ref standalone } => - write!(f, "StartDocument({}, {}, {:?})", version, *encoding, *standalone), + XmlEvent::StartDocument { ref version, ref encoding, standalone } => + write!(f, "StartDocument({}, {}, {:?})", version, *encoding, standalone), XmlEvent::EndDocument => write!(f, "EndDocument"), XmlEvent::ProcessingInstruction { ref name, ref data } => write!(f, "ProcessingInstruction({}{})", *name, match *data { - Some(ref data) => format!(", {}", data), + Some(ref data) => format!(", {data}"), None => String::new() }), XmlEvent::StartElement { ref name, ref attributes, namespace: Namespace(ref namespace) } => @@ -136,15 +137,15 @@ impl fmt::Debug for XmlEvent { format!(", [{}]", attributes.join(", ")) }), XmlEvent::EndElement { ref name } => - write!(f, "EndElement({})", name), + write!(f, "EndElement({name})"), XmlEvent::Comment(ref data) => - write!(f, "Comment({})", data), + write!(f, "Comment({data})"), XmlEvent::CData(ref data) => - write!(f, "CData({})", data), + write!(f, "CData({data})"), XmlEvent::Characters(ref data) => - write!(f, "Characters({})", data), + write!(f, "Characters({data})"), XmlEvent::Whitespace(ref data) => - write!(f, "Whitespace({})", data) + write!(f, "Whitespace({data})") } } } @@ -188,32 +189,33 @@ impl XmlEvent { /// ``` /// /// Note that this API may change or get additions in future to improve its ergonomics. - pub fn as_writer_event<'a>(&'a self) -> Option<::writer::events::XmlEvent<'a>> { + #[must_use] + pub fn as_writer_event(&self) -> Option> { match *self { XmlEvent::StartDocument { version, ref encoding, standalone } => - Some(::writer::events::XmlEvent::StartDocument { - version: version, + Some(crate::writer::events::XmlEvent::StartDocument { + version, encoding: Some(encoding), - standalone: standalone + standalone }), XmlEvent::ProcessingInstruction { ref name, ref data } => - Some(::writer::events::XmlEvent::ProcessingInstruction { - name: name, - data: data.as_ref().map(|s| &s[..]) + Some(crate::writer::events::XmlEvent::ProcessingInstruction { + name, + data: data.as_ref().map(|s| &**s) }), XmlEvent::StartElement { ref name, ref attributes, ref namespace } => - Some(::writer::events::XmlEvent::StartElement { + Some(crate::writer::events::XmlEvent::StartElement { name: name.borrow(), attributes: attributes.iter().map(|a| a.borrow()).collect(), namespace: Cow::Borrowed(namespace) }), XmlEvent::EndElement { ref name } => - Some(::writer::events::XmlEvent::EndElement { name: Some(name.borrow()) }), - XmlEvent::Comment(ref data) => Some(::writer::events::XmlEvent::Comment(data)), - XmlEvent::CData(ref data) => Some(::writer::events::XmlEvent::CData(data)), - XmlEvent::Characters(ref data) => Some(::writer::events::XmlEvent::Characters(data)), - XmlEvent::Whitespace(ref data) => Some(::writer::events::XmlEvent::Characters(data)), - _ => None + Some(crate::writer::events::XmlEvent::EndElement { name: Some(name.borrow()) }), + XmlEvent::Comment(ref data) => Some(crate::writer::events::XmlEvent::Comment(data)), + XmlEvent::CData(ref data) => Some(crate::writer::events::XmlEvent::CData(data)), + XmlEvent::Characters(ref data) | + XmlEvent::Whitespace(ref data) => Some(crate::writer::events::XmlEvent::Characters(data)), + XmlEvent::EndDocument => None, } } } diff --git a/src/reader/lexer.rs b/src/reader/lexer.rs index c466db9..a8345ba 100644 --- a/src/reader/lexer.rs +++ b/src/reader/lexer.rs @@ -2,20 +2,25 @@ //! //! This module is for internal use. Use `xml::pull` module to do parsing. -use std::fmt; + +use crate::reader::ErrorKind; +use crate::reader::error::SyntaxError; use std::collections::VecDeque; +use std::fmt; use std::io::Read; use std::result; -use std::borrow::Cow; +use crate::common::{is_name_char, is_whitespace_char, Position, TextPosition, is_xml10_char, is_xml11_char}; +use crate::reader::Error; +use crate::util::{CharReader, Encoding}; -use common::{Position, TextPosition, is_whitespace_char, is_name_char}; -use reader::Error; -use util; +/// Limits to defend from billion laughs attack +const MAX_ENTITY_EXPANSION_LENGTH: usize = 1_000_000; +const MAX_ENTITY_EXPANSION_DEPTH: u8 = 10; /// `Token` represents a single lexeme of an XML document. These lexemes /// are used to perform actual parsing. #[derive(Copy, Clone, PartialEq, Eq, Debug)] -pub enum Token { +pub(crate) enum Token { /// `` @@ -34,12 +39,8 @@ pub enum Token { CommentStart, /// `-->` CommentEnd, - /// A chunk of characters, used for errors recovery. - Chunk(&'static str), /// Any non-special character except whitespace. Character(char), - /// Whitespace character. - Whitespace(char), /// `=` EqualsSign, /// `'` @@ -54,14 +55,16 @@ pub enum Token { ReferenceStart, /// `;` ReferenceEnd, + /// ` fmt::Result { + #[cold] + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match *self { - Token::Chunk(s) => write!(f, "{}", s), - Token::Character(c) | Token::Whitespace(c) => write!(f, "{}", c), - other => write!(f, "{}", match other { + Token::Character(c) => c.fmt(f), + other => match other { Token::OpeningTagStart => "<", Token::ProcessingInstructionStart => " " "=", Token::SingleQuote => "'", Token::DoubleQuote => "\"", + Token::MarkupDeclarationStart => " unreachable!() - }) + }.fmt(f), } } } @@ -103,47 +107,28 @@ impl Token { Token::EqualsSign => Some("="), Token::SingleQuote => Some("'"), Token::DoubleQuote => Some("\""), - Token::Chunk(s) => Some(s), _ => None } } // using String.push_str(token.to_string()) is simply way too slow pub fn push_to_string(&self, target: &mut String) { - match self.as_static_str() { - Some(s) => { target.push_str(s); } - None => { - match *self { - Token::Character(c) | Token::Whitespace(c) => target.push(c), - _ => unreachable!() - } - } - } - } - - /// Returns `true` if this token contains data that can be interpreted - /// as a part of the text. Surprisingly, this also means '>' and '=' and '"' and "'" and '-->'. - #[inline] - pub fn contains_char_data(&self) -> bool { - match *self { - Token::Whitespace(_) | Token::Chunk(_) | Token::Character(_) | Token::CommentEnd | - Token::TagEnd | Token::EqualsSign | Token::DoubleQuote | Token::SingleQuote | Token::CDataEnd | - Token::ProcessingInstructionEnd | Token::EmptyTagEnd => true, - _ => false - } - } - - /// Returns `true` if this token corresponds to a white space character. - #[inline] - pub fn is_whitespace(&self) -> bool { match *self { - Token::Whitespace(_) => true, - _ => false + Token::Character(c) => { + debug_assert!(is_xml10_char(c) || is_xml11_char(c)); + target.push(c) + }, + _ => if let Some(s) = self.as_static_str() { + target.push_str(s); + } } } } +#[derive(Copy, Clone)] enum State { + /// Default state + Normal, /// Triggered on '<' TagStarted, /// Triggered on '` + InsideMarkupDeclarationQuotedString(QuoteStyle), +} + +#[derive(Copy, Clone, Eq, PartialEq)] +enum QuoteStyle { + Single, Double } #[derive(Copy, Clone)] @@ -184,7 +184,7 @@ enum CDataStartedSubstate { } /// `Result` represents lexing result. It is either a token or an error message. -pub type Result = result::Result, Error>; +pub(crate) type Result, E = Error> = result::Result; /// Helps to set up a dispatch table for lexing large unambigous tokens like /// `, - st: State, - skip_errors: bool, - inside_comment: bool, + /// Default state to go back to after a tag end (may be `InsideDoctype`) + normal_state: State, inside_token: bool, - eof_handled: bool + eof_handled: bool, + reparse_depth: u8, + #[cfg(test)] + skip_errors: bool, } impl Position for Lexer { @@ -235,37 +239,33 @@ impl Position for Lexer { impl Lexer { /// Returns a new lexer with default state. - pub fn new() -> Lexer { + pub(crate) fn new() -> Lexer { Lexer { + reader: CharReader::new(), pos: TextPosition::new(), head_pos: TextPosition::new(), char_queue: VecDeque::with_capacity(4), // TODO: check size st: State::Normal, - skip_errors: false, - inside_comment: false, + normal_state: State::Normal, inside_token: false, - eof_handled: false + eof_handled: false, + reparse_depth: 0, + #[cfg(test)] + skip_errors: false, } } - /// Enables error handling so `next_token` will return `Some(Err(..))` - /// upon invalid lexeme. - #[inline] - pub fn enable_errors(&mut self) { self.skip_errors = false; } + pub(crate) fn encoding(&mut self) -> Encoding { + self.reader.encoding + } + + pub(crate) fn set_encoding(&mut self, encoding: Encoding) { + self.reader.encoding = encoding; + } /// Disables error handling so `next_token` will return `Some(Chunk(..))` /// upon invalid lexeme with this lexeme content. - #[inline] - pub fn disable_errors(&mut self) { self.skip_errors = true; } - - /// Enables special handling of some lexemes which should be done when we're parsing comment - /// internals. - #[inline] - pub fn inside_comment(&mut self) { self.inside_comment = true; } - - /// Disables the effect of `inside_comment()` method. - #[inline] - pub fn outside_comment(&mut self) { self.inside_comment = false; } + #[cfg(test)] fn disable_errors(&mut self) { self.skip_errors = true; } /// Reset the eof handled flag of the lexer. #[inline] @@ -293,23 +293,29 @@ impl Lexer { // Check if we have saved a char or two for ourselves while let Some(c) = self.char_queue.pop_front() { - match try!(self.read_next_token(c)) { + match self.dispatch_char(c)? { Some(t) => { self.inside_token = false; return Ok(Some(t)); } - None => {} // continue + None => {} // continue } } - + // if char_queue is empty, all circular reparsing is done + self.reparse_depth = 0; loop { - // TODO: this should handle multiple encodings - let c = match try!(util::next_char_from(b)) { - Some(c) => c, // got next char - None => break, // nothing to read left + let c = match self.reader.next_char_from(b)? { + Some(c) => c, // got next char + None => break, // nothing to read left }; - match try!(self.read_next_token(c)) { + if c == '\n' { + self.head_pos.new_line(); + } else { + self.head_pos.advance(1); + } + + match self.dispatch_char(c)? { Some(t) => { self.inside_token = false; return Ok(Some(t)); @@ -320,61 +326,67 @@ impl Lexer { } } + self.end_of_stream() + } + + #[inline(never)] + fn end_of_stream(&mut self) -> Result { // Handle end of stream self.eof_handled = true; self.pos = self.head_pos; match self.st { + State::InsideCdata | State::CDataClosing(_) => Err(self.error(SyntaxError::UnclosedCdata)), State::TagStarted | State::CommentOrCDataOrDoctypeStarted | State::CommentStarted | State::CDataStarted(_)| State::DoctypeStarted(_) | State::CommentClosing(ClosingSubstate::Second) | - State::DoctypeFinishing(_) => - Err(self.error("Unexpected end of stream")), - State::ProcessingInstructionClosing => - Ok(Some(Token::Character('?'))), + State::InsideComment | State::InsideMarkupDeclaration | + State::InsideProcessingInstruction | State::ProcessingInstructionClosing | + State::InsideDoctype | State::InsideMarkupDeclarationQuotedString(_) => + Err(self.error(SyntaxError::UnexpectedEof)), State::EmptyTagClosing => Ok(Some(Token::Character('/'))), State::CommentClosing(ClosingSubstate::First) => Ok(Some(Token::Character('-'))), - State::CDataClosing(ClosingSubstate::First) => + State::InvalidCDataClosing(ClosingSubstate::First) => Ok(Some(Token::Character(']'))), - State::CDataClosing(ClosingSubstate::Second) => - Ok(Some(Token::Chunk("]]"))), + State::InvalidCDataClosing(ClosingSubstate::Second) => { + self.eof_handled = false; + self.move_to_with_unread(State::Normal, &[']'], Token::Character(']')) + }, State::Normal => - Ok(None) + Ok(None), } } - #[inline] - fn error>>(&self, msg: M) -> Error { - (self, msg).into() - } - - #[inline] - fn read_next_token(&mut self, c: char) -> Result { - let res = self.dispatch_char(c); - if self.char_queue.is_empty() { - if c == '\n' { - self.head_pos.new_line(); - } else { - self.head_pos.advance(1); - } + #[cold] + fn error(&self, e: SyntaxError) -> Error { + Error { + pos: self.position(), + kind: ErrorKind::Syntax(e.to_cow()), } - res } + + #[inline(never)] fn dispatch_char(&mut self, c: char) -> Result { match self.st { State::Normal => self.normal(c), State::TagStarted => self.tag_opened(c), + State::EmptyTagClosing => self.empty_element_closing(c), State::CommentOrCDataOrDoctypeStarted => self.comment_or_cdata_or_doctype_started(c), - State::CommentStarted => self.comment_started(c), + State::InsideCdata => self.inside_cdata(c), State::CDataStarted(s) => self.cdata_started(c, s), - State::DoctypeStarted(s) => self.doctype_started(c, s), - State::DoctypeFinishing(d) => self.doctype_finishing(c, d), + State::InsideComment => self.inside_comment_state(c), + State::CommentStarted => self.comment_started(c), + State::InsideProcessingInstruction => self.inside_processing_instruction(c), State::ProcessingInstructionClosing => self.processing_instruction_closing(c), - State::EmptyTagClosing => self.empty_element_closing(c), State::CommentClosing(s) => self.comment_closing(c, s), - State::CDataClosing(s) => self.cdata_closing(c, s) + State::CDataClosing(s) => self.cdata_closing(c, s), + State::InsideDoctype => self.inside_doctype(c), + State::DoctypeStarted(s) => self.doctype_started(c, s), + State::InvalidCDataClosing(s) => self.invalid_cdata_closing(c, s), + State::InsideMarkupDeclaration => self.markup_declaration(c), + State::InsideMarkupDeclarationQuotedString(q) => self.markup_declaration_string(c, q), } } @@ -391,18 +403,50 @@ impl Lexer { } #[inline] + fn move_to_and_reset_normal(&mut self, st: State, token: Token) -> Result { + self.normal_state = st; + self.st = st; + Ok(Some(token)) + } + fn move_to_with_unread(&mut self, st: State, cs: &[char], token: Token) -> Result { - self.char_queue.extend(cs.iter().cloned()); + for c in cs.iter().rev().copied() { + self.char_queue.push_front(c); + } self.move_to_with(st, token) } + pub(crate) fn reparse(&mut self, markup: &str) -> Result<()> { + if markup.is_empty() { + return Ok(()); + } + + self.reparse_depth += 1; + if self.reparse_depth > MAX_ENTITY_EXPANSION_DEPTH || self.char_queue.len() > MAX_ENTITY_EXPANSION_LENGTH { + return Err(self.error(SyntaxError::EntityTooBig)) + } + + self.eof_handled = false; + self.char_queue.reserve(markup.len()); + for c in markup.chars().rev() { + self.char_queue.push_front(c); + } + + Ok(()) + } + fn handle_error(&mut self, chunk: &'static str, c: char) -> Result { - self.char_queue.push_back(c); - if self.skip_errors || (self.inside_comment && chunk != "--") { // FIXME: looks hacky - self.move_to_with(State::Normal, Token::Chunk(chunk)) - } else { - Err(self.error(format!("Unexpected token '{}' before '{}'", chunk, c))) + debug_assert!(!chunk.is_empty()); + + #[cfg(test)] + if self.skip_errors { + let mut chars = chunk.chars(); + let first = chars.next().unwrap_or('\0'); + self.char_queue.extend(chars); + self.char_queue.push_back(c); + return self.move_to_with(State::Normal, Token::Character(first)); } + Err(self.error(SyntaxError::UnexpectedTokenBefore(chunk, c))) } /// Encountered a char @@ -414,12 +458,39 @@ impl Lexer { '=' => Ok(Some(Token::EqualsSign)), '"' => Ok(Some(Token::DoubleQuote)), '\'' => Ok(Some(Token::SingleQuote)), - '?' => self.move_to(State::ProcessingInstructionClosing), - '-' => self.move_to(State::CommentClosing(ClosingSubstate::First)), + ']' => self.move_to(State::InvalidCDataClosing(ClosingSubstate::First)), + '&' => Ok(Some(Token::ReferenceStart)), + ';' => Ok(Some(Token::ReferenceEnd)), + _ => Ok(Some(Token::Character(c))) + } + } + + fn inside_cdata(&mut self, c: char) -> Result { + match c { ']' => self.move_to(State::CDataClosing(ClosingSubstate::First)), + _ => Ok(Some(Token::Character(c))) + } + } + + fn inside_processing_instruction(&mut self, c: char) -> Result { + // These tokens are used by `` parser + match c { + '?' => self.move_to(State::ProcessingInstructionClosing), + '<' => Ok(Some(Token::OpeningTagStart)), + '>' => Ok(Some(Token::TagEnd)), + '/' => Ok(Some(Token::ClosingTagStart)), + '=' => Ok(Some(Token::EqualsSign)), + '"' => Ok(Some(Token::DoubleQuote)), + '\'' => Ok(Some(Token::SingleQuote)), '&' => Ok(Some(Token::ReferenceStart)), ';' => Ok(Some(Token::ReferenceEnd)), - _ if is_whitespace_char(c) => Ok(Some(Token::Whitespace(c))), + _ => Ok(Some(Token::Character(c))) + } + } + + fn inside_comment_state(&mut self, c: char) -> Result { + match c { + '-' => self.move_to(State::CommentClosing(ClosingSubstate::First)), _ => Ok(Some(Token::Character(c))) } } @@ -427,11 +498,11 @@ impl Lexer { /// Encountered '<' fn tag_opened(&mut self, c: char) -> Result { match c { - '?' => self.move_to_with(State::Normal, Token::ProcessingInstructionStart), - '/' => self.move_to_with(State::Normal, Token::ClosingTagStart), + '?' => self.move_to_with(State::InsideProcessingInstruction, Token::ProcessingInstructionStart), + '/' => self.move_to_with(self.normal_state, Token::ClosingTagStart), '!' => self.move_to(State::CommentOrCDataOrDoctypeStarted), - _ if is_whitespace_char(c) => self.move_to_with_unread(State::Normal, &[c], Token::OpeningTagStart), - _ if is_name_char(c) => self.move_to_with_unread(State::Normal, &[c], Token::OpeningTagStart), + _ if is_whitespace_char(c) => self.move_to_with_unread(self.normal_state, &[c], Token::OpeningTagStart), + _ if is_name_char(c) => self.move_to_with_unread(self.normal_state, &[c], Token::OpeningTagStart), _ => self.handle_error("<", c) } } @@ -442,31 +513,55 @@ impl Lexer { '-' => self.move_to(State::CommentStarted), '[' => self.move_to(State::CDataStarted(CDataStartedSubstate::E)), 'D' => self.move_to(State::DoctypeStarted(DoctypeStartedSubstate::D)), - _ => self.handle_error(" { + self.move_to_with_unread(State::InsideMarkupDeclaration, &[c], Token::MarkupDeclarationStart) + }, + _ => self.handle_error(" Result { match c { - '-' => self.move_to_with(State::Normal, Token::CommentStart), - _ => self.handle_error(" self.move_to_with(State::InsideComment, Token::CommentStart), + _ => self.handle_error(" Result { - use self::CDataStartedSubstate::{E, C, CD, CDA, CDAT, CDATA}; + use self::CDataStartedSubstate::{C, CD, CDA, CDAT, CDATA, E}; dispatch_on_enum_state!(self, s, c, State::CDataStarted, E ; 'C' ; C ; " Result { + match c { + '<' => self.handle_error("' => self.move_to_with(self.normal_state, Token::TagEnd), + '&' => Ok(Some(Token::ReferenceStart)), + ';' => Ok(Some(Token::ReferenceEnd)), + '"' => self.move_to_with(State::InsideMarkupDeclarationQuotedString(QuoteStyle::Double), Token::DoubleQuote), + '\'' => self.move_to_with(State::InsideMarkupDeclarationQuotedString(QuoteStyle::Single), Token::SingleQuote), + _ => Ok(Some(Token::Character(c))), + } + } + + fn markup_declaration_string(&mut self, c: char, q: QuoteStyle) -> Result { + match c { + '"' if q == QuoteStyle::Double => self.move_to_with(State::InsideMarkupDeclaration, Token::DoubleQuote), + '\'' if q == QuoteStyle::Single => self.move_to_with(State::InsideMarkupDeclaration, Token::SingleQuote), + _ => Ok(Some(Token::Character(c))), + } + } + /// Encountered ' Result { use self::DoctypeStartedSubstate::{D, DO, DOC, DOCT, DOCTY, DOCTYP}; @@ -476,33 +571,36 @@ impl Lexer { DOC ; 'T' ; DOCT ; " Result { + fn inside_doctype(&mut self, c: char) -> Result { match c { - '<' => self.move_to(State::DoctypeFinishing(d + 1)), - '>' if d == 1 => self.move_to_with(State::Normal, Token::TagEnd), - '>' => self.move_to(State::DoctypeFinishing(d - 1)), - _ => Ok(None), + '>' => self.move_to_and_reset_normal(State::Normal, Token::TagEnd), + '<' => self.move_to(State::TagStarted), + '&' => Ok(Some(Token::ReferenceStart)), + ';' => Ok(Some(Token::ReferenceEnd)), + '"' => Ok(Some(Token::DoubleQuote)), + '\'' => Ok(Some(Token::SingleQuote)), + _ => Ok(Some(Token::Character(c))), } } /// Encountered '?' fn processing_instruction_closing(&mut self, c: char) -> Result { match c { - '>' => self.move_to_with(State::Normal, Token::ProcessingInstructionEnd), - _ => self.move_to_with_unread(State::Normal, &[c], Token::Character('?')), + '>' => self.move_to_with(self.normal_state, Token::ProcessingInstructionEnd), + _ => self.move_to_with_unread(State::InsideProcessingInstruction, &[c], Token::Character('?')), } } /// Encountered '/' fn empty_element_closing(&mut self, c: char) -> Result { match c { - '>' => self.move_to_with(State::Normal, Token::EmptyTagEnd), - _ => self.move_to_with_unread(State::Normal, &[c], Token::Character('/')), + '>' => self.move_to_with(self.normal_state, Token::EmptyTagEnd), + _ => self.move_to_with_unread(self.normal_state, &[c], Token::Character('/')), } } @@ -511,18 +609,13 @@ impl Lexer { match s { ClosingSubstate::First => match c { '-' => self.move_to(State::CommentClosing(ClosingSubstate::Second)), - _ => self.move_to_with_unread(State::Normal, &[c], Token::Character('-')) + _ => self.move_to_with_unread(State::InsideComment, &[c], Token::Character('-')), }, ClosingSubstate::Second => match c { - '>' => self.move_to_with(State::Normal, Token::CommentEnd), + '>' => self.move_to_with(self.normal_state, Token::CommentEnd), // double dash not followed by a greater-than is a hard error inside comment - _ if self.inside_comment => self.handle_error("--", c), - // nothing else except comment closing starts with a double dash, and comment - // closing can never be after another dash, and also we're outside of a comment, - // therefore it is safe to push only the last read character to the list of unread - // characters and pass the double dash directly to the output - _ => self.move_to_with_unread(State::Normal, &[c], Token::Chunk("--")) - } + _ => self.handle_error("--", c), + }, } } @@ -531,19 +624,33 @@ impl Lexer { match s { ClosingSubstate::First => match c { ']' => self.move_to(State::CDataClosing(ClosingSubstate::Second)), - _ => self.move_to_with_unread(State::Normal, &[c], Token::Character(']')) + _ => self.move_to_with_unread(State::InsideCdata, &[c], Token::Character(']')), }, ClosingSubstate::Second => match c { '>' => self.move_to_with(State::Normal, Token::CDataEnd), - _ => self.move_to_with_unread(State::Normal, &[']', c], Token::Character(']')) - } + _ => self.move_to_with_unread(State::InsideCdata, &[']', c], Token::Character(']')), + }, + } + } + + /// Encountered ']' + fn invalid_cdata_closing(&mut self, c: char, s: ClosingSubstate) -> Result { + match s { + ClosingSubstate::First => match c { + ']' => self.move_to(State::InvalidCDataClosing(ClosingSubstate::Second)), + _ => self.move_to_with_unread(State::Normal, &[c], Token::Character(']')), + }, + ClosingSubstate::Second => match c { + '>' => self.move_to_with(self.normal_state, Token::CDataEnd), + _ => self.move_to_with_unread(State::Normal, &[']', c], Token::Character(']')), + }, } } } #[cfg(test)] mod tests { - use common::{Position}; + use crate::common::Position; use std::io::{BufReader, Cursor}; use super::{Lexer, Token}; @@ -563,13 +670,12 @@ mod tests { let err = err.unwrap_err(); assert_eq!($r as u64, err.position().row); assert_eq!($c as u64, err.position().column); - assert_eq!($s, err.msg()); }) ); macro_rules! assert_none( (for $lex:ident and $buf:ident) => ( - assert_eq!(Ok(None), $lex.next_token(&mut $buf)); + assert_eq!(Ok(None), $lex.next_token(&mut $buf)) ) ); @@ -577,6 +683,47 @@ mod tests { (Lexer::new(), BufReader::new(Cursor::new(s.to_owned().into_bytes()))) } + #[test] + fn tricky_pi() { + let (mut lex, mut buf) = make_lex_and_buf(r#""#); + + assert_oks!(for lex and buf ; + Token::ProcessingInstructionStart + Token::Character('x') + Token::OpeningTagStart // processing of relies on the extra tokens + Token::Character('!') + Token::Character('-') + Token::Character('-') + Token::Character(' ') + Token::ReferenceStart + Token::Character('?') + Token::ProcessingInstructionEnd + Token::OpeningTagStart + Token::Character('x') + Token::TagEnd + ); + assert_none!(for lex and buf); + } + + #[test] + fn reparser() { + let (mut lex, mut buf) = make_lex_and_buf(r#"&a;"#); + + assert_oks!(for lex and buf ; + Token::ReferenceStart + Token::Character('a') + Token::ReferenceEnd + ); + lex.reparse("").unwrap(); + assert_oks!(for lex and buf ; + Token::OpeningTagStart + Token::Character('h') + Token::Character('i') + Token::EmptyTagEnd + ); + assert_none!(for lex and buf); + } + #[test] fn simple_lexer_test() { let (mut lex, mut buf) = make_lex_and_buf( @@ -586,18 +733,18 @@ mod tests { assert_oks!(for lex and buf ; Token::OpeningTagStart Token::Character('a') - Token::Whitespace(' ') + Token::Character(' ') Token::Character('p') Token::EqualsSign Token::SingleQuote Token::Character('q') Token::SingleQuote Token::TagEnd - Token::Whitespace(' ') + Token::Character(' ') Token::Character('x') Token::OpeningTagStart Token::Character('b') - Token::Whitespace(' ') + Token::Character(' ') Token::Character('z') Token::EqualsSign Token::DoubleQuote @@ -605,7 +752,7 @@ mod tests { Token::DoubleQuote Token::TagEnd Token::Character('d') - Token::Whitespace('\t') + Token::Character('\t') Token::ClosingTagStart Token::Character('b') Token::TagEnd @@ -615,21 +762,21 @@ mod tests { Token::OpeningTagStart Token::Character('p') Token::EmptyTagEnd - Token::Whitespace(' ') + Token::Character(' ') Token::ProcessingInstructionStart Token::Character('n') Token::Character('m') - Token::Whitespace(' ') + Token::Character(' ') Token::ProcessingInstructionEnd - Token::Whitespace(' ') + Token::Character(' ') Token::CommentStart - Token::Whitespace(' ') + Token::Character(' ') Token::Character('a') - Token::Whitespace(' ') + Token::Character(' ') Token::Character('c') - Token::Whitespace(' ') + Token::Character(' ') Token::CommentEnd - Token::Whitespace(' ') + Token::Character(' ') Token::ReferenceStart Token::Character('n') Token::Character('b') @@ -651,16 +798,17 @@ mod tests { Token::Character('x') Token::Character('!') Token::Character('+') - Token::Whitespace(' ') + Token::Character(' ') Token::Character('/') Token::Character('/') - Token::Whitespace(' ') + Token::Character(' ') Token::Character('-') Token::Character('|') - Token::Whitespace(' ') + Token::Character(' ') Token::Character(']') Token::Character('z') - Token::Chunk("]]") + Token::Character(']') + Token::Character(']') ); assert_none!(for lex and buf); } @@ -677,12 +825,12 @@ mod tests { Token::TagEnd Token::CDataStart Token::Character('x') - Token::Whitespace(' ') + Token::Character(' ') Token::Character('y') - Token::Whitespace(' ') + Token::Character(' ') Token::Character('?') Token::CDataEnd - Token::Whitespace(' ') + Token::Character(' ') Token::ClosingTagStart Token::Character('a') Token::TagEnd @@ -690,6 +838,33 @@ mod tests { assert_none!(for lex and buf); } + #[test] + fn cdata_closers_test() { + let (mut lex, mut buf) = make_lex_and_buf( + r#" ]> ]]>]]"# + ); + + assert_oks!(for lex and buf ; + Token::CDataStart + Token::Character(']') + Token::Character(' ') + Token::Character('>') + Token::Character(' ') + Token::Character(']') + Token::Character('>') + Token::Character(' ') + Token::CDataEnd + Token::CommentStart + Token::CommentEnd + Token::Character(']') + Token::Character(']') + Token::OpeningTagStart + Token::Character('a') + Token::TagEnd + ); + assert_none!(for lex and buf); + } + #[test] fn doctype_test() { let (mut lex, mut buf) = make_lex_and_buf( @@ -700,26 +875,135 @@ mod tests { Token::Character('a') Token::TagEnd Token::DoctypeStart + Token::Character(' ') + Token::Character('a') + Token::Character('b') + Token::Character(' ') + Token::Character('x') + Token::Character('x') + Token::Character(' ') + Token::Character('z') Token::TagEnd - Token::Whitespace(' ') + Token::Character(' ') ); - assert_none!(for lex and buf) + assert_none!(for lex and buf); + } + + #[test] + fn tricky_comments() { + let (mut lex, mut buf) = make_lex_and_buf( + r#""# + ); + assert_oks!(for lex and buf ; + Token::OpeningTagStart + Token::Character('a') + Token::TagEnd + Token::CommentStart + Token::Character(' ') + Token::Character('C') + Token::Character(' ') + Token::Character('-') + Token::Character('>') + Token::CommentEnd + Token::ClosingTagStart + Token::Character('a') + Token::TagEnd + ); + assert_none!(for lex and buf); } #[test] fn doctype_with_internal_subset_test() { let (mut lex, mut buf) = make_lex_and_buf( - r#" ]> "# + r#">>"> ]> "# ); assert_oks!(for lex and buf ; Token::OpeningTagStart Token::Character('a') Token::TagEnd Token::DoctypeStart + Token::Character(' ') + Token::Character('a') + Token::Character('b') + Token::Character('[') + Token::MarkupDeclarationStart + Token::Character('E') + Token::Character('L') + Token::Character('E') + Token::Character('M') + Token::Character('E') + Token::Character('N') + Token::Character('T') + Token::Character(' ') + Token::Character('b') + Token::Character('a') + Token::Character(' ') + Token::DoubleQuote + Token::Character('>') + Token::Character('>') + Token::Character('>') + Token::DoubleQuote + Token::TagEnd + Token::Character(' ') + Token::Character(']') + Token::TagEnd + Token::Character(' ') + ); + assert_none!(for lex and buf); + } + + #[test] + fn doctype_internal_pi_comment() { + let (mut lex, mut buf) = make_lex_and_buf( + " ?> \n]>" + ); + assert_oks!(for lex and buf ; + Token::DoctypeStart + Token::Character(' ') + Token::Character('a') + Token::Character(' ') + Token::Character('[') + Token::Character('\n') + Token::MarkupDeclarationStart + Token::Character('E') + Token::Character('L') + Token::Character('E') + Token::Character('M') + Token::Character('E') + Token::Character('N') + Token::Character('T') + Token::Character(' ') + Token::Character('l') + Token::Character(' ') + Token::Character('A') + Token::Character('N') + Token::Character('Y') Token::TagEnd - Token::Whitespace(' ') + Token::Character(' ') + Token::CommentStart + Token::Character(' ') + Token::Character('<') + Token::Character('?') + Token::Character('n') + Token::Character('o') + Token::Character('n') + Token::Character('?') + Token::Character('>') + Token::CommentEnd + Token::Character(' ') + Token::ProcessingInstructionStart + Token::Character('p') + Token::Character('i') + Token::Character(' ') + Token::TagEnd // not really + Token::Character(' ') + Token::ProcessingInstructionEnd + Token::Character(' ') + Token::Character('\n') + Token::Character(']') + Token::TagEnd // DTD ); - assert_none!(for lex and buf) + assert_none!(for lex and buf); } #[test] @@ -735,7 +1019,8 @@ mod tests { eof_check!("/" ; Token::Character('/')); eof_check!("-" ; Token::Character('-')); eof_check!("]" ; Token::Character(']')); - eof_check!("]]" ; Token::Chunk("]]")); + eof_check!("]" ; Token::Character(']')); + eof_check!("]" ; Token::Character(']')); } #[test] @@ -756,7 +1041,6 @@ mod tests { eof_check!("()); + assert_eq!(2, std::mem::size_of::()); + } + #[test] fn error_in_cdata_started() { check_case!(" = result::Result; - -/// A wrapper around an `std::io::Read` instance which provides pull-based XML parsing. -pub struct EventReader { - source: R, - parser: PullParser -} - -impl EventReader { - /// Creates a new reader, consuming the given stream. - #[inline] - pub fn new(source: R) -> EventReader { - EventReader::new_with_config(source, ParserConfig::new()) - } - - /// Creates a new reader with the provded configuration, consuming the given stream. - #[inline] - pub fn new_with_config(source: R, config: ParserConfig) -> EventReader { - EventReader { source: source, parser: PullParser::new(config) } - } - - /// Pulls and returns next XML event from the stream. - /// - /// If returned event is `XmlEvent::Error` or `XmlEvent::EndDocument`, then - /// further calls to this method will return this event again. - #[inline] - pub fn next(&mut self) -> Result { - self.parser.next(&mut self.source) - } - - pub fn source(&self) -> &R { &self.source } - pub fn source_mut(&mut self) -> &mut R { &mut self.source } - - /// Unwraps this `EventReader`, returning the underlying reader. - /// - /// Note that this operation is destructive; unwrapping the reader and wrapping it - /// again with `EventReader::new()` will create a fresh reader which will attempt - /// to parse an XML document from the beginning. - pub fn into_inner(self) -> R { - self.source - } -} - -impl Position for EventReader { - /// Returns the position of the last event produced by the reader. - #[inline] - fn position(&self) -> TextPosition { - self.parser.position() - } -} - -impl IntoIterator for EventReader { - type Item = Result; - type IntoIter = Events; - - fn into_iter(self) -> Events { - Events { reader: self, finished: false } - } -} - -/// An iterator over XML events created from some type implementing `Read`. -/// -/// When the next event is `xml::event::Error` or `xml::event::EndDocument`, then -/// it will be returned by the iterator once, and then it will stop producing events. -pub struct Events { - reader: EventReader, - finished: bool -} - -impl Events { - /// Unwraps the iterator, returning the internal `EventReader`. - #[inline] - pub fn into_inner(self) -> EventReader { - self.reader - } - - pub fn source(&self) -> &R { &self.reader.source } - pub fn source_mut(&mut self) -> &mut R { &mut self.reader.source } - -} - -impl Iterator for Events { - type Item = Result; - - #[inline] - fn next(&mut self) -> Option> { - if self.finished && !self.reader.parser.is_ignoring_end_of_stream() { None } - else { - let ev = self.reader.next(); - match ev { - Ok(XmlEvent::EndDocument) | Err(_) => self.finished = true, - _ => {} - } - Some(ev) - } - } -} - -impl<'r> EventReader<&'r [u8]> { - /// A convenience method to create an `XmlReader` from a string slice. - #[inline] - pub fn from_str(source: &'r str) -> EventReader<&'r [u8]> { - EventReader::new(source.as_bytes()) - } -} diff --git a/src/reader/parser.rs b/src/reader/parser.rs new file mode 100644 index 0000000..dcdec89 --- /dev/null +++ b/src/reader/parser.rs @@ -0,0 +1,796 @@ +//! Contains an implementation of pull-based XML parser. + + +use crate::common::is_xml11_char; +use crate::common::is_xml10_char; +use crate::common::is_xml11_char_not_restricted; +use crate::reader::error::SyntaxError; +use std::collections::HashMap; +use std::io::prelude::*; + +use crate::attribute::OwnedAttribute; +use crate::common::{self, is_name_char, is_name_start_char, Position, TextPosition, XmlVersion, is_whitespace_char}; +use crate::name::OwnedName; +use crate::namespace::NamespaceStack; + +use crate::reader::config::ParserConfig2; +use crate::reader::events::XmlEvent; +use crate::reader::lexer::{Lexer, Token}; + +use super::{Error, ErrorKind}; + +macro_rules! gen_takes( + ($($field:ident -> $method:ident, $t:ty, $def:expr);+) => ( + $( + impl MarkupData { + #[inline] + #[allow(clippy::mem_replace_option_with_none)] + fn $method(&mut self) -> $t { + std::mem::replace(&mut self.$field, $def) + } + } + )+ + ) +); + +gen_takes!( + name -> take_name, String, String::new(); + ref_data -> take_ref_data, String, String::new(); + + encoding -> take_encoding, Option, None; + + element_name -> take_element_name, Option, None; + + attr_name -> take_attr_name, Option, None; + attributes -> take_attributes, Vec, vec!() +); + +mod inside_cdata; +mod inside_closing_tag_name; +mod inside_comment; +mod inside_declaration; +mod inside_doctype; +mod inside_opening_tag; +mod inside_processing_instruction; +mod inside_reference; +mod outside_tag; + +static DEFAULT_VERSION: XmlVersion = XmlVersion::Version10; +static DEFAULT_STANDALONE: Option = None; + +type ElementStack = Vec; +pub type Result = super::Result; + +/// Pull-based XML parser. +pub(crate) struct PullParser { + config: ParserConfig2, + lexer: Lexer, + st: State, + state_after_reference: State, + buf: String, + + /// From DTD internal subset + entities: HashMap, + + nst: NamespaceStack, + + data: MarkupData, + final_result: Option, + next_event: Option, + est: ElementStack, + pos: Vec, + + encountered: Encountered, + inside_whitespace: bool, + read_prefix_separator: bool, + pop_namespace: bool, +} + +// Keeps track when XML declaration can happen +#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord)] +enum Encountered { + None = 0, + AnyChars, // whitespace before ) -> PullParser { + let config = config.into(); + Self::new_with_config2(config) + } + + #[inline] + fn new_with_config2(config: ParserConfig2) -> PullParser { + let mut lexer = Lexer::new(); + if let Some(enc) = config.override_encoding { + lexer.set_encoding(enc); + } + + let mut pos = Vec::with_capacity(16); + pos.push(TextPosition::new()); + + PullParser { + config, + lexer, + st: State::DocumentStart, + state_after_reference: State::OutsideTag, + buf: String::new(), + entities: HashMap::new(), + nst: NamespaceStack::default(), + + data: MarkupData { + name: String::new(), + version: None, + encoding: None, + standalone: None, + ref_data: String::new(), + element_name: None, + quote: None, + attr_name: None, + attributes: Vec::new(), + }, + final_result: None, + next_event: None, + est: Vec::new(), + pos, + + encountered: Encountered::None, + inside_whitespace: true, + read_prefix_separator: false, + pop_namespace: false, + } + } + + /// Checks if this parser ignores the end of stream errors. + pub fn is_ignoring_end_of_stream(&self) -> bool { self.config.c.ignore_end_of_stream } + + #[inline(never)] + fn set_encountered(&mut self, new_encounter: Encountered) -> Option { + if new_encounter <= self.encountered { + return None; + } + let prev_enc = self.encountered; + self.encountered = new_encounter; + + // If declaration was not parsed and we have encountered an element, + // emit this declaration as the next event. + if prev_enc == Encountered::None { + self.push_pos(); + Some(Ok(XmlEvent::StartDocument { + version: DEFAULT_VERSION, + encoding: self.lexer.encoding().to_string(), + standalone: DEFAULT_STANDALONE, + })) + } else { + None + } + } +} + +impl Position for PullParser { + /// Returns the position of the last event produced by the parser + #[inline] + fn position(&self) -> TextPosition { + self.pos[0] + } +} + +#[derive(Copy, Clone, PartialEq)] +pub enum State { + OutsideTag, + InsideOpeningTag(OpeningTagSubstate), + InsideClosingTag(ClosingTagSubstate), + InsideProcessingInstruction(ProcessingInstructionSubstate), + InsideComment, + InsideCData, + InsideDeclaration(DeclarationSubstate), + InsideDoctype(DoctypeSubstate), + InsideReference, + DocumentStart, +} + +#[derive(Copy, Clone, PartialEq)] +pub enum DoctypeSubstate { + Outside, + String, + InsideName, + BeforeEntityName, + EntityName, + BeforeEntityValue, + EntityValue, + NumericReferenceStart, + NumericReference, + /// expansion + PEReferenceInValue, + PEReferenceInDtd, + /// name definition + PEReferenceDefinitionStart, + PEReferenceDefinition, + SkipDeclaration, + Comment, +} + +#[derive(Copy, Clone, PartialEq)] +pub enum OpeningTagSubstate { + InsideName, + + InsideTag, + + InsideAttributeName, + AfterAttributeName, + + InsideAttributeValue, + AfterAttributeValue, +} + +#[derive(Copy, Clone, PartialEq)] +pub enum ClosingTagSubstate { + CTInsideName, + CTAfterName, +} + +#[derive(Copy, Clone, PartialEq)] +pub enum ProcessingInstructionSubstate { + PIInsideName, + PIInsideData, +} + +#[derive(Copy, Clone, PartialEq)] +pub enum DeclarationSubstate { + BeforeVersion, + InsideVersion, + AfterVersion, + + InsideVersionValue, + AfterVersionValue, + + BeforeEncoding, + InsideEncoding, + AfterEncoding, + + InsideEncodingValue, + AfterEncodingValue, + + BeforeStandaloneDecl, + InsideStandaloneDecl, + AfterStandaloneDecl, + + InsideStandaloneDeclValue, + AfterStandaloneDeclValue, +} + +#[derive(PartialEq)] +enum QualifiedNameTarget { + AttributeNameTarget, + OpeningTagNameTarget, + ClosingTagNameTarget, +} + +#[derive(Copy, Clone, PartialEq, Eq)] +enum QuoteToken { + SingleQuoteToken, + DoubleQuoteToken, +} + +impl QuoteToken { + fn from_token(t: &Token) -> QuoteToken { + match *t { + Token::SingleQuote => QuoteToken::SingleQuoteToken, + Token::DoubleQuote => QuoteToken::DoubleQuoteToken, + _ => panic!("Unexpected token: {t}"), + } + } + + fn as_token(self) -> Token { + match self { + QuoteToken::SingleQuoteToken => Token::SingleQuote, + QuoteToken::DoubleQuoteToken => Token::DoubleQuote, + } + } +} + +struct MarkupData { + name: String, // used for processing instruction name + ref_data: String, // used for reference content + + version: Option, // used for XML declaration version + encoding: Option, // used for XML declaration encoding + standalone: Option, // used for XML declaration standalone parameter + + element_name: Option, // used for element name + + quote: Option, // used to hold opening quote for attribute value + attr_name: Option, // used to hold attribute name + attributes: Vec // used to hold all accumulated attributes +} + +impl PullParser { + /// Returns next event read from the given buffer. + /// + /// This method should be always called with the same buffer. If you call it + /// providing different buffers each time, the result will be undefined. + pub fn next(&mut self, r: &mut R) -> Result { + if let Some(ref ev) = self.final_result { + return ev.clone(); + } + + if let Some(ev) = self.next_event.take() { + return ev; + } + + if self.pop_namespace { + self.pop_namespace = false; + self.nst.pop(); + } + + loop { + debug_assert!(self.next_event.is_none()); + debug_assert!(!self.pop_namespace); + + // While lexer gives us Ok(maybe_token) -- we loop. + // Upon having a complete XML-event -- we return from the whole function. + match self.lexer.next_token(r) { + Ok(Some(token)) => { + match self.dispatch_token(token) { + None => {} // continue + Some(Ok(xml_event)) => { + self.next_pos(); + return Ok(xml_event) + }, + Some(Err(xml_error)) => { + self.next_pos(); + return self.set_final_result(Err(xml_error)) + }, + } + }, + Ok(None) => break, + Err(lexer_error) => { + return self.set_final_result(Err(lexer_error)) + }, + } + } + + self.handle_eof() + } + + /// Handle end of stream + fn handle_eof(&mut self) -> std::result::Result { + // Forward pos to the lexer head + self.next_pos(); + let ev = if self.depth() == 0 { + if self.encountered == Encountered::Element && self.st == State::OutsideTag { // all is ok + Ok(XmlEvent::EndDocument) + } else if self.encountered < Encountered::Element { + self.error(SyntaxError::NoRootElement) + } else { // self.st != State::OutsideTag + self.error(SyntaxError::UnexpectedEof) // TODO: add expected hint? + } + } else if self.config.c.ignore_end_of_stream { + self.final_result = None; + self.lexer.reset_eof_handled(); + return self.error(SyntaxError::UnbalancedRootElement); + } else { + self.error(SyntaxError::UnbalancedRootElement) + }; + self.set_final_result(ev) + } + + // This function is to be called when a terminal event is reached. + // The function sets up the `self.final_result` into `Some(result)` and return `result`. + #[inline] + fn set_final_result(&mut self, result: Result) -> Result { + self.final_result = Some(result.clone()); + result + } + + #[cold] + fn error(&self, e: SyntaxError) -> Result { + Err(Error { + pos: self.lexer.position(), + kind: ErrorKind::Syntax(e.to_cow()), + }) + } + + #[inline] + fn next_pos(&mut self) { + // unfortunately calls to next_pos will never be perfectly balanced with push_pos, + // at very least because parse errors and EOF can happen unexpectedly without a prior push. + if self.pos.len() > 0 { + if self.pos.len() > 1 { + self.pos.remove(0); + } else { + self.pos[0] = self.lexer.position(); + } + } + } + + #[inline] + #[track_caller] + fn push_pos(&mut self) { + debug_assert!(self.pos.len() != self.pos.capacity(), "You've found a bug in xml-rs, caused by calls to push_pos() in states that don't end up emitting events. + This case is ignored in release mode, and merely causes document positions to be out of sync. + Please file a bug and include the XML document that triggers this assert."); + + // it has capacity preallocated for more than it ever needs, so this reduces code size + if self.pos.len() != self.pos.capacity() { + self.pos.push(self.lexer.position()); + } else if self.pos.len() > 1 { + self.pos.remove(0); // this mitigates the excessive push_pos() call + } + } + + #[inline(never)] + fn dispatch_token(&mut self, t: Token) -> Option { + match self.st { + State::OutsideTag => self.outside_tag(t), + State::InsideOpeningTag(s) => self.inside_opening_tag(t, s), + State::InsideClosingTag(s) => self.inside_closing_tag_name(t, s), + State::InsideReference => self.inside_reference(t), + State::InsideComment => self.inside_comment(t), + State::InsideCData => self.inside_cdata(t), + State::InsideProcessingInstruction(s) => self.inside_processing_instruction(t, s), + State::InsideDoctype(s) => self.inside_doctype(t, s), + State::InsideDeclaration(s) => self.inside_declaration(t, s), + State::DocumentStart => self.document_start(t), + } + } + + #[inline] + fn depth(&self) -> usize { + self.est.len() + } + + #[inline] + fn buf_has_data(&self) -> bool { + !self.buf.is_empty() + } + + #[inline] + fn take_buf(&mut self) -> String { + std::mem::take(&mut self.buf) + } + + #[inline] + fn into_state(&mut self, st: State, ev: Option) -> Option { + self.st = st; + ev + } + + #[inline] + fn into_state_continue(&mut self, st: State) -> Option { + self.into_state(st, None) + } + + #[inline] + fn into_state_emit(&mut self, st: State, ev: Result) -> Option { + self.into_state(st, Some(ev)) + } + + /// Dispatches tokens in order to process qualified name. If qualified name cannot be parsed, + /// an error is returned. + /// + /// # Parameters + /// * `t` --- next token; + /// * `on_name` --- a callback which is executed when whitespace is encountered. + fn read_qualified_name(&mut self, t: Token, target: QualifiedNameTarget, on_name: F) -> Option + where F: Fn(&mut PullParser, Token, OwnedName) -> Option { + // We can get here for the first time only when self.data.name contains zero or one character, + // but first character cannot be a colon anyway + if self.buf.len() <= 1 { + self.read_prefix_separator = false; + } + + let invoke_callback = move |this: &mut PullParser, t| { + let name = this.take_buf(); + match name.parse() { + Ok(name) => on_name(this, t, name), + Err(_) => Some(this.error(SyntaxError::InvalidQualifiedName(name.into()))) + } + }; + + match t { + // There can be only one colon, and not as the first character + Token::Character(':') if self.buf_has_data() && !self.read_prefix_separator => { + self.buf.push(':'); + self.read_prefix_separator = true; + None + } + + Token::Character(c) if c != ':' && (self.buf.is_empty() && is_name_start_char(c) || + self.buf_has_data() && is_name_char(c)) => { + self.buf.push(c); + None + }, + + Token::EqualsSign if target == QualifiedNameTarget::AttributeNameTarget => invoke_callback(self, t), + + Token::EmptyTagEnd if target == QualifiedNameTarget::OpeningTagNameTarget => invoke_callback(self, t), + + Token::TagEnd if target == QualifiedNameTarget::OpeningTagNameTarget || + target == QualifiedNameTarget::ClosingTagNameTarget => invoke_callback(self, t), + + Token::Character(c) if is_whitespace_char(c) => invoke_callback(self, t), + + _ => Some(self.error(SyntaxError::UnexpectedQualifiedName(t))) + } + } + + /// Dispatches tokens in order to process attribute value. + /// + /// # Parameters + /// * `t` --- next token; + /// * `on_value` --- a callback which is called when terminating quote is encountered. + fn read_attribute_value(&mut self, t: Token, on_value: F) -> Option + where F: Fn(&mut PullParser, String) -> Option { + match t { + Token::Character(c) if self.data.quote.is_none() && is_whitespace_char(c) => None, // skip leading whitespace + + Token::DoubleQuote | Token::SingleQuote => match self.data.quote { + None => { // Entered attribute value + self.data.quote = Some(QuoteToken::from_token(&t)); + None + } + Some(q) if q.as_token() == t => { + self.data.quote = None; + let value = self.take_buf(); + on_value(self, value) + } + _ => { + if let Token::Character(c) = t { + if !self.is_valid_xml_char_not_restricted(c) { + return Some(self.error(SyntaxError::InvalidCharacterEntity(c as u32))); + } + } + t.push_to_string(&mut self.buf); + None + } + }, + + Token::ReferenceStart if self.data.quote.is_some() => { + self.state_after_reference = self.st; + self.into_state_continue(State::InsideReference) + }, + + Token::OpeningTagStart => + Some(self.error(SyntaxError::UnexpectedOpeningTag)), + + Token::Character(c) if !self.is_valid_xml_char_not_restricted(c) => { + Some(self.error(SyntaxError::InvalidCharacterEntity(c as u32))) + }, + + // Every character except " and ' and < is okay + _ if self.data.quote.is_some() => { + t.push_to_string(&mut self.buf); + None + } + + _ => Some(self.error(SyntaxError::UnexpectedToken(t))), + } + } + + fn emit_start_element(&mut self, emit_end_element: bool) -> Option { + let mut name = self.data.take_element_name()?; + let mut attributes = self.data.take_attributes(); + + // check whether the name prefix is bound and fix its namespace + match self.nst.get(name.borrow().prefix_repr()) { + Some("") => name.namespace = None, // default namespace + Some(ns) => name.namespace = Some(ns.into()), + None => return Some(self.error(SyntaxError::UnboundElementPrefix(name.to_string().into()))) + } + + // check and fix accumulated attributes prefixes + for attr in &mut attributes { + if let Some(ref pfx) = attr.name.prefix { + let new_ns = match self.nst.get(pfx) { + Some("") => None, // default namespace + Some(ns) => Some(ns.into()), + None => return Some(self.error(SyntaxError::UnboundAttribute(attr.name.to_string().into()))) + }; + attr.name.namespace = new_ns; + } + } + + if emit_end_element { + self.pop_namespace = true; + self.next_event = Some(Ok(XmlEvent::EndElement { + name: name.clone() + })); + } else { + self.est.push(name.clone()); + } + let namespace = self.nst.squash(); + self.into_state_emit(State::OutsideTag, Ok(XmlEvent::StartElement { + name, + attributes, + namespace + })) + } + + fn emit_end_element(&mut self) -> Option { + let mut name = self.data.take_element_name()?; + + // check whether the name prefix is bound and fix its namespace + match self.nst.get(name.borrow().prefix_repr()) { + Some("") => name.namespace = None, // default namespace + Some(ns) => name.namespace = Some(ns.into()), + None => return Some(self.error(SyntaxError::UnboundElementPrefix(name.to_string().into()))) + } + + let op_name = self.est.pop()?; + + if name == op_name { + self.pop_namespace = true; + self.into_state_emit(State::OutsideTag, Ok(XmlEvent::EndElement { name })) + } else { + Some(self.error(SyntaxError::UnexpectedClosingTag(format!("{name} != {op_name}").into()))) + } + } + + #[inline] + fn is_valid_xml_char(&self, c: char) -> bool { + if Some(XmlVersion::Version11) == self.data.version { + is_xml11_char(c) + } else { + is_xml10_char(c) + } + } + + #[inline] + fn is_valid_xml_char_not_restricted(&self, c: char) -> bool { + if Some(XmlVersion::Version11) == self.data.version { + is_xml11_char_not_restricted(c) + } else { + is_xml10_char(c) + } + } +} + +#[cfg(test)] +mod tests { + use std::io::BufReader; + use crate::attribute::OwnedAttribute; + use crate::common::TextPosition; + use crate::name::OwnedName; + use crate::reader::events::XmlEvent; + use crate::reader::parser::PullParser; + use crate::reader::ParserConfig; + + fn new_parser() -> PullParser { + PullParser::new(ParserConfig::new()) + } + + macro_rules! expect_event( + ($r:expr, $p:expr, $t:pat) => ( + match $p.next(&mut $r) { + $t => {} + e => panic!("Unexpected event: {e:?}\nExpected: {}", stringify!($t)) + } + ); + ($r:expr, $p:expr, $t:pat => $c:expr ) => ( + match $p.next(&mut $r) { + $t if $c => {} + e => panic!("Unexpected event: {e:?}\nExpected: {} if {}", stringify!($t), stringify!($c)) + } + ) + ); + + macro_rules! test_data( + ($d:expr) => ({ + static DATA: &'static str = $d; + let r = BufReader::new(DATA.as_bytes()); + let p = new_parser(); + (r, p) + }) + ); + + #[test] + fn issue_3_semicolon_in_attribute_value() { + let (mut r, mut p) = test_data!(r#" + + "#); + + expect_event!(r, p, Ok(XmlEvent::StartDocument { .. })); + expect_event!(r, p, Ok(XmlEvent::StartElement { ref name, ref attributes, ref namespace }) => + *name == OwnedName::local("a") && + attributes.len() == 1 && + attributes[0] == OwnedAttribute::new(OwnedName::local("attr"), "zzz;zzz") && + namespace.is_essentially_empty() + ); + expect_event!(r, p, Ok(XmlEvent::EndElement { ref name }) => *name == OwnedName::local("a")); + expect_event!(r, p, Ok(XmlEvent::EndDocument)); + } + + #[test] + fn issue_140_entity_reference_inside_tag() { + let (mut r, mut p) = test_data!(r#" + + "#); + + expect_event!(r, p, Ok(XmlEvent::StartDocument { .. })); + expect_event!(r, p, Ok(XmlEvent::StartElement { ref name, .. }) => *name == OwnedName::local("bla")); + expect_event!(r, p, Ok(XmlEvent::Characters(ref s)) => s == "\u{266b}"); + expect_event!(r, p, Ok(XmlEvent::EndElement { ref name, .. }) => *name == OwnedName::local("bla")); + expect_event!(r, p, Ok(XmlEvent::EndDocument)); + } + + #[test] + fn issue_220_comment() { + let (mut r, mut p) = test_data!(r#""#); + expect_event!(r, p, Ok(XmlEvent::StartDocument { .. })); + expect_event!(r, p, Ok(XmlEvent::StartElement { .. })); + expect_event!(r, p, Ok(XmlEvent::EndElement { .. })); + expect_event!(r, p, Ok(XmlEvent::EndDocument)); + + let (mut r, mut p) = test_data!(r#""#); + expect_event!(r, p, Ok(XmlEvent::StartDocument { .. })); + expect_event!(r, p, Ok(XmlEvent::StartElement { .. })); + expect_event!(r, p, Err(_)); // ---> is forbidden in comments + + let (mut r, mut p) = test_data!(r#""#); + p.config.c.ignore_comments = false; + expect_event!(r, p, Ok(XmlEvent::StartDocument { .. })); + expect_event!(r, p, Ok(XmlEvent::StartElement { .. })); + expect_event!(r, p, Ok(XmlEvent::Comment(s)) => s == " "#); + expect_event!(r, p, Err(_)); + + let (mut r, mut p) = test_data!(r#""#); + expect_event!(r, p, Err(_)); + + let (mut r, mut p) = test_data!(r#""#); + expect_event!(r, p, Err(_)); + + let (mut r, mut p) = test_data!(r#""#); + expect_event!(r, p, Err(_)); + + let (mut r, mut p) = test_data!(r#""#); + expect_event!(r, p, Err(_)); + } + + #[test] + fn opening_tag_in_attribute_value() { + use crate::reader::error::{SyntaxError, Error, ErrorKind}; + + let (mut r, mut p) = test_data!(r#" + + "#); + + expect_event!(r, p, Ok(XmlEvent::StartDocument { .. })); + expect_event!(r, p, Err(ref e) => + *e == Error { + kind: ErrorKind::Syntax(SyntaxError::UnexpectedOpeningTag.to_cow()), + pos: TextPosition { row: 1, column: 24 } + } + ); + } + + #[test] + fn reference_err() { + let (mut r, mut p) = test_data!(r#" + && + "#); + + expect_event!(r, p, Ok(XmlEvent::StartDocument { .. })); + expect_event!(r, p, Ok(XmlEvent::StartElement { .. })); + expect_event!(r, p, Err(_)); + } + + #[test] + fn state_size() { + assert_eq!(2, std::mem::size_of::()); + assert_eq!(1, std::mem::size_of::()); + } +} diff --git a/src/reader/parser/inside_cdata.rs b/src/reader/parser/inside_cdata.rs index 3269fb4..4f46f06 100644 --- a/src/reader/parser/inside_cdata.rs +++ b/src/reader/parser/inside_cdata.rs @@ -1,14 +1,14 @@ -use reader::events::XmlEvent; -use reader::lexer::Token; +use crate::reader::error::SyntaxError; +use crate::reader::lexer::Token; +use crate::{common::is_whitespace_char, reader::events::XmlEvent}; -use super::{Result, PullParser, State}; +use super::{PullParser, Result, State}; impl PullParser { pub fn inside_cdata(&mut self, t: Token) -> Option { match t { Token::CDataEnd => { - self.lexer.enable_errors(); - let event = if self.config.cdata_to_characters { + let event = if self.config.c.cdata_to_characters { None } else { let data = self.take_buf(); @@ -17,16 +17,18 @@ impl PullParser { self.into_state(State::OutsideTag, event) } - Token::Whitespace(_) => { - t.push_to_string(&mut self.buf); + Token::Character(c) if !self.is_valid_xml_char(c) => { + Some(self.error(SyntaxError::InvalidCharacterEntity(c as u32))) + }, + Token::Character(c) => { + if !is_whitespace_char(c) { + self.inside_whitespace = false; + } + self.buf.push(c); None } - _ => { - self.inside_whitespace = false; - t.push_to_string(&mut self.buf); - None - } + _ => unreachable!(), } } } diff --git a/src/reader/parser/inside_closing_tag_name.rs b/src/reader/parser/inside_closing_tag_name.rs index 1d8074a..6d86808 100644 --- a/src/reader/parser/inside_closing_tag_name.rs +++ b/src/reader/parser/inside_closing_tag_name.rs @@ -1,8 +1,7 @@ -use namespace; - -use reader::lexer::Token; - -use super::{Result, PullParser, State, QualifiedNameTarget, ClosingTagSubstate}; +use crate::reader::error::SyntaxError; +use crate::{common::is_whitespace_char, namespace}; +use crate::reader::lexer::Token; +use super::{ClosingTagSubstate, PullParser, QualifiedNameTarget, Result, State}; impl PullParser { pub fn inside_closing_tag_name(&mut self, t: Token, s: ClosingTagSubstate) -> Option { @@ -11,24 +10,22 @@ impl PullParser { match name.prefix_ref() { Some(prefix) if prefix == namespace::NS_XML_PREFIX || prefix == namespace::NS_XMLNS_PREFIX => - // TODO: {:?} is bad, need something better - Some(self_error!(this; "'{:?}' cannot be an element name prefix", name.prefix)), + Some(this.error(SyntaxError::InvalidNamePrefix(prefix.into()))), _ => { this.data.element_name = Some(name.clone()); match token { - Token::Whitespace(_) => this.into_state_continue(State::InsideClosingTag(ClosingTagSubstate::CTAfterName)), Token::TagEnd => this.emit_end_element(), - _ => Some(self_error!(this; "Unexpected token inside closing tag: {}", token)) + Token::Character(c) if is_whitespace_char(c) => this.into_state_continue(State::InsideClosingTag(ClosingTagSubstate::CTAfterName)), + _ => Some(this.error(SyntaxError::UnexpectedTokenInClosingTag(token))) } } } }), ClosingTagSubstate::CTAfterName => match t { - Token::Whitespace(_) => None, // Skip whitespace Token::TagEnd => self.emit_end_element(), - _ => Some(self_error!(self; "Unexpected token inside closing tag: {}", t)) + Token::Character(c) if is_whitespace_char(c) => None, // Skip whitespace + _ => Some(self.error(SyntaxError::UnexpectedTokenInClosingTag(t))) } } } - } diff --git a/src/reader/parser/inside_comment.rs b/src/reader/parser/inside_comment.rs index fc98320..e4132c5 100644 --- a/src/reader/parser/inside_comment.rs +++ b/src/reader/parser/inside_comment.rs @@ -1,26 +1,26 @@ -use reader::events::XmlEvent; -use reader::lexer::Token; +use crate::reader::error::SyntaxError; +use crate::reader::events::XmlEvent; +use crate::reader::lexer::Token; -use super::{Result, PullParser, State}; +use super::{PullParser, Result, State}; impl PullParser { pub fn inside_comment(&mut self, t: Token) -> Option { match t { - // Double dash is illegal inside a comment - Token::Chunk(ref s) if &s[..] == "--" => Some(self_error!(self; "Unexpected token inside a comment: --")), - - Token::CommentEnd if self.config.ignore_comments => { - self.lexer.outside_comment(); + Token::CommentEnd if self.config.c.ignore_comments => { self.into_state_continue(State::OutsideTag) } Token::CommentEnd => { - self.lexer.outside_comment(); let data = self.take_buf(); self.into_state_emit(State::OutsideTag, Ok(XmlEvent::Comment(data))) } - _ if self.config.ignore_comments => None, // Do not modify buffer if ignoring the comment + Token::Character(c) if !self.is_valid_xml_char(c) => { + Some(self.error(SyntaxError::InvalidCharacterEntity(c as u32))) + }, + + _ if self.config.c.ignore_comments => None, // Do not modify buffer if ignoring the comment _ => { t.push_to_string(&mut self.buf); @@ -28,5 +28,4 @@ impl PullParser { } } } - } diff --git a/src/reader/parser/inside_declaration.rs b/src/reader/parser/inside_declaration.rs index af39d10..4ff1427 100644 --- a/src/reader/parser/inside_declaration.rs +++ b/src/reader/parser/inside_declaration.rs @@ -1,44 +1,62 @@ - -use common::XmlVersion; - -use reader::events::XmlEvent; -use reader::lexer::Token; +use crate::common::{is_whitespace_char, XmlVersion}; +use crate::reader::error::SyntaxError; +use crate::reader::events::XmlEvent; +use crate::reader::lexer::Token; +use crate::util::Encoding; use super::{ - Result, PullParser, State, DeclarationSubstate, QualifiedNameTarget, - DEFAULT_VERSION, DEFAULT_ENCODING + DeclarationSubstate, Encountered, PullParser, QualifiedNameTarget, Result, State, + DEFAULT_VERSION, }; impl PullParser { + #[inline(never)] + fn emit_start_document(&mut self) -> Option { + debug_assert!(self.encountered == Encountered::None); + self.encountered = Encountered::Declaration; + + let version = self.data.version; + let encoding = self.data.take_encoding(); + let standalone = self.data.standalone; + + if let Some(new_encoding) = encoding.as_deref() { + let new_encoding = match new_encoding.parse() { + Ok(e) => e, + Err(_) if self.config.ignore_invalid_encoding_declarations => Encoding::Latin1, + Err(_) => return Some(self.error(SyntaxError::UnsupportedEncoding(new_encoding.into()))), + }; + let current_encoding = self.lexer.encoding(); + if current_encoding != new_encoding { + let set = match (current_encoding, new_encoding) { + (Encoding::Unknown | Encoding::Default, new) if new != Encoding::Utf16 => new, + (Encoding::Utf16Be | Encoding::Utf16Le, Encoding::Utf16) => current_encoding, + _ if self.config.ignore_invalid_encoding_declarations => current_encoding, + _ => return Some(self.error(SyntaxError::ConflictingEncoding(new_encoding, current_encoding))), + }; + self.lexer.set_encoding(set); + } + } + + let current_encoding = self.lexer.encoding(); + self.into_state_emit(State::OutsideTag, Ok(XmlEvent::StartDocument { + version: version.unwrap_or(DEFAULT_VERSION), + encoding: encoding.unwrap_or_else(move || current_encoding.to_string()), + standalone + })) + } + // TODO: remove redundancy via macros or extra methods pub fn inside_declaration(&mut self, t: Token, s: DeclarationSubstate) -> Option { - macro_rules! unexpected_token( - ($this:expr; $t:expr) => (Some($this.error(format!("Unexpected token inside XML declaration: {}", $t)))); - ($t:expr) => (unexpected_token!(self; $t)); - ); - - #[inline] - fn emit_start_document(this: &mut PullParser) -> Option { - this.parsed_declaration = true; - let version = this.data.take_version(); - let encoding = this.data.take_encoding(); - let standalone = this.data.take_standalone(); - this.into_state_emit(State::OutsideTag, Ok(XmlEvent::StartDocument { - version: version.unwrap_or(DEFAULT_VERSION), - encoding: encoding.unwrap_or(DEFAULT_ENCODING.into()), - standalone: standalone - })) - } match s { DeclarationSubstate::BeforeVersion => match t { - Token::Whitespace(_) => None, // continue Token::Character('v') => self.into_state_continue(State::InsideDeclaration(DeclarationSubstate::InsideVersion)), - _ => unexpected_token!(t) + Token::Character(c) if is_whitespace_char(c) => None, // continue + _ => Some(self.error(SyntaxError::UnexpectedToken(t))), }, DeclarationSubstate::InsideVersion => self.read_qualified_name(t, QualifiedNameTarget::AttributeNameTarget, |this, token, name| { - match &name.local_name[..] { + match &*name.local_name { "ersion" if name.namespace.is_none() => this.into_state_continue(State::InsideDeclaration( if token == Token::EqualsSign { @@ -47,18 +65,18 @@ impl PullParser { DeclarationSubstate::AfterVersion } )), - _ => unexpected_token!(this; name) + _ => Some(this.error(SyntaxError::UnexpectedNameInsideXml(name.to_string().into()))), } }), DeclarationSubstate::AfterVersion => match t { - Token::Whitespace(_) => None, Token::EqualsSign => self.into_state_continue(State::InsideDeclaration(DeclarationSubstate::InsideVersionValue)), - _ => unexpected_token!(t) + Token::Character(c) if is_whitespace_char(c) => None, + _ => Some(self.error(SyntaxError::UnexpectedToken(t))), }, DeclarationSubstate::InsideVersionValue => self.read_attribute_value(t, |this, value| { - this.data.version = match &value[..] { + this.data.version = match &*value { "1.0" => Some(XmlVersion::Version10), "1.1" => Some(XmlVersion::Version11), _ => None @@ -66,48 +84,60 @@ impl PullParser { if this.data.version.is_some() { this.into_state_continue(State::InsideDeclaration(DeclarationSubstate::AfterVersionValue)) } else { - Some(self_error!(this; "Unexpected XML version value: {}", value)) + Some(this.error(SyntaxError::UnexpectedXmlVersion(value.into()))) } }), DeclarationSubstate::AfterVersionValue => match t { - Token::Whitespace(_) => None, // skip whitespace + Token::Character(c) if is_whitespace_char(c) => self.into_state_continue(State::InsideDeclaration(DeclarationSubstate::BeforeEncoding)), + Token::ProcessingInstructionEnd => self.emit_start_document(), + _ => Some(self.error(SyntaxError::UnexpectedToken(t))), + }, + + DeclarationSubstate::BeforeEncoding => match t { Token::Character('e') => self.into_state_continue(State::InsideDeclaration(DeclarationSubstate::InsideEncoding)), Token::Character('s') => self.into_state_continue(State::InsideDeclaration(DeclarationSubstate::InsideStandaloneDecl)), - Token::ProcessingInstructionEnd => emit_start_document(self), - _ => unexpected_token!(t) + Token::ProcessingInstructionEnd => self.emit_start_document(), + Token::Character(c) if is_whitespace_char(c) => None, // skip whitespace + _ => Some(self.error(SyntaxError::UnexpectedToken(t))), }, DeclarationSubstate::InsideEncoding => self.read_qualified_name(t, QualifiedNameTarget::AttributeNameTarget, |this, token, name| { - match &name.local_name[..] { + match &*name.local_name { "ncoding" if name.namespace.is_none() => this.into_state_continue(State::InsideDeclaration( if token == Token::EqualsSign { DeclarationSubstate::InsideEncodingValue } else { DeclarationSubstate::AfterEncoding } )), - _ => unexpected_token!(this; name) + _ => Some(this.error(SyntaxError::UnexpectedName(name.to_string().into()))) } }), DeclarationSubstate::AfterEncoding => match t { - Token::Whitespace(_) => None, Token::EqualsSign => self.into_state_continue(State::InsideDeclaration(DeclarationSubstate::InsideEncodingValue)), - _ => unexpected_token!(t) + Token::Character(c) if is_whitespace_char(c) => None, + _ => Some(self.error(SyntaxError::UnexpectedToken(t))), }, DeclarationSubstate::InsideEncodingValue => self.read_attribute_value(t, |this, value| { this.data.encoding = Some(value); - this.into_state_continue(State::InsideDeclaration(DeclarationSubstate::BeforeStandaloneDecl)) + this.into_state_continue(State::InsideDeclaration(DeclarationSubstate::AfterEncodingValue)) }), + DeclarationSubstate::AfterEncodingValue => match t { + Token::Character(c) if is_whitespace_char(c) => self.into_state_continue(State::InsideDeclaration(DeclarationSubstate::BeforeStandaloneDecl)), + Token::ProcessingInstructionEnd => self.emit_start_document(), + _ => Some(self.error(SyntaxError::UnexpectedToken(t))), + }, + DeclarationSubstate::BeforeStandaloneDecl => match t { - Token::Whitespace(_) => None, // skip whitespace Token::Character('s') => self.into_state_continue(State::InsideDeclaration(DeclarationSubstate::InsideStandaloneDecl)), - Token::ProcessingInstructionEnd => emit_start_document(self), - _ => unexpected_token!(t) + Token::ProcessingInstructionEnd => self.emit_start_document(), + Token::Character(c) if is_whitespace_char(c) => None, // skip whitespace + _ => Some(self.error(SyntaxError::UnexpectedToken(t))), }, DeclarationSubstate::InsideStandaloneDecl => self.read_qualified_name(t, QualifiedNameTarget::AttributeNameTarget, |this, token, name| { - match &name.local_name[..] { + match &*name.local_name { "tandalone" if name.namespace.is_none() => this.into_state_continue(State::InsideDeclaration( if token == Token::EqualsSign { @@ -116,18 +146,18 @@ impl PullParser { DeclarationSubstate::AfterStandaloneDecl } )), - _ => unexpected_token!(this; name) + _ => Some(this.error(SyntaxError::UnexpectedName(name.to_string().into()))), } }), DeclarationSubstate::AfterStandaloneDecl => match t { - Token::Whitespace(_) => None, Token::EqualsSign => self.into_state_continue(State::InsideDeclaration(DeclarationSubstate::InsideStandaloneDeclValue)), - _ => unexpected_token!(t) + Token::Character(c) if is_whitespace_char(c) => None, + _ => Some(self.error(SyntaxError::UnexpectedToken(t))), }, DeclarationSubstate::InsideStandaloneDeclValue => self.read_attribute_value(t, |this, value| { - let standalone = match &value[..] { + let standalone = match &*value { "yes" => Some(true), "no" => Some(false), _ => None @@ -136,16 +166,15 @@ impl PullParser { this.data.standalone = standalone; this.into_state_continue(State::InsideDeclaration(DeclarationSubstate::AfterStandaloneDeclValue)) } else { - Some(self_error!(this; "Invalid standalone declaration value: {}", value)) + Some(this.error(SyntaxError::InvalidStandaloneDeclaration(value.into()))) } }), DeclarationSubstate::AfterStandaloneDeclValue => match t { - Token::Whitespace(_) => None, // skip whitespace - Token::ProcessingInstructionEnd => emit_start_document(self), - _ => unexpected_token!(t) - } + Token::ProcessingInstructionEnd => self.emit_start_document(), + Token::Character(c) if is_whitespace_char(c) => None, // skip whitespace + _ => Some(self.error(SyntaxError::UnexpectedToken(t))), + }, } } - } diff --git a/src/reader/parser/inside_doctype.rs b/src/reader/parser/inside_doctype.rs index 8dcf367..93ea470 100644 --- a/src/reader/parser/inside_doctype.rs +++ b/src/reader/parser/inside_doctype.rs @@ -1,16 +1,235 @@ -use reader::lexer::Token; +use crate::reader::error::SyntaxError; +use crate::common::{is_name_char, is_name_start_char, is_whitespace_char}; +use crate::reader::lexer::Token; -use super::{Result, PullParser, State}; +use super::{DoctypeSubstate, PullParser, QuoteToken, Result, State}; impl PullParser { - pub fn inside_doctype(&mut self, t: Token) -> Option { - match t { - Token::TagEnd => { - self.lexer.enable_errors(); - self.into_state_continue(State::OutsideTag) - } + pub fn inside_doctype(&mut self, t: Token, substate: DoctypeSubstate) -> Option { + match substate { + DoctypeSubstate::Outside => match t { + Token::TagEnd => self.into_state_continue(State::OutsideTag), + Token::MarkupDeclarationStart => { + self.buf.clear(); + self.into_state_continue(State::InsideDoctype(DoctypeSubstate::InsideName)) + }, + Token::Character('%') => { + self.data.ref_data.clear(); + self.data.ref_data.push('%'); + self.into_state_continue(State::InsideDoctype(DoctypeSubstate::PEReferenceInDtd)) + }, + Token::CommentStart => { + self.into_state_continue(State::InsideDoctype(DoctypeSubstate::Comment)) + }, + Token::SingleQuote | Token::DoubleQuote => { + // just discard string literals + self.data.quote = Some(super::QuoteToken::from_token(&t)); + self.into_state_continue(State::InsideDoctype(DoctypeSubstate::String)) + }, + Token::CDataEnd | Token::CDataStart => Some(self.error(SyntaxError::UnexpectedToken(t))), + // TODO: parse SYSTEM, and [ + _ => None, + }, + DoctypeSubstate::String => match t { + Token::SingleQuote if self.data.quote != Some(QuoteToken::SingleQuoteToken) => { None }, + Token::DoubleQuote if self.data.quote != Some(QuoteToken::DoubleQuoteToken) => { None }, + Token::SingleQuote | Token::DoubleQuote => { + self.data.quote = None; + self.into_state_continue(State::InsideDoctype(DoctypeSubstate::Outside)) + }, + _ => None, + }, + DoctypeSubstate::Comment => match t { + Token::CommentEnd => { + self.into_state_continue(State::InsideDoctype(DoctypeSubstate::Outside)) + }, + _ => None, + }, + DoctypeSubstate::InsideName => match t { + Token::Character(c @ 'A'..='Z') => { + self.buf.push(c); + None + }, + Token::Character(c) if is_whitespace_char(c) => { + match self.buf.as_str() { + "ENTITY" => self.into_state_continue(State::InsideDoctype(DoctypeSubstate::BeforeEntityName)), + "NOTATION" | "ELEMENT" | "ATTLIST" => self.into_state_continue(State::InsideDoctype(DoctypeSubstate::SkipDeclaration)), + s => Some(self.error(SyntaxError::UnknownMarkupDeclaration(s.into()))), + } - _ => None + }, + _ => Some(self.error(SyntaxError::UnexpectedToken(t))), + }, + DoctypeSubstate::BeforeEntityName => { + self.data.name.clear(); + match t { + Token::Character(c) if is_whitespace_char(c) => None, + Token::Character('%') => { // % is for PEDecl + self.data.name.push('%'); + self.into_state_continue(State::InsideDoctype(DoctypeSubstate::PEReferenceDefinitionStart)) + }, + Token::Character(c) if is_name_start_char(c) => { + self.data.name.push(c); + self.into_state_continue(State::InsideDoctype(DoctypeSubstate::EntityName)) + }, + _ => Some(self.error(SyntaxError::UnexpectedTokenInEntity(t))), + } + }, + DoctypeSubstate::EntityName => match t { + Token::Character(c) if is_whitespace_char(c) => { + self.into_state_continue(State::InsideDoctype(DoctypeSubstate::BeforeEntityValue)) + }, + Token::Character(c) if is_name_char(c) => { + self.data.name.push(c); + None + }, + _ => Some(self.error(SyntaxError::UnexpectedTokenInEntity(t))), + }, + DoctypeSubstate::BeforeEntityValue => { + self.buf.clear(); + match t { + Token::Character(c) if is_whitespace_char(c) => None, + // SYSTEM/PUBLIC not supported + Token::Character('S' | 'P') => { + let name = self.data.take_name(); + self.entities.entry(name).or_insert_with(String::new); // Dummy value, but at least the name is recognized + + self.into_state_continue(State::InsideDoctype(DoctypeSubstate::SkipDeclaration)) + }, + Token::SingleQuote | Token::DoubleQuote => { + self.data.quote = Some(super::QuoteToken::from_token(&t)); + self.into_state_continue(State::InsideDoctype(DoctypeSubstate::EntityValue)) + }, + _ => Some(self.error(SyntaxError::UnexpectedTokenInEntity(t))), + } + }, + DoctypeSubstate::EntityValue => match t { + Token::SingleQuote if self.data.quote != Some(QuoteToken::SingleQuoteToken) => { self.buf.push('\''); None }, + Token::DoubleQuote if self.data.quote != Some(QuoteToken::DoubleQuoteToken) => { self.buf.push('"'); None }, + Token::SingleQuote | Token::DoubleQuote => { + self.data.quote = None; + let name = self.data.take_name(); + let val = self.take_buf(); + self.entities.entry(name).or_insert(val); // First wins + self.into_state_continue(State::InsideDoctype(DoctypeSubstate::SkipDeclaration)) // FIXME + }, + Token::ReferenceStart | Token::Character('&') => { + self.data.ref_data.clear(); + self.into_state_continue(State::InsideDoctype(DoctypeSubstate::NumericReferenceStart)) + }, + Token::Character('%') => { + self.data.ref_data.clear(); + self.data.ref_data.push('%'); // include literal % in the name to distinguish from regular entities + self.into_state_continue(State::InsideDoctype(DoctypeSubstate::PEReferenceInValue)) + }, + Token::Character(c) if !self.is_valid_xml_char(c) => { + Some(self.error(SyntaxError::InvalidCharacterEntity(c as u32))) + }, + Token::Character(c) => { + self.buf.push(c); + None + }, + _ => Some(self.error(SyntaxError::UnexpectedTokenInEntity(t))), + }, + DoctypeSubstate::PEReferenceDefinitionStart => match t { + Token::Character(c) if is_whitespace_char(c) => { + None + }, + Token::Character(c) if is_name_start_char(c) => { + debug_assert_eq!(self.data.name, "%"); + self.data.name.push(c); + self.into_state_continue(State::InsideDoctype(DoctypeSubstate::PEReferenceDefinition)) + }, + _ => Some(self.error(SyntaxError::UnexpectedTokenInEntity(t))), + }, + DoctypeSubstate::PEReferenceDefinition => match t { + Token::Character(c) if is_name_char(c) => { + self.data.name.push(c); + None + }, + Token::Character(c) if is_whitespace_char(c) => { + self.into_state_continue(State::InsideDoctype(DoctypeSubstate::BeforeEntityValue)) + }, + _ => Some(self.error(SyntaxError::UnexpectedTokenInEntity(t))), + }, + DoctypeSubstate::PEReferenceInDtd => match t { + Token::Character(c) if is_name_char(c) => { + self.data.ref_data.push(c); + None + }, + Token::ReferenceEnd | Token::Character(';') => { + let name = self.data.take_ref_data(); + match self.entities.get(&name) { + Some(ent) => { + if let Err(e) = self.lexer.reparse(ent) { + return Some(Err(e)); + } + self.into_state_continue(State::InsideDoctype(DoctypeSubstate::Outside)) + }, + None => Some(self.error(SyntaxError::UndefinedEntity(name.into()))), + } + }, + _ => Some(self.error(SyntaxError::UnexpectedTokenInEntity(t))), + }, + DoctypeSubstate::PEReferenceInValue => match t { + Token::Character(c) if is_name_char(c) => { + self.data.ref_data.push(c); + None + }, + Token::ReferenceEnd | Token::Character(';') => { + let name = self.data.take_ref_data(); + match self.entities.get(&name) { + Some(ent) => { + self.buf.push_str(ent); + self.into_state_continue(State::InsideDoctype(DoctypeSubstate::EntityValue)) + }, + None => Some(self.error(SyntaxError::UndefinedEntity(name.into()))), + } + }, + _ => Some(self.error(SyntaxError::UnexpectedTokenInEntity(t))), + }, + DoctypeSubstate::NumericReferenceStart => match t { + Token::Character('#') => { + self.into_state_continue(State::InsideDoctype(DoctypeSubstate::NumericReference)) + }, + Token::Character(c) if !self.is_valid_xml_char(c) => { + Some(self.error(SyntaxError::InvalidCharacterEntity(c as u32))) + }, + Token::Character(c) => { + self.buf.push('&'); + self.buf.push(c); + // named entities are not expanded inside doctype + self.into_state_continue(State::InsideDoctype(DoctypeSubstate::EntityValue)) + }, + _ => Some(self.error(SyntaxError::UnexpectedTokenInEntity(t))), + }, + DoctypeSubstate::NumericReference => match t { + Token::ReferenceEnd | Token::Character(';') => { + let r = self.data.take_ref_data(); + // https://www.w3.org/TR/xml/#sec-entexpand + match self.numeric_reference_from_str(&r) { + Ok(c) => { + self.buf.push(c); + self.into_state_continue(State::InsideDoctype(DoctypeSubstate::EntityValue)) + } + Err(e) => Some(self.error(e)), + } + }, + Token::Character(c) if !self.is_valid_xml_char(c) => { + Some(self.error(SyntaxError::InvalidCharacterEntity(c as u32))) + }, + Token::Character(c) => { + self.data.ref_data.push(c); + None + }, + _ => Some(self.error(SyntaxError::UnexpectedTokenInEntity(t))), + }, + DoctypeSubstate::SkipDeclaration => match t { + Token::TagEnd => { + self.into_state_continue(State::InsideDoctype(DoctypeSubstate::Outside)) + }, + _ => None, + }, } } } diff --git a/src/reader/parser/inside_opening_tag.rs b/src/reader/parser/inside_opening_tag.rs index 533874f..b7f185a 100644 --- a/src/reader/parser/inside_opening_tag.rs +++ b/src/reader/parser/inside_opening_tag.rs @@ -1,26 +1,26 @@ -use common::is_name_start_char; -use attribute::OwnedAttribute; -use namespace; +use crate::reader::error::SyntaxError; +use crate::common::is_name_start_char; +use crate::namespace; +use crate::{attribute::OwnedAttribute, common::is_whitespace_char}; -use reader::lexer::Token; +use crate::reader::lexer::Token; -use super::{Result, PullParser, State, OpeningTagSubstate, QualifiedNameTarget}; +use super::{OpeningTagSubstate, PullParser, QualifiedNameTarget, Result, State}; impl PullParser { pub fn inside_opening_tag(&mut self, t: Token, s: OpeningTagSubstate) -> Option { - macro_rules! unexpected_token(($t:expr) => (Some(self_error!(self; "Unexpected token inside opening tag: {}", $t)))); match s { OpeningTagSubstate::InsideName => self.read_qualified_name(t, QualifiedNameTarget::OpeningTagNameTarget, |this, token, name| { match name.prefix_ref() { Some(prefix) if prefix == namespace::NS_XML_PREFIX || prefix == namespace::NS_XMLNS_PREFIX => - Some(self_error!(this; "'{:?}' cannot be an element name prefix", name.prefix)), + Some(this.error(SyntaxError::InvalidNamePrefix(prefix.into()))), _ => { this.data.element_name = Some(name.clone()); match token { Token::TagEnd => this.emit_start_element(false), Token::EmptyTagEnd => this.emit_start_element(true), - Token::Whitespace(_) => this.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::InsideTag)), + Token::Character(c) if is_whitespace_char(c) => this.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::InsideTag)), _ => unreachable!() } } @@ -28,66 +28,65 @@ impl PullParser { }), OpeningTagSubstate::InsideTag => match t { - Token::Whitespace(_) => None, // skip whitespace + Token::TagEnd => self.emit_start_element(false), + Token::EmptyTagEnd => self.emit_start_element(true), + Token::Character(c) if is_whitespace_char(c) => None, // skip whitespace Token::Character(c) if is_name_start_char(c) => { self.buf.push(c); self.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::InsideAttributeName)) } - Token::TagEnd => self.emit_start_element(false), - Token::EmptyTagEnd => self.emit_start_element(true), - _ => unexpected_token!(t) + _ => Some(self.error(SyntaxError::UnexpectedTokenInOpeningTag(t))) }, OpeningTagSubstate::InsideAttributeName => self.read_qualified_name(t, QualifiedNameTarget::AttributeNameTarget, |this, token, name| { this.data.attr_name = Some(name); match token { - Token::Whitespace(_) => this.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::AfterAttributeName)), Token::EqualsSign => this.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::InsideAttributeValue)), + Token::Character(c) if is_whitespace_char(c) => this.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::AfterAttributeName)), _ => unreachable!() } }), OpeningTagSubstate::AfterAttributeName => match t { - Token::Whitespace(_) => None, Token::EqualsSign => self.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::InsideAttributeValue)), - _ => unexpected_token!(t) + Token::Character(c) if is_whitespace_char(c) => None, + _ => Some(self.error(SyntaxError::UnexpectedTokenInOpeningTag(t))) }, OpeningTagSubstate::InsideAttributeValue => self.read_attribute_value(t, |this, value| { - let name = this.data.take_attr_name().unwrap(); // unwrap() will always succeed here - + let name = this.data.take_attr_name()?; // will always succeed here // check that no attribute with such name is already present // if there is one, XML is not well-formed - if this.data.attributes.iter().find(|a| a.name == name).is_some() { // TODO: looks bad + if this.data.attributes.iter().any(|a| a.name == name) { // TODO: looks bad // TODO: ideally this error should point to the beginning of the attribute, // TODO: not the end of its value - Some(self_error!(this; "Attribute '{}' is redefined", name)) + Some(this.error(SyntaxError::RedefinedAttribute(name.to_string().into()))) } else { match name.prefix_ref() { // declaring a new prefix; it is sufficient to check prefix only // because "xmlns" prefix is reserved Some(namespace::NS_XMLNS_PREFIX) => { - let ln = &name.local_name[..]; + let ln = &*name.local_name; if ln == namespace::NS_XMLNS_PREFIX { - Some(self_error!(this; "Cannot redefine prefix '{}'", namespace::NS_XMLNS_PREFIX)) - } else if ln == namespace::NS_XML_PREFIX && &value[..] != namespace::NS_XML_URI { - Some(self_error!(this; "Prefix '{}' cannot be rebound to another value", namespace::NS_XML_PREFIX)) + Some(this.error(SyntaxError::CannotRedefineXmlnsPrefix)) + } else if ln == namespace::NS_XML_PREFIX && &*value != namespace::NS_XML_URI { + Some(this.error(SyntaxError::CannotRedefineXmlPrefix)) } else if value.is_empty() { - Some(self_error!(this; "Cannot undefine prefix '{}'", ln)) + Some(this.error(SyntaxError::CannotUndefinePrefix(ln.into()))) } else { this.nst.put(name.local_name.clone(), value); - this.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::InsideTag)) + this.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::AfterAttributeValue)) } } // declaring default namespace - None if &name.local_name[..] == namespace::NS_XMLNS_PREFIX => - match &value[..] { - namespace::NS_XMLNS_PREFIX | namespace::NS_XML_PREFIX => - Some(self_error!(this; "Namespace '{}' cannot be default", value)), + None if &*name.local_name == namespace::NS_XMLNS_PREFIX => + match &*value { + namespace::NS_XMLNS_PREFIX | namespace::NS_XML_PREFIX | namespace::NS_XML_URI | namespace::NS_XMLNS_URI => + Some(this.error(SyntaxError::InvalidDefaultNamespace(value.into()))), _ => { this.nst.put(namespace::NS_NO_PREFIX, value.clone()); - this.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::InsideTag)) + this.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::AfterAttributeValue)) } }, @@ -95,14 +94,20 @@ impl PullParser { _ => { this.data.attributes.push(OwnedAttribute { name: name.clone(), - value: value + value }); - this.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::InsideTag)) + this.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::AfterAttributeValue)) } } } - }) + }), + + OpeningTagSubstate::AfterAttributeValue => match t { + Token::Character(c) if is_whitespace_char(c) => self.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::InsideTag)), + Token::TagEnd => self.emit_start_element(false), + Token::EmptyTagEnd => self.emit_start_element(true), + _ => Some(self.error(SyntaxError::UnexpectedTokenInOpeningTag(t))) + }, } } - } diff --git a/src/reader/parser/inside_processing_instruction.rs b/src/reader/parser/inside_processing_instruction.rs index 8ddf6b8..96f6753 100644 --- a/src/reader/parser/inside_processing_instruction.rs +++ b/src/reader/parser/inside_processing_instruction.rs @@ -1,18 +1,20 @@ -use common::{ - is_name_start_char, is_name_char, -}; +use crate::reader::error::SyntaxError; +use crate::common::{is_name_char, is_name_start_char, is_whitespace_char}; -use reader::events::XmlEvent; -use reader::lexer::Token; +use crate::reader::events::XmlEvent; +use crate::reader::lexer::Token; -use super::{Result, PullParser, State, ProcessingInstructionSubstate, DeclarationSubstate}; +use super::{DeclarationSubstate, ProcessingInstructionSubstate, PullParser, Result, State, Encountered}; impl PullParser { pub fn inside_processing_instruction(&mut self, t: Token, s: ProcessingInstructionSubstate) -> Option { match s { ProcessingInstructionSubstate::PIInsideName => match t { - Token::Character(c) if !self.buf_has_data() && is_name_start_char(c) || - self.buf_has_data() && is_name_char(c) => self.append_char_continue(c), + Token::Character(c) if self.buf.is_empty() && is_name_start_char(c) || + self.buf_has_data() && is_name_char(c) => { + self.buf.push(c); + None + }, Token::ProcessingInstructionEnd => { // self.buf contains PI name @@ -20,70 +22,83 @@ impl PullParser { // Don't need to check for declaration because it has mandatory attributes // but there is none - match &name[..] { + match &*name { // Name is empty, it is an error - "" => Some(self_error!(self; "Encountered processing instruction without name")), + "" => Some(self.error(SyntaxError::ProcessingInstructionWithoutName)), // Found - Some(self_error!(self; "Invalid processing instruction: + Some(self.error(SyntaxError::InvalidXmlProcessingInstruction(name.into()))), // All is ok, emitting event _ => { - self.into_state_emit( - State::OutsideTag, - Ok(XmlEvent::ProcessingInstruction { - name: name, - data: None - }) - ) + debug_assert!(self.next_event.is_none(), "{:?}", self.next_event); + // can't have a PI before ` { + Token::Character(c) if is_whitespace_char(c) => { // self.buf contains PI name let name = self.take_buf(); - match &name[..] { + match &*name { // We have not ever encountered an element and have not parsed XML declaration - "xml" if !self.encountered_element && !self.parsed_declaration => + "xml" if self.encountered == Encountered::None => self.into_state_continue(State::InsideDeclaration(DeclarationSubstate::BeforeVersion)), // Found - Some(self_error!(self; "Invalid processing instruction: + Some(self.error(SyntaxError::InvalidXmlProcessingInstruction(name.into()))), // All is ok, starting parsing PI data _ => { - self.lexer.disable_errors(); // data is arbitrary, so disable errors self.data.name = name; - self.into_state_continue(State::InsideProcessingInstruction(ProcessingInstructionSubstate::PIInsideData)) + // can't have a PI before ` Some(self_error!(self; "Unexpected token: { + let buf = self.take_buf(); + Some(self.error(SyntaxError::UnexpectedProcessingInstruction(buf.into(), t))) + } }, ProcessingInstructionSubstate::PIInsideData => match t { Token::ProcessingInstructionEnd => { - self.lexer.enable_errors(); let name = self.data.take_name(); let data = self.take_buf(); self.into_state_emit( State::OutsideTag, Ok(XmlEvent::ProcessingInstruction { - name: name, - data: Some(data) - }) + name, + data: Some(data), + }), ) }, + Token::Character(c) if !self.is_valid_xml_char(c) => { + Some(self.error(SyntaxError::InvalidCharacterEntity(c as u32))) + }, + // Any other token should be treated as plain characters _ => { t.push_to_string(&mut self.buf); @@ -92,5 +107,4 @@ impl PullParser { }, } } - } diff --git a/src/reader/parser/inside_reference.rs b/src/reader/parser/inside_reference.rs index 60026d5..9a15e09 100644 --- a/src/reader/parser/inside_reference.rs +++ b/src/reader/parser/inside_reference.rs @@ -1,13 +1,11 @@ +use crate::reader::error::SyntaxError; use std::char; - -use common::{is_name_start_char, is_name_char, is_whitespace_str}; - -use reader::lexer::Token; - -use super::{Result, PullParser, State}; +use crate::common::{is_name_char, is_name_start_char, is_whitespace_char}; +use crate::reader::lexer::Token; +use super::{PullParser, Result, State}; impl PullParser { - pub fn inside_reference(&mut self, t: Token, prev_st: State) -> Option { + pub fn inside_reference(&mut self, t: Token) -> Option { match t { Token::Character(c) if !self.data.ref_data.is_empty() && is_name_char(c) || self.data.ref_data.is_empty() && (is_name_start_char(c) || c == '#') => { @@ -16,74 +14,64 @@ impl PullParser { } Token::ReferenceEnd => { - // TODO: check for unicode correctness let name = self.data.take_ref_data(); - let name_len = name.len(); // compute once - let c = match &name[..] { - "lt" => Ok('<'.to_string()), - "gt" => Ok('>'.to_string()), - "amp" => Ok('&'.to_string()), - "apos" => Ok('\''.to_string()), - "quot" => Ok('"'.to_string()), - "" => Err(self_error!(self; "Encountered empty entity")), - _ if name_len > 2 && name.starts_with("#x") => { - let num_str = &name[2..name_len]; - if num_str == "0" { - Err(self_error!(self; "Null character entity is not allowed")) - } else { - if self.config.replace_unknown_entity_references { - match u32::from_str_radix(num_str, 16).ok().map(|i| char::from_u32(i).unwrap_or('\u{fffd}')) { - Some(c) => Ok(c.to_string()), - None => Err(self_error!(self; "Invalid hexadecimal character number in an entity: {}", name)) - } - } else { - match u32::from_str_radix(num_str, 16).ok().and_then(char::from_u32) { - Some(c) => Ok(c.to_string()), - None => Err(self_error!(self; "Invalid hexadecimal character number in an entity: {}", name)) - } - } - } - } - _ if name_len > 1 && name.starts_with('#') => { - let num_str = &name[1..name_len]; - if num_str == "0" { - Err(self_error!(self; "Null character entity is not allowed")) - } else { - if self.config.replace_unknown_entity_references { - match u32::from_str_radix(num_str, 10).ok().map(|i| char::from_u32(i).unwrap_or('\u{fffd}')) { - Some(c) => Ok(c.to_string()), - None => Err(self_error!(self; "Invalid decimal character number in an entity: {}", name)) - } - } - else { - match u32::from_str_radix(num_str, 10).ok().and_then(char::from_u32) { - Some(c) => Ok(c.to_string()), - None => Err(self_error!(self; "Invalid decimal character number in an entity: {}", name)) - } - } - } + if name.is_empty() { + return Some(self.error(SyntaxError::EmptyEntity)); + } + + let c = match &*name { + "lt" => Some('<'), + "gt" => Some('>'), + "amp" => Some('&'), + "apos" => Some('\''), + "quot" => Some('"'), + _ if name.starts_with('#') => match self.numeric_reference_from_str(&name[1..]) { + Ok(c) => Some(c), + Err(e) => return Some(self.error(e)) }, - _ => { - if let Some(v) = self.config.extra_entities.get(&name) { - Ok(v.clone()) - } else { - Err(self_error!(self; "Unexpected entity: {}", name)) - } - } + _ => None, }; - match c { - Ok(c) => { - self.buf.push_str(&c); - if prev_st == State::OutsideTag && !is_whitespace_str(&c) { - self.inside_whitespace = false; + if let Some(c) = c { + self.buf.push(c); + } else if let Some(v) = self.config.c.extra_entities.get(&name) { + self.buf.push_str(v); + } else if let Some(v) = self.entities.get(&name) { + if self.state_after_reference == State::OutsideTag { + // an entity can expand to *elements*, so outside of a tag it needs a full reparse + if let Err(e) = self.lexer.reparse(v) { + return Some(Err(e)); } - self.into_state_continue(prev_st) + } else { + // however, inside attributes it's not allowed to affect attribute quoting, + // so it can't be fed to the lexer + self.buf.push_str(v); } - Err(e) => Some(e) + } else { + return Some(self.error(SyntaxError::UnexpectedEntity(name.into()))); + } + let prev_st = self.state_after_reference; + if prev_st == State::OutsideTag && !is_whitespace_char(self.buf.chars().last().unwrap_or('\0')) { + self.inside_whitespace = false; } + self.into_state_continue(prev_st) } - _ => Some(self_error!(self; "Unexpected token inside an entity: {}", t)) + _ => Some(self.error(SyntaxError::UnexpectedTokenInEntity(t))), + } + } + + pub(crate) fn numeric_reference_from_str(&self, num_str: &str) -> std::result::Result { + let val = if let Some(hex) = num_str.strip_prefix('x') { + u32::from_str_radix(hex, 16).map_err(move |_| SyntaxError::InvalidNumericEntity(num_str.into()))? + } else { + u32::from_str_radix(num_str, 10).map_err(move |_| SyntaxError::InvalidNumericEntity(num_str.into()))? + }; + match char::from_u32(val) { + Some(c) if self.is_valid_xml_char(c) => Ok(c), + None if self.config.c.replace_unknown_entity_references => { + Ok('\u{fffd}') + }, + _ => Err(SyntaxError::InvalidCharacterEntity(val)), } } } diff --git a/src/reader/parser/mod.rs b/src/reader/parser/mod.rs deleted file mode 100644 index 58ca3a6..0000000 --- a/src/reader/parser/mod.rs +++ /dev/null @@ -1,622 +0,0 @@ -//! Contains an implementation of pull-based XML parser. - -use std::mem; -use std::borrow::Cow; -use std::io::prelude::*; - -use common::{ - self, - XmlVersion, Position, TextPosition, - is_name_start_char, is_name_char, -}; -use name::OwnedName; -use attribute::OwnedAttribute; -use namespace::NamespaceStack; - -use reader::events::XmlEvent; -use reader::config::ParserConfig; -use reader::lexer::{Lexer, Token}; - -macro_rules! gen_takes( - ($($field:ident -> $method:ident, $t:ty, $def:expr);+) => ( - $( - impl MarkupData { - #[inline] - fn $method(&mut self) -> $t { - mem::replace(&mut self.$field, $def) - } - } - )+ - ) -); - -gen_takes!( - name -> take_name, String, String::new(); - ref_data -> take_ref_data, String, String::new(); - - version -> take_version, Option, None; - encoding -> take_encoding, Option, None; - standalone -> take_standalone, Option, None; - - element_name -> take_element_name, Option, None; - - attr_name -> take_attr_name, Option, None; - attributes -> take_attributes, Vec, vec!() -); - -macro_rules! self_error( - ($this:ident; $msg:expr) => ($this.error($msg)); - ($this:ident; $fmt:expr, $($arg:expr),+) => ($this.error(format!($fmt, $($arg),+))) -); - -mod outside_tag; -mod inside_processing_instruction; -mod inside_declaration; -mod inside_doctype; -mod inside_opening_tag; -mod inside_closing_tag_name; -mod inside_comment; -mod inside_cdata; -mod inside_reference; - -static DEFAULT_VERSION: XmlVersion = XmlVersion::Version10; -static DEFAULT_ENCODING: &'static str = "UTF-8"; -static DEFAULT_STANDALONE: Option = None; - -type ElementStack = Vec; -pub type Result = super::Result; - -/// Pull-based XML parser. -pub struct PullParser { - config: ParserConfig, - lexer: Lexer, - st: State, - buf: String, - nst: NamespaceStack, - - data: MarkupData, - final_result: Option, - next_event: Option, - est: ElementStack, - pos: Vec, - - encountered_element: bool, - parsed_declaration: bool, - inside_whitespace: bool, - read_prefix_separator: bool, - pop_namespace: bool -} - -impl PullParser { - /// Returns a new parser using the given config. - pub fn new(config: ParserConfig) -> PullParser { - PullParser { - config: config, - lexer: Lexer::new(), - st: State::OutsideTag, - buf: String::new(), - nst: NamespaceStack::default(), - - data: MarkupData { - name: String::new(), - version: None, - encoding: None, - standalone: None, - ref_data: String::new(), - element_name: None, - quote: None, - attr_name: None, - attributes: Vec::new() - }, - final_result: None, - next_event: None, - est: Vec::new(), - pos: vec![TextPosition::new()], - - encountered_element: false, - parsed_declaration: false, - inside_whitespace: true, - read_prefix_separator: false, - pop_namespace: false - } - } - - /// Checks if this parser ignores the end of stream errors. - pub fn is_ignoring_end_of_stream(&self) -> bool { self.config.ignore_end_of_stream } -} - -impl Position for PullParser { - /// Returns the position of the last event produced by the parser - #[inline] - fn position(&self) -> TextPosition { - self.pos[0] - } -} - -#[derive(Clone, PartialEq)] -pub enum State { - OutsideTag, - InsideOpeningTag(OpeningTagSubstate), - InsideClosingTag(ClosingTagSubstate), - InsideProcessingInstruction(ProcessingInstructionSubstate), - InsideComment, - InsideCData, - InsideDeclaration(DeclarationSubstate), - InsideDoctype, - InsideReference(Box) -} - -#[derive(Clone, PartialEq)] -pub enum OpeningTagSubstate { - InsideName, - - InsideTag, - - InsideAttributeName, - AfterAttributeName, - - InsideAttributeValue, -} - -#[derive(Clone, PartialEq)] -pub enum ClosingTagSubstate { - CTInsideName, - CTAfterName -} - -#[derive(Clone, PartialEq)] -pub enum ProcessingInstructionSubstate { - PIInsideName, - PIInsideData -} - -#[derive(Clone, PartialEq)] -pub enum DeclarationSubstate { - BeforeVersion, - InsideVersion, - AfterVersion, - - InsideVersionValue, - AfterVersionValue, - - InsideEncoding, - AfterEncoding, - - InsideEncodingValue, - - BeforeStandaloneDecl, - InsideStandaloneDecl, - AfterStandaloneDecl, - - InsideStandaloneDeclValue, - AfterStandaloneDeclValue -} - -#[derive(PartialEq)] -enum QualifiedNameTarget { - AttributeNameTarget, - OpeningTagNameTarget, - ClosingTagNameTarget -} - -#[derive(Copy, Clone, PartialEq, Eq)] -enum QuoteToken { - SingleQuoteToken, - DoubleQuoteToken -} - -impl QuoteToken { - fn from_token(t: &Token) -> QuoteToken { - match *t { - Token::SingleQuote => QuoteToken::SingleQuoteToken, - Token::DoubleQuote => QuoteToken::DoubleQuoteToken, - _ => panic!("Unexpected token: {}", t) - } - } - - fn as_token(self) -> Token { - match self { - QuoteToken::SingleQuoteToken => Token::SingleQuote, - QuoteToken::DoubleQuoteToken => Token::DoubleQuote - } - } -} - -struct MarkupData { - name: String, // used for processing instruction name - ref_data: String, // used for reference content - - version: Option, // used for XML declaration version - encoding: Option, // used for XML declaration encoding - standalone: Option, // used for XML declaration standalone parameter - - element_name: Option, // used for element name - - quote: Option, // used to hold opening quote for attribute value - attr_name: Option, // used to hold attribute name - attributes: Vec // used to hold all accumulated attributes -} - -impl PullParser { - /// Returns next event read from the given buffer. - /// - /// This method should be always called with the same buffer. If you call it - /// providing different buffers each time, the result will be undefined. - pub fn next(&mut self, r: &mut R) -> Result { - if let Some(ref ev) = self.final_result { - return ev.clone(); - } - - if let Some(ev) = self.next_event.take() { - return ev; - } - - if self.pop_namespace { - self.pop_namespace = false; - self.nst.pop(); - } - - loop { - // While lexer gives us Ok(maybe_token) -- we loop. - // Upon having a complete XML-event -- we return from the whole function. - match self.lexer.next_token(r) { - Ok(maybe_token) => - match maybe_token { - None => break, - Some(token) => - match self.dispatch_token(token) { - None => {} // continue - Some(Ok(XmlEvent::EndDocument)) => - return { - self.next_pos(); - self.set_final_result(Ok(XmlEvent::EndDocument)) - }, - Some(Ok(xml_event)) => - return { - self.next_pos(); - Ok(xml_event) - }, - Some(Err(xml_error)) => - return { - self.next_pos(); - self.set_final_result(Err(xml_error)) - }, - } - }, - Err(lexer_error) => - return self.set_final_result(Err(lexer_error)), - } - } - - // Handle end of stream - // Forward pos to the lexer head - self.next_pos(); - let ev = if self.depth() == 0 { - if self.encountered_element && self.st == State::OutsideTag { // all is ok - Ok(XmlEvent::EndDocument) - } else if !self.encountered_element { - self_error!(self; "Unexpected end of stream: no root element found") - } else { // self.st != State::OutsideTag - self_error!(self; "Unexpected end of stream") // TODO: add expected hint? - } - } else { - if self.config.ignore_end_of_stream { - self.final_result = None; - self.lexer.reset_eof_handled(); - return self_error!(self; "Unexpected end of stream: still inside the root element"); - } else { - self_error!(self; "Unexpected end of stream: still inside the root element") - } - }; - self.set_final_result(ev) - } - - // This function is to be called when a terminal event is reached. - // The function sets up the `self.final_result` into `Some(result)` and return `result`. - fn set_final_result(&mut self, result: Result) -> Result { - self.final_result = Some(result.clone()); - result - } - - #[inline] - fn error>>(&self, msg: M) -> Result { - Err((&self.lexer, msg).into()) - } - - #[inline] - fn next_pos(&mut self) { - if self.pos.len() > 1 { - self.pos.remove(0); - } else { - self.pos[0] = self.lexer.position(); - } - } - - #[inline] - fn push_pos(&mut self) { - self.pos.push(self.lexer.position()); - } - - fn dispatch_token(&mut self, t: Token) -> Option { - match self.st.clone() { - State::OutsideTag => self.outside_tag(t), - State::InsideProcessingInstruction(s) => self.inside_processing_instruction(t, s), - State::InsideDeclaration(s) => self.inside_declaration(t, s), - State::InsideDoctype => self.inside_doctype(t), - State::InsideOpeningTag(s) => self.inside_opening_tag(t, s), - State::InsideClosingTag(s) => self.inside_closing_tag_name(t, s), - State::InsideComment => self.inside_comment(t), - State::InsideCData => self.inside_cdata(t), - State::InsideReference(s) => self.inside_reference(t, *s) - } - } - - #[inline] - fn depth(&self) -> usize { - self.est.len() - } - - #[inline] - fn buf_has_data(&self) -> bool { - self.buf.len() > 0 - } - - #[inline] - fn take_buf(&mut self) -> String { - mem::replace(&mut self.buf, String::new()) - } - - #[inline] - fn append_char_continue(&mut self, c: char) -> Option { - self.buf.push(c); - None - } - - #[inline] - fn into_state(&mut self, st: State, ev: Option) -> Option { - self.st = st; - ev - } - - #[inline] - fn into_state_continue(&mut self, st: State) -> Option { - self.into_state(st, None) - } - - #[inline] - fn into_state_emit(&mut self, st: State, ev: Result) -> Option { - self.into_state(st, Some(ev)) - } - - /// Dispatches tokens in order to process qualified name. If qualified name cannot be parsed, - /// an error is returned. - /// - /// # Parameters - /// * `t` --- next token; - /// * `on_name` --- a callback which is executed when whitespace is encountered. - fn read_qualified_name(&mut self, t: Token, target: QualifiedNameTarget, on_name: F) -> Option - where F: Fn(&mut PullParser, Token, OwnedName) -> Option { - // We can get here for the first time only when self.data.name contains zero or one character, - // but first character cannot be a colon anyway - if self.buf.len() <= 1 { - self.read_prefix_separator = false; - } - - let invoke_callback = |this: &mut PullParser, t| { - let name = this.take_buf(); - match name.parse() { - Ok(name) => on_name(this, t, name), - Err(_) => Some(self_error!(this; "Qualified name is invalid: {}", name)) - } - }; - - match t { - // There can be only one colon, and not as the first character - Token::Character(':') if self.buf_has_data() && !self.read_prefix_separator => { - self.buf.push(':'); - self.read_prefix_separator = true; - None - } - - Token::Character(c) if c != ':' && (!self.buf_has_data() && is_name_start_char(c) || - self.buf_has_data() && is_name_char(c)) => - self.append_char_continue(c), - - Token::EqualsSign if target == QualifiedNameTarget::AttributeNameTarget => invoke_callback(self, t), - - Token::EmptyTagEnd if target == QualifiedNameTarget::OpeningTagNameTarget => invoke_callback(self, t), - - Token::TagEnd if target == QualifiedNameTarget::OpeningTagNameTarget || - target == QualifiedNameTarget::ClosingTagNameTarget => invoke_callback(self, t), - - Token::Whitespace(_) => invoke_callback(self, t), - - _ => Some(self_error!(self; "Unexpected token inside qualified name: {}", t)) - } - } - - /// Dispatches tokens in order to process attribute value. - /// - /// # Parameters - /// * `t` --- next token; - /// * `on_value` --- a callback which is called when terminating quote is encountered. - fn read_attribute_value(&mut self, t: Token, on_value: F) -> Option - where F: Fn(&mut PullParser, String) -> Option { - match t { - Token::Whitespace(_) if self.data.quote.is_none() => None, // skip leading whitespace - - Token::DoubleQuote | Token::SingleQuote => match self.data.quote { - None => { // Entered attribute value - self.data.quote = Some(QuoteToken::from_token(&t)); - None - } - Some(q) if q.as_token() == t => { - self.data.quote = None; - let value = self.take_buf(); - on_value(self, value) - } - _ => { - t.push_to_string(&mut self.buf); - None - } - }, - - Token::ReferenceStart => { - let st = Box::new(self.st.clone()); - self.into_state_continue(State::InsideReference(st)) - } - - Token::OpeningTagStart => - Some(self_error!(self; "Unexpected token inside attribute value: <")), - - // Every character except " and ' and < is okay - _ => { - t.push_to_string(&mut self.buf); - None - } - } - } - - fn emit_start_element(&mut self, emit_end_element: bool) -> Option { - let mut name = self.data.take_element_name().unwrap(); - let mut attributes = self.data.take_attributes(); - - // check whether the name prefix is bound and fix its namespace - match self.nst.get(name.borrow().prefix_repr()) { - Some("") => name.namespace = None, // default namespace - Some(ns) => name.namespace = Some(ns.into()), - None => return Some(self_error!(self; "Element {} prefix is unbound", name)) - } - - // check and fix accumulated attributes prefixes - for attr in attributes.iter_mut() { - if let Some(ref pfx) = attr.name.prefix { - let new_ns = match self.nst.get(pfx) { - Some("") => None, // default namespace - Some(ns) => Some(ns.into()), - None => return Some(self_error!(self; "Attribute {} prefix is unbound", attr.name)) - }; - attr.name.namespace = new_ns; - } - } - - if emit_end_element { - self.pop_namespace = true; - self.next_event = Some(Ok(XmlEvent::EndElement { - name: name.clone() - })); - } else { - self.est.push(name.clone()); - } - let namespace = self.nst.squash(); - self.into_state_emit(State::OutsideTag, Ok(XmlEvent::StartElement { - name: name, - attributes: attributes, - namespace: namespace - })) - } - - fn emit_end_element(&mut self) -> Option { - let mut name = self.data.take_element_name().unwrap(); - - // check whether the name prefix is bound and fix its namespace - match self.nst.get(name.borrow().prefix_repr()) { - Some("") => name.namespace = None, // default namespace - Some(ns) => name.namespace = Some(ns.into()), - None => return Some(self_error!(self; "Element {} prefix is unbound", name)) - } - - let op_name = self.est.pop().unwrap(); - - if name == op_name { - self.pop_namespace = true; - self.into_state_emit(State::OutsideTag, Ok(XmlEvent::EndElement { name: name })) - } else { - Some(self_error!(self; "Unexpected closing tag: {}, expected {}", name, op_name)) - } - } - -} - -#[cfg(test)] -mod tests { - use std::io::BufReader; - - use common::{Position, TextPosition}; - use name::OwnedName; - use attribute::OwnedAttribute; - use reader::parser::PullParser; - use reader::ParserConfig; - use reader::events::XmlEvent; - - fn new_parser() -> PullParser { - PullParser::new(ParserConfig::new()) - } - - macro_rules! expect_event( - ($r:expr, $p:expr, $t:pat) => ( - match $p.next(&mut $r) { - $t => {} - e => panic!("Unexpected event: {:?}", e) - } - ); - ($r:expr, $p:expr, $t:pat => $c:expr ) => ( - match $p.next(&mut $r) { - $t if $c => {} - e => panic!("Unexpected event: {:?}", e) - } - ) - ); - - macro_rules! test_data( - ($d:expr) => ({ - static DATA: &'static str = $d; - let r = BufReader::new(DATA.as_bytes()); - let p = new_parser(); - (r, p) - }) - ); - - #[test] - fn issue_3_semicolon_in_attribute_value() { - let (mut r, mut p) = test_data!(r#" - - "#); - - expect_event!(r, p, Ok(XmlEvent::StartDocument { .. })); - expect_event!(r, p, Ok(XmlEvent::StartElement { ref name, ref attributes, ref namespace }) => - *name == OwnedName::local("a") && - attributes.len() == 1 && - attributes[0] == OwnedAttribute::new(OwnedName::local("attr"), "zzz;zzz") && - namespace.is_essentially_empty() - ); - expect_event!(r, p, Ok(XmlEvent::EndElement { ref name }) => *name == OwnedName::local("a")); - expect_event!(r, p, Ok(XmlEvent::EndDocument)); - } - - #[test] - fn issue_140_entity_reference_inside_tag() { - let (mut r, mut p) = test_data!(r#" - - "#); - - expect_event!(r, p, Ok(XmlEvent::StartDocument { .. })); - expect_event!(r, p, Ok(XmlEvent::StartElement { ref name, .. }) => *name == OwnedName::local("bla")); - expect_event!(r, p, Ok(XmlEvent::Characters(ref s)) => s == "\u{266b}"); - expect_event!(r, p, Ok(XmlEvent::EndElement { ref name, .. }) => *name == OwnedName::local("bla")); - expect_event!(r, p, Ok(XmlEvent::EndDocument)); - } - - #[test] - fn opening_tag_in_attribute_value() { - let (mut r, mut p) = test_data!(r#" - - "#); - - expect_event!(r, p, Ok(XmlEvent::StartDocument { .. })); - expect_event!(r, p, Err(ref e) => - e.msg() == "Unexpected token inside attribute value: <" && - e.position() == TextPosition { row: 1, column: 24 } - ); - } -} diff --git a/src/reader/parser/outside_tag.rs b/src/reader/parser/outside_tag.rs index d3f7598..8104224 100644 --- a/src/reader/parser/outside_tag.rs +++ b/src/reader/parser/outside_tag.rs @@ -1,130 +1,196 @@ -use common::is_whitespace_char; - -use reader::events::XmlEvent; -use reader::lexer::Token; +use crate::reader::error::SyntaxError; +use crate::common::is_whitespace_char; +use crate::reader::events::XmlEvent; +use crate::reader::lexer::Token; use super::{ - Result, PullParser, State, ClosingTagSubstate, OpeningTagSubstate, - ProcessingInstructionSubstate, DEFAULT_VERSION, DEFAULT_ENCODING, DEFAULT_STANDALONE + ClosingTagSubstate, DoctypeSubstate, Encountered, OpeningTagSubstate, + ProcessingInstructionSubstate, PullParser, Result, State, }; impl PullParser { pub fn outside_tag(&mut self, t: Token) -> Option { match t { - Token::ReferenceStart => - self.into_state_continue(State::InsideReference(Box::new(State::OutsideTag))), - - Token::Whitespace(_) if self.depth() == 0 && self.config.ignore_root_level_whitespace => None, // skip whitespace outside of the root element + Token::Character(c) => { + if is_whitespace_char(c) { + // skip whitespace outside of the root element + if (self.config.c.trim_whitespace && self.buf.is_empty()) || + (self.depth() == 0 && self.config.c.ignore_root_level_whitespace) { + return None; + } + } else { + self.inside_whitespace = false; + if self.depth() == 0 { + return Some(self.error(SyntaxError::UnexpectedTokenOutsideRoot(t))); + } + } - Token::Whitespace(_) if self.config.trim_whitespace && !self.buf_has_data() => None, + if !self.is_valid_xml_char_not_restricted(c) { + return Some(self.error(SyntaxError::InvalidCharacterEntity(c as u32))); + } - Token::Whitespace(c) => { - if !self.buf_has_data() { + if self.buf.is_empty() { self.push_pos(); } - self.append_char_continue(c) - } - - _ if t.contains_char_data() && self.depth() == 0 => - Some(self_error!(self; "Unexpected characters outside the root element: {}", t)), + self.buf.push(c); + None + }, - _ if t.contains_char_data() => { // Non-whitespace char data - if !self.buf_has_data() { - self.push_pos(); + Token::CommentEnd | Token::TagEnd | Token::EqualsSign | + Token::DoubleQuote | Token::SingleQuote | + Token::ProcessingInstructionEnd | Token::EmptyTagEnd => { + if self.depth() == 0 { + return Some(self.error(SyntaxError::UnexpectedTokenOutsideRoot(t))); } self.inside_whitespace = false; - t.push_to_string(&mut self.buf); + + if let Some(s) = t.as_static_str() { + if self.buf.is_empty() { + self.push_pos(); + } + self.buf.push_str(s); + } None - } + }, + + Token::ReferenceStart if self.depth() > 0 => { + self.state_after_reference = State::OutsideTag; + self.into_state_continue(State::InsideReference) + }, - Token::ReferenceEnd => { // Semi-colon in a text outside an entity + Token::ReferenceEnd if self.depth() > 0 => { // Semi-colon in a text outside an entity self.inside_whitespace = false; Token::ReferenceEnd.push_to_string(&mut self.buf); None - } + }, - Token::CommentStart if self.config.coalesce_characters && self.config.ignore_comments => { + Token::CommentStart if self.config.c.coalesce_characters && self.config.c.ignore_comments => { + let next_event = self.set_encountered(Encountered::Comment); // We need to switch the lexer into a comment mode inside comments - self.lexer.inside_comment(); - self.into_state_continue(State::InsideComment) + self.into_state(State::InsideComment, next_event) } - Token::CDataStart if self.config.coalesce_characters && self.config.cdata_to_characters => { - if !self.buf_has_data() { + Token::CDataStart if self.depth() > 0 && self.config.c.coalesce_characters && self.config.c.cdata_to_characters => { + if self.buf.is_empty() { self.push_pos(); } - // We need to disable lexing errors inside CDATA - self.lexer.disable_errors(); self.into_state_continue(State::InsideCData) - } + }, _ => { // Encountered some markup event, flush the buffer as characters // or a whitespace let mut next_event = if self.buf_has_data() { let buf = self.take_buf(); - if self.inside_whitespace && self.config.trim_whitespace { + if self.inside_whitespace && self.config.c.trim_whitespace { None - } else if self.inside_whitespace && !self.config.whitespace_to_characters { + } else if self.inside_whitespace && !self.config.c.whitespace_to_characters { Some(Ok(XmlEvent::Whitespace(buf))) - } else if self.config.trim_whitespace { + } else if self.config.c.trim_whitespace { Some(Ok(XmlEvent::Characters(buf.trim_matches(is_whitespace_char).into()))) } else { Some(Ok(XmlEvent::Characters(buf))) } } else { None }; self.inside_whitespace = true; // Reset inside_whitespace flag - self.push_pos(); - match t { - Token::ProcessingInstructionStart => - self.into_state(State::InsideProcessingInstruction(ProcessingInstructionSubstate::PIInsideName), next_event), - - Token::DoctypeStart if !self.encountered_element => { - // We don't have a doctype event so skip this position - // FIXME: update when we have a doctype event - self.next_pos(); - self.lexer.disable_errors(); - self.into_state(State::InsideDoctype, next_event) - } - Token::OpeningTagStart => { - // If declaration was not parsed and we have encountered an element, - // emit this declaration as the next event. - if !self.parsed_declaration { - self.parsed_declaration = true; - let sd_event = XmlEvent::StartDocument { - version: DEFAULT_VERSION, - encoding: DEFAULT_ENCODING.into(), - standalone: DEFAULT_STANDALONE - }; - // next_event is always none here because we're outside of - // the root element - next_event = Some(Ok(sd_event)); - self.push_pos(); + // pos is popped whenever an event is emitted, so pushes must happen only if there will be an event to balance it + // and ignored comments don't pop + if t != Token::CommentStart || !self.config.c.ignore_comments { + self.push_pos(); + } + match t { + Token::OpeningTagStart if self.depth() > 0 || self.encountered < Encountered::Element || self.config.allow_multiple_root_elements => { + if let Some(e) = self.set_encountered(Encountered::Element) { + next_event = Some(e); } - self.encountered_element = true; self.nst.push_empty(); self.into_state(State::InsideOpeningTag(OpeningTagSubstate::InsideName), next_event) - } + }, Token::ClosingTagStart if self.depth() > 0 => self.into_state(State::InsideClosingTag(ClosingTagSubstate::CTInsideName), next_event), Token::CommentStart => { + if let Some(e) = self.set_encountered(Encountered::Comment) { + next_event = Some(e); + } // We need to switch the lexer into a comment mode inside comments - self.lexer.inside_comment(); self.into_state(State::InsideComment, next_event) - } + }, + + Token::DoctypeStart if self.encountered < Encountered::Doctype => { + if let Some(e) = self.set_encountered(Encountered::Doctype) { + next_event = Some(e); + } - Token::CDataStart => { - // We need to disable lexing errors inside CDATA - self.lexer.disable_errors(); + // We don't have a doctype event so skip this position + // FIXME: update when we have a doctype event + self.next_pos(); + self.into_state(State::InsideDoctype(DoctypeSubstate::Outside), next_event) + }, + + Token::ProcessingInstructionStart => + self.into_state(State::InsideProcessingInstruction(ProcessingInstructionSubstate::PIInsideName), next_event), + + Token::CDataStart if self.depth() > 0 => { self.into_state(State::InsideCData, next_event) - } + }, - _ => Some(self_error!(self; "Unexpected token: {}", t)) + _ => Some(self.error(SyntaxError::UnexpectedToken(t))) } } } } + + pub fn document_start(&mut self, t: Token) -> Option { + debug_assert!(self.encountered < Encountered::Declaration); + + match t { + Token::Character(c) => { + let next_event = self.set_encountered(Encountered::AnyChars); + + if !is_whitespace_char(c) { + return Some(self.error(SyntaxError::UnexpectedTokenOutsideRoot(t))); + } + self.inside_whitespace = true; + + // skip whitespace outside of the root element + if (self.config.c.trim_whitespace && self.buf.is_empty()) || + (self.depth() == 0 && self.config.c.ignore_root_level_whitespace) { + return self.into_state(State::OutsideTag, next_event); + } + + self.push_pos(); + self.buf.push(c); + self.into_state(State::OutsideTag, next_event) + }, + + Token::CommentStart => { + let next_event = self.set_encountered(Encountered::Comment); + self.into_state(State::InsideComment, next_event) + } + + Token::OpeningTagStart => { + let next_event = self.set_encountered(Encountered::Element); + self.nst.push_empty(); + self.into_state(State::InsideOpeningTag(OpeningTagSubstate::InsideName), next_event) + }, + + Token::DoctypeStart => { + let next_event = self.set_encountered(Encountered::Doctype); + // We don't have a doctype event so skip this position + // FIXME: update when we have a doctype event + self.next_pos(); + self.into_state(State::InsideDoctype(DoctypeSubstate::Outside), next_event) + }, + + Token::ProcessingInstructionStart => { + self.push_pos(); + self.into_state_continue(State::InsideProcessingInstruction(ProcessingInstructionSubstate::PIInsideName)) + }, + + _ => Some(self.error(SyntaxError::UnexpectedToken(t))), + } + } } diff --git a/src/util.rs b/src/util.rs index 23fee04..07d0336 100644 --- a/src/util.rs +++ b/src/util.rs @@ -1,107 +1,305 @@ -use std::io::{self, Read}; -use std::str; use std::fmt; +use std::io::{self, Read}; +use std::str::{self, FromStr}; #[derive(Debug)] pub enum CharReadError { UnexpectedEof, Utf8(str::Utf8Error), - Io(io::Error) + Io(io::Error), } impl From for CharReadError { + #[cold] fn from(e: str::Utf8Error) -> CharReadError { CharReadError::Utf8(e) } } impl From for CharReadError { + #[cold] fn from(e: io::Error) -> CharReadError { CharReadError::Io(e) } } impl fmt::Display for CharReadError { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - use self::CharReadError::*; + #[cold] + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + use self::CharReadError::{Io, UnexpectedEof, Utf8}; match *self { UnexpectedEof => write!(f, "unexpected end of stream"), - Utf8(ref e) => write!(f, "UTF-8 decoding error: {}", e), - Io(ref e) => write!(f, "I/O error: {}", e) + Utf8(ref e) => write!(f, "UTF-8 decoding error: {e}"), + Io(ref e) => write!(f, "I/O error: {e}"), } } } -pub fn next_char_from(source: &mut R) -> Result, CharReadError> { - const MAX_CODEPOINT_LEN: usize = 4; +/// Character encoding used for parsing +#[derive(Debug, Copy, Clone, Eq, PartialEq)] +#[non_exhaustive] +pub enum Encoding { + /// Explicitly UTF-8 only + Utf8, + /// UTF-8 fallback, but can be any 8-bit encoding + Default, + /// ISO-8859-1 + Latin1, + /// US-ASCII + Ascii, + /// Big-Endian + Utf16Be, + /// Little-Endian + Utf16Le, + /// Unknown endianness yet, will be sniffed + Utf16, + /// Not determined yet, may be sniffed to be anything + Unknown, +} - let mut bytes = source.bytes(); - let mut buf = [0u8; MAX_CODEPOINT_LEN]; - let mut pos = 0; +// Rustc inlines eq_ignore_ascii_case and creates kilobytes of code! +#[inline(never)] +fn icmp(lower: &str, varcase: &str) -> bool { + lower.bytes().zip(varcase.bytes()).all(|(l, v)| l == v.to_ascii_lowercase()) +} - loop { - let next = match bytes.next() { - Some(Ok(b)) => b, - Some(Err(e)) => return Err(e.into()), - None if pos == 0 => return Ok(None), - None => return Err(CharReadError::UnexpectedEof) - }; - buf[pos] = next; - pos += 1; +impl FromStr for Encoding { + type Err = &'static str; + + fn from_str(val: &str) -> Result { + if ["utf-8", "utf8"].into_iter().any(move |label| icmp(label, val)) { + Ok(Encoding::Utf8) + } else if ["iso-8859-1", "latin1"].into_iter().any(move |label| icmp(label, val)) { + Ok(Encoding::Latin1) + } else if ["utf-16", "utf16"].into_iter().any(move |label| icmp(label, val)) { + Ok(Encoding::Utf16) + } else if ["ascii", "us-ascii"].into_iter().any(move |label| icmp(label, val)) { + Ok(Encoding::Ascii) + } else { + Err("unknown encoding name") + } + } +} + +impl fmt::Display for Encoding { + #[cold] + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.write_str(match self { + Encoding::Utf8 => "UTF-8", + Encoding::Default => "UTF-8", + Encoding::Latin1 => "ISO-8859-1", + Encoding::Ascii => "US-ASCII", + Encoding::Utf16Be => "UTF-16", + Encoding::Utf16Le => "UTF-16", + Encoding::Utf16 => "UTF-16", + Encoding::Unknown => "(unknown)", + }) + } +} + +pub(crate) struct CharReader { + pub encoding: Encoding, +} + +impl CharReader { + pub fn new() -> Self { + Self { + encoding: Encoding::Unknown, + } + } + + pub fn next_char_from(&mut self, source: &mut R) -> Result, CharReadError> { + let mut bytes = source.bytes(); + const MAX_CODEPOINT_LEN: usize = 4; + + let mut buf = [0u8; MAX_CODEPOINT_LEN]; + let mut pos = 0; + loop { + let next = match bytes.next() { + Some(Ok(b)) => b, + Some(Err(e)) => return Err(e.into()), + None if pos == 0 => return Ok(None), + None => return Err(CharReadError::UnexpectedEof), + }; + + match self.encoding { + Encoding::Utf8 | Encoding::Default => { + // fast path for ASCII subset + if pos == 0 && next.is_ascii() { + return Ok(Some(next.into())); + } - match str::from_utf8(&buf[..pos]) { - Ok(s) => return Ok(s.chars().next()), // always Some(..) - Err(_) if pos < MAX_CODEPOINT_LEN => {}, - Err(e) => return Err(e.into()) + buf[pos] = next; + pos += 1; + + match str::from_utf8(&buf[..pos]) { + Ok(s) => return Ok(s.chars().next()), // always Some(..) + Err(_) if pos < MAX_CODEPOINT_LEN => continue, + Err(e) => return Err(e.into()), + } + }, + Encoding::Latin1 => { + return Ok(Some(next.into())); + }, + Encoding::Ascii => { + if next.is_ascii() { + return Ok(Some(next.into())); + } else { + return Err(CharReadError::Io(io::Error::new(io::ErrorKind::InvalidData, "char is not ASCII"))); + } + }, + Encoding::Unknown | Encoding::Utf16 => { + buf[pos] = next; + pos += 1; + + // sniff BOM + if pos <= 3 && buf[..pos] == [0xEF, 0xBB, 0xBF][..pos] { + if pos == 3 && self.encoding != Encoding::Utf16 { + pos = 0; + self.encoding = Encoding::Utf8; + } + } else if pos <= 2 && buf[..pos] == [0xFE, 0xFF][..pos] { + if pos == 2 { + pos = 0; + self.encoding = Encoding::Utf16Be; + } + } else if pos <= 2 && buf[..pos] == [0xFF, 0xFE][..pos] { + if pos == 2 { + pos = 0; + self.encoding = Encoding::Utf16Le; + } + } else if pos == 1 && self.encoding == Encoding::Utf16 { + // sniff ASCII char in UTF-16 + self.encoding = if next == 0 { Encoding::Utf16Be } else { Encoding::Utf16Le }; + } else { + // UTF-8 is the default, but XML decl can change it to other 8-bit encoding + self.encoding = Encoding::Default; + if pos == 1 && next.is_ascii() { + return Ok(Some(next.into())); + } + } + }, + Encoding::Utf16Be => { + buf[pos] = next; + pos += 1; + if pos == 2 { + if let Some(Ok(c)) = char::decode_utf16([u16::from_be_bytes(buf[..2].try_into().unwrap())]).next() { + return Ok(Some(c)); + } + } else if pos == 4 { // surrogate + return char::decode_utf16([u16::from_be_bytes(buf[..2].try_into().unwrap()), u16::from_be_bytes(buf[2..4].try_into().unwrap())]) + .next().transpose() + .map_err(|e| CharReadError::Io(io::Error::new(io::ErrorKind::InvalidData, e))); + } + }, + Encoding::Utf16Le => { + buf[pos] = next; + pos += 1; + if pos == 2 { + if let Some(Ok(c)) = char::decode_utf16([u16::from_le_bytes(buf[..2].try_into().unwrap())]).next() { + return Ok(Some(c)); + } + } else if pos == 4 { // surrogate + return char::decode_utf16([u16::from_le_bytes(buf[..2].try_into().unwrap()), u16::from_le_bytes(buf[2..4].try_into().unwrap())]) + .next().transpose() + .map_err(|e| CharReadError::Io(io::Error::new(io::ErrorKind::InvalidData, e))); + } + }, + } } } } #[cfg(test)] mod tests { + use super::{CharReadError, CharReader, Encoding}; + #[test] fn test_next_char_from() { use std::io; - use std::error::Error; let mut bytes: &[u8] = "correct".as_bytes(); // correct ASCII - assert_eq!(super::next_char_from(&mut bytes).unwrap(), Some('c')); + assert_eq!(CharReader::new().next_char_from(&mut bytes).unwrap(), Some('c')); + + let mut bytes: &[u8] = b"\xEF\xBB\xBF\xE2\x80\xA2!"; // BOM + assert_eq!(CharReader::new().next_char_from(&mut bytes).unwrap(), Some('•')); + + let mut bytes: &[u8] = b"\xEF\xBB\xBFx123"; // BOM + assert_eq!(CharReader::new().next_char_from(&mut bytes).unwrap(), Some('x')); + + let mut bytes: &[u8] = b"\xEF\xBB\xBF"; // Nothing after BOM + assert_eq!(CharReader::new().next_char_from(&mut bytes).unwrap(), None); + + let mut bytes: &[u8] = b"\xEF\xBB"; // Nothing after BO + assert!(matches!(CharReader::new().next_char_from(&mut bytes), Err(CharReadError::UnexpectedEof))); + + let mut bytes: &[u8] = b"\xEF\xBB\x42"; // Nothing after BO + assert!(matches!(CharReader::new().next_char_from(&mut bytes), Err(_))); + + let mut bytes: &[u8] = b"\xFE\xFF\x00\x42"; // UTF-16 + assert_eq!(CharReader::new().next_char_from(&mut bytes).unwrap(), Some('B')); + + let mut bytes: &[u8] = b"\xFF\xFE\x42\x00"; // UTF-16 + assert_eq!(CharReader::new().next_char_from(&mut bytes).unwrap(), Some('B')); + + let mut bytes: &[u8] = b"\xFF\xFE"; // UTF-16 + assert_eq!(CharReader::new().next_char_from(&mut bytes).unwrap(), None); + + let mut bytes: &[u8] = b"\xFF\xFE\x00"; // UTF-16 + assert!(matches!(CharReader::new().next_char_from(&mut bytes), Err(CharReadError::UnexpectedEof))); let mut bytes: &[u8] = "правильно".as_bytes(); // correct BMP - assert_eq!(super::next_char_from(&mut bytes).unwrap(), Some('п')); + assert_eq!(CharReader::new().next_char_from(&mut bytes).unwrap(), Some('п')); + + let mut bytes: &[u8] = "правильно".as_bytes(); + assert_eq!(CharReader { encoding: Encoding::Utf16Be }.next_char_from(&mut bytes).unwrap(), Some('킿')); + + let mut bytes: &[u8] = "правильно".as_bytes(); + assert_eq!(CharReader { encoding: Encoding::Utf16Le }.next_char_from(&mut bytes).unwrap(), Some('뿐')); + + let mut bytes: &[u8] = b"\xD8\xD8\x80"; + assert!(matches!(CharReader { encoding: Encoding::Utf16 }.next_char_from(&mut bytes), Err(_))); + + let mut bytes: &[u8] = b"\x00\x42"; + assert_eq!(CharReader { encoding: Encoding::Utf16 }.next_char_from(&mut bytes).unwrap(), Some('B')); + + let mut bytes: &[u8] = b"\x42\x00"; + assert_eq!(CharReader { encoding: Encoding::Utf16 }.next_char_from(&mut bytes).unwrap(), Some('B')); + + let mut bytes: &[u8] = b"\x00"; + assert!(matches!(CharReader { encoding: Encoding::Utf16Be }.next_char_from(&mut bytes), Err(_))); let mut bytes: &[u8] = "😊".as_bytes(); // correct non-BMP - assert_eq!(super::next_char_from(&mut bytes).unwrap(), Some('😊')); + assert_eq!(CharReader::new().next_char_from(&mut bytes).unwrap(), Some('😊')); let mut bytes: &[u8] = b""; // empty - assert_eq!(super::next_char_from(&mut bytes).unwrap(), None); + assert_eq!(CharReader::new().next_char_from(&mut bytes).unwrap(), None); let mut bytes: &[u8] = b"\xf0\x9f\x98"; // incomplete code point - match super::next_char_from(&mut bytes).unwrap_err() { + match CharReader::new().next_char_from(&mut bytes).unwrap_err() { super::CharReadError::UnexpectedEof => {}, - e => panic!("Unexpected result: {:?}", e) + e => panic!("Unexpected result: {e:?}") }; let mut bytes: &[u8] = b"\xff\x9f\x98\x32"; // invalid code point - match super::next_char_from(&mut bytes).unwrap_err() { + match CharReader::new().next_char_from(&mut bytes).unwrap_err() { super::CharReadError::Utf8(_) => {}, - e => panic!("Unexpected result: {:?}", e) + e => panic!("Unexpected result: {e:?}") }; - // error during read struct ErrorReader; impl io::Read for ErrorReader { - fn read(&mut self, buf: &mut [u8]) -> io::Result { + fn read(&mut self, _: &mut [u8]) -> io::Result { Err(io::Error::new(io::ErrorKind::Other, "test error")) } } let mut r = ErrorReader; - match super::next_char_from(&mut r).unwrap_err() { + match CharReader::new().next_char_from(&mut r).unwrap_err() { super::CharReadError::Io(ref e) if e.kind() == io::ErrorKind::Other && - e.description() == "test error" => {}, - e => panic!("Unexpected result: {:?}", e) + e.to_string().contains("test error") => {}, + e => panic!("Unexpected result: {e:?}") } } } diff --git a/src/writer.rs b/src/writer.rs new file mode 100644 index 0000000..e2b70ec --- /dev/null +++ b/src/writer.rs @@ -0,0 +1,90 @@ +//! Contains high-level interface for an events-based XML emitter. +//! +//! The most important type in this module is `EventWriter` which allows writing an XML document +//! to some output stream. + +pub use self::config::EmitterConfig; +pub use self::emitter::EmitterError as Error; +pub use self::emitter::Result; +pub use self::events::XmlEvent; + +use self::emitter::Emitter; + +use std::io::prelude::*; + +mod config; +mod emitter; +pub mod events; + +/// A wrapper around an `std::io::Write` instance which emits XML document according to provided +/// events. +pub struct EventWriter { + sink: W, + emitter: Emitter, +} + +impl EventWriter { + /// Creates a new `EventWriter` out of an `std::io::Write` instance using the default + /// configuration. + #[inline] + pub fn new(sink: W) -> EventWriter { + EventWriter::new_with_config(sink, EmitterConfig::new()) + } + + /// Creates a new `EventWriter` out of an `std::io::Write` instance using the provided + /// configuration. + #[inline] + pub fn new_with_config(sink: W, config: EmitterConfig) -> EventWriter { + EventWriter { + sink, + emitter: Emitter::new(config), + } + } + + /// Writes the next piece of XML document according to the provided event. + /// + /// Note that output data may not exactly correspond to the written event because + /// of various configuration options. For example, `XmlEvent::EndElement` may + /// correspond to a separate closing element or it may cause writing an empty element. + /// Another example is that `XmlEvent::CData` may be represented as characters in + /// the output stream. + pub fn write<'a, E>(&mut self, event: E) -> Result<()> where E: Into> { + match event.into() { + XmlEvent::StartDocument { version, encoding, standalone } => + self.emitter.emit_start_document(&mut self.sink, version, encoding.unwrap_or("UTF-8"), standalone), + XmlEvent::ProcessingInstruction { name, data } => + self.emitter.emit_processing_instruction(&mut self.sink, name, data), + XmlEvent::StartElement { name, attributes, namespace } => { + self.emitter.namespace_stack_mut().push_empty().checked_target().extend(namespace.as_ref()); + self.emitter.emit_start_element(&mut self.sink, name, &attributes) + } + XmlEvent::EndElement { name } => { + let r = self.emitter.emit_end_element(&mut self.sink, name); + self.emitter.namespace_stack_mut().try_pop(); + r + } + XmlEvent::Comment(content) => self.emitter.emit_comment(&mut self.sink, content), + XmlEvent::CData(content) => self.emitter.emit_cdata(&mut self.sink, content), + XmlEvent::Characters(content) => self.emitter.emit_characters(&mut self.sink, content), + } + } + + /// Returns a mutable reference to the underlying `Writer`. + /// + /// Note that having a reference to the underlying sink makes it very easy to emit invalid XML + /// documents. Use this method with care. Valid use cases for this method include accessing + /// methods like `Write::flush`, which do not emit new data but rather change the state + /// of the stream itself. + pub fn inner_mut(&mut self) -> &mut W { + &mut self.sink + } + + /// Unwraps this `EventWriter`, returning the underlying writer. + /// + /// Note that this is a destructive operation: unwrapping a writer and then wrapping + /// it again with `EventWriter::new()` will create a fresh writer whose state will be + /// blank; for example, accumulated namespaces will be reset. + pub fn into_inner(self) -> W { + self.sink + } +} diff --git a/src/writer/config.rs b/src/writer/config.rs index ebabf18..c7841bc 100644 --- a/src/writer/config.rs +++ b/src/writer/config.rs @@ -1,9 +1,8 @@ //! Contains emitter configuration structure. -use std::io::Write; use std::borrow::Cow; - -use writer::EventWriter; +use std::io::Write; +use crate::writer::EventWriter; /// Emitter configuration structure. /// @@ -98,10 +97,11 @@ impl EmitterConfig { /// .normalize_empty_elements(false); /// ``` #[inline] + #[must_use] pub fn new() -> EmitterConfig { EmitterConfig { line_separator: "\n".into(), - indent_string: " ".into(), // two spaces + indent_string: " ".into(), // two spaces perform_indent: false, perform_escaping: true, write_document_declaration: true, @@ -109,7 +109,7 @@ impl EmitterConfig { cdata_to_characters: false, keep_element_names_stack: true, autopad_comments: true, - pad_self_closing: true + pad_self_closing: true, } } diff --git a/src/writer/emitter.rs b/src/writer/emitter.rs index ba80f66..8e74b5f 100644 --- a/src/writer/emitter.rs +++ b/src/writer/emitter.rs @@ -1,18 +1,17 @@ +use std::error::Error; +use std::fmt; use std::io; use std::io::prelude::*; -use std::fmt; use std::result; -use std::borrow::Cow; -use std::error::Error; -use common; -use name::{Name, OwnedName}; -use attribute::Attribute; -use escape::{escape_str_attribute, escape_str_pcdata}; -use common::XmlVersion; -use namespace::{NamespaceStack, NS_NO_PREFIX, NS_EMPTY_URI, NS_XMLNS_PREFIX, NS_XML_PREFIX}; +use crate::attribute::Attribute; +use crate::common; +use crate::common::XmlVersion; +use crate::escape::{AttributeEscapes, Escaped, PcDataEscapes}; +use crate::name::{Name, OwnedName}; +use crate::namespace::{NamespaceStack, NS_EMPTY_URI, NS_NO_PREFIX, NS_XMLNS_PREFIX, NS_XML_PREFIX}; -use writer::config::EmitterConfig; +use crate::writer::config::EmitterConfig; /// An error which may be returned by `XmlWriter` when writing XML events. #[derive(Debug)] @@ -32,47 +31,35 @@ pub enum EmitterError { /// End element name is not specified when it is needed, for example, when automatic /// closing is not enabled in configuration. - EndElementNameIsNotSpecified + EndElementNameIsNotSpecified, } impl From for EmitterError { + #[cold] fn from(err: io::Error) -> EmitterError { EmitterError::Io(err) } } impl fmt::Display for EmitterError { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - - write!(f, "emitter error: ")?; - match *self { - EmitterError::Io(ref e) => - write!(f, "I/O error: {}", e), - ref other => - write!(f, "{}", other.description()), + #[cold] + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.write_str("emitter error: ")?; + match self { + EmitterError::Io(e) => write!(f, "I/O error: {e}"), + EmitterError::DocumentStartAlreadyEmitted => f.write_str("document start event has already been emitted"), + EmitterError::LastElementNameNotAvailable => f.write_str("last element name is not available"), + EmitterError::EndElementNameIsNotEqualToLastStartElementName => f.write_str("end element name is not equal to last start element name"), + EmitterError::EndElementNameIsNotSpecified => f.write_str("end element name is not specified and can't be inferred"), } } } impl Error for EmitterError { - fn description(&self) -> &str { - match *self { - EmitterError::Io(_) => - "I/O error", - EmitterError::DocumentStartAlreadyEmitted => - "document start event has already been emitted", - EmitterError::LastElementNameNotAvailable => - "last element name is not available", - EmitterError::EndElementNameIsNotEqualToLastStartElementName => - "end element name is not equal to last start element name", - EmitterError::EndElementNameIsNotSpecified => - "end element name is not specified and can't be inferred", - } - } } /// A result type yielded by `XmlWriter`. -pub type Result = result::Result; +pub type Result = result::Result; // TODO: split into a low-level fast writer without any checks and formatting logic and a // high-level indenting validating writer @@ -87,23 +74,26 @@ pub struct Emitter { element_names: Vec, start_document_emitted: bool, - just_wrote_start_element: bool + just_wrote_start_element: bool, } impl Emitter { pub fn new(config: EmitterConfig) -> Emitter { + let mut indent_stack = Vec::with_capacity(16); + indent_stack.push(IndentFlags::WroteNothing); + Emitter { config, nst: NamespaceStack::empty(), indent_level: 0, - indent_stack: vec![IndentFlags::WroteNothing], + indent_stack, element_names: Vec::new(), start_document_emitted: false, - just_wrote_start_element: false + just_wrote_start_element: false, } } } @@ -124,27 +114,26 @@ impl Emitter { #[inline] fn wrote_text(&self) -> bool { - *self.indent_stack.last().unwrap() == IndentFlags::WroteText + self.indent_stack.last().map_or(false, |&e| e == IndentFlags::WroteText) } #[inline] fn wrote_markup(&self) -> bool { - *self.indent_stack.last().unwrap() == IndentFlags::WroteMarkup + self.indent_stack.last().map_or(false, |&e| e == IndentFlags::WroteMarkup) } #[inline] fn set_wrote_text(&mut self) { - *self.indent_stack.last_mut().unwrap() = IndentFlags::WroteText; + if let Some(e) = self.indent_stack.last_mut() { + *e = IndentFlags::WroteText; + } } #[inline] fn set_wrote_markup(&mut self) { - *self.indent_stack.last_mut().unwrap() = IndentFlags::WroteMarkup; - } - - #[inline] - fn reset_state(&mut self) { - *self.indent_stack.last_mut().unwrap() = IndentFlags::WroteNothing; + if let Some(e) = self.indent_stack.last_mut() { + *e = IndentFlags::WroteMarkup; + } } fn write_newline(&mut self, target: &mut W, level: usize) -> Result<()> { @@ -216,7 +205,7 @@ impl Emitter { self.before_markup(target)?; let result = { let mut write = move || { - write!(target, "")?; @@ -280,8 +269,8 @@ impl Emitter { } fn emit_start_element_initial(&mut self, target: &mut W, - name: Name, - attributes: &[Attribute]) -> Result<()> + name: Name<'_>, + attributes: &[Attribute<'_>]) -> Result<()> where W: Write { self.check_document_started(target)?; @@ -295,8 +284,8 @@ impl Emitter { } pub fn emit_start_element(&mut self, target: &mut W, - name: Name, - attributes: &[Attribute]) -> Result<()> + name: Name<'_>, + attributes: &[Attribute<'_>]) -> Result<()> where W: Write { if self.config.keep_element_names_stack { @@ -324,29 +313,31 @@ impl Emitter { //prefix if self.nst.get(prefix) == Some(uri) => Ok(()), // emit xmlns only if it is overridden NS_NO_PREFIX => if uri != NS_EMPTY_URI { - write!(target, " xmlns=\"{}\"", uri) + write!(target, " xmlns=\"{uri}\"") } else { Ok(()) }, // everything else - prefix => write!(target, " xmlns:{}=\"{}\"", prefix, uri) + prefix => write!(target, " xmlns:{prefix}=\"{uri}\"") }?; } Ok(()) } pub fn emit_attributes(&mut self, target: &mut W, - attributes: &[Attribute]) -> Result<()> { - for attr in attributes.iter() { - write!( - target, " {}=\"{}\"", - attr.name.repr_display(), - if self.config.perform_escaping { escape_str_attribute(attr.value) } else { Cow::Borrowed(attr.value) } - )? + attributes: &[Attribute<'_>]) -> Result<()> { + for attr in attributes.iter() { + write!(target, " {}=\"", attr.name.repr_display())?; + if self.config.perform_escaping { + write!(target, "{}", Escaped::::new(attr.value))?; + } else { + write!(target, "{}", attr.value)?; + } + write!(target, "\"")?; } Ok(()) } pub fn emit_end_element(&mut self, target: &mut W, - name: Option) -> Result<()> { + name: Option>) -> Result<()> { let owned_name = if self.config.keep_element_names_stack { Some(self.element_names.pop().ok_or(EmitterError::LastElementNameNotAvailable)?) } else { @@ -403,13 +394,13 @@ impl Emitter { content: &str) -> Result<()> { self.check_document_started(target)?; self.fix_non_empty_element(target)?; - target.write_all( - (if self.config.perform_escaping { - escape_str_pcdata(content) - } else { - Cow::Borrowed(content) - }).as_bytes() - )?; + + if self.config.perform_escaping { + write!(target, "{}", Escaped::::new(content))?; + } else { + target.write_all(content.as_bytes())?; + } + self.after_text(); Ok(()) } @@ -420,7 +411,7 @@ impl Emitter { // TODO: add escaping dashes at the end of the comment let autopad_comments = self.config.autopad_comments; - let write = |target: &mut W| -> Result<()> { + let write = move |target: &mut W| -> Result<()> { target.write_all(b" - test - kkss" = ddd' > - ddddd!e3--> - test - kkss" = ddd' > - ddddd!e3-->"#, - br#" - |1:14 Unexpected token '--' before ' ' - "#, - ParserConfig::new(), - false - ); - - test( - br#""#, - br#" - |1:14 Unexpected token '--' before '-' - "#, - ParserConfig::new(), - false - ); -} - -#[test] -fn tabs_1() { - test( - b"\t\t", - br#" - |1:2 StartDocument(1.0, UTF-8) - |1:2 StartElement(a) - |1:6 StartElement(b) - |1:6 EndElement(b) - |1:10 EndElement(a) - |1:14 EndDocument - "#, - ParserConfig::new() - .trim_whitespace(true), - true - ); -} - -#[test] -fn issue_32_unescaped_cdata_end() { - test( - br#"]]>"#, - br#" - |StartDocument(1.0, UTF-8) - |StartElement(hello) - |Characters("]]>") - |EndElement(hello) - |EndDocument - "#, - ParserConfig::new(), - false - ); -} - -#[test] -fn issue_unescaped_processing_instruction_end() { - test( - br#"?>"#, - br#" - |StartDocument(1.0, UTF-8) - |StartElement(hello) - |Characters("?>") - |EndElement(hello) - |EndDocument - "#, - ParserConfig::new(), - false - ); -} - -#[test] -fn issue_unescaped_empty_tag_end() { - test( - br#"/>"#, - br#" - |StartDocument(1.0, UTF-8) - |StartElement(hello) - |Characters("/>") - |EndElement(hello) - |EndDocument - "#, - ParserConfig::new(), - false - ); -} - -#[test] -fn issue_83_duplicate_attributes() { - test( - br#""#, - br#" - |StartDocument(1.0, UTF-8) - |StartElement(hello) - |1:30 Attribute 'a' is redefined - "#, - ParserConfig::new(), - false - ); -} - -#[test] -fn issue_93_large_characters_in_entity_references() { - test( - r#"&𤶼;"#.as_bytes(), - r#" - |StartDocument(1.0, UTF-8) - |StartElement(hello) - |1:10 Unexpected entity: 𤶼 - "#.as_bytes(), // FIXME: it shouldn't be 10, looks like indices are off slightly - ParserConfig::new(), - false - ) -} - -#[test] -fn issue_98_cdata_ending_with_right_bracket() { - test( - br#""#, - br#" - |StartDocument(1.0, UTF-8) - |StartElement(hello) - |CData("Foo [Bar]") - |EndElement(hello) - |EndDocument - "#, - ParserConfig::new(), - false - ) -} - -#[test] -fn issue_105_unexpected_double_dash() { - test( - br#"-- "#, - br#" - |StartDocument(1.0, UTF-8) - |StartElement(hello) - |Characters("-- ") - |EndElement(hello) - |EndDocument - "#, - ParserConfig::new(), - false - ); - - test( - br#"--"#, - br#" - |StartDocument(1.0, UTF-8) - |StartElement(hello) - |Characters("--") - |EndElement(hello) - |EndDocument - "#, - ParserConfig::new(), - false - ); - - test( - br#"-->"#, - br#" - |StartDocument(1.0, UTF-8) - |StartElement(hello) - |Characters("-->") - |EndElement(hello) - |EndDocument - "#, - ParserConfig::new(), - false - ); - - test( - br#""#, - br#" - |StartDocument(1.0, UTF-8) - |StartElement(hello) - |CData("--") - |EndElement(hello) - |EndDocument - "#, - ParserConfig::new(), - false - ); -} - -#[test] -fn issue_attribues_have_no_default_namespace () { - test( - br#""#, - br#" - |StartDocument(1.0, UTF-8) - |StartElement({urn:foo}hello [x="y"]) - |EndElement({urn:foo}hello) - |EndDocument - "#, - ParserConfig::new(), - false - ); -} - -#[test] -fn issue_replacement_character_entity_reference() { - test( - br#"��"#, - br#" - |StartDocument(1.0, UTF-8) - |StartElement(doc) - |1:13 Invalid decimal character number in an entity: #55357 - "#, - ParserConfig::new(), - false, - ); - - test( - br#"��"#, - br#" - |StartDocument(1.0, UTF-8) - |StartElement(doc) - |1:13 Invalid hexadecimal character number in an entity: #xd83d - "#, - ParserConfig::new(), - false, - ); - - test( - br#"��"#, - format!( - r#" - |StartDocument(1.0, UTF-8) - |StartElement(doc) - |Characters("{replacement_character}{replacement_character}") - |EndElement(doc) - |EndDocument - "#, - replacement_character = "\u{fffd}" - ) - .as_bytes(), - ParserConfig::new() - .replace_unknown_entity_references(true), - false, - ); - - test( - br#"��"#, - format!( - r#" - |StartDocument(1.0, UTF-8) - |StartElement(doc) - |Characters("{replacement_character}{replacement_character}") - |EndElement(doc) - |EndDocument - "#, - replacement_character = "\u{fffd}" - ) - .as_bytes(), - ParserConfig::new() - .replace_unknown_entity_references(true), - false, - ); -} - -lazy_static! { - // If PRINT_SPEC env variable is set, print the lines - // to stderr instead of comparing with the output - // it can be used like this: - // PRINT_SPEC=1 cargo test --test event_reader sample_1_full 2> sample_1_full.txt - static ref PRINT: bool = { - for (key, value) in env::vars() { - if key == "PRINT_SPEC" && value == "1" { - return true; - } - } - false - }; -} - -// clones a lot but that's fine -fn trim_until_bar(s: String) -> String { - match s.trim() { - ts if ts.starts_with('|') => return ts[1..].to_owned(), - _ => {} - } - s -} - -fn test(input: &[u8], output: &[u8], config: ParserConfig, test_position: bool) { - let mut reader = config.create_reader(input); - let mut spec_lines = BufReader::new(output).lines() - .map(|line| line.unwrap()) - .enumerate() - .map(|(i, line)| (i, trim_until_bar(line))) - .filter(|&(_, ref line)| !line.trim().is_empty()); - - loop { - let e = reader.next(); - let line = - if test_position { - format!("{} {}", reader.position(), Event(&e)) - } else { - format!("{}", Event(&e)) - }; - - if *PRINT { - writeln!(&mut stderr(), "{}", line).unwrap(); - } else { - if let Some((n, spec)) = spec_lines.next() { - if line != spec { - const SPLITTER: &'static str = "-------------------"; - panic!("\n{}\nUnexpected event at line {}:\nExpected: {}\nFound: {}\n{}\n", - SPLITTER, n + 1, spec, line, std::str::from_utf8(output).unwrap()); - } - } else { - panic!("Unexpected event: {}", line); - } - } - - match e { - Ok(XmlEvent::EndDocument) | Err(_) => break, - _ => {}, - } - } -} - -// Here we define our own string representation of events so we don't depend -// on the specifics of Display implementation for XmlEvent and OwnedName. - -struct Name<'a>(&'a OwnedName); - -impl <'a> fmt::Display for Name<'a> { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - if let Some(ref namespace) = self.0.namespace { - try! { write!(f, "{{{}}}", namespace) } - } - - if let Some(ref prefix) = self.0.prefix { - try! { write!(f, "{}:", prefix) } - } - - write!(f, "{}", self.0.local_name) - } -} - -struct Event<'a>(&'a Result); - -impl<'a> fmt::Display for Event<'a> { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - let empty = String::new(); - match *self.0 { - Ok(ref e) => match *e { - XmlEvent::StartDocument { ref version, ref encoding, .. } => - write!(f, "StartDocument({}, {})", version, encoding), - XmlEvent::EndDocument => - write!(f, "EndDocument"), - XmlEvent::ProcessingInstruction { ref name, ref data } => - write!(f, "ProcessingInstruction({}={:?})", name, - data.as_ref().unwrap_or(&empty)), - XmlEvent::StartElement { ref name, ref attributes, .. } => { - if attributes.is_empty() { - write!(f, "StartElement({})", Name(name)) - } - else { - let attrs: Vec<_> = attributes.iter() - .map(|a| format!("{}={:?}", Name(&a.name), a.value)) .collect(); - write!(f, "StartElement({} [{}])", Name(name), attrs.join(", ")) - } - }, - XmlEvent::EndElement { ref name } => - write!(f, "EndElement({})", Name(name)), - XmlEvent::Comment(ref data) => - write!(f, r#"Comment("{}")"#, data.escape_debug()), - XmlEvent::CData(ref data) => - write!(f, r#"CData("{}")"#, data.escape_debug()), - XmlEvent::Characters(ref data) => - write!(f, r#"Characters("{}")"#, data.escape_debug()), - XmlEvent::Whitespace(ref data) => - write!(f, r#"Whitespace("{}")"#, data.escape_debug()), - }, - Err(ref e) => e.fmt(f), - } - } -} diff --git a/tests/event_writer.rs b/tests/event_writer.rs deleted file mode 100644 index dd64a43..0000000 --- a/tests/event_writer.rs +++ /dev/null @@ -1,269 +0,0 @@ -#![forbid(unsafe_code)] - -extern crate xml; - -use std::io::{BufReader, SeekFrom}; -use std::io::prelude::*; -use std::fs::File; -use std::str; - -use xml::reader::EventReader; -use xml::writer::EmitterConfig; - -macro_rules! unwrap_all { - ($($e:expr);+) => {{ - $($e.unwrap();)+ - }} -} - -#[test] -fn reading_writing_equal_with_namespaces() { - let mut f = File::open("tests/documents/sample_2.xml").unwrap(); - let mut b = Vec::new(); - - { - let r = EventReader::new(BufReader::new(&mut f)); - let mut w = EmitterConfig::default().perform_indent(true).create_writer(&mut b); - - for e in r { - match e { - Ok(e) => if let Some(e) = e.as_writer_event() { - match w.write(e) { - Ok(_) => {}, - Err(e) => panic!("Writer error: {:?}", e) - } - }, - Err(e) => panic!("Error: {}", e) - } - } - } - - f.seek(SeekFrom::Start(0)).unwrap(); - let mut fs = String::new(); - f.read_to_string(&mut fs).unwrap(); - - let bs = String::from_utf8(b).unwrap(); - - assert_eq!(fs.trim(), bs.trim()); -} - -#[test] -fn writing_simple() { - use xml::writer::XmlEvent; - - let mut b = Vec::new(); - - { - let mut w = EmitterConfig::new().write_document_declaration(false).create_writer(&mut b); - - w.write(XmlEvent::start_element("h:hello").ns("h", "urn:hello-world")).unwrap(); - w.write("hello world").unwrap(); - w.write(XmlEvent::end_element()).unwrap(); - } - - assert_eq!( - str::from_utf8(&b).unwrap(), - r#"hello world"# - ); -} - -#[test] -fn writing_empty_elements_with_normalizing() { - use xml::writer::XmlEvent; - - let mut b = Vec::new(); - - { - let mut w = EmitterConfig::new().write_document_declaration(false).create_writer(&mut b); - - unwrap_all! { - w.write(XmlEvent::start_element("hello")); - w.write(XmlEvent::start_element("world")); - w.write(XmlEvent::end_element()); - w.write(XmlEvent::end_element()) - } - } - - assert_eq!(str::from_utf8(&b).unwrap(), r#""#); -} - -#[test] -fn writing_empty_elements_without_normalizing() { - use xml::writer::XmlEvent; - - let mut b = Vec::new(); - - { - let mut w = EmitterConfig::new() - .write_document_declaration(false) - .normalize_empty_elements(false) - .create_writer(&mut b); - - unwrap_all! { - w.write(XmlEvent::start_element("hello")); - w.write(XmlEvent::start_element("world")); - w.write(XmlEvent::end_element()); - w.write(XmlEvent::end_element()) - } - } - - assert_eq!(str::from_utf8(&b).unwrap(), r#""#); -} - -#[test] -fn writing_empty_elements_without_pad_self_closing() { - use xml::writer::XmlEvent; - - let mut b = Vec::new(); - - { - let mut w = EmitterConfig::new() - .write_document_declaration(false) - .pad_self_closing(false) - .create_writer(&mut b); - - unwrap_all! { - w.write(XmlEvent::start_element("hello")); - w.write(XmlEvent::start_element("world")); - w.write(XmlEvent::end_element()); - w.write(XmlEvent::end_element()) - } - } - - assert_eq!(str::from_utf8(&b).unwrap(), r#""#); -} -#[test] -fn writing_empty_elements_pad_self_closing_explicit() { - use xml::writer::XmlEvent; - - let mut b = Vec::new(); - - { - let mut w = EmitterConfig::new() - .write_document_declaration(false) - .pad_self_closing(true) - .create_writer(&mut b); - - unwrap_all! { - w.write(XmlEvent::start_element("hello")); - w.write(XmlEvent::start_element("world")); - w.write(XmlEvent::end_element()); - w.write(XmlEvent::end_element()) - } - } - - assert_eq!(str::from_utf8(&b).unwrap(), r#""#); -} - -#[test] -fn writing_comments_with_indentation() { - use xml::writer::XmlEvent; - - let mut b = Vec::new(); - - { - let mut w = EmitterConfig::new() - .write_document_declaration(false) - .perform_indent(true) - .create_writer(&mut b); - - unwrap_all! { - w.write(XmlEvent::start_element("hello")); - w.write(XmlEvent::start_element("world")); - w.write(XmlEvent::comment(" this is a manually padded comment\t")); - w.write(XmlEvent::comment("this is an unpadded comment")); - w.write(XmlEvent::end_element()); - w.write(XmlEvent::end_element()) - } - } - - assert_eq!( - str::from_utf8(&b).unwrap(), - " - - - - -"); -} - -#[test] -fn issue_112_overriding_namepace_prefix() { - use xml::writer::XmlEvent; - - let mut b = Vec::new(); - - { - let mut w = EmitterConfig::new() - .write_document_declaration(false) - .create_writer(&mut b); - - unwrap_all! { - w.write(XmlEvent::start_element("iq").ns("", "jabber:client").ns("a", "urn:A")); - w.write(XmlEvent::start_element("bind").ns("", "urn:ietf:params:xml:ns:xmpp-bind")); - w.write(XmlEvent::end_element()); - w.write(XmlEvent::start_element("whatever").ns("a", "urn:X")); - w.write(XmlEvent::end_element()); - w.write(XmlEvent::end_element()) - } - } - - assert_eq!( - str::from_utf8(&b).unwrap(), - r#""# - ) -} - -#[test] -fn attribute_escaping() { - use xml::writer::XmlEvent; - - let mut b = Vec::new(); - - { - let mut w = EmitterConfig::new() - .write_document_declaration(false) - .perform_indent(true) - .create_writer(&mut b); - - unwrap_all! { - w.write( - XmlEvent::start_element("hello") - .attr("testLt", "<") - .attr("testGt", ">") - ); - w.write(XmlEvent::end_element()); - w.write( - XmlEvent::start_element("hello") - .attr("testQuot", "\"") - .attr("testApos", "\'") - ); - w.write(XmlEvent::end_element()); - w.write( - XmlEvent::start_element("hello") - .attr("testAmp", "&") - ); - w.write(XmlEvent::end_element()); - w.write( - XmlEvent::start_element("hello") - .attr("testNl", "\n") - .attr("testCr", "\r") - ); - w.write(XmlEvent::end_element()); - w.write( - XmlEvent::start_element("hello") - .attr("testNl", "\\n") - .attr("testCr", "\\r") - ); - w.write(XmlEvent::end_element()) - } - } - assert_eq!( - str::from_utf8(&b).unwrap(), - " - - - -" - ); -} \ No newline at end of file diff --git a/tests/streaming.rs b/tests/streaming.rs deleted file mode 100644 index a577a00..0000000 --- a/tests/streaming.rs +++ /dev/null @@ -1,103 +0,0 @@ -#![forbid(unsafe_code)] - -extern crate xml; - -use std::io::{Cursor, Write}; - -use xml::EventReader; -use xml::reader::ParserConfig; -use xml::reader::XmlEvent; - -macro_rules! assert_match { - ($actual:expr, $expected:pat) => { - match $actual { - $expected => {}, - _ => panic!("assertion failed: `(left matches right)` \ - (left: `{:?}`, right: `{}`", $actual, stringify!($expected)) - } - }; - ($actual:expr, $expected:pat if $guard:expr) => { - match $actual { - $expected if $guard => {}, - _ => panic!("assertion failed: `(left matches right)` \ - (left: `{:?}`, right: `{} if {}`", - $actual, stringify!($expected), stringify!($guard)) - } - } -} - -fn write_and_reset_position(c: &mut Cursor, data: &[u8]) where Cursor: Write { - let p = c.position(); - c.write_all(data).unwrap(); - c.set_position(p); -} - -#[test] -fn reading_streamed_content() { - let buf = Cursor::new(b"".to_vec()); - let reader = EventReader::new(buf); - - let mut it = reader.into_iter(); - - assert_match!(it.next(), Some(Ok(XmlEvent::StartDocument { .. }))); - assert_match!(it.next(), Some(Ok(XmlEvent::StartElement { ref name, .. })) if name.local_name == "root"); - - write_and_reset_position(it.source_mut(), b"content"); - assert_match!(it.next(), Some(Ok(XmlEvent::StartElement { ref name, .. })) if name.local_name == "child-1"); - assert_match!(it.next(), Some(Ok(XmlEvent::Characters(ref c))) if c == "content"); - assert_match!(it.next(), Some(Ok(XmlEvent::EndElement { ref name })) if name.local_name == "child-1"); - - write_and_reset_position(it.source_mut(), b""); - assert_match!(it.next(), Some(Ok(XmlEvent::StartElement { ref name, .. })) if name.local_name == "child-2"); - assert_match!(it.next(), Some(Ok(XmlEvent::EndElement { ref name })) if name.local_name == "child-2"); - - write_and_reset_position(it.source_mut(), b""); - assert_match!(it.next(), Some(Ok(XmlEvent::StartElement { ref name, .. })) if name.local_name == "child-3"); - assert_match!(it.next(), Some(Ok(XmlEvent::EndElement { ref name })) if name.local_name == "child-3"); - // doesn't seem to work because of how tags parsing is done -// write_and_reset_position(it.source_mut(), b"some text"); - // assert_match!(it.next(), Some(Ok(XmlEvent::Characters(ref c))) if c == "some text"); - - write_and_reset_position(it.source_mut(), b""); - assert_match!(it.next(), Some(Ok(XmlEvent::EndElement { ref name })) if name.local_name == "root"); - assert_match!(it.next(), Some(Ok(XmlEvent::EndDocument))); - assert_match!(it.next(), None); -} - -#[test] -fn reading_streamed_content2() { - let buf = Cursor::new(b"".to_vec()); - let mut config = ParserConfig::new(); - config.ignore_end_of_stream = true; - let readerb = EventReader::new_with_config(buf, config); - - let mut reader = readerb.into_iter(); - - assert_match!(reader.next(), Some(Ok(XmlEvent::StartDocument { .. }))); - assert_match!(reader.next(), Some(Ok(XmlEvent::StartElement { ref name, .. })) if name.local_name == "root"); - - write_and_reset_position(reader.source_mut(), b"content"); - assert_match!(reader.next(), Some(Ok(XmlEvent::StartElement { ref name, .. })) if name.local_name == "child-1"); - assert_match!(reader.next(), Some(Ok(XmlEvent::Characters(ref c))) if c == "content"); - assert_match!(reader.next(), Some(Ok(XmlEvent::EndElement { ref name })) if name.local_name == "child-1"); - - write_and_reset_position(reader.source_mut(), b"content"); - - assert_match!(reader.next(), Some(Ok(XmlEvent::StartElement { ref name, .. })) if name.local_name == "child-2"); - assert_match!(reader.next(), Some(Ok(XmlEvent::Characters(ref c))) if c == "content"); - assert_match!(reader.next(), Some(Ok(XmlEvent::EndElement { ref name })) if name.local_name == "child-2"); - assert_match!(reader.next(), Some(Err(_))); - write_and_reset_position(reader.source_mut(), b""); - assert_match!(reader.next(), Some(Ok(XmlEvent::StartElement { ref name, .. })) if name.local_name == "child-3"); - write_and_reset_position(reader.source_mut(), b" { - panic!("At this point, parser must not detect something."); - }, - Some(Err(_)) => {} - }; - write_and_reset_position(reader.source_mut(), b" />"); - assert_match!(reader.next(), Some(Ok(XmlEvent::StartElement { ref name, .. })) if name.local_name == "child-4"); -} - -- cgit v1.2.3