diff options
author | Sasha Smundak <asmundak@google.com> | 2020-06-02 14:56:13 -0700 |
---|---|---|
committer | Sasha Smundak <asmundak@google.com> | 2020-06-02 15:19:39 -0700 |
commit | 911f836dc405291b8d655328e785c00d3093528b (patch) | |
tree | d9c7512dfe416e2c5cd2e30fb081b0a70ac6ecad | |
parent | 5144115e7cda2641b0a3db1dd53ac557bbb58851 (diff) | |
parent | 23e6ba8cc2de65cb79fedf9827731040ddd4bdf5 (diff) | |
download | go-etree-911f836dc405291b8d655328e785c00d3093528b.tar.gz |
Merge sso://github/beevik/etree, add mandatory files
Bug: 158031244
Test: N/A
Change-Id: Idda5b583f92c0d952e1af23dcf90eed94db2f843
-rw-r--r-- | .travis.yml | 18 | ||||
-rw-r--r-- | CONTRIBUTORS | 10 | ||||
-rw-r--r-- | LICENSE | 24 | ||||
-rw-r--r-- | METADATA | 20 | ||||
-rw-r--r-- | MODULE_LICENSE_BSD | 0 | ||||
-rw-r--r-- | README.md | 205 | ||||
-rw-r--r-- | RELEASE_NOTES.md | 109 | ||||
-rw-r--r-- | etree.go | 1505 | ||||
-rw-r--r-- | etree_test.go | 1115 | ||||
-rw-r--r-- | example_test.go | 69 | ||||
-rw-r--r-- | go.mod | 3 | ||||
-rw-r--r-- | helpers.go | 276 | ||||
-rw-r--r-- | path.go | 580 | ||||
-rw-r--r-- | path_test.go | 222 |
14 files changed, 4156 insertions, 0 deletions
diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000..e12bb98 --- /dev/null +++ b/.travis.yml @@ -0,0 +1,18 @@ +language: go +sudo: false + +env: + - GO111MODULE=on + +go: + - 1.11.x + - 1.12.x + - tip + +matrix: + allow_failures: + - go: tip + +script: + - go vet ./... + - go test -v ./... diff --git a/CONTRIBUTORS b/CONTRIBUTORS new file mode 100644 index 0000000..03211a8 --- /dev/null +++ b/CONTRIBUTORS @@ -0,0 +1,10 @@ +Brett Vickers (beevik) +Felix Geisendörfer (felixge) +Kamil Kisiel (kisielk) +Graham King (grahamking) +Matt Smith (ma314smith) +Michal Jemala (michaljemala) +Nicolas Piganeau (npiganeau) +Chris Brown (ccbrown) +Earncef Sequeira (earncef) +Gabriel de Labachelerie (wuzuf) @@ -0,0 +1,24 @@ +Copyright 2015-2019 Brett Vickers. All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY COPYRIGHT HOLDER ``AS IS'' AND ANY +EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL COPYRIGHT HOLDER OR +CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/METADATA b/METADATA new file mode 100644 index 0000000..bcfee03 --- /dev/null +++ b/METADATA @@ -0,0 +1,20 @@ +name: "go-etree" +description: + "The etree package is a lightweight, pure go package that expresses XML in " + "the form of an element tree. Its design was inspired by the Python " + "ElementTree module." + +third_party { + url { + type: HOMEPAGE + value: "https://github.com/beevik/etree" + } + url { + type: GIT + value: "https://github.com/beevik/etree.git" + } + version: "v1.1.0" + last_upgrade_date { year: 2020 month: 2 day: 22 } + license_type: NOTICE +} + diff --git a/MODULE_LICENSE_BSD b/MODULE_LICENSE_BSD new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/MODULE_LICENSE_BSD diff --git a/README.md b/README.md new file mode 100644 index 0000000..08ec26b --- /dev/null +++ b/README.md @@ -0,0 +1,205 @@ +[![Build Status](https://travis-ci.org/beevik/etree.svg?branch=master)](https://travis-ci.org/beevik/etree) +[![GoDoc](https://godoc.org/github.com/beevik/etree?status.svg)](https://godoc.org/github.com/beevik/etree) + +etree +===== + +The etree package is a lightweight, pure go package that expresses XML in +the form of an element tree. Its design was inspired by the Python +[ElementTree](http://docs.python.org/2/library/xml.etree.elementtree.html) +module. + +Some of the package's capabilities and features: + +* Represents XML documents as trees of elements for easy traversal. +* Imports, serializes, modifies or creates XML documents from scratch. +* Writes and reads XML to/from files, byte slices, strings and io interfaces. +* Performs simple or complex searches with lightweight XPath-like query APIs. +* Auto-indents XML using spaces or tabs for better readability. +* Implemented in pure go; depends only on standard go libraries. +* Built on top of the go [encoding/xml](http://golang.org/pkg/encoding/xml) + package. + +### Creating an XML document + +The following example creates an XML document from scratch using the etree +package and outputs its indented contents to stdout. +```go +doc := etree.NewDocument() +doc.CreateProcInst("xml", `version="1.0" encoding="UTF-8"`) +doc.CreateProcInst("xml-stylesheet", `type="text/xsl" href="style.xsl"`) + +people := doc.CreateElement("People") +people.CreateComment("These are all known people") + +jon := people.CreateElement("Person") +jon.CreateAttr("name", "Jon") + +sally := people.CreateElement("Person") +sally.CreateAttr("name", "Sally") + +doc.Indent(2) +doc.WriteTo(os.Stdout) +``` + +Output: +```xml +<?xml version="1.0" encoding="UTF-8"?> +<?xml-stylesheet type="text/xsl" href="style.xsl"?> +<People> + <!--These are all known people--> + <Person name="Jon"/> + <Person name="Sally"/> +</People> +``` + +### Reading an XML file + +Suppose you have a file on disk called `bookstore.xml` containing the +following data: + +```xml +<bookstore xmlns:p="urn:schemas-books-com:prices"> + + <book category="COOKING"> + <title lang="en">Everyday Italian</title> + <author>Giada De Laurentiis</author> + <year>2005</year> + <p:price>30.00</p:price> + </book> + + <book category="CHILDREN"> + <title lang="en">Harry Potter</title> + <author>J K. Rowling</author> + <year>2005</year> + <p:price>29.99</p:price> + </book> + + <book category="WEB"> + <title lang="en">XQuery Kick Start</title> + <author>James McGovern</author> + <author>Per Bothner</author> + <author>Kurt Cagle</author> + <author>James Linn</author> + <author>Vaidyanathan Nagarajan</author> + <year>2003</year> + <p:price>49.99</p:price> + </book> + + <book category="WEB"> + <title lang="en">Learning XML</title> + <author>Erik T. Ray</author> + <year>2003</year> + <p:price>39.95</p:price> + </book> + +</bookstore> +``` + +This code reads the file's contents into an etree document. +```go +doc := etree.NewDocument() +if err := doc.ReadFromFile("bookstore.xml"); err != nil { + panic(err) +} +``` + +You can also read XML from a string, a byte slice, or an `io.Reader`. + +### Processing elements and attributes + +This example illustrates several ways to access elements and attributes using +etree selection queries. +```go +root := doc.SelectElement("bookstore") +fmt.Println("ROOT element:", root.Tag) + +for _, book := range root.SelectElements("book") { + fmt.Println("CHILD element:", book.Tag) + if title := book.SelectElement("title"); title != nil { + lang := title.SelectAttrValue("lang", "unknown") + fmt.Printf(" TITLE: %s (%s)\n", title.Text(), lang) + } + for _, attr := range book.Attr { + fmt.Printf(" ATTR: %s=%s\n", attr.Key, attr.Value) + } +} +``` +Output: +``` +ROOT element: bookstore +CHILD element: book + TITLE: Everyday Italian (en) + ATTR: category=COOKING +CHILD element: book + TITLE: Harry Potter (en) + ATTR: category=CHILDREN +CHILD element: book + TITLE: XQuery Kick Start (en) + ATTR: category=WEB +CHILD element: book + TITLE: Learning XML (en) + ATTR: category=WEB +``` + +### Path queries + +This example uses etree's path functions to select all book titles that fall +into the category of 'WEB'. The double-slash prefix in the path causes the +search for book elements to occur recursively; book elements may appear at any +level of the XML hierarchy. +```go +for _, t := range doc.FindElements("//book[@category='WEB']/title") { + fmt.Println("Title:", t.Text()) +} +``` + +Output: +``` +Title: XQuery Kick Start +Title: Learning XML +``` + +This example finds the first book element under the root bookstore element and +outputs the tag and text of each of its child elements. +```go +for _, e := range doc.FindElements("./bookstore/book[1]/*") { + fmt.Printf("%s: %s\n", e.Tag, e.Text()) +} +``` + +Output: +``` +title: Everyday Italian +author: Giada De Laurentiis +year: 2005 +price: 30.00 +``` + +This example finds all books with a price of 49.99 and outputs their titles. +```go +path := etree.MustCompilePath("./bookstore/book[p:price='49.99']/title") +for _, e := range doc.FindElementsPath(path) { + fmt.Println(e.Text()) +} +``` + +Output: +``` +XQuery Kick Start +``` + +Note that this example uses the FindElementsPath function, which takes as an +argument a pre-compiled path object. Use precompiled paths when you plan to +search with the same path more than once. + +### Other features + +These are just a few examples of the things the etree package can do. See the +[documentation](http://godoc.org/github.com/beevik/etree) for a complete +description of its capabilities. + +### Contributing + +This project accepts contributions. Just fork the repo and submit a pull +request! diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md new file mode 100644 index 0000000..ee59d7a --- /dev/null +++ b/RELEASE_NOTES.md @@ -0,0 +1,109 @@ +Release v1.1.0 +============== + +**New Features** + +* New attribute helpers. + * Added the `Element.SortAttrs` method, which lexicographically sorts an + element's attributes by key. +* New `ReadSettings` properties. + * Added `Entity` for the support of custom entity maps. +* New `WriteSettings` properties. + * Added `UseCRLF` to allow the output of CR-LF newlines instead of the + default LF newlines. This is useful on Windows systems. +* Additional support for text and CDATA sections. + * The `Element.Text` method now returns the concatenation of all consecutive + character data tokens immediately following an element's opening tag. + * Added `Element.SetCData` to replace the character data immediately + following an element's opening tag with a CDATA section. + * Added `Element.CreateCData` to create and add a CDATA section child + `CharData` token to an element. + * Added `Element.CreateText` to create and add a child text `CharData` token + to an element. + * Added `NewCData` to create a parentless CDATA section `CharData` token. + * Added `NewText` to create a parentless text `CharData` + token. + * Added `CharData.IsCData` to detect if the token contains a CDATA section. + * Added `CharData.IsWhitespace` to detect if the token contains whitespace + inserted by one of the document Indent functions. + * Modified `Element.SetText` so that it replaces a run of consecutive + character data tokens following the element's opening tag (instead of just + the first one). +* New "tail text" support. + * Added the `Element.Tail` method, which returns the text immediately + following an element's closing tag. + * Added the `Element.SetTail` method, which modifies the text immediately + following an element's closing tag. +* New element child insertion and removal methods. + * Added the `Element.InsertChildAt` method, which inserts a new child token + before the specified child token index. + * Added the `Element.RemoveChildAt` method, which removes the child token at + the specified child token index. +* New element and attribute queries. + * Added the `Element.Index` method, which returns the element's index within + its parent element's child token list. + * Added the `Element.NamespaceURI` method to return the namespace URI + associated with an element. + * Added the `Attr.NamespaceURI` method to return the namespace URI + associated with an element. + * Added the `Attr.Element` method to return the element that an attribute + belongs to. +* New Path filter functions. + * Added `[local-name()='val']` to keep elements whose unprefixed tag matches + the desired value. + * Added `[name()='val']` to keep elements whose full tag matches the desired + value. + * Added `[namespace-prefix()='val']` to keep elements whose namespace prefix + matches the desired value. + * Added `[namespace-uri()='val']` to keep elements whose namespace URI + matches the desired value. + +**Bug Fixes** + +* A default XML `CharSetReader` is now used to prevent failed parsing of XML + documents using certain encodings. + ([Issue](https://github.com/beevik/etree/issues/53)). +* All characters are now properly escaped according to XML parsing rules. + ([Issue](https://github.com/beevik/etree/issues/55)). +* The `Document.Indent` and `Document.IndentTabs` functions no longer insert + empty string `CharData` tokens. + +**Deprecated** + +* `Element` + * The `InsertChild` method is deprecated. Use `InsertChildAt` instead. + * The `CreateCharData` method is deprecated. Use `CreateText` instead. +* `CharData` + * The `NewCharData` method is deprecated. Use `NewText` instead. + + +Release v1.0.1 +============== + +**Changes** + +* Added support for absolute etree Path queries. An absolute path begins with + `/` or `//` and begins its search from the element's document root. +* Added [`GetPath`](https://godoc.org/github.com/beevik/etree#Element.GetPath) + and [`GetRelativePath`](https://godoc.org/github.com/beevik/etree#Element.GetRelativePath) + functions to the [`Element`](https://godoc.org/github.com/beevik/etree#Element) + type. + +**Breaking changes** + +* A path starting with `//` is now interpreted as an absolute path. + Previously, it was interpreted as a relative path starting from the element + whose + [`FindElement`](https://godoc.org/github.com/beevik/etree#Element.FindElement) + method was called. To remain compatible with this release, all paths + prefixed with `//` should be prefixed with `.//` when called from any + element other than the document's root. +* [**edit 2/1/2019**]: Minor releases should not contain breaking changes. + Even though this breaking change was very minor, it was a mistake to include + it in this minor release. In the future, all breaking changes will be + limited to major releases (e.g., version 2.0.0). + +Release v1.0.0 +============== + +Initial release. diff --git a/etree.go b/etree.go new file mode 100644 index 0000000..8a8c9bb --- /dev/null +++ b/etree.go @@ -0,0 +1,1505 @@ +// Copyright 2015-2019 Brett Vickers. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Package etree provides XML services through an Element Tree +// abstraction. +package etree + +import ( + "bufio" + "bytes" + "encoding/xml" + "errors" + "io" + "os" + "sort" + "strings" +) + +const ( + // NoIndent is used with the Document Indent function to disable all + // indenting. + NoIndent = -1 +) + +// ErrXML is returned when XML parsing fails due to incorrect formatting. +var ErrXML = errors.New("etree: invalid XML format") + +// ReadSettings determine the default behavior of the Document's ReadFrom* +// methods. +type ReadSettings struct { + // CharsetReader to be passed to standard xml.Decoder. Default: nil. + CharsetReader func(charset string, input io.Reader) (io.Reader, error) + + // Permissive allows input containing common mistakes such as missing tags + // or attribute values. Default: false. + Permissive bool + + // Entity to be passed to standard xml.Decoder. Default: nil. + Entity map[string]string +} + +// newReadSettings creates a default ReadSettings record. +func newReadSettings() ReadSettings { + return ReadSettings{ + CharsetReader: func(label string, input io.Reader) (io.Reader, error) { + return input, nil + }, + Permissive: false, + } +} + +// dup creates a duplicate of the ReadSettings object. +func (s *ReadSettings) dup() ReadSettings { + var entityCopy map[string]string + if s.Entity != nil { + entityCopy = make(map[string]string) + for k, v := range s.Entity { + entityCopy[k] = v + } + } + return ReadSettings{ + CharsetReader: s.CharsetReader, + Permissive: s.Permissive, + Entity: entityCopy, + } +} + +// WriteSettings determine the behavior of the Document's WriteTo* and +// Indent* methods. +type WriteSettings struct { + // CanonicalEndTags forces the production of XML end tags, even for + // elements that have no child elements. Default: false. + CanonicalEndTags bool + + // CanonicalText forces the production of XML character references for + // text data characters &, <, and >. If false, XML character references + // are also produced for " and '. Default: false. + CanonicalText bool + + // CanonicalAttrVal forces the production of XML character references for + // attribute value characters &, < and ". If false, XML character + // references are also produced for > and '. Default: false. + CanonicalAttrVal bool + + // UseCRLF causes the document's indentation methods to use a carriage + // return followed by a linefeed ("\r\n") when outputting a newline. If + // false, only a linefeed is used ("\n"). Default: false. + UseCRLF bool +} + +// newWriteSettings creates a default WriteSettings record. +func newWriteSettings() WriteSettings { + return WriteSettings{ + CanonicalEndTags: false, + CanonicalText: false, + CanonicalAttrVal: false, + UseCRLF: false, + } +} + +// dup creates a dulicate of the WriteSettings object. +func (s *WriteSettings) dup() WriteSettings { + return *s +} + +// A Token is an interface type used to represent XML elements, character +// data, CDATA sections, XML comments, XML directives, and XML processing +// instructions. +type Token interface { + Parent() *Element + Index() int + dup(parent *Element) Token + setParent(parent *Element) + setIndex(index int) + writeTo(w *bufio.Writer, s *WriteSettings) +} + +// A Document is a container holding a complete XML tree. +// +// A document has a single embedded element, which contains zero or more child +// tokens, one of which is usually the root element. The embedded element may +// include other children such as processing instruction tokens or character +// data tokens. The document's embedded element is never directly serialized; +// only its children are. +// +// A document also contains read and write settings, which influence the way +// the document is deserialized, serialized, and indented. +type Document struct { + Element + ReadSettings ReadSettings + WriteSettings WriteSettings +} + +// An Element represents an XML element, its attributes, and its child tokens. +type Element struct { + Space, Tag string // namespace prefix and tag + Attr []Attr // key-value attribute pairs + Child []Token // child tokens (elements, comments, etc.) + parent *Element // parent element + index int // token index in parent's children +} + +// An Attr represents a key-value attribute within an XML element. +type Attr struct { + Space, Key string // The attribute's namespace prefix and key + Value string // The attribute value string + element *Element // element containing the attribute +} + +// charDataFlags are used with CharData tokens to store additional settings. +type charDataFlags uint8 + +const ( + // The CharData contains only whitespace. + whitespaceFlag charDataFlags = 1 << iota + + // The CharData contains a CDATA section. + cdataFlag +) + +// CharData may be used to represent simple text data or a CDATA section +// within an XML document. The Data property should never be modified +// directly; use the SetData method instead. +type CharData struct { + Data string // the simple text or CDATA section content + parent *Element + index int + flags charDataFlags +} + +// A Comment represents an XML comment. +type Comment struct { + Data string // the comment's text + parent *Element + index int +} + +// A Directive represents an XML directive. +type Directive struct { + Data string // the directive string + parent *Element + index int +} + +// A ProcInst represents an XML processing instruction. +type ProcInst struct { + Target string // the processing instruction target + Inst string // the processing instruction value + parent *Element + index int +} + +// NewDocument creates an XML document without a root element. +func NewDocument() *Document { + return &Document{ + Element: Element{Child: make([]Token, 0)}, + ReadSettings: newReadSettings(), + WriteSettings: newWriteSettings(), + } +} + +// NewDocumentWithRoot creates an XML document and sets the element 'e' as its +// root element. If the element 'e' is already part of another document, it is +// first removed from its existing document. +func NewDocumentWithRoot(e *Element) *Document { + d := NewDocument() + d.SetRoot(e) + return d +} + +// Copy returns a recursive, deep copy of the document. +func (d *Document) Copy() *Document { + return &Document{ + Element: *(d.Element.dup(nil).(*Element)), + ReadSettings: d.ReadSettings.dup(), + WriteSettings: d.WriteSettings.dup(), + } +} + +// Root returns the root element of the document. It returns nil if there is +// no root element. +func (d *Document) Root() *Element { + for _, t := range d.Child { + if c, ok := t.(*Element); ok { + return c + } + } + return nil +} + +// SetRoot replaces the document's root element with the element 'e'. If the +// document already has a root element when this function is called, then the +// existing root element is unbound from the document. If the element 'e' is +// part of another document, then it is unbound from the other document. +func (d *Document) SetRoot(e *Element) { + if e.parent != nil { + e.parent.RemoveChild(e) + } + + // If there is already a root element, replace it. + p := &d.Element + for i, t := range p.Child { + if _, ok := t.(*Element); ok { + t.setParent(nil) + t.setIndex(-1) + p.Child[i] = e + e.setParent(p) + e.setIndex(i) + return + } + } + + // No existing root element, so add it. + p.addChild(e) +} + +// ReadFrom reads XML from the reader 'r' into this document. The function +// returns the number of bytes read and any error encountered. +func (d *Document) ReadFrom(r io.Reader) (n int64, err error) { + return d.Element.readFrom(r, d.ReadSettings) +} + +// ReadFromFile reads XML from a local file at path 'filepath' into this +// document. +func (d *Document) ReadFromFile(filepath string) error { + f, err := os.Open(filepath) + if err != nil { + return err + } + defer f.Close() + _, err = d.ReadFrom(f) + return err +} + +// ReadFromBytes reads XML from the byte slice 'b' into the this document. +func (d *Document) ReadFromBytes(b []byte) error { + _, err := d.ReadFrom(bytes.NewReader(b)) + return err +} + +// ReadFromString reads XML from the string 's' into this document. +func (d *Document) ReadFromString(s string) error { + _, err := d.ReadFrom(strings.NewReader(s)) + return err +} + +// WriteTo serializes the document out to the writer 'w'. The function returns +// the number of bytes written and any error encountered. +func (d *Document) WriteTo(w io.Writer) (n int64, err error) { + cw := newCountWriter(w) + b := bufio.NewWriter(cw) + for _, c := range d.Child { + c.writeTo(b, &d.WriteSettings) + } + err, n = b.Flush(), cw.bytes + return +} + +// WriteToFile serializes the document out to the file at path 'filepath'. +func (d *Document) WriteToFile(filepath string) error { + f, err := os.Create(filepath) + if err != nil { + return err + } + defer f.Close() + _, err = d.WriteTo(f) + return err +} + +// WriteToBytes serializes this document into a slice of bytes. +func (d *Document) WriteToBytes() (b []byte, err error) { + var buf bytes.Buffer + if _, err = d.WriteTo(&buf); err != nil { + return + } + return buf.Bytes(), nil +} + +// WriteToString serializes this document into a string. +func (d *Document) WriteToString() (s string, err error) { + var b []byte + if b, err = d.WriteToBytes(); err != nil { + return + } + return string(b), nil +} + +type indentFunc func(depth int) string + +// Indent modifies the document's element tree by inserting character data +// tokens containing newlines and indentation. The amount of indentation per +// depth level is given by the 'spaces' parameter. Pass etree.NoIndent for +// 'spaces' if you want no indentation at all. +func (d *Document) Indent(spaces int) { + var indent indentFunc + switch { + case spaces < 0: + indent = func(depth int) string { return "" } + case d.WriteSettings.UseCRLF: + indent = func(depth int) string { return indentCRLF(depth*spaces, indentSpaces) } + default: + indent = func(depth int) string { return indentLF(depth*spaces, indentSpaces) } + } + d.Element.indent(0, indent) +} + +// IndentTabs modifies the document's element tree by inserting CharData +// tokens containing newlines and tabs for indentation. One tab is used per +// indentation level. +func (d *Document) IndentTabs() { + var indent indentFunc + switch d.WriteSettings.UseCRLF { + case true: + indent = func(depth int) string { return indentCRLF(depth, indentTabs) } + default: + indent = func(depth int) string { return indentLF(depth, indentTabs) } + } + d.Element.indent(0, indent) +} + +// NewElement creates an unparented element with the specified tag (i.e., +// name). The tag may include a namespace prefix followed by a colon. +func NewElement(tag string) *Element { + space, stag := spaceDecompose(tag) + return newElement(space, stag, nil) +} + +// newElement is a helper function that creates an element and binds it to +// a parent element if possible. +func newElement(space, tag string, parent *Element) *Element { + e := &Element{ + Space: space, + Tag: tag, + Attr: make([]Attr, 0), + Child: make([]Token, 0), + parent: parent, + index: -1, + } + if parent != nil { + parent.addChild(e) + } + return e +} + +// Copy creates a recursive, deep copy of the element and all its attributes +// and children. The returned element has no parent but can be parented to a +// another element using AddChild, or added to a document with SetRoot or +// NewDocumentWithRoot. +func (e *Element) Copy() *Element { + return e.dup(nil).(*Element) +} + +// FullTag returns the element e's complete tag, including namespace prefix if +// present. +func (e *Element) FullTag() string { + if e.Space == "" { + return e.Tag + } + return e.Space + ":" + e.Tag +} + +// NamespaceURI returns the XML namespace URI associated with the element. If +// the element is part of the XML default namespace, NamespaceURI returns the +// empty string. +func (e *Element) NamespaceURI() string { + if e.Space == "" { + return e.findDefaultNamespaceURI() + } + return e.findLocalNamespaceURI(e.Space) +} + +// findLocalNamespaceURI finds the namespace URI corresponding to the +// requested prefix. +func (e *Element) findLocalNamespaceURI(prefix string) string { + for _, a := range e.Attr { + if a.Space == "xmlns" && a.Key == prefix { + return a.Value + } + } + + if e.parent == nil { + return "" + } + + return e.parent.findLocalNamespaceURI(prefix) +} + +// findDefaultNamespaceURI finds the default namespace URI of the element. +func (e *Element) findDefaultNamespaceURI() string { + for _, a := range e.Attr { + if a.Space == "" && a.Key == "xmlns" { + return a.Value + } + } + + if e.parent == nil { + return "" + } + + return e.parent.findDefaultNamespaceURI() +} + +// namespacePrefix returns the namespace prefix associated with the element. +func (e *Element) namespacePrefix() string { + return e.Space +} + +// name returns the tag associated with the element. +func (e *Element) name() string { + return e.Tag +} + +// Text returns all character data immediately following the element's opening +// tag. +func (e *Element) Text() string { + if len(e.Child) == 0 { + return "" + } + + text := "" + for _, ch := range e.Child { + if cd, ok := ch.(*CharData); ok { + if text == "" { + text = cd.Data + } else { + text += cd.Data + } + } else { + break + } + } + return text +} + +// SetText replaces all character data immediately following an element's +// opening tag with the requested string. +func (e *Element) SetText(text string) { + e.replaceText(0, text, 0) +} + +// SetCData replaces all character data immediately following an element's +// opening tag with a CDATA section. +func (e *Element) SetCData(text string) { + e.replaceText(0, text, cdataFlag) +} + +// Tail returns all character data immediately following the element's end +// tag. +func (e *Element) Tail() string { + if e.Parent() == nil { + return "" + } + + p := e.Parent() + i := e.Index() + + text := "" + for _, ch := range p.Child[i+1:] { + if cd, ok := ch.(*CharData); ok { + if text == "" { + text = cd.Data + } else { + text += cd.Data + } + } else { + break + } + } + return text +} + +// SetTail replaces all character data immediately following the element's end +// tag with the requested string. +func (e *Element) SetTail(text string) { + if e.Parent() == nil { + return + } + + p := e.Parent() + p.replaceText(e.Index()+1, text, 0) +} + +// replaceText is a helper function that replaces a series of chardata tokens +// starting at index i with the requested text. +func (e *Element) replaceText(i int, text string, flags charDataFlags) { + end := e.findTermCharDataIndex(i) + + switch { + case end == i: + if text != "" { + // insert a new chardata token at index i + cd := newCharData(text, flags, nil) + e.InsertChildAt(i, cd) + } + + case end == i+1: + if text == "" { + // remove the chardata token at index i + e.RemoveChildAt(i) + } else { + // replace the first and only character token at index i + cd := e.Child[i].(*CharData) + cd.Data, cd.flags = text, flags + } + + default: + if text == "" { + // remove all chardata tokens starting from index i + copy(e.Child[i:], e.Child[end:]) + removed := end - i + e.Child = e.Child[:len(e.Child)-removed] + for j := i; j < len(e.Child); j++ { + e.Child[j].setIndex(j) + } + } else { + // replace the first chardata token at index i and remove all + // subsequent chardata tokens + cd := e.Child[i].(*CharData) + cd.Data, cd.flags = text, flags + copy(e.Child[i+1:], e.Child[end:]) + removed := end - (i + 1) + e.Child = e.Child[:len(e.Child)-removed] + for j := i + 1; j < len(e.Child); j++ { + e.Child[j].setIndex(j) + } + } + } +} + +// findTermCharDataIndex finds the index of the first child token that isn't +// a CharData token. It starts from the requested start index. +func (e *Element) findTermCharDataIndex(start int) int { + for i := start; i < len(e.Child); i++ { + if _, ok := e.Child[i].(*CharData); !ok { + return i + } + } + return len(e.Child) +} + +// CreateElement creates a new element with the specified tag (i.e., name) and +// adds it as the last child token of this element. The tag may include a +// prefix followed by a colon. +func (e *Element) CreateElement(tag string) *Element { + space, stag := spaceDecompose(tag) + return newElement(space, stag, e) +} + +// AddChild adds the token 't' as the last child of the element. If token 't' +// was already the child of another element, it is first removed from its +// parent element. +func (e *Element) AddChild(t Token) { + if t.Parent() != nil { + t.Parent().RemoveChild(t) + } + e.addChild(t) +} + +// InsertChild inserts the token 't' into this element's list of children just +// before the element's existing child token 'ex'. If the existing element +// 'ex' does not appear in this element's list of child tokens, then 't' is +// added to the end of this element's list of child tokens. If token 't' is +// already the child of another element, it is first removed from the other +// element's list of child tokens. +// +// Deprecated: InsertChild is deprecated. Use InsertChildAt instead. +func (e *Element) InsertChild(ex Token, t Token) { + if ex == nil || ex.Parent() != e { + e.AddChild(t) + return + } + + if t.Parent() != nil { + t.Parent().RemoveChild(t) + } + + t.setParent(e) + + i := ex.Index() + e.Child = append(e.Child, nil) + copy(e.Child[i+1:], e.Child[i:]) + e.Child[i] = t + + for j := i; j < len(e.Child); j++ { + e.Child[j].setIndex(j) + } +} + +// InsertChildAt inserts the token 't' into this element's list of child +// tokens just before the requested 'index'. If the index is greater than or +// equal to the length of the list of child tokens, then the token 't' is +// added to the end of the list of child tokens. +func (e *Element) InsertChildAt(index int, t Token) { + if index >= len(e.Child) { + e.AddChild(t) + return + } + + if t.Parent() != nil { + if t.Parent() == e && t.Index() > index { + index-- + } + t.Parent().RemoveChild(t) + } + + t.setParent(e) + + e.Child = append(e.Child, nil) + copy(e.Child[index+1:], e.Child[index:]) + e.Child[index] = t + + for j := index; j < len(e.Child); j++ { + e.Child[j].setIndex(j) + } +} + +// RemoveChild attempts to remove the token 't' from this element's list of +// child tokens. If the token 't' was a child of this element, then it is +// removed and returned. Otherwise, nil is returned. +func (e *Element) RemoveChild(t Token) Token { + if t.Parent() != e { + return nil + } + return e.RemoveChildAt(t.Index()) +} + +// RemoveChildAt removes the child token appearing in slot 'index' of this +// element's list of child tokens. The removed child token is then returned. +// If the index is out of bounds, no child is removed and nil is returned. +func (e *Element) RemoveChildAt(index int) Token { + if index >= len(e.Child) { + return nil + } + + t := e.Child[index] + for j := index + 1; j < len(e.Child); j++ { + e.Child[j].setIndex(j - 1) + } + e.Child = append(e.Child[:index], e.Child[index+1:]...) + t.setIndex(-1) + t.setParent(nil) + return t +} + +// ReadFrom reads XML from the reader ;ri' and stores the result as a new +// child of this element. +func (e *Element) readFrom(ri io.Reader, settings ReadSettings) (n int64, err error) { + r := newCountReader(ri) + dec := xml.NewDecoder(r) + dec.CharsetReader = settings.CharsetReader + dec.Strict = !settings.Permissive + dec.Entity = settings.Entity + var stack stack + stack.push(e) + for { + t, err := dec.RawToken() + switch { + case err == io.EOF: + return r.bytes, nil + case err != nil: + return r.bytes, err + case stack.empty(): + return r.bytes, ErrXML + } + + top := stack.peek().(*Element) + + switch t := t.(type) { + case xml.StartElement: + e := newElement(t.Name.Space, t.Name.Local, top) + for _, a := range t.Attr { + e.createAttr(a.Name.Space, a.Name.Local, a.Value, e) + } + stack.push(e) + case xml.EndElement: + stack.pop() + case xml.CharData: + data := string(t) + var flags charDataFlags + if isWhitespace(data) { + flags = whitespaceFlag + } + newCharData(data, flags, top) + case xml.Comment: + newComment(string(t), top) + case xml.Directive: + newDirective(string(t), top) + case xml.ProcInst: + newProcInst(t.Target, string(t.Inst), top) + } + } +} + +// SelectAttr finds an element attribute matching the requested 'key' and, if +// found, returns a pointer to the matching attribute. The function returns +// nil if no matching attribute is found. The key may include a namespace +// prefix followed by a colon. +func (e *Element) SelectAttr(key string) *Attr { + space, skey := spaceDecompose(key) + for i, a := range e.Attr { + if spaceMatch(space, a.Space) && skey == a.Key { + return &e.Attr[i] + } + } + return nil +} + +// SelectAttrValue finds an element attribute matching the requested 'key' and +// returns its value if found. If no matching attribute is found, the function +// returns the 'dflt' value instead. The key may include a namespace prefix +// followed by a colon. +func (e *Element) SelectAttrValue(key, dflt string) string { + space, skey := spaceDecompose(key) + for _, a := range e.Attr { + if spaceMatch(space, a.Space) && skey == a.Key { + return a.Value + } + } + return dflt +} + +// ChildElements returns all elements that are children of this element. +func (e *Element) ChildElements() []*Element { + var elements []*Element + for _, t := range e.Child { + if c, ok := t.(*Element); ok { + elements = append(elements, c) + } + } + return elements +} + +// SelectElement returns the first child element with the given 'tag' (i.e., +// name). The function returns nil if no child element matching the tag is +// found. The tag may include a namespace prefix followed by a colon. +func (e *Element) SelectElement(tag string) *Element { + space, stag := spaceDecompose(tag) + for _, t := range e.Child { + if c, ok := t.(*Element); ok && spaceMatch(space, c.Space) && stag == c.Tag { + return c + } + } + return nil +} + +// SelectElements returns a slice of all child elements with the given 'tag' +// (i.e., name). The tag may include a namespace prefix followed by a colon. +func (e *Element) SelectElements(tag string) []*Element { + space, stag := spaceDecompose(tag) + var elements []*Element + for _, t := range e.Child { + if c, ok := t.(*Element); ok && spaceMatch(space, c.Space) && stag == c.Tag { + elements = append(elements, c) + } + } + return elements +} + +// FindElement returns the first element matched by the XPath-like 'path' +// string. The function returns nil if no child element is found using the +// path. It panics if an invalid path string is supplied. +func (e *Element) FindElement(path string) *Element { + return e.FindElementPath(MustCompilePath(path)) +} + +// FindElementPath returns the first element matched by the 'path' object. The +// function returns nil if no element is found using the path. +func (e *Element) FindElementPath(path Path) *Element { + p := newPather() + elements := p.traverse(e, path) + if len(elements) > 0 { + return elements[0] + } + return nil +} + +// FindElements returns a slice of elements matched by the XPath-like 'path' +// string. The function returns nil if no child element is found using the +// path. It panics if an invalid path string is supplied. +func (e *Element) FindElements(path string) []*Element { + return e.FindElementsPath(MustCompilePath(path)) +} + +// FindElementsPath returns a slice of elements matched by the 'path' object. +func (e *Element) FindElementsPath(path Path) []*Element { + p := newPather() + return p.traverse(e, path) +} + +// GetPath returns the absolute path of the element. The absolute path is the +// full path from the document's root. +func (e *Element) GetPath() string { + path := []string{} + for seg := e; seg != nil; seg = seg.Parent() { + if seg.Tag != "" { + path = append(path, seg.Tag) + } + } + + // Reverse the path. + for i, j := 0, len(path)-1; i < j; i, j = i+1, j-1 { + path[i], path[j] = path[j], path[i] + } + + return "/" + strings.Join(path, "/") +} + +// GetRelativePath returns the path of this element relative to the 'source' +// element. If the two elements are not part of the same element tree, then +// the function returns the empty string. +func (e *Element) GetRelativePath(source *Element) string { + var path []*Element + + if source == nil { + return "" + } + + // Build a reverse path from the element toward the root. Stop if the + // source element is encountered. + var seg *Element + for seg = e; seg != nil && seg != source; seg = seg.Parent() { + path = append(path, seg) + } + + // If we found the source element, reverse the path and compose the + // string. + if seg == source { + if len(path) == 0 { + return "." + } + parts := []string{} + for i := len(path) - 1; i >= 0; i-- { + parts = append(parts, path[i].Tag) + } + return "./" + strings.Join(parts, "/") + } + + // The source wasn't encountered, so climb from the source element toward + // the root of the tree until an element in the reversed path is + // encountered. + + findPathIndex := func(e *Element, path []*Element) int { + for i, ee := range path { + if e == ee { + return i + } + } + return -1 + } + + climb := 0 + for seg = source; seg != nil; seg = seg.Parent() { + i := findPathIndex(seg, path) + if i >= 0 { + path = path[:i] // truncate at found segment + break + } + climb++ + } + + // No element in the reversed path was encountered, so the two elements + // must not be part of the same tree. + if seg == nil { + return "" + } + + // Reverse the (possibly truncated) path and prepend ".." segments to + // climb. + parts := []string{} + for i := 0; i < climb; i++ { + parts = append(parts, "..") + } + for i := len(path) - 1; i >= 0; i-- { + parts = append(parts, path[i].Tag) + } + return strings.Join(parts, "/") +} + +// indent recursively inserts proper indentation between an XML element's +// child tokens. +func (e *Element) indent(depth int, indent indentFunc) { + e.stripIndent() + n := len(e.Child) + if n == 0 { + return + } + + oldChild := e.Child + e.Child = make([]Token, 0, n*2+1) + isCharData, firstNonCharData := false, true + for _, c := range oldChild { + // Insert NL+indent before child if it's not character data. + // Exceptions: when it's the first non-character-data child, or when + // the child is at root depth. + _, isCharData = c.(*CharData) + if !isCharData { + if !firstNonCharData || depth > 0 { + s := indent(depth) + if s != "" { + newCharData(s, whitespaceFlag, e) + } + } + firstNonCharData = false + } + + e.addChild(c) + + // Recursively process child elements. + if ce, ok := c.(*Element); ok { + ce.indent(depth+1, indent) + } + } + + // Insert NL+indent before the last child. + if !isCharData { + if !firstNonCharData || depth > 0 { + s := indent(depth - 1) + if s != "" { + newCharData(s, whitespaceFlag, e) + } + } + } +} + +// stripIndent removes any previously inserted indentation. +func (e *Element) stripIndent() { + // Count the number of non-indent child tokens + n := len(e.Child) + for _, c := range e.Child { + if cd, ok := c.(*CharData); ok && cd.IsWhitespace() { + n-- + } + } + if n == len(e.Child) { + return + } + + // Strip out indent CharData + newChild := make([]Token, n) + j := 0 + for _, c := range e.Child { + if cd, ok := c.(*CharData); ok && cd.IsWhitespace() { + continue + } + newChild[j] = c + newChild[j].setIndex(j) + j++ + } + e.Child = newChild +} + +// dup duplicates the element. +func (e *Element) dup(parent *Element) Token { + ne := &Element{ + Space: e.Space, + Tag: e.Tag, + Attr: make([]Attr, len(e.Attr)), + Child: make([]Token, len(e.Child)), + parent: parent, + index: e.index, + } + for i, t := range e.Child { + ne.Child[i] = t.dup(ne) + } + copy(ne.Attr, e.Attr) + return ne +} + +// Parent returns this element's parent element. It returns nil if this +// element has no parent. +func (e *Element) Parent() *Element { + return e.parent +} + +// Index returns the index of this element within its parent element's +// list of child tokens. If this element has no parent, then the function +// returns -1. +func (e *Element) Index() int { + return e.index +} + +// setParent replaces this element token's parent. +func (e *Element) setParent(parent *Element) { + e.parent = parent +} + +// setIndex sets this element token's index within its parent's Child slice. +func (e *Element) setIndex(index int) { + e.index = index +} + +// writeTo serializes the element to the writer w. +func (e *Element) writeTo(w *bufio.Writer, s *WriteSettings) { + w.WriteByte('<') + w.WriteString(e.FullTag()) + for _, a := range e.Attr { + w.WriteByte(' ') + a.writeTo(w, s) + } + if len(e.Child) > 0 { + w.WriteByte('>') + for _, c := range e.Child { + c.writeTo(w, s) + } + w.Write([]byte{'<', '/'}) + w.WriteString(e.FullTag()) + w.WriteByte('>') + } else { + if s.CanonicalEndTags { + w.Write([]byte{'>', '<', '/'}) + w.WriteString(e.FullTag()) + w.WriteByte('>') + } else { + w.Write([]byte{'/', '>'}) + } + } +} + +// addChild adds a child token to the element e. +func (e *Element) addChild(t Token) { + t.setParent(e) + t.setIndex(len(e.Child)) + e.Child = append(e.Child, t) +} + +// CreateAttr creates an attribute with the specified 'key' and 'value' and +// adds it to this element. If an attribute with same key already exists on +// this element, then its value is replaced. The key may include a namespace +// prefix followed by a colon. +func (e *Element) CreateAttr(key, value string) *Attr { + space, skey := spaceDecompose(key) + return e.createAttr(space, skey, value, e) +} + +// createAttr is a helper function that creates attributes. +func (e *Element) createAttr(space, key, value string, parent *Element) *Attr { + for i, a := range e.Attr { + if space == a.Space && key == a.Key { + e.Attr[i].Value = value + return &e.Attr[i] + } + } + a := Attr{ + Space: space, + Key: key, + Value: value, + element: parent, + } + e.Attr = append(e.Attr, a) + return &e.Attr[len(e.Attr)-1] +} + +// RemoveAttr removes the first attribute of this element whose key matches +// 'key'. It returns a copy of the removed attribute if a match is found. If +// no match is found, it returns nil. The key may include a namespace prefix +// followed by a colon. +func (e *Element) RemoveAttr(key string) *Attr { + space, skey := spaceDecompose(key) + for i, a := range e.Attr { + if space == a.Space && skey == a.Key { + e.Attr = append(e.Attr[0:i], e.Attr[i+1:]...) + return &Attr{ + Space: a.Space, + Key: a.Key, + Value: a.Value, + element: nil, + } + } + } + return nil +} + +// SortAttrs sorts this element's attributes lexicographically by key. +func (e *Element) SortAttrs() { + sort.Sort(byAttr(e.Attr)) +} + +type byAttr []Attr + +func (a byAttr) Len() int { + return len(a) +} + +func (a byAttr) Swap(i, j int) { + a[i], a[j] = a[j], a[i] +} + +func (a byAttr) Less(i, j int) bool { + sp := strings.Compare(a[i].Space, a[j].Space) + if sp == 0 { + return strings.Compare(a[i].Key, a[j].Key) < 0 + } + return sp < 0 +} + +// FullKey returns this attribute's complete key, including namespace prefix +// if present. +func (a *Attr) FullKey() string { + if a.Space == "" { + return a.Key + } + return a.Space + ":" + a.Key +} + +// Element returns a pointer to the element containing this attribute. +func (a *Attr) Element() *Element { + return a.element +} + +// NamespaceURI returns the XML namespace URI associated with this attribute. +// The function returns the empty string if the attribute is unprefixed or +// if the attribute is part of the XML default namespace. +func (a *Attr) NamespaceURI() string { + if a.Space == "" { + return "" + } + return a.element.findLocalNamespaceURI(a.Space) +} + +// writeTo serializes the attribute to the writer. +func (a *Attr) writeTo(w *bufio.Writer, s *WriteSettings) { + w.WriteString(a.FullKey()) + w.WriteString(`="`) + var m escapeMode + if s.CanonicalAttrVal { + m = escapeCanonicalAttr + } else { + m = escapeNormal + } + escapeString(w, a.Value, m) + w.WriteByte('"') +} + +// NewText creates an unparented CharData token containing simple text data. +func NewText(text string) *CharData { + return newCharData(text, 0, nil) +} + +// NewCData creates an unparented XML character CDATA section with 'data' as +// its content. +func NewCData(data string) *CharData { + return newCharData(data, cdataFlag, nil) +} + +// NewCharData creates an unparented CharData token containing simple text +// data. +// +// Deprecated: NewCharData is deprecated. Instead, use NewText, which does the +// same thing. +func NewCharData(data string) *CharData { + return newCharData(data, 0, nil) +} + +// newCharData creates a character data token and binds it to a parent +// element. If parent is nil, the CharData token remains unbound. +func newCharData(data string, flags charDataFlags, parent *Element) *CharData { + c := &CharData{ + Data: data, + parent: nil, + index: -1, + flags: flags, + } + if parent != nil { + parent.addChild(c) + } + return c +} + +// CreateText creates a CharData token simple text data and adds it to the +// end of this element's list of child tokens. +func (e *Element) CreateText(text string) *CharData { + return newCharData(text, 0, e) +} + +// CreateCData creates a CharData token containing a CDATA section with 'data' +// as its content and adds it to the end of this element's list of child +// tokens. +func (e *Element) CreateCData(data string) *CharData { + return newCharData(data, cdataFlag, e) +} + +// CreateCharData creates a CharData token simple text data and adds it to the +// end of this element's list of child tokens. +// +// Deprecated: CreateCharData is deprecated. Instead, use CreateText, which +// does the same thing. +func (e *Element) CreateCharData(data string) *CharData { + return newCharData(data, 0, e) +} + +// SetData modifies the content of the CharData token. In the case of a +// CharData token containing simple text, the simple text is modified. In the +// case of a CharData token containing a CDATA section, the CDATA section's +// content is modified. +func (c *CharData) SetData(text string) { + c.Data = text + if isWhitespace(text) { + c.flags |= whitespaceFlag + } else { + c.flags &= ^whitespaceFlag + } +} + +// IsCData returns true if this CharData token is contains a CDATA section. It +// returns false if the CharData token contains simple text. +func (c *CharData) IsCData() bool { + return (c.flags & cdataFlag) != 0 +} + +// IsWhitespace returns true if this CharData token contains only whitespace. +func (c *CharData) IsWhitespace() bool { + return (c.flags & whitespaceFlag) != 0 +} + +// Parent returns this CharData token's parent element, or nil if it has no +// parent. +func (c *CharData) Parent() *Element { + return c.parent +} + +// Index returns the index of this CharData token within its parent element's +// list of child tokens. If this CharData token has no parent, then the +// function returns -1. +func (c *CharData) Index() int { + return c.index +} + +// dup duplicates the character data. +func (c *CharData) dup(parent *Element) Token { + return &CharData{ + Data: c.Data, + flags: c.flags, + parent: parent, + index: c.index, + } +} + +// setParent replaces the character data token's parent. +func (c *CharData) setParent(parent *Element) { + c.parent = parent +} + +// setIndex sets the CharData token's index within its parent element's Child +// slice. +func (c *CharData) setIndex(index int) { + c.index = index +} + +// writeTo serializes character data to the writer. +func (c *CharData) writeTo(w *bufio.Writer, s *WriteSettings) { + if c.IsCData() { + w.WriteString(`<![CDATA[`) + w.WriteString(c.Data) + w.WriteString(`]]>`) + } else { + var m escapeMode + if s.CanonicalText { + m = escapeCanonicalText + } else { + m = escapeNormal + } + escapeString(w, c.Data, m) + } +} + +// NewComment creates an unparented comment token. +func NewComment(comment string) *Comment { + return newComment(comment, nil) +} + +// NewComment creates a comment token and sets its parent element to 'parent'. +func newComment(comment string, parent *Element) *Comment { + c := &Comment{ + Data: comment, + parent: nil, + index: -1, + } + if parent != nil { + parent.addChild(c) + } + return c +} + +// CreateComment creates a comment token using the specified 'comment' string +// and adds it as the last child token of this element. +func (e *Element) CreateComment(comment string) *Comment { + return newComment(comment, e) +} + +// dup duplicates the comment. +func (c *Comment) dup(parent *Element) Token { + return &Comment{ + Data: c.Data, + parent: parent, + index: c.index, + } +} + +// Parent returns comment token's parent element, or nil if it has no parent. +func (c *Comment) Parent() *Element { + return c.parent +} + +// Index returns the index of this Comment token within its parent element's +// list of child tokens. If this Comment token has no parent, then the +// function returns -1. +func (c *Comment) Index() int { + return c.index +} + +// setParent replaces the comment token's parent. +func (c *Comment) setParent(parent *Element) { + c.parent = parent +} + +// setIndex sets the Comment token's index within its parent element's Child +// slice. +func (c *Comment) setIndex(index int) { + c.index = index +} + +// writeTo serialies the comment to the writer. +func (c *Comment) writeTo(w *bufio.Writer, s *WriteSettings) { + w.WriteString("<!--") + w.WriteString(c.Data) + w.WriteString("-->") +} + +// NewDirective creates an unparented XML directive token. +func NewDirective(data string) *Directive { + return newDirective(data, nil) +} + +// newDirective creates an XML directive and binds it to a parent element. If +// parent is nil, the Directive remains unbound. +func newDirective(data string, parent *Element) *Directive { + d := &Directive{ + Data: data, + parent: nil, + index: -1, + } + if parent != nil { + parent.addChild(d) + } + return d +} + +// CreateDirective creates an XML directive token with the specified 'data' +// value and adds it as the last child token of this element. +func (e *Element) CreateDirective(data string) *Directive { + return newDirective(data, e) +} + +// dup duplicates the directive. +func (d *Directive) dup(parent *Element) Token { + return &Directive{ + Data: d.Data, + parent: parent, + index: d.index, + } +} + +// Parent returns directive token's parent element, or nil if it has no +// parent. +func (d *Directive) Parent() *Element { + return d.parent +} + +// Index returns the index of this Directive token within its parent element's +// list of child tokens. If this Directive token has no parent, then the +// function returns -1. +func (d *Directive) Index() int { + return d.index +} + +// setParent replaces the directive token's parent. +func (d *Directive) setParent(parent *Element) { + d.parent = parent +} + +// setIndex sets the Directive token's index within its parent element's Child +// slice. +func (d *Directive) setIndex(index int) { + d.index = index +} + +// writeTo serializes the XML directive to the writer. +func (d *Directive) writeTo(w *bufio.Writer, s *WriteSettings) { + w.WriteString("<!") + w.WriteString(d.Data) + w.WriteString(">") +} + +// NewProcInst creates an unparented XML processing instruction. +func NewProcInst(target, inst string) *ProcInst { + return newProcInst(target, inst, nil) +} + +// newProcInst creates an XML processing instruction and binds it to a parent +// element. If parent is nil, the ProcInst remains unbound. +func newProcInst(target, inst string, parent *Element) *ProcInst { + p := &ProcInst{ + Target: target, + Inst: inst, + parent: nil, + index: -1, + } + if parent != nil { + parent.addChild(p) + } + return p +} + +// CreateProcInst creates an XML processing instruction token with the +// sepcified 'target' and instruction 'inst'. It is then added as the last +// child token of this element. +func (e *Element) CreateProcInst(target, inst string) *ProcInst { + return newProcInst(target, inst, e) +} + +// dup duplicates the procinst. +func (p *ProcInst) dup(parent *Element) Token { + return &ProcInst{ + Target: p.Target, + Inst: p.Inst, + parent: parent, + index: p.index, + } +} + +// Parent returns processing instruction token's parent element, or nil if it +// has no parent. +func (p *ProcInst) Parent() *Element { + return p.parent +} + +// Index returns the index of this ProcInst token within its parent element's +// list of child tokens. If this ProcInst token has no parent, then the +// function returns -1. +func (p *ProcInst) Index() int { + return p.index +} + +// setParent replaces the processing instruction token's parent. +func (p *ProcInst) setParent(parent *Element) { + p.parent = parent +} + +// setIndex sets the processing instruction token's index within its parent +// element's Child slice. +func (p *ProcInst) setIndex(index int) { + p.index = index +} + +// writeTo serializes the processing instruction to the writer. +func (p *ProcInst) writeTo(w *bufio.Writer, s *WriteSettings) { + w.WriteString("<?") + w.WriteString(p.Target) + if p.Inst != "" { + w.WriteByte(' ') + w.WriteString(p.Inst) + } + w.WriteString("?>") +} diff --git a/etree_test.go b/etree_test.go new file mode 100644 index 0000000..501f4ad --- /dev/null +++ b/etree_test.go @@ -0,0 +1,1115 @@ +// Copyright 2015-2019 Brett Vickers. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package etree + +import ( + "encoding/xml" + "io" + "strings" + "testing" +) + +func newDocumentFromString(t *testing.T, s string) *Document { + t.Helper() + doc := NewDocument() + err := doc.ReadFromString(s) + if err != nil { + t.Error("etree: failed to parse document") + } + return doc +} + +func checkStrEq(t *testing.T, got, want string) { + t.Helper() + if got != want { + t.Errorf("etree: unexpected result.\nGot:\n%s\nWanted:\n%s\n", got, want) + } +} + +func checkStrBinaryEq(t *testing.T, got, want string) { + t.Helper() + if got != want { + t.Errorf("etree: unexpected result.\nGot:\n%v\nWanted:\n%v\n", []byte(got), []byte(want)) + } +} + +func checkIntEq(t *testing.T, got, want int) { + t.Helper() + if got != want { + t.Errorf("etree: unexpected integer. Got: %d. Wanted: %d\n", got, want) + } +} + +func checkBoolEq(t *testing.T, got, want bool) { + t.Helper() + if got != want { + t.Errorf("etree: unexpected boolean. Got: %v. Wanted: %v\n", got, want) + } +} + +func checkElementEq(t *testing.T, got, want *Element) { + t.Helper() + if got != want { + t.Errorf("etree: unexpected element. Got: %v. Wanted: %v.\n", got, want) + } +} + +func checkDocEq(t *testing.T, doc *Document, expected string) { + t.Helper() + doc.Indent(NoIndent) + s, err := doc.WriteToString() + if err != nil { + t.Error("etree: failed to serialize document") + } + if s != expected { + t.Errorf("etree: unexpected document.\nGot:\n%s\nWanted:\n%s\n", s, expected) + } +} + +func checkIndexes(t *testing.T, e *Element) { + t.Helper() + for i := 0; i < len(e.Child); i++ { + c := e.Child[i] + if c.Index() != i { + t.Errorf("Child index mismatch. Got %d, expected %d.", c.Index(), i) + } + if ce, ok := c.(*Element); ok { + checkIndexes(t, ce) + } + } +} + +func TestDocument(t *testing.T) { + // Create a document + doc := NewDocument() + doc.CreateProcInst("xml", `version="1.0" encoding="UTF-8"`) + doc.CreateProcInst("xml-stylesheet", `type="text/xsl" href="style.xsl"`) + store := doc.CreateElement("store") + store.CreateAttr("xmlns:t", "urn:books-com:titles") + store.CreateDirective("Directive") + store.CreateComment("This is a comment") + book := store.CreateElement("book") + book.CreateAttr("lang", "fr") + book.CreateAttr("lang", "en") + title := book.CreateElement("t:title") + title.SetText("Nicholas Nickleby") + title.SetText("Great Expectations") + author := book.CreateElement("author") + author.CreateCharData("Charles Dickens") + review := book.CreateElement("review") + review.CreateCData("<<< Will be replaced") + review.SetCData(">>> Excellent book") + doc.IndentTabs() + + checkIndexes(t, &doc.Element) + + // Serialize the document to a string + s, err := doc.WriteToString() + if err != nil { + t.Error("etree: failed to serialize document") + } + + // Make sure the serialized XML matches expectation. + expected := `<?xml version="1.0" encoding="UTF-8"?> +<?xml-stylesheet type="text/xsl" href="style.xsl"?> +<store xmlns:t="urn:books-com:titles"> + <!Directive> + <!--This is a comment--> + <book lang="en"> + <t:title>Great Expectations</t:title> + <author>Charles Dickens</author> + <review><![CDATA[>>> Excellent book]]></review> + </book> +</store> +` + checkStrEq(t, s, expected) + + // Test the structure of the XML + if doc.Root() != store { + t.Error("etree: root mismatch") + } + if len(store.ChildElements()) != 1 || len(store.Child) != 7 { + t.Error("etree: incorrect tree structure") + } + if len(book.ChildElements()) != 3 || len(book.Attr) != 1 || len(book.Child) != 7 { + t.Error("etree: incorrect tree structure") + } + if len(title.ChildElements()) != 0 || len(title.Child) != 1 || len(title.Attr) != 0 { + t.Error("etree: incorrect tree structure") + } + if len(author.ChildElements()) != 0 || len(author.Child) != 1 || len(author.Attr) != 0 { + t.Error("etree: incorrect tree structure") + } + if len(review.ChildElements()) != 0 || len(review.Child) != 1 || len(review.Attr) != 0 { + t.Error("etree: incorrect tree structure") + } + if book.parent != store || store.parent != &doc.Element || doc.parent != nil { + t.Error("etree: incorrect tree structure") + } + if title.parent != book || author.parent != book { + t.Error("etree: incorrect tree structure") + } + + // Perform some basic queries on the document + elements := doc.SelectElements("store") + if len(elements) != 1 || elements[0] != store { + t.Error("etree: incorrect SelectElements result") + } + element := doc.SelectElement("store") + if element != store { + t.Error("etree: incorrect SelectElement result") + } + elements = store.SelectElements("book") + if len(elements) != 1 || elements[0] != book { + t.Error("etree: incorrect SelectElements result") + } + element = store.SelectElement("book") + if element != book { + t.Error("etree: incorrect SelectElement result") + } + attr := book.SelectAttr("lang") + if attr == nil || attr.Key != "lang" || attr.Value != "en" { + t.Error("etree: incorrect SelectAttr result") + } + if book.SelectAttrValue("lang", "unknown") != "en" { + t.Error("etree: incorrect SelectAttrValue result") + } + if book.SelectAttrValue("t:missing", "unknown") != "unknown" { + t.Error("etree: incorrect SelectAttrValue result") + } + attr = book.RemoveAttr("lang") + if attr.Value != "en" { + t.Error("etree: incorrect RemoveAttr result") + } + book.CreateAttr("lang", "de") + attr = book.RemoveAttr("lang") + if attr.Value != "de" { + t.Error("etree: incorrect RemoveAttr result") + } + element = book.SelectElement("t:title") + if element != title || element.Text() != "Great Expectations" || len(element.Attr) != 0 { + t.Error("etree: incorrect SelectElement result") + } + element = book.SelectElement("title") + if element != title { + t.Error("etree: incorrect SelectElement result") + } + element = book.SelectElement("p:title") + if element != nil { + t.Error("etree: incorrect SelectElement result") + } + element = book.RemoveChildAt(title.Index()).(*Element) + if element != title { + t.Error("etree: incorrect RemoveElement result") + } + element = book.SelectElement("title") + if element != nil { + t.Error("etree: incorrect SelectElement result") + } + element = book.SelectElement("review") + if element != review || element.Text() != ">>> Excellent book" || len(element.Attr) != 0 { + t.Error("etree: incorrect SelectElement result") + } +} + +func TestDocumentReadNonUTF8Encodings(t *testing.T) { + s := `<?xml version="1.0" encoding="ISO-8859-1"?> + <store> + <book lang="en"> + <title>Great Expectations</title> + <author>Charles Dickens</author> + </book> +</store>` + + doc := NewDocument() + doc.ReadSettings.CharsetReader = func(label string, input io.Reader) (io.Reader, error) { + return input, nil + } + err := doc.ReadFromString(s) + if err != nil { + t.Fatal("etree: incorrect ReadFromString result") + } +} + +func TestDocumentReadPermissive(t *testing.T) { + s := "<select disabled></select>" + + doc := NewDocument() + err := doc.ReadFromString(s) + if err == nil { + t.Fatal("etree: incorrect ReadFromString result") + } + + doc.ReadSettings.Permissive = true + err = doc.ReadFromString(s) + if err != nil { + t.Fatal("etree: incorrect ReadFromString result") + } +} + +func TestDocumentReadHTMLEntities(t *testing.T) { + s := `<store> + <book lang="en"> + <title>→ Great Expectations</title> + <author>Charles Dickens</author> + </book> +</store>` + + doc := NewDocument() + err := doc.ReadFromString(s) + if err == nil { + t.Fatal("etree: incorrect ReadFromString result") + } + + doc.ReadSettings.Entity = xml.HTMLEntity + err = doc.ReadFromString(s) + if err != nil { + t.Fatal("etree: incorrect ReadFromString result") + } +} + +func TestEscapeCodes(t *testing.T) { + cases := []struct { + input string + normal string + attrCanonical string + textCanonical string + }{ + { + "&<>'\"\t\n\r", + "<e a=\"&<>'"\t\n\r\">&<>'"\t\n\r</e>", + "<e a=\"&<>'"	

\">&<>'"\t\n\r</e>", + "<e a=\"&<>'"\t\n\r\">&<>'\"\t\n
</e>", + }, + { + "\x00\x1f\x08\x09\x0a\x0d", + "<e a=\"���\t\n\r\">���\t\n\r</e>", + "<e a=\"���	

\">���\t\n\r</e>", + "<e a=\"���\t\n\r\">���\t\n
</e>", + }, + } + for _, c := range cases { + doc := NewDocument() + + e := doc.CreateElement("e") + e.SetText(c.input) + e.CreateAttr("a", c.input) + + doc.WriteSettings.CanonicalText = false + doc.WriteSettings.CanonicalAttrVal = false + s, err := doc.WriteToString() + if err != nil { + t.Error("etree: Escape test produced inocrrect result.") + } + checkStrEq(t, s, c.normal) + + doc.WriteSettings.CanonicalText = false + doc.WriteSettings.CanonicalAttrVal = true + s, err = doc.WriteToString() + if err != nil { + t.Error("etree: Escape test produced inocrrect result.") + } + checkStrEq(t, s, c.attrCanonical) + + doc.WriteSettings.CanonicalText = true + doc.WriteSettings.CanonicalAttrVal = false + s, err = doc.WriteToString() + if err != nil { + t.Error("etree: Escape test produced inocrrect result.") + } + checkStrEq(t, s, c.textCanonical) + } +} + +func TestCanonical(t *testing.T) { + BOM := "\xef\xbb\xbf" + + doc := NewDocument() + doc.WriteSettings.CanonicalEndTags = true + doc.WriteSettings.CanonicalText = true + doc.WriteSettings.CanonicalAttrVal = true + doc.CreateCharData(BOM) + doc.CreateProcInst("xml-stylesheet", `type="text/xsl" href="style.xsl"`) + + people := doc.CreateElement("People") + people.CreateComment("These are all known people") + + jon := people.CreateElement("Person") + jon.CreateAttr("name", "Jon O'Reilly") + jon.SetText("\r<'\">&\u0004\u0005\u001f�") + + sally := people.CreateElement("Person") + sally.CreateAttr("name", "Sally") + sally.CreateAttr("escape", "\r\n\t<'\">&") + + doc.Indent(2) + s, err := doc.WriteToString() + if err != nil { + t.Error("etree: WriteSettings WriteTo produced incorrect result.") + } + + expected := BOM + `<?xml-stylesheet type="text/xsl" href="style.xsl"?> +<People> + <!--These are all known people--> + <Person name="Jon O'Reilly">
<'">&����</Person> + <Person name="Sally" escape="
	<'">&"></Person> +</People> +` + checkStrEq(t, s, expected) +} + +func TestCopy(t *testing.T) { + s := `<store> + <book lang="en"> + <title>Great Expectations</title> + <author>Charles Dickens</author> + </book> +</store>` + + doc := newDocumentFromString(t, s) + + s1, err := doc.WriteToString() + if err != nil { + t.Error("etree: incorrect WriteToString result") + } + + doc2 := doc.Copy() + checkIndexes(t, &doc2.Element) + s2, err := doc2.WriteToString() + if err != nil { + t.Error("etree: incorrect Copy result") + } + + if s1 != s2 { + t.Error("etree: mismatched Copy result") + t.Error("wanted:\n" + s1) + t.Error("got:\n" + s2) + } + + e1 := doc.FindElement("./store/book/title") + e2 := doc2.FindElement("./store/book/title") + if e1 == nil || e2 == nil { + t.Error("etree: incorrect FindElement result") + } + if e1 == e2 { + t.Error("etree: incorrect FindElement result") + } + + e1.parent.RemoveChildAt(e1.Index()) + s1, _ = doc.WriteToString() + s2, _ = doc2.WriteToString() + if s1 == s2 { + t.Error("etree: incorrect result after RemoveElement") + } +} + +func TestGetPath(t *testing.T) { + s := `<a> + <b1> + <c1> + <d1/> + <d1a/> + </c1> + </b1> + <b2> + <c2> + <d2/> + </c2> + </b2> +</a>` + + doc := newDocumentFromString(t, s) + + cases := []struct { + from string + to string + relpath string + topath string + }{ + {"a", ".", "..", "/"}, + {".", "a", "./a", "/a"}, + {"a/b1/c1/d1", ".", "../../../..", "/"}, + {".", "a/b1/c1/d1", "./a/b1/c1/d1", "/a/b1/c1/d1"}, + {"a", "a", ".", "/a"}, + {"a/b1", "a/b1/c1", "./c1", "/a/b1/c1"}, + {"a/b1/c1", "a/b1", "..", "/a/b1"}, + {"a/b1/c1", "a/b1/c1", ".", "/a/b1/c1"}, + {"a", "a/b1", "./b1", "/a/b1"}, + {"a/b1", "a", "..", "/a"}, + {"a", "a/b1/c1", "./b1/c1", "/a/b1/c1"}, + {"a/b1/c1", "a", "../..", "/a"}, + {"a/b1/c1/d1", "a", "../../..", "/a"}, + {"a", "a/b1/c1/d1", "./b1/c1/d1", "/a/b1/c1/d1"}, + {"a/b1", "a/b2", "../b2", "/a/b2"}, + {"a/b2", "a/b1", "../b1", "/a/b1"}, + {"a/b1/c1/d1", "a/b2/c2/d2", "../../../b2/c2/d2", "/a/b2/c2/d2"}, + {"a/b2/c2/d2", "a/b1/c1/d1", "../../../b1/c1/d1", "/a/b1/c1/d1"}, + {"a/b1/c1/d1", "a/b1/c1/d1a", "../d1a", "/a/b1/c1/d1a"}, + } + + for _, c := range cases { + fe := doc.FindElement(c.from) + te := doc.FindElement(c.to) + + rp := te.GetRelativePath(fe) + if rp != c.relpath { + t.Errorf("GetRelativePath from '%s' to '%s'. Expected '%s', got '%s'.\n", c.from, c.to, c.relpath, rp) + } + + p := te.GetPath() + if p != c.topath { + t.Errorf("GetPath for '%s'. Expected '%s', got '%s'.\n", c.to, c.topath, p) + } + } +} + +func TestInsertChild(t *testing.T) { + s := `<book lang="en"> + <t:title>Great Expectations</t:title> + <author>Charles Dickens</author> +</book> +` + + doc := newDocumentFromString(t, s) + + year := NewElement("year") + year.SetText("1861") + + book := doc.FindElement("//book") + book.InsertChildAt(book.SelectElement("t:title").Index(), year) + + expected1 := `<book lang="en"> + <year>1861</year> + <t:title>Great Expectations</t:title> + <author>Charles Dickens</author> +</book> +` + doc.Indent(2) + s1, _ := doc.WriteToString() + checkStrEq(t, s1, expected1) + + book.RemoveChildAt(year.Index()) + book.InsertChildAt(book.SelectElement("author").Index(), year) + + expected2 := `<book lang="en"> + <t:title>Great Expectations</t:title> + <year>1861</year> + <author>Charles Dickens</author> +</book> +` + doc.Indent(2) + s2, _ := doc.WriteToString() + checkStrEq(t, s2, expected2) + + book.RemoveChildAt(year.Index()) + book.InsertChildAt(len(book.Child), year) + + expected3 := `<book lang="en"> + <t:title>Great Expectations</t:title> + <author>Charles Dickens</author> + <year>1861</year> +</book> +` + doc.Indent(2) + s3, _ := doc.WriteToString() + checkStrEq(t, s3, expected3) + + book.RemoveChildAt(year.Index()) + book.InsertChildAt(999, year) + + expected4 := `<book lang="en"> + <t:title>Great Expectations</t:title> + <author>Charles Dickens</author> + <year>1861</year> +</book> +` + doc.Indent(2) + s4, _ := doc.WriteToString() + checkStrEq(t, s4, expected4) +} + +func TestCdata(t *testing.T) { + var tests = []struct { + in, out string + }{ + {`<tag>1234567</tag>`, "1234567"}, + {`<tag><![CDATA[1234567]]></tag>`, "1234567"}, + {`<tag>1<![CDATA[2]]>3<![CDATA[4]]>5<![CDATA[6]]>7</tag>`, "1234567"}, + {`<tag>1<![CDATA[2]]>3<inner>4</inner>5<![CDATA[6]]>7</tag>`, "123"}, + {`<tag>1<inner>4</inner>5<![CDATA[6]]>7</tag>`, "1"}, + {`<tag><![CDATA[1]]><inner>4</inner>5<![CDATA[6]]>7</tag>`, "1"}, + } + + for _, test := range tests { + doc := NewDocument() + err := doc.ReadFromString(test.in) + if err != nil { + t.Fatal("etree ReadFromString: " + err.Error()) + } + + tag := doc.FindElement("tag") + if tag.Text() != test.out { + t.Fatalf("etree invalid cdata. Expected: %v. Got: %v\n", test.out, tag.Text()) + } + } +} + +func TestAddChild(t *testing.T) { + s := `<book lang="en"> + <t:title>Great Expectations</t:title> + <author>Charles Dickens</author> +</book> +` + doc1 := newDocumentFromString(t, s) + + doc2 := NewDocument() + root := doc2.CreateElement("root") + + for _, e := range doc1.FindElements("//book/*") { + root.AddChild(e) + } + + expected1 := `<book lang="en"/> +` + doc1.Indent(2) + s1, _ := doc1.WriteToString() + checkStrEq(t, s1, expected1) + + expected2 := `<root> + <t:title>Great Expectations</t:title> + <author>Charles Dickens</author> +</root> +` + doc2.Indent(2) + s2, _ := doc2.WriteToString() + checkStrEq(t, s2, expected2) +} + +func TestSetRoot(t *testing.T) { + s := `<?test a="wow"?> +<book> + <title>Great Expectations</title> + <author>Charles Dickens</author> +</book> +` + doc := newDocumentFromString(t, s) + + origroot := doc.Root() + if origroot.Parent() != &doc.Element { + t.Error("Root incorrect") + } + + newroot := NewElement("root") + doc.SetRoot(newroot) + + if doc.Root() != newroot { + t.Error("doc.Root() != newroot") + } + if origroot.Parent() != nil { + t.Error("origroot.Parent() != nil") + } + + expected1 := `<?test a="wow"?> +<root/> +` + doc.Indent(2) + s1, _ := doc.WriteToString() + checkStrEq(t, s1, expected1) + + doc.SetRoot(origroot) + doc.Indent(2) + expected2 := s + s2, _ := doc.WriteToString() + checkStrEq(t, s2, expected2) + + doc2 := NewDocument() + doc2.CreateProcInst("test", `a="wow"`) + doc2.SetRoot(NewElement("root")) + doc2.Indent(2) + expected3 := expected1 + s3, _ := doc2.WriteToString() + checkStrEq(t, s3, expected3) + + doc2.SetRoot(doc.Root()) + doc2.Indent(2) + expected4 := s + s4, _ := doc2.WriteToString() + checkStrEq(t, s4, expected4) + + expected5 := `<?test a="wow"?> +` + doc.Indent(2) + s5, _ := doc.WriteToString() + checkStrEq(t, s5, expected5) +} + +func TestSortAttrs(t *testing.T) { + s := `<el foo='5' Foo='2' aaa='4' สวัสดี='7' AAA='1' a01='3' z='6' a:ZZZ='9' a:AAA='8'/>` + doc := newDocumentFromString(t, s) + doc.Root().SortAttrs() + doc.Indent(2) + out, _ := doc.WriteToString() + checkStrEq(t, out, `<el AAA="1" Foo="2" a01="3" aaa="4" foo="5" z="6" สวัสดี="7" a:AAA="8" a:ZZZ="9"/>`+"\n") +} + +func TestCharsetReaderEncoding(t *testing.T) { + cases := []string{ + `<?xml version="1.0" encoding="ISO-8859-1"?><foo></foo>`, + `<?xml version="1.0" encoding="UTF-8"?><foo></foo>`, + `<?xml version="1.0" encoding="US-ASCII"?><foo></foo>`, + } + + for _, c := range cases { + doc := NewDocument() + if err := doc.ReadFromBytes([]byte(c)); err != nil { + t.Error(err) + } + } +} + +func TestCharData(t *testing.T) { + doc := NewDocument() + root := doc.CreateElement("root") + root.CreateCharData("This ") + root.CreateCData("is ") + e1 := NewText("a ") + e2 := NewCData("text ") + root.AddChild(e1) + root.AddChild(e2) + root.CreateCharData("Element!!") + + s, err := doc.WriteToString() + if err != nil { + t.Error("etree: failed to serialize document") + } + + checkStrEq(t, s, `<root>This <![CDATA[is ]]>a <![CDATA[text ]]>Element!!</root>`) + + // Check we can parse the output + err = doc.ReadFromString(s) + if err != nil { + t.Fatal("etree: incorrect ReadFromString result") + } + if doc.Root().Text() != "This is a text Element!!" { + t.Error("etree: invalid text") + } +} + +func TestIndentSettings(t *testing.T) { + doc := NewDocument() + root := doc.CreateElement("root") + ch1 := root.CreateElement("child1") + ch1.CreateElement("child2") + + // First test with NoIndent. + doc.Indent(NoIndent) + s, err := doc.WriteToString() + if err != nil { + t.Error("etree: failed to serialize document") + } + expected := "<root><child1><child2/></child1></root>" + checkStrEq(t, s, expected) + + // Run all indent test cases. + tests := []struct { + useTabs, useCRLF bool + ws, nl string + }{ + {false, false, " ", "\n"}, + {false, true, " ", "\r\n"}, + {true, false, "\t", "\n"}, + {true, true, "\t", "\r\n"}, + } + + for _, test := range tests { + doc.WriteSettings.UseCRLF = test.useCRLF + if test.useTabs { + doc.IndentTabs() + s, err := doc.WriteToString() + if err != nil { + t.Error("etree: failed to serialize document") + } + tab := test.ws + expected := "<root>" + test.nl + tab + "<child1>" + test.nl + + tab + tab + "<child2/>" + test.nl + tab + + "</child1>" + test.nl + "</root>" + test.nl + checkStrEq(t, s, expected) + } else { + for i := 0; i < 256; i++ { + doc.Indent(i) + s, err := doc.WriteToString() + if err != nil { + t.Error("etree: failed to serialize document") + } + tab := strings.Repeat(test.ws, i) + expected := "<root>" + test.nl + tab + "<child1>" + test.nl + + tab + tab + "<child2/>" + test.nl + tab + + "</child1>" + test.nl + "</root>" + test.nl + checkStrEq(t, s, expected) + } + } + } +} + +func TestTokenIndexing(t *testing.T) { + s := `<?xml version="1.0" encoding="UTF-8"?> +<?xml-stylesheet type="text/xsl" href="style.xsl"?> +<store xmlns:t="urn:books-com:titles"> + <!Directive> + <!--This is a comment--> + <book lang="en"> + <t:title>Great Expectations</t:title> + <author>Charles Dickens</author> + <review/> + </book> +</store>` + + doc := newDocumentFromString(t, s) + review := doc.FindElement("/store/book/review") + review.SetText("Excellent") + + checkIndexes(t, &doc.Element) + + doc.Indent(4) + checkIndexes(t, &doc.Element) + + doc.Indent(NoIndent) + checkIndexes(t, &doc.Element) + + e := NewElement("foo") + store := doc.SelectElement("store") + store.InsertChildAt(0, e) + checkIndexes(t, &doc.Element) + + store.RemoveChildAt(0) + checkIndexes(t, &doc.Element) +} + +func TestSetText(t *testing.T) { + doc := NewDocument() + root := doc.CreateElement("root") + + checkDocEq(t, doc, `<root/>`) + checkStrEq(t, root.Text(), "") + checkIntEq(t, len(root.Child), 0) + + root.SetText("foo") + checkDocEq(t, doc, `<root>foo</root>`) + checkStrEq(t, root.Text(), "foo") + checkIntEq(t, len(root.Child), 1) + + root.SetText("bar") + checkDocEq(t, doc, `<root>bar</root>`) + checkStrEq(t, root.Text(), "bar") + checkIntEq(t, len(root.Child), 1) + + root.CreateCData("cdata") + checkDocEq(t, doc, `<root>bar<![CDATA[cdata]]></root>`) + checkStrEq(t, root.Text(), "barcdata") + checkIntEq(t, len(root.Child), 2) + + root.SetText("qux") + checkDocEq(t, doc, `<root>qux</root>`) + checkStrEq(t, root.Text(), "qux") + checkIntEq(t, len(root.Child), 1) + + root.CreateCData("cdata") + checkDocEq(t, doc, `<root>qux<![CDATA[cdata]]></root>`) + checkStrEq(t, root.Text(), "quxcdata") + checkIntEq(t, len(root.Child), 2) + + root.SetCData("baz") + checkDocEq(t, doc, `<root><![CDATA[baz]]></root>`) + checkStrEq(t, root.Text(), "baz") + checkIntEq(t, len(root.Child), 1) + + root.CreateText("corge") + root.CreateCData("grault") + root.CreateText("waldo") + root.CreateCData("fred") + root.CreateElement("child") + checkDocEq(t, doc, `<root><![CDATA[baz]]>corge<![CDATA[grault]]>waldo<![CDATA[fred]]><child/></root>`) + checkStrEq(t, root.Text(), "bazcorgegraultwaldofred") + checkIntEq(t, len(root.Child), 6) + + root.SetText("plugh") + checkDocEq(t, doc, `<root>plugh<child/></root>`) + checkStrEq(t, root.Text(), "plugh") + checkIntEq(t, len(root.Child), 2) + + root.SetText("") + checkDocEq(t, doc, `<root><child/></root>`) + checkStrEq(t, root.Text(), "") + checkIntEq(t, len(root.Child), 1) + + root.SetText("") + checkDocEq(t, doc, `<root><child/></root>`) + checkStrEq(t, root.Text(), "") + checkIntEq(t, len(root.Child), 1) + + root.RemoveChildAt(0) + root.CreateText("corge") + root.CreateCData("grault") + root.CreateText("waldo") + root.CreateCData("fred") + root.CreateElement("child") + checkDocEq(t, doc, `<root>corge<![CDATA[grault]]>waldo<![CDATA[fred]]><child/></root>`) + checkStrEq(t, root.Text(), "corgegraultwaldofred") + checkIntEq(t, len(root.Child), 5) + + root.SetText("") + checkDocEq(t, doc, `<root><child/></root>`) + checkStrEq(t, root.Text(), "") + checkIntEq(t, len(root.Child), 1) +} + +func TestSetTail(t *testing.T) { + doc := NewDocument() + root := doc.CreateElement("root") + child := root.CreateElement("child") + root.CreateText("\n\t") + child.SetText("foo") + checkDocEq(t, doc, "<root><child>foo</child>\n\t</root>") + checkStrEq(t, child.Tail(), "\n\t") + checkIntEq(t, len(root.Child), 2) + checkIntEq(t, len(child.Child), 1) + + root.CreateCData(" ") + checkDocEq(t, doc, "<root><child>foo</child>\n\t<![CDATA[ ]]></root>") + checkStrEq(t, child.Tail(), "\n\t ") + checkIntEq(t, len(root.Child), 3) + checkIntEq(t, len(child.Child), 1) + + child.SetTail("") + checkDocEq(t, doc, "<root><child>foo</child></root>") + checkStrEq(t, child.Tail(), "") + checkIntEq(t, len(root.Child), 1) + checkIntEq(t, len(child.Child), 1) + + child.SetTail("\t\t\t") + checkDocEq(t, doc, "<root><child>foo</child>\t\t\t</root>") + checkStrEq(t, child.Tail(), "\t\t\t") + checkIntEq(t, len(root.Child), 2) + checkIntEq(t, len(child.Child), 1) + + child.SetTail("\t\n\n\t") + checkDocEq(t, doc, "<root><child>foo</child>\t\n\n\t</root>") + checkStrEq(t, child.Tail(), "\t\n\n\t") + checkIntEq(t, len(root.Child), 2) + checkIntEq(t, len(child.Child), 1) + + child.SetTail("") + checkDocEq(t, doc, "<root><child>foo</child></root>") + checkStrEq(t, child.Tail(), "") + checkIntEq(t, len(root.Child), 1) + checkIntEq(t, len(child.Child), 1) +} + +func TestAttrParent(t *testing.T) { + doc := NewDocument() + root := doc.CreateElement("root") + attr1 := root.CreateAttr("bar", "1") + attr2 := root.CreateAttr("qux", "2") + + checkIntEq(t, len(root.Attr), 2) + checkElementEq(t, attr1.Element(), root) + checkElementEq(t, attr2.Element(), root) + + attr1 = root.RemoveAttr("bar") + attr2 = root.RemoveAttr("qux") + checkElementEq(t, attr1.Element(), nil) + checkElementEq(t, attr2.Element(), nil) + + s := `<root a="1" b="2" c="3" d="4"/>` + err := doc.ReadFromString(s) + if err != nil { + t.Error("etree: failed to parse document") + } + + root = doc.SelectElement("root") + for i := range root.Attr { + checkElementEq(t, root.Attr[i].Element(), root) + } +} + +func TestDefaultNamespaceURI(t *testing.T) { + s := ` +<root xmlns="https://root.example.com" xmlns:attrib="https://attrib.example.com" attrib:a="foo" b="bar"> + <child1 xmlns="https://child.example.com" attrib:a="foo"> + <grandchild1 xmlns="https://grandchild.example.com" a="foo"> + </grandchild1> + <grandchild2 a="foo"> + <greatgrandchild1 attrib:a="foo"/> + </grandchild2> + </child1> + <child2 a="foo"/> +</root>` + + doc := newDocumentFromString(t, s) + root := doc.SelectElement("root") + child1 := root.SelectElement("child1") + child2 := root.SelectElement("child2") + grandchild1 := child1.SelectElement("grandchild1") + grandchild2 := child1.SelectElement("grandchild2") + greatgrandchild1 := grandchild2.SelectElement("greatgrandchild1") + + checkStrEq(t, doc.NamespaceURI(), "") + checkStrEq(t, root.NamespaceURI(), "https://root.example.com") + checkStrEq(t, child1.NamespaceURI(), "https://child.example.com") + checkStrEq(t, child2.NamespaceURI(), "https://root.example.com") + checkStrEq(t, grandchild1.NamespaceURI(), "https://grandchild.example.com") + checkStrEq(t, grandchild2.NamespaceURI(), "https://child.example.com") + checkStrEq(t, greatgrandchild1.NamespaceURI(), "https://child.example.com") + + checkStrEq(t, root.Attr[0].NamespaceURI(), "") + checkStrEq(t, root.Attr[1].NamespaceURI(), "") + checkStrEq(t, root.Attr[2].NamespaceURI(), "https://attrib.example.com") + checkStrEq(t, root.Attr[3].NamespaceURI(), "") + checkStrEq(t, child1.Attr[0].NamespaceURI(), "") + checkStrEq(t, child1.Attr[1].NamespaceURI(), "https://attrib.example.com") + checkStrEq(t, child2.Attr[0].NamespaceURI(), "") + checkStrEq(t, grandchild1.Attr[0].NamespaceURI(), "") + checkStrEq(t, grandchild1.Attr[1].NamespaceURI(), "") + checkStrEq(t, grandchild2.Attr[0].NamespaceURI(), "") + checkStrEq(t, greatgrandchild1.Attr[0].NamespaceURI(), "https://attrib.example.com") + + f := doc.FindElements("//*[namespace-uri()='https://root.example.com']") + if len(f) != 2 || f[0] != root || f[1] != child2 { + t.Error("etree: failed namespace-uri test") + } + + f = doc.FindElements("//*[namespace-uri()='https://child.example.com']") + if len(f) != 3 || f[0] != child1 || f[1] != grandchild2 || f[2] != greatgrandchild1 { + t.Error("etree: failed namespace-uri test") + } + + f = doc.FindElements("//*[namespace-uri()='https://grandchild.example.com']") + if len(f) != 1 || f[0] != grandchild1 { + t.Error("etree: failed namespace-uri test") + } + + f = doc.FindElements("//*[namespace-uri()='']") + if len(f) != 0 { + t.Error("etree: failed namespace-uri test") + } + + f = doc.FindElements("//*[namespace-uri()='foo']") + if len(f) != 0 { + t.Error("etree: failed namespace-uri test") + } +} + +func TestLocalNamespaceURI(t *testing.T) { + s := ` +<a:root xmlns:a="https://root.example.com"> + <b:child1 xmlns:b="https://child.example.com"> + <c:grandchild1 xmlns:c="https://grandchild.example.com"/> + <b:grandchild2> + <a:greatgrandchild1/> + </b:grandchild2> + <a:grandchild3/> + <grandchild4/> + </b:child1> + <a:child2> + </a:child2> + <child3> + </child3> +</a:root>` + + doc := newDocumentFromString(t, s) + root := doc.SelectElement("root") + child1 := root.SelectElement("child1") + child2 := root.SelectElement("child2") + child3 := root.SelectElement("child3") + grandchild1 := child1.SelectElement("grandchild1") + grandchild2 := child1.SelectElement("grandchild2") + grandchild3 := child1.SelectElement("grandchild3") + grandchild4 := child1.SelectElement("grandchild4") + greatgrandchild1 := grandchild2.SelectElement("greatgrandchild1") + + checkStrEq(t, doc.NamespaceURI(), "") + checkStrEq(t, root.NamespaceURI(), "https://root.example.com") + checkStrEq(t, child1.NamespaceURI(), "https://child.example.com") + checkStrEq(t, child2.NamespaceURI(), "https://root.example.com") + checkStrEq(t, child3.NamespaceURI(), "") + checkStrEq(t, grandchild1.NamespaceURI(), "https://grandchild.example.com") + checkStrEq(t, grandchild2.NamespaceURI(), "https://child.example.com") + checkStrEq(t, grandchild3.NamespaceURI(), "https://root.example.com") + checkStrEq(t, grandchild4.NamespaceURI(), "") + checkStrEq(t, greatgrandchild1.NamespaceURI(), "https://root.example.com") + + f := doc.FindElements("//*[namespace-uri()='https://root.example.com']") + if len(f) != 4 || f[0] != root || f[1] != child2 || f[2] != grandchild3 || f[3] != greatgrandchild1 { + t.Error("etree: failed namespace-uri test") + } + + f = doc.FindElements("//*[namespace-uri()='https://child.example.com']") + if len(f) != 2 || f[0] != child1 || f[1] != grandchild2 { + t.Error("etree: failed namespace-uri test") + } + + f = doc.FindElements("//*[namespace-uri()='https://grandchild.example.com']") + if len(f) != 1 || f[0] != grandchild1 { + t.Error("etree: failed namespace-uri test") + } + + f = doc.FindElements("//*[namespace-uri()='']") + if len(f) != 2 || f[0] != child3 || f[1] != grandchild4 { + t.Error("etree: failed namespace-uri test") + } + + f = doc.FindElements("//*[namespace-uri()='foo']") + if len(f) != 0 { + t.Error("etree: failed namespace-uri test") + } +} + +func TestWhitespace(t *testing.T) { + s := "<root>\n\t<child>\n\t\t<grandchild> x</grandchild>\n </child>\n</root>" + + doc := newDocumentFromString(t, s) + root := doc.Root() + checkIntEq(t, len(root.Child), 3) + + cd := root.Child[0].(*CharData) + checkBoolEq(t, cd.IsWhitespace(), true) + checkStrBinaryEq(t, cd.Data, "\n\t") + + cd = root.Child[2].(*CharData) + checkBoolEq(t, cd.IsWhitespace(), true) + checkStrBinaryEq(t, cd.Data, "\n") + + child := root.SelectElement("child") + checkIntEq(t, len(child.Child), 3) + + cd = child.Child[0].(*CharData) + checkBoolEq(t, cd.IsWhitespace(), true) + checkStrBinaryEq(t, cd.Data, "\n\t\t") + + cd = child.Child[2].(*CharData) + checkBoolEq(t, cd.IsWhitespace(), true) + checkStrBinaryEq(t, cd.Data, "\n ") + + grandchild := child.SelectElement("grandchild") + checkIntEq(t, len(grandchild.Child), 1) + + cd = grandchild.Child[0].(*CharData) + checkBoolEq(t, cd.IsWhitespace(), false) + + cd.SetData(" ") + checkBoolEq(t, cd.IsWhitespace(), true) + + cd.SetData(" x") + checkBoolEq(t, cd.IsWhitespace(), false) + + cd.SetData("\t\n\r ") + checkBoolEq(t, cd.IsWhitespace(), true) + + cd.SetData("\uFFFD") + checkBoolEq(t, cd.IsWhitespace(), false) + + cd.SetData("") + checkBoolEq(t, cd.IsWhitespace(), true) +} diff --git a/example_test.go b/example_test.go new file mode 100644 index 0000000..45fc4ca --- /dev/null +++ b/example_test.go @@ -0,0 +1,69 @@ +// Copyright 2015-2019 Brett Vickers. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package etree + +import "os" + +// Create an etree Document, add XML entities to it, and serialize it +// to stdout. +func ExampleDocument_creating() { + doc := NewDocument() + doc.CreateProcInst("xml", `version="1.0" encoding="UTF-8"`) + doc.CreateProcInst("xml-stylesheet", `type="text/xsl" href="style.xsl"`) + + people := doc.CreateElement("People") + people.CreateComment("These are all known people") + + jon := people.CreateElement("Person") + jon.CreateAttr("name", "Jon O'Reilly") + + sally := people.CreateElement("Person") + sally.CreateAttr("name", "Sally") + + doc.Indent(2) + doc.WriteTo(os.Stdout) + // Output: + // <?xml version="1.0" encoding="UTF-8"?> + // <?xml-stylesheet type="text/xsl" href="style.xsl"?> + // <People> + // <!--These are all known people--> + // <Person name="Jon O'Reilly"/> + // <Person name="Sally"/> + // </People> +} + +func ExampleDocument_reading() { + doc := NewDocument() + if err := doc.ReadFromFile("document.xml"); err != nil { + panic(err) + } +} + +func ExamplePath() { + xml := ` +<bookstore> + <book> + <title>Great Expectations</title> + <author>Charles Dickens</author> + </book> + <book> + <title>Ulysses</title> + <author>James Joyce</author> + </book> +</bookstore>` + + doc := NewDocument() + doc.ReadFromString(xml) + for _, e := range doc.FindElements(".//book[author='Charles Dickens']") { + doc := NewDocumentWithRoot(e.Copy()) + doc.Indent(2) + doc.WriteTo(os.Stdout) + } + // Output: + // <book> + // <title>Great Expectations</title> + // <author>Charles Dickens</author> + // </book> +} @@ -0,0 +1,3 @@ +module github.com/beevik/etree + +go 1.12 diff --git a/helpers.go b/helpers.go new file mode 100644 index 0000000..825e14e --- /dev/null +++ b/helpers.go @@ -0,0 +1,276 @@ +// Copyright 2015-2019 Brett Vickers. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package etree + +import ( + "bufio" + "io" + "strings" + "unicode/utf8" +) + +// A simple stack +type stack struct { + data []interface{} +} + +func (s *stack) empty() bool { + return len(s.data) == 0 +} + +func (s *stack) push(value interface{}) { + s.data = append(s.data, value) +} + +func (s *stack) pop() interface{} { + value := s.data[len(s.data)-1] + s.data[len(s.data)-1] = nil + s.data = s.data[:len(s.data)-1] + return value +} + +func (s *stack) peek() interface{} { + return s.data[len(s.data)-1] +} + +// A fifo is a simple first-in-first-out queue. +type fifo struct { + data []interface{} + head, tail int +} + +func (f *fifo) add(value interface{}) { + if f.len()+1 >= len(f.data) { + f.grow() + } + f.data[f.tail] = value + if f.tail++; f.tail == len(f.data) { + f.tail = 0 + } +} + +func (f *fifo) remove() interface{} { + value := f.data[f.head] + f.data[f.head] = nil + if f.head++; f.head == len(f.data) { + f.head = 0 + } + return value +} + +func (f *fifo) len() int { + if f.tail >= f.head { + return f.tail - f.head + } + return len(f.data) - f.head + f.tail +} + +func (f *fifo) grow() { + c := len(f.data) * 2 + if c == 0 { + c = 4 + } + buf, count := make([]interface{}, c), f.len() + if f.tail >= f.head { + copy(buf[0:count], f.data[f.head:f.tail]) + } else { + hindex := len(f.data) - f.head + copy(buf[0:hindex], f.data[f.head:]) + copy(buf[hindex:count], f.data[:f.tail]) + } + f.data, f.head, f.tail = buf, 0, count +} + +// countReader implements a proxy reader that counts the number of +// bytes read from its encapsulated reader. +type countReader struct { + r io.Reader + bytes int64 +} + +func newCountReader(r io.Reader) *countReader { + return &countReader{r: r} +} + +func (cr *countReader) Read(p []byte) (n int, err error) { + b, err := cr.r.Read(p) + cr.bytes += int64(b) + return b, err +} + +// countWriter implements a proxy writer that counts the number of +// bytes written by its encapsulated writer. +type countWriter struct { + w io.Writer + bytes int64 +} + +func newCountWriter(w io.Writer) *countWriter { + return &countWriter{w: w} +} + +func (cw *countWriter) Write(p []byte) (n int, err error) { + b, err := cw.w.Write(p) + cw.bytes += int64(b) + return b, err +} + +// isWhitespace returns true if the byte slice contains only +// whitespace characters. +func isWhitespace(s string) bool { + for i := 0; i < len(s); i++ { + if c := s[i]; c != ' ' && c != '\t' && c != '\n' && c != '\r' { + return false + } + } + return true +} + +// spaceMatch returns true if namespace a is the empty string +// or if namespace a equals namespace b. +func spaceMatch(a, b string) bool { + switch { + case a == "": + return true + default: + return a == b + } +} + +// spaceDecompose breaks a namespace:tag identifier at the ':' +// and returns the two parts. +func spaceDecompose(str string) (space, key string) { + colon := strings.IndexByte(str, ':') + if colon == -1 { + return "", str + } + return str[:colon], str[colon+1:] +} + +// Strings used by indentCRLF and indentLF +const ( + indentSpaces = "\r\n " + indentTabs = "\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t" +) + +// indentCRLF returns a CRLF newline followed by n copies of the first +// non-CRLF character in the source string. +func indentCRLF(n int, source string) string { + switch { + case n < 0: + return source[:2] + case n < len(source)-1: + return source[:n+2] + default: + return source + strings.Repeat(source[2:3], n-len(source)+2) + } +} + +// indentLF returns a LF newline followed by n copies of the first non-LF +// character in the source string. +func indentLF(n int, source string) string { + switch { + case n < 0: + return source[1:2] + case n < len(source)-1: + return source[1 : n+2] + default: + return source[1:] + strings.Repeat(source[2:3], n-len(source)+2) + } +} + +// nextIndex returns the index of the next occurrence of sep in s, +// starting from offset. It returns -1 if the sep string is not found. +func nextIndex(s, sep string, offset int) int { + switch i := strings.Index(s[offset:], sep); i { + case -1: + return -1 + default: + return offset + i + } +} + +// isInteger returns true if the string s contains an integer. +func isInteger(s string) bool { + for i := 0; i < len(s); i++ { + if (s[i] < '0' || s[i] > '9') && !(i == 0 && s[i] == '-') { + return false + } + } + return true +} + +type escapeMode byte + +const ( + escapeNormal escapeMode = iota + escapeCanonicalText + escapeCanonicalAttr +) + +// escapeString writes an escaped version of a string to the writer. +func escapeString(w *bufio.Writer, s string, m escapeMode) { + var esc []byte + last := 0 + for i := 0; i < len(s); { + r, width := utf8.DecodeRuneInString(s[i:]) + i += width + switch r { + case '&': + esc = []byte("&") + case '<': + esc = []byte("<") + case '>': + if m == escapeCanonicalAttr { + continue + } + esc = []byte(">") + case '\'': + if m != escapeNormal { + continue + } + esc = []byte("'") + case '"': + if m == escapeCanonicalText { + continue + } + esc = []byte(""") + case '\t': + if m != escapeCanonicalAttr { + continue + } + esc = []byte("	") + case '\n': + if m != escapeCanonicalAttr { + continue + } + esc = []byte("
") + case '\r': + if m == escapeNormal { + continue + } + esc = []byte("
") + default: + if !isInCharacterRange(r) || (r == 0xFFFD && width == 1) { + esc = []byte("\uFFFD") + break + } + continue + } + w.WriteString(s[last : i-width]) + w.Write(esc) + last = i + } + w.WriteString(s[last:]) +} + +func isInCharacterRange(r rune) bool { + return r == 0x09 || + r == 0x0A || + r == 0x0D || + r >= 0x20 && r <= 0xD7FF || + r >= 0xE000 && r <= 0xFFFD || + r >= 0x10000 && r <= 0x10FFFF +} @@ -0,0 +1,580 @@ +// Copyright 2015-2019 Brett Vickers. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package etree + +import ( + "strconv" + "strings" +) + +/* +A Path is a string that represents a search path through an etree starting +from the document root or an arbitrary element. Paths are used with the +Element object's Find* methods to locate and return desired elements. + +A Path consists of a series of slash-separated "selectors", each of which may +be modified by one or more bracket-enclosed "filters". Selectors are used to +traverse the etree from element to element, while filters are used to narrow +the list of candidate elements at each node. + +Although etree Path strings are structurally and behaviorally similar to XPath +strings (https://www.w3.org/TR/1999/REC-xpath-19991116/), they have a more +limited set of selectors and filtering options. + +The following selectors are supported by etree paths: + + . Select the current element. + .. Select the parent of the current element. + * Select all child elements of the current element. + / Select the root element when used at the start of a path. + // Select all descendants of the current element. + tag Select all child elements with a name matching the tag. + +The following basic filters are supported: + + [@attrib] Keep elements with an attribute named attrib. + [@attrib='val'] Keep elements with an attribute named attrib and value matching val. + [tag] Keep elements with a child element named tag. + [tag='val'] Keep elements with a child element named tag and text matching val. + [n] Keep the n-th element, where n is a numeric index starting from 1. + +The following function-based filters are supported: + + [text()] Keep elements with non-empty text. + [text()='val'] Keep elements whose text matches val. + [local-name()='val'] Keep elements whose un-prefixed tag matches val. + [name()='val'] Keep elements whose full tag exactly matches val. + [namespace-prefix()] Keep elements with non-empty namespace prefixes. + [namespace-prefix()='val'] Keep elements whose namespace prefix matches val. + [namespace-uri()] Keep elements with non-empty namespace URIs. + [namespace-uri()='val'] Keep elements whose namespace URI matches val. + +Below are some examples of etree path strings. + +Select the bookstore child element of the root element: + /bookstore + +Beginning from the root element, select the title elements of all descendant +book elements having a 'category' attribute of 'WEB': + //book[@category='WEB']/title + +Beginning from the current element, select the first descendant book element +with a title child element containing the text 'Great Expectations': + .//book[title='Great Expectations'][1] + +Beginning from the current element, select all child elements of book elements +with an attribute 'language' set to 'english': + ./book/*[@language='english'] + +Beginning from the current element, select all child elements of book elements +containing the text 'special': + ./book/*[text()='special'] + +Beginning from the current element, select all descendant book elements whose +title child element has a 'language' attribute of 'french': + .//book/title[@language='french']/.. + +Beginning from the current element, select all descendant book elements +belonging to the http://www.w3.org/TR/html4/ namespace: + .//book[namespace-uri()='http://www.w3.org/TR/html4/'] + +*/ +type Path struct { + segments []segment +} + +// ErrPath is returned by path functions when an invalid etree path is provided. +type ErrPath string + +// Error returns the string describing a path error. +func (err ErrPath) Error() string { + return "etree: " + string(err) +} + +// CompilePath creates an optimized version of an XPath-like string that +// can be used to query elements in an element tree. +func CompilePath(path string) (Path, error) { + var comp compiler + segments := comp.parsePath(path) + if comp.err != ErrPath("") { + return Path{nil}, comp.err + } + return Path{segments}, nil +} + +// MustCompilePath creates an optimized version of an XPath-like string that +// can be used to query elements in an element tree. Panics if an error +// occurs. Use this function to create Paths when you know the path is +// valid (i.e., if it's hard-coded). +func MustCompilePath(path string) Path { + p, err := CompilePath(path) + if err != nil { + panic(err) + } + return p +} + +// A segment is a portion of a path between "/" characters. +// It contains one selector and zero or more [filters]. +type segment struct { + sel selector + filters []filter +} + +func (seg *segment) apply(e *Element, p *pather) { + seg.sel.apply(e, p) + for _, f := range seg.filters { + f.apply(p) + } +} + +// A selector selects XML elements for consideration by the +// path traversal. +type selector interface { + apply(e *Element, p *pather) +} + +// A filter pares down a list of candidate XML elements based +// on a path filter in [brackets]. +type filter interface { + apply(p *pather) +} + +// A pather is helper object that traverses an element tree using +// a Path object. It collects and deduplicates all elements matching +// the path query. +type pather struct { + queue fifo + results []*Element + inResults map[*Element]bool + candidates []*Element + scratch []*Element // used by filters +} + +// A node represents an element and the remaining path segments that +// should be applied against it by the pather. +type node struct { + e *Element + segments []segment +} + +func newPather() *pather { + return &pather{ + results: make([]*Element, 0), + inResults: make(map[*Element]bool), + candidates: make([]*Element, 0), + scratch: make([]*Element, 0), + } +} + +// traverse follows the path from the element e, collecting +// and then returning all elements that match the path's selectors +// and filters. +func (p *pather) traverse(e *Element, path Path) []*Element { + for p.queue.add(node{e, path.segments}); p.queue.len() > 0; { + p.eval(p.queue.remove().(node)) + } + return p.results +} + +// eval evalutes the current path node by applying the remaining +// path's selector rules against the node's element. +func (p *pather) eval(n node) { + p.candidates = p.candidates[0:0] + seg, remain := n.segments[0], n.segments[1:] + seg.apply(n.e, p) + + if len(remain) == 0 { + for _, c := range p.candidates { + if in := p.inResults[c]; !in { + p.inResults[c] = true + p.results = append(p.results, c) + } + } + } else { + for _, c := range p.candidates { + p.queue.add(node{c, remain}) + } + } +} + +// A compiler generates a compiled path from a path string. +type compiler struct { + err ErrPath +} + +// parsePath parses an XPath-like string describing a path +// through an element tree and returns a slice of segment +// descriptors. +func (c *compiler) parsePath(path string) []segment { + // If path ends with //, fix it + if strings.HasSuffix(path, "//") { + path += "*" + } + + var segments []segment + + // Check for an absolute path + if strings.HasPrefix(path, "/") { + segments = append(segments, segment{new(selectRoot), []filter{}}) + path = path[1:] + } + + // Split path into segments + for _, s := range splitPath(path) { + segments = append(segments, c.parseSegment(s)) + if c.err != ErrPath("") { + break + } + } + return segments +} + +func splitPath(path string) []string { + var pieces []string + start := 0 + inquote := false + for i := 0; i+1 <= len(path); i++ { + if path[i] == '\'' { + inquote = !inquote + } else if path[i] == '/' && !inquote { + pieces = append(pieces, path[start:i]) + start = i + 1 + } + } + return append(pieces, path[start:]) +} + +// parseSegment parses a path segment between / characters. +func (c *compiler) parseSegment(path string) segment { + pieces := strings.Split(path, "[") + seg := segment{ + sel: c.parseSelector(pieces[0]), + filters: []filter{}, + } + for i := 1; i < len(pieces); i++ { + fpath := pieces[i] + if fpath[len(fpath)-1] != ']' { + c.err = ErrPath("path has invalid filter [brackets].") + break + } + seg.filters = append(seg.filters, c.parseFilter(fpath[:len(fpath)-1])) + } + return seg +} + +// parseSelector parses a selector at the start of a path segment. +func (c *compiler) parseSelector(path string) selector { + switch path { + case ".": + return new(selectSelf) + case "..": + return new(selectParent) + case "*": + return new(selectChildren) + case "": + return new(selectDescendants) + default: + return newSelectChildrenByTag(path) + } +} + +var fnTable = map[string]func(e *Element) string{ + "local-name": (*Element).name, + "name": (*Element).FullTag, + "namespace-prefix": (*Element).namespacePrefix, + "namespace-uri": (*Element).NamespaceURI, + "text": (*Element).Text, +} + +// parseFilter parses a path filter contained within [brackets]. +func (c *compiler) parseFilter(path string) filter { + if len(path) == 0 { + c.err = ErrPath("path contains an empty filter expression.") + return nil + } + + // Filter contains [@attr='val'], [fn()='val'], or [tag='val']? + eqindex := strings.Index(path, "='") + if eqindex >= 0 { + rindex := nextIndex(path, "'", eqindex+2) + if rindex != len(path)-1 { + c.err = ErrPath("path has mismatched filter quotes.") + return nil + } + + key := path[:eqindex] + value := path[eqindex+2 : rindex] + + switch { + case key[0] == '@': + return newFilterAttrVal(key[1:], value) + case strings.HasSuffix(key, "()"): + name := key[:len(key)-2] + if fn, ok := fnTable[name]; ok { + return newFilterFuncVal(fn, value) + } + c.err = ErrPath("path has unknown function " + name) + return nil + default: + return newFilterChildText(key, value) + } + } + + // Filter contains [@attr], [N], [tag] or [fn()] + switch { + case path[0] == '@': + return newFilterAttr(path[1:]) + case strings.HasSuffix(path, "()"): + name := path[:len(path)-2] + if fn, ok := fnTable[name]; ok { + return newFilterFunc(fn) + } + c.err = ErrPath("path has unknown function " + name) + return nil + case isInteger(path): + pos, _ := strconv.Atoi(path) + switch { + case pos > 0: + return newFilterPos(pos - 1) + default: + return newFilterPos(pos) + } + default: + return newFilterChild(path) + } +} + +// selectSelf selects the current element into the candidate list. +type selectSelf struct{} + +func (s *selectSelf) apply(e *Element, p *pather) { + p.candidates = append(p.candidates, e) +} + +// selectRoot selects the element's root node. +type selectRoot struct{} + +func (s *selectRoot) apply(e *Element, p *pather) { + root := e + for root.parent != nil { + root = root.parent + } + p.candidates = append(p.candidates, root) +} + +// selectParent selects the element's parent into the candidate list. +type selectParent struct{} + +func (s *selectParent) apply(e *Element, p *pather) { + if e.parent != nil { + p.candidates = append(p.candidates, e.parent) + } +} + +// selectChildren selects the element's child elements into the +// candidate list. +type selectChildren struct{} + +func (s *selectChildren) apply(e *Element, p *pather) { + for _, c := range e.Child { + if c, ok := c.(*Element); ok { + p.candidates = append(p.candidates, c) + } + } +} + +// selectDescendants selects all descendant child elements +// of the element into the candidate list. +type selectDescendants struct{} + +func (s *selectDescendants) apply(e *Element, p *pather) { + var queue fifo + for queue.add(e); queue.len() > 0; { + e := queue.remove().(*Element) + p.candidates = append(p.candidates, e) + for _, c := range e.Child { + if c, ok := c.(*Element); ok { + queue.add(c) + } + } + } +} + +// selectChildrenByTag selects into the candidate list all child +// elements of the element having the specified tag. +type selectChildrenByTag struct { + space, tag string +} + +func newSelectChildrenByTag(path string) *selectChildrenByTag { + s, l := spaceDecompose(path) + return &selectChildrenByTag{s, l} +} + +func (s *selectChildrenByTag) apply(e *Element, p *pather) { + for _, c := range e.Child { + if c, ok := c.(*Element); ok && spaceMatch(s.space, c.Space) && s.tag == c.Tag { + p.candidates = append(p.candidates, c) + } + } +} + +// filterPos filters the candidate list, keeping only the +// candidate at the specified index. +type filterPos struct { + index int +} + +func newFilterPos(pos int) *filterPos { + return &filterPos{pos} +} + +func (f *filterPos) apply(p *pather) { + if f.index >= 0 { + if f.index < len(p.candidates) { + p.scratch = append(p.scratch, p.candidates[f.index]) + } + } else { + if -f.index <= len(p.candidates) { + p.scratch = append(p.scratch, p.candidates[len(p.candidates)+f.index]) + } + } + p.candidates, p.scratch = p.scratch, p.candidates[0:0] +} + +// filterAttr filters the candidate list for elements having +// the specified attribute. +type filterAttr struct { + space, key string +} + +func newFilterAttr(str string) *filterAttr { + s, l := spaceDecompose(str) + return &filterAttr{s, l} +} + +func (f *filterAttr) apply(p *pather) { + for _, c := range p.candidates { + for _, a := range c.Attr { + if spaceMatch(f.space, a.Space) && f.key == a.Key { + p.scratch = append(p.scratch, c) + break + } + } + } + p.candidates, p.scratch = p.scratch, p.candidates[0:0] +} + +// filterAttrVal filters the candidate list for elements having +// the specified attribute with the specified value. +type filterAttrVal struct { + space, key, val string +} + +func newFilterAttrVal(str, value string) *filterAttrVal { + s, l := spaceDecompose(str) + return &filterAttrVal{s, l, value} +} + +func (f *filterAttrVal) apply(p *pather) { + for _, c := range p.candidates { + for _, a := range c.Attr { + if spaceMatch(f.space, a.Space) && f.key == a.Key && f.val == a.Value { + p.scratch = append(p.scratch, c) + break + } + } + } + p.candidates, p.scratch = p.scratch, p.candidates[0:0] +} + +// filterFunc filters the candidate list for elements satisfying a custom +// boolean function. +type filterFunc struct { + fn func(e *Element) string +} + +func newFilterFunc(fn func(e *Element) string) *filterFunc { + return &filterFunc{fn} +} + +func (f *filterFunc) apply(p *pather) { + for _, c := range p.candidates { + if f.fn(c) != "" { + p.scratch = append(p.scratch, c) + } + } + p.candidates, p.scratch = p.scratch, p.candidates[0:0] +} + +// filterFuncVal filters the candidate list for elements containing a value +// matching the result of a custom function. +type filterFuncVal struct { + fn func(e *Element) string + val string +} + +func newFilterFuncVal(fn func(e *Element) string, value string) *filterFuncVal { + return &filterFuncVal{fn, value} +} + +func (f *filterFuncVal) apply(p *pather) { + for _, c := range p.candidates { + if f.fn(c) == f.val { + p.scratch = append(p.scratch, c) + } + } + p.candidates, p.scratch = p.scratch, p.candidates[0:0] +} + +// filterChild filters the candidate list for elements having +// a child element with the specified tag. +type filterChild struct { + space, tag string +} + +func newFilterChild(str string) *filterChild { + s, l := spaceDecompose(str) + return &filterChild{s, l} +} + +func (f *filterChild) apply(p *pather) { + for _, c := range p.candidates { + for _, cc := range c.Child { + if cc, ok := cc.(*Element); ok && + spaceMatch(f.space, cc.Space) && + f.tag == cc.Tag { + p.scratch = append(p.scratch, c) + } + } + } + p.candidates, p.scratch = p.scratch, p.candidates[0:0] +} + +// filterChildText filters the candidate list for elements having +// a child element with the specified tag and text. +type filterChildText struct { + space, tag, text string +} + +func newFilterChildText(str, text string) *filterChildText { + s, l := spaceDecompose(str) + return &filterChildText{s, l, text} +} + +func (f *filterChildText) apply(p *pather) { + for _, c := range p.candidates { + for _, cc := range c.Child { + if cc, ok := cc.(*Element); ok && + spaceMatch(f.space, cc.Space) && + f.tag == cc.Tag && + f.text == cc.Text() { + p.scratch = append(p.scratch, c) + } + } + } + p.candidates, p.scratch = p.scratch, p.candidates[0:0] +} diff --git a/path_test.go b/path_test.go new file mode 100644 index 0000000..ed0b570 --- /dev/null +++ b/path_test.go @@ -0,0 +1,222 @@ +// Copyright 2015-2019 Brett Vickers. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package etree + +import "testing" + +var testXML = ` +<?xml version="1.0" encoding="UTF-8"?> +<bookstore xmlns:p="urn:books-com:prices"> + + <!Directive> + + <book category="COOKING"> + <title lang="en">Everyday Italian</title> + <author>Giada De Laurentiis</author> + <year>2005</year> + <p:price>30.00</p:price> + <editor>Clarkson Potter</editor> + </book> + + <book category="CHILDREN"> + <title lang="en" sku="150">Harry Potter</title> + <author>J K. Rowling</author> + <year>2005</year> + <p:price p:tax="1.99">29.99</p:price> + <editor></editor> + <editor/> + </book> + + <book category="WEB"> + <title lang="en">XQuery Kick Start</title> + <author>James McGovern</author> + <author>Per Bothner</author> + <author>Kurt Cagle</author> + <author>James Linn</author> + <author>Vaidyanathan Nagarajan</author> + <year>2003</year> + <price>49.99</p:price> + <editor> + </editor> + </book> + + <!-- Final book --> + <book category="WEB" path="/books/xml"> + <title lang="en">Learning XML</title> + <author>Erik T. Ray</author> + <year>2003</year> + <p:price>39.95</p:price> + </book> + +</bookstore> +` + +type test struct { + path string + result interface{} +} + +type errorResult string + +var tests = []test{ + // basic queries + {"./bookstore/book/title", []string{"Everyday Italian", "Harry Potter", "XQuery Kick Start", "Learning XML"}}, + {"./bookstore/book/author", []string{"Giada De Laurentiis", "J K. Rowling", "James McGovern", "Per Bothner", "Kurt Cagle", "James Linn", "Vaidyanathan Nagarajan", "Erik T. Ray"}}, + {"./bookstore/book/year", []string{"2005", "2005", "2003", "2003"}}, + {"./bookstore/book/p:price", []string{"30.00", "29.99", "39.95"}}, + {"./bookstore/book/isbn", nil}, + + // descendant queries + {"//title", []string{"Everyday Italian", "Harry Potter", "XQuery Kick Start", "Learning XML"}}, + {"//book/title", []string{"Everyday Italian", "Harry Potter", "XQuery Kick Start", "Learning XML"}}, + {".//title", []string{"Everyday Italian", "Harry Potter", "XQuery Kick Start", "Learning XML"}}, + {".//bookstore//title", []string{"Everyday Italian", "Harry Potter", "XQuery Kick Start", "Learning XML"}}, + {".//book/title", []string{"Everyday Italian", "Harry Potter", "XQuery Kick Start", "Learning XML"}}, + {".//p:price/.", []string{"30.00", "29.99", "39.95"}}, + {".//price", []string{"30.00", "29.99", "49.99", "39.95"}}, + + // positional queries + {"./bookstore/book[1]/title", "Everyday Italian"}, + {"./bookstore/book[4]/title", "Learning XML"}, + {"./bookstore/book[5]/title", nil}, + {"./bookstore/book[3]/author[0]", "James McGovern"}, + {"./bookstore/book[3]/author[1]", "James McGovern"}, + {"./bookstore/book[3]/author[3]/./.", "Kurt Cagle"}, + {"./bookstore/book[3]/author[6]", nil}, + {"./bookstore/book[-1]/title", "Learning XML"}, + {"./bookstore/book[-4]/title", "Everyday Italian"}, + {"./bookstore/book[-5]/title", nil}, + + // text function queries + {"./bookstore/book[author='James McGovern']/title", "XQuery Kick Start"}, + {"./bookstore/book[author='Per Bothner']/title", "XQuery Kick Start"}, + {"./bookstore/book[author='Kurt Cagle']/title", "XQuery Kick Start"}, + {"./bookstore/book[author='James Linn']/title", "XQuery Kick Start"}, + {"./bookstore/book[author='Vaidyanathan Nagarajan']/title", "XQuery Kick Start"}, + {"//book[p:price='29.99']/title", "Harry Potter"}, + {"//book[price='29.99']/title", "Harry Potter"}, + {"//book/price[text()='29.99']", "29.99"}, + {"//book/author[text()='Kurt Cagle']", "Kurt Cagle"}, + {"//book/editor[text()]", []string{"Clarkson Potter", "\n\t\t"}}, + + // namespace function queries + {"//*[namespace-uri()]", []string{"30.00", "29.99", "39.95"}}, + {"//*[namespace-uri()='urn:books-com:prices']", []string{"30.00", "29.99", "39.95"}}, + {"//*[namespace-uri()='foo']", nil}, + {"//*[namespace-prefix()]", []string{"30.00", "29.99", "39.95"}}, + {"//*[namespace-prefix()='p']", []string{"30.00", "29.99", "39.95"}}, + {"//*[name()='p:price']", []string{"30.00", "29.99", "39.95"}}, + {"//*[local-name()='price']", []string{"30.00", "29.99", "49.99", "39.95"}}, + {"//price[namespace-uri()='']", []string{"49.99"}}, + {"//price[namespace-prefix()='']", []string{"49.99"}}, + {"//price[name()='price']", []string{"49.99"}}, + {"//price[local-name()='price']", []string{"30.00", "29.99", "49.99", "39.95"}}, + + // attribute queries + {"./bookstore/book[@category='WEB']/title", []string{"XQuery Kick Start", "Learning XML"}}, + {"./bookstore/book[@path='/books/xml']/title", []string{"Learning XML"}}, + {"./bookstore/book[@category='COOKING']/title[@lang='en']", "Everyday Italian"}, + {"./bookstore/book/title[@lang='en'][@sku='150']", "Harry Potter"}, + {"./bookstore/book/title[@lang='fr']", nil}, + {"//p:price[@p:tax='1.99']", []string{"29.99"}}, + {"//p:price[@tax='1.99']", []string{"29.99"}}, + {"//p:price[@p:tax]", []string{"29.99"}}, + {"//p:price[@tax]", []string{"29.99"}}, + + // parent queries + {"./bookstore/book[@category='COOKING']/title/../../book[4]/title", "Learning XML"}, + + // root queries + {"/bookstore/book[1]/title", "Everyday Italian"}, + {"/bookstore/book[4]/title", "Learning XML"}, + {"/bookstore/book[5]/title", nil}, + {"/bookstore/book[3]/author[0]", "James McGovern"}, + {"/bookstore/book[3]/author[1]", "James McGovern"}, + {"/bookstore/book[3]/author[3]/./.", "Kurt Cagle"}, + {"/bookstore/book[3]/author[6]", nil}, + {"/bookstore/book[-1]/title", "Learning XML"}, + {"/bookstore/book[-4]/title", "Everyday Italian"}, + {"/bookstore/book[-5]/title", nil}, + + // bad paths + {"./bookstore/book[]", errorResult("etree: path contains an empty filter expression.")}, + {"./bookstore/book[@category='WEB'", errorResult("etree: path has invalid filter [brackets].")}, + {"./bookstore/book[@category='WEB]", errorResult("etree: path has mismatched filter quotes.")}, + {"./bookstore/book[author]a", errorResult("etree: path has invalid filter [brackets].")}, +} + +func TestPath(t *testing.T) { + doc := NewDocument() + err := doc.ReadFromString(testXML) + if err != nil { + t.Error(err) + } + + for _, test := range tests { + path, err := CompilePath(test.path) + if err != nil { + if r, ok := test.result.(errorResult); !ok || err.Error() != string(r) { + fail(t, test) + } + continue + } + + // Test both FindElementsPath and FindElementPath + element := doc.FindElementPath(path) + elements := doc.FindElementsPath(path) + + switch s := test.result.(type) { + case errorResult: + fail(t, test) + case nil: + if element != nil || len(elements) != 0 { + fail(t, test) + } + case string: + if element == nil || element.Text() != s || + len(elements) != 1 || elements[0].Text() != s { + fail(t, test) + } + case []string: + if element == nil || element.Text() != s[0] || len(elements) != len(s) { + fail(t, test) + continue + } + for i := 0; i < len(elements); i++ { + if elements[i].Text() != s[i] { + fail(t, test) + break + } + } + } + + } +} + +func fail(t *testing.T, test test) { + t.Helper() + t.Errorf("etree: failed test '%s'\n", test.path) +} + +func TestAbsolutePath(t *testing.T) { + doc := NewDocument() + err := doc.ReadFromString(testXML) + if err != nil { + t.Error(err) + } + + elements := doc.FindElements("//book/author") + for _, e := range elements { + title := e.FindElement("/bookstore/book[1]/title") + if title == nil || title.Text() != "Everyday Italian" { + t.Errorf("etree: absolute path test failed") + } + + title = e.FindElement("//book[p:price='29.99']/title") + if title == nil || title.Text() != "Harry Potter" { + t.Errorf("etree: absolute path test failed") + } + } +} |