diff options
Diffstat (limited to 'src/org/ccil/cowan/tagsoup/Parser.java')
-rw-r--r-- | src/org/ccil/cowan/tagsoup/Parser.java | 1114 |
1 files changed, 1114 insertions, 0 deletions
diff --git a/src/org/ccil/cowan/tagsoup/Parser.java b/src/org/ccil/cowan/tagsoup/Parser.java new file mode 100644 index 0000000..0997f23 --- /dev/null +++ b/src/org/ccil/cowan/tagsoup/Parser.java @@ -0,0 +1,1114 @@ +// This file is part of TagSoup and is Copyright 2002-2008 by John Cowan. +// +// TagSoup is licensed under the Apache License, +// Version 2.0. You may obtain a copy of this license at +// http://www.apache.org/licenses/LICENSE-2.0 . You may also have +// additional legal rights not granted by this license. +// +// TagSoup is distributed in the hope that it will be useful, but +// unless required by applicable law or agreed to in writing, TagSoup +// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS +// OF ANY KIND, either express or implied; not even the implied warranty +// of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +// +// +// The TagSoup parser + +package org.ccil.cowan.tagsoup; +import java.util.HashMap; +import java.util.ArrayList; +import java.io.*; +import java.net.URL; +import java.net.URLConnection; +import org.xml.sax.*; +import org.xml.sax.helpers.DefaultHandler; +import org.xml.sax.ext.LexicalHandler; + + +/** +The SAX parser class. +**/ +public class Parser extends DefaultHandler implements ScanHandler, XMLReader, LexicalHandler { + + // XMLReader implementation + + private ContentHandler theContentHandler = this; + private LexicalHandler theLexicalHandler = this; + private DTDHandler theDTDHandler = this; + private ErrorHandler theErrorHandler = this; + private EntityResolver theEntityResolver = this; + private Schema theSchema; + private Scanner theScanner; + private AutoDetector theAutoDetector; + + // Default values for feature flags + + private static boolean DEFAULT_NAMESPACES = true; + private static boolean DEFAULT_IGNORE_BOGONS = false; + private static boolean DEFAULT_BOGONS_EMPTY = false; + private static boolean DEFAULT_ROOT_BOGONS = true; + private static boolean DEFAULT_DEFAULT_ATTRIBUTES = true; + private static boolean DEFAULT_TRANSLATE_COLONS = false; + private static boolean DEFAULT_RESTART_ELEMENTS = true; + private static boolean DEFAULT_IGNORABLE_WHITESPACE = false; + private static boolean DEFAULT_CDATA_ELEMENTS = true; + + // Feature flags. + + private boolean namespaces = DEFAULT_NAMESPACES; + private boolean ignoreBogons = DEFAULT_IGNORE_BOGONS; + private boolean bogonsEmpty = DEFAULT_BOGONS_EMPTY; + private boolean rootBogons = DEFAULT_ROOT_BOGONS; + private boolean defaultAttributes = DEFAULT_DEFAULT_ATTRIBUTES; + private boolean translateColons = DEFAULT_TRANSLATE_COLONS; + private boolean restartElements = DEFAULT_RESTART_ELEMENTS; + private boolean ignorableWhitespace = DEFAULT_IGNORABLE_WHITESPACE; + private boolean CDATAElements = DEFAULT_CDATA_ELEMENTS; + + /** + A value of "true" indicates namespace URIs and unprefixed local + names for element and attribute names will be available. + **/ + public final static String namespacesFeature = + "http://xml.org/sax/features/namespaces"; + + /** + A value of "true" indicates that XML qualified names (with prefixes) + and attributes (including xmlns* attributes) will be available. + We don't support this value. + **/ + public final static String namespacePrefixesFeature = + "http://xml.org/sax/features/namespace-prefixes"; + + /** + Reports whether this parser processes external general entities + (it doesn't). + **/ + public final static String externalGeneralEntitiesFeature = + "http://xml.org/sax/features/external-general-entities"; + + /** + Reports whether this parser processes external parameter entities + (it doesn't). + **/ + public final static String externalParameterEntitiesFeature = + "http://xml.org/sax/features/external-parameter-entities"; + + /** + May be examined only during a parse, after the startDocument() + callback has been completed; read-only. The value is true if + the document specified standalone="yes" in its XML declaration, + and otherwise is false. (It's always false.) + **/ + public final static String isStandaloneFeature = + "http://xml.org/sax/features/is-standalone"; + + /** + A value of "true" indicates that the LexicalHandler will report + the beginning and end of parameter entities (it won't). + **/ + public final static String lexicalHandlerParameterEntitiesFeature = + "http://xml.org/sax/features/lexical-handler/parameter-entities"; + + /** + A value of "true" indicates that system IDs in declarations will + be absolutized (relative to their base URIs) before reporting. + (This returns true but doesn't actually do anything.) + **/ + public final static String resolveDTDURIsFeature = + "http://xml.org/sax/features/resolve-dtd-uris"; + + /** + Has a value of "true" if all XML names (for elements, + prefixes, attributes, entities, notations, and local + names), as well as Namespace URIs, will have been interned + using java.lang.String.intern. This supports fast testing of + equality/inequality against string constants, rather than forcing + slower calls to String.equals(). (We always intern.) + **/ + public final static String stringInterningFeature = + "http://xml.org/sax/features/string-interning"; + + /** + Returns "true" if the Attributes objects passed by this + parser in ContentHandler.startElement() implement the + org.xml.sax.ext.Attributes2 interface. (They don't.) + **/ + + public final static String useAttributes2Feature = + "http://xml.org/sax/features/use-attributes2"; + + /** + Returns "true" if the Locator objects passed by this parser + in ContentHandler.setDocumentLocator() implement the + org.xml.sax.ext.Locator2 interface. (They don't.) + **/ + public final static String useLocator2Feature = + "http://xml.org/sax/features/use-locator2"; + + /** + Returns "true" if, when setEntityResolver is given an object + implementing the org.xml.sax.ext.EntityResolver2 interface, + those new methods will be used. (They won't be.) + **/ + public final static String useEntityResolver2Feature = + "http://xml.org/sax/features/use-entity-resolver2"; + + /** + Controls whether the parser is reporting all validity errors + (We don't report any validity errors.) + **/ + public final static String validationFeature = + "http://xml.org/sax/features/validation"; + + /** + Controls whether the parser reports Unicode normalization + errors as described in section 2.13 and Appendix B of the XML + 1.1 Recommendation. (We don't normalize.) + **/ + public final static String unicodeNormalizationCheckingFeature = +"http://xml.org/sax/features/unicode-normalization-checking"; + + /** + Controls whether, when the namespace-prefixes feature is set, + the parser treats namespace declaration attributes as being in + the http://www.w3.org/2000/xmlns/ namespace. (It doesn't.) + **/ + public final static String xmlnsURIsFeature = + "http://xml.org/sax/features/xmlns-uris"; + + /** + Returns "true" if the parser supports both XML 1.1 and XML 1.0. + (Always false.) + **/ + public final static String XML11Feature = + "http://xml.org/sax/features/xml-1.1"; + + /** + A value of "true" indicates that the parser will ignore + unknown elements. + **/ + public final static String ignoreBogonsFeature = + "http://www.ccil.org/~cowan/tagsoup/features/ignore-bogons"; + + /** + A value of "true" indicates that the parser will give unknown + elements a content model of EMPTY; a value of "false", a + content model of ANY. + **/ + public final static String bogonsEmptyFeature = + "http://www.ccil.org/~cowan/tagsoup/features/bogons-empty"; + + /** + A value of "true" indicates that the parser will allow unknown + elements to be the root element. + **/ + public final static String rootBogonsFeature = + "http://www.ccil.org/~cowan/tagsoup/features/root-bogons"; + + /** + A value of "true" indicates that the parser will return default + attribute values for missing attributes that have default values. + **/ + public final static String defaultAttributesFeature = + "http://www.ccil.org/~cowan/tagsoup/features/default-attributes"; + + /** + A value of "true" indicates that the parser will + translate colons into underscores in names. + **/ + public final static String translateColonsFeature = + "http://www.ccil.org/~cowan/tagsoup/features/translate-colons"; + + /** + A value of "true" indicates that the parser will + attempt to restart the restartable elements. + **/ + public final static String restartElementsFeature = + "http://www.ccil.org/~cowan/tagsoup/features/restart-elements"; + + /** + A value of "true" indicates that the parser will + transmit whitespace in element-only content via the SAX + ignorableWhitespace callback. Normally this is not done, + because HTML is an SGML application and SGML suppresses + such whitespace. + **/ + public final static String ignorableWhitespaceFeature = + "http://www.ccil.org/~cowan/tagsoup/features/ignorable-whitespace"; + + /** + A value of "true" indicates that the parser will treat CDATA + elements specially. Normally true, since the input is by + default HTML. + **/ + public final static String CDATAElementsFeature = + "http://www.ccil.org/~cowan/tagsoup/features/cdata-elements"; + + /** + Used to see some syntax events that are essential in some + applications: comments, CDATA delimiters, selected general + entity inclusions, and the start and end of the DTD (and + declaration of document element name). The Object must implement + org.xml.sax.ext.LexicalHandler. + **/ + public final static String lexicalHandlerProperty = + "http://xml.org/sax/properties/lexical-handler"; + + /** + Specifies the Scanner object this Parser uses. + **/ + public final static String scannerProperty = + "http://www.ccil.org/~cowan/tagsoup/properties/scanner"; + + /** + Specifies the Schema object this Parser uses. + **/ + public final static String schemaProperty = + "http://www.ccil.org/~cowan/tagsoup/properties/schema"; + + /** + Specifies the AutoDetector (for encoding detection) this Parser uses. + **/ + public final static String autoDetectorProperty = + "http://www.ccil.org/~cowan/tagsoup/properties/auto-detector"; + + // Due to sucky Java order of initialization issues, these + // entries are maintained separately from the initial values of + // the corresponding instance variables, but care must be taken + // to keep them in sync. + + private HashMap theFeatures = new HashMap(); + { + theFeatures.put(namespacesFeature, truthValue(DEFAULT_NAMESPACES)); + theFeatures.put(namespacePrefixesFeature, Boolean.FALSE); + theFeatures.put(externalGeneralEntitiesFeature, Boolean.FALSE); + theFeatures.put(externalParameterEntitiesFeature, Boolean.FALSE); + theFeatures.put(isStandaloneFeature, Boolean.FALSE); + theFeatures.put(lexicalHandlerParameterEntitiesFeature, + Boolean.FALSE); + theFeatures.put(resolveDTDURIsFeature, Boolean.TRUE); + theFeatures.put(stringInterningFeature, Boolean.TRUE); + theFeatures.put(useAttributes2Feature, Boolean.FALSE); + theFeatures.put(useLocator2Feature, Boolean.FALSE); + theFeatures.put(useEntityResolver2Feature, Boolean.FALSE); + theFeatures.put(validationFeature, Boolean.FALSE); + theFeatures.put(xmlnsURIsFeature, Boolean.FALSE); + theFeatures.put(xmlnsURIsFeature, Boolean.FALSE); + theFeatures.put(XML11Feature, Boolean.FALSE); + theFeatures.put(ignoreBogonsFeature, truthValue(DEFAULT_IGNORE_BOGONS)); + theFeatures.put(bogonsEmptyFeature, truthValue(DEFAULT_BOGONS_EMPTY)); + theFeatures.put(rootBogonsFeature, truthValue(DEFAULT_ROOT_BOGONS)); + theFeatures.put(defaultAttributesFeature, truthValue(DEFAULT_DEFAULT_ATTRIBUTES)); + theFeatures.put(translateColonsFeature, truthValue(DEFAULT_TRANSLATE_COLONS)); + theFeatures.put(restartElementsFeature, truthValue(DEFAULT_RESTART_ELEMENTS)); + theFeatures.put(ignorableWhitespaceFeature, truthValue(DEFAULT_IGNORABLE_WHITESPACE)); + theFeatures.put(CDATAElementsFeature, truthValue(DEFAULT_CDATA_ELEMENTS)); + } + + // Private clone of Boolean.valueOf that is guaranteed to return + // Boolean.TRUE or Boolean.FALSE + private static Boolean truthValue(boolean b) { + return b ? Boolean.TRUE : Boolean.FALSE; + } + + + public boolean getFeature (String name) + throws SAXNotRecognizedException, SAXNotSupportedException { + Boolean b = (Boolean)theFeatures.get(name); + if (b == null) { + throw new SAXNotRecognizedException("Unknown feature " + name); + } + return b.booleanValue(); + } + + public void setFeature (String name, boolean value) + throws SAXNotRecognizedException, SAXNotSupportedException { + Boolean b = (Boolean)theFeatures.get(name); + if (b == null) { + throw new SAXNotRecognizedException("Unknown feature " + name); + } + if (value) theFeatures.put(name, Boolean.TRUE); + else theFeatures.put(name, Boolean.FALSE); + + if (name.equals(namespacesFeature)) namespaces = value; + else if (name.equals(ignoreBogonsFeature)) ignoreBogons = value; + else if (name.equals(bogonsEmptyFeature)) bogonsEmpty = value; + else if (name.equals(rootBogonsFeature)) rootBogons = value; + else if (name.equals(defaultAttributesFeature)) defaultAttributes = value; + else if (name.equals(translateColonsFeature)) translateColons = value; + else if (name.equals(restartElementsFeature)) restartElements = value; + else if (name.equals(ignorableWhitespaceFeature)) ignorableWhitespace = value; + else if (name.equals(CDATAElementsFeature)) CDATAElements = value; + } + + public Object getProperty (String name) + throws SAXNotRecognizedException, SAXNotSupportedException { + if (name.equals(lexicalHandlerProperty)) { + return theLexicalHandler == this ? null : theLexicalHandler; + } + else if (name.equals(scannerProperty)) { + return theScanner; + } + else if (name.equals(schemaProperty)) { + return theSchema; + } + else if (name.equals(autoDetectorProperty)) { + return theAutoDetector; + } + else { + throw new SAXNotRecognizedException("Unknown property " + name); + } + } + + public void setProperty (String name, Object value) + throws SAXNotRecognizedException, SAXNotSupportedException { + if (name.equals(lexicalHandlerProperty)) { + if (value == null) { + theLexicalHandler = this; + } + else if (value instanceof LexicalHandler) { + theLexicalHandler = (LexicalHandler)value; + } + else { + throw new SAXNotSupportedException("Your lexical handler is not a LexicalHandler"); + } + } + else if (name.equals(scannerProperty)) { + if (value instanceof Scanner) { + theScanner = (Scanner)value; + } + else { + throw new SAXNotSupportedException("Your scanner is not a Scanner"); + } + } + else if (name.equals(schemaProperty)) { + if (value instanceof Schema) { + theSchema = (Schema)value; + } + else { + throw new SAXNotSupportedException("Your schema is not a Schema"); + } + } + else if (name.equals(autoDetectorProperty)) { + if (value instanceof AutoDetector) { + theAutoDetector = (AutoDetector)value; + } + else { + throw new SAXNotSupportedException("Your auto-detector is not an AutoDetector"); + } + } + else { + throw new SAXNotRecognizedException("Unknown property " + name); + } + } + + public void setEntityResolver (EntityResolver resolver) { + theEntityResolver = (resolver == null) ? this : resolver; + } + + public EntityResolver getEntityResolver () { + return (theEntityResolver == this) ? null : theEntityResolver; + } + + public void setDTDHandler (DTDHandler handler) { + theDTDHandler = (handler == null) ? this : handler; + } + + public DTDHandler getDTDHandler () { + return (theDTDHandler == this) ? null : theDTDHandler; + } + + public void setContentHandler (ContentHandler handler) { + theContentHandler = (handler == null) ? this : handler; + } + + public ContentHandler getContentHandler () { + return (theContentHandler == this) ? null : theContentHandler; + } + + public void setErrorHandler (ErrorHandler handler) { + theErrorHandler = (handler == null) ? this : handler; + } + + public ErrorHandler getErrorHandler () { + return (theErrorHandler == this) ? null : theErrorHandler; + } + + public void parse (InputSource input) throws IOException, SAXException { + setup(); + Reader r = getReader(input); + theContentHandler.startDocument(); + theScanner.resetDocumentLocator(input.getPublicId(), input.getSystemId()); + if (theScanner instanceof Locator) { + theContentHandler.setDocumentLocator((Locator)theScanner); + } + if (!(theSchema.getURI().equals(""))) + theContentHandler.startPrefixMapping(theSchema.getPrefix(), + theSchema.getURI()); + theScanner.scan(r, this); + } + + public void parse (String systemid) throws IOException, SAXException { + parse(new InputSource(systemid)); + } + + // Sets up instance variables that haven't been set by setFeature + private void setup() { + if (theSchema == null) theSchema = new HTMLSchema(); + if (theScanner == null) theScanner = new HTMLScanner(); + if (theAutoDetector == null) { + theAutoDetector = new AutoDetector() { + public Reader autoDetectingReader(InputStream i) { + return new InputStreamReader(i); + } + }; + } + theStack = new Element(theSchema.getElementType("<root>"), defaultAttributes); + thePCDATA = new Element(theSchema.getElementType("<pcdata>"), defaultAttributes); + theNewElement = null; + theAttributeName = null; + thePITarget = null; + theSaved = null; + theEntity = 0; + virginStack = true; + theDoctypeName = theDoctypePublicId = theDoctypeSystemId = null; + } + + // Return a Reader based on the contents of an InputSource + // Buffer both the InputStream and the Reader + private Reader getReader(InputSource s) throws SAXException, IOException { + Reader r = s.getCharacterStream(); + InputStream i = s.getByteStream(); + String encoding = s.getEncoding(); + String publicid = s.getPublicId(); + String systemid = s.getSystemId(); + if (r == null) { + if (i == null) i = getInputStream(publicid, systemid); +// i = new BufferedInputStream(i); + if (encoding == null) { + r = theAutoDetector.autoDetectingReader(i); + } + else { + try { + r = new InputStreamReader(i, encoding); + } + catch (UnsupportedEncodingException e) { + r = new InputStreamReader(i); + } + } + } +// r = new BufferedReader(r); + return r; + } + + // Get an InputStream based on a publicid and a systemid + private InputStream getInputStream(String publicid, String systemid) throws IOException, SAXException { + URL basis = new URL("file", "", System.getProperty("user.dir") + "/."); + URL url = new URL(basis, systemid); + URLConnection c = url.openConnection(); + return c.getInputStream(); + } + // We don't process publicids (who uses them anyhow?) + + // ScanHandler implementation + + private Element theNewElement = null; + private String theAttributeName = null; + private boolean theDoctypeIsPresent = false; + private String theDoctypePublicId = null; + private String theDoctypeSystemId = null; + private String theDoctypeName = null; + private String thePITarget = null; + private Element theStack = null; + private Element theSaved = null; + private Element thePCDATA = null; + private int theEntity = 0; // needs to support chars past U+FFFF + + public void adup(char[] buff, int offset, int length) throws SAXException { + if (theNewElement == null || theAttributeName == null) return; + theNewElement.setAttribute(theAttributeName, null, theAttributeName); + theAttributeName = null; + } + + public void aname(char[] buff, int offset, int length) throws SAXException { + if (theNewElement == null) return; + // Currently we don't rely on Schema to canonicalize + // attribute names. + theAttributeName = makeName(buff, offset, length).toLowerCase(); +// System.err.println("%% Attribute name " + theAttributeName); + } + + public void aval(char[] buff, int offset, int length) throws SAXException { + if (theNewElement == null || theAttributeName == null) return; + String value = new String(buff, offset, length); +// System.err.println("%% Attribute value [" + value + "]"); + value = expandEntities(value); + theNewElement.setAttribute(theAttributeName, null, value); + theAttributeName = null; +// System.err.println("%% Aval done"); + } + + // Expand entity references in attribute values selectively. + // Currently we expand a reference iff it is properly terminated + // with a semicolon. + private String expandEntities(String src) { + int refStart = -1; + int len = src.length(); + char[] dst = new char[len]; + int dstlen = 0; + for (int i = 0; i < len; i++) { + char ch = src.charAt(i); + dst[dstlen++] = ch; +// System.err.print("i = " + i + ", d = " + dstlen + ", ch = [" + ch + "] "); + if (ch == '&' && refStart == -1) { + // start of a ref excluding & + refStart = dstlen; +// System.err.println("start of ref"); + } + else if (refStart == -1) { + // not in a ref +// System.err.println("not in ref"); + } + else if (Character.isLetter(ch) || + Character.isDigit(ch) || + ch == '#') { + // valid entity char +// System.err.println("valid"); + } + else if (ch == ';') { + // properly terminated ref +// System.err.print("got [" + new String(dst, refStart, dstlen-refStart-1) + "]"); + int ent = lookupEntity(dst, refStart, dstlen - refStart - 1); +// System.err.println(" = " + ent); + if (ent > 0xFFFF) { + ent -= 0x10000; + dst[refStart - 1] = (char)((ent>>10) + 0xD800); + dst[refStart] = (char)((ent&0x3FF) + 0xDC00); + dstlen = refStart + 1; + } + else if (ent != 0) { + dst[refStart - 1] = (char)ent; + dstlen = refStart; + } + refStart = -1; + } + else { + // improperly terminated ref +// System.err.println("end of ref"); + refStart = -1; + } + } + return new String(dst, 0, dstlen); + } + + public void entity(char[] buff, int offset, int length) throws SAXException { + theEntity = lookupEntity(buff, offset, length); + } + + // Process numeric character references, + // deferring to the schema for named ones. + private int lookupEntity(char[] buff, int offset, int length) { + int result = 0; + if (length < 1) return result; +// System.err.println("%% Entity at " + offset + " " + length); +// System.err.println("%% Got entity [" + new String(buff, offset, length) + "]"); + if (buff[offset] == '#') { + if (length > 1 && (buff[offset+1] == 'x' + || buff[offset+1] == 'X')) { + try { + return Integer.parseInt(new String(buff, offset + 2, length - 2), 16); + } + catch (NumberFormatException e) { return 0; } + } + try { + return Integer.parseInt(new String(buff, offset + 1, length - 1), 10); + } + catch (NumberFormatException e) { return 0; } + } + return theSchema.getEntity(new String(buff, offset, length)); + } + + public void eof(char[] buff, int offset, int length) throws SAXException { + if (virginStack) rectify(thePCDATA); + while (theStack.next() != null) { + pop(); + } + if (!(theSchema.getURI().equals(""))) + theContentHandler.endPrefixMapping(theSchema.getPrefix()); + theContentHandler.endDocument(); + } + + public void etag(char[] buff, int offset, int length) throws SAXException { + if (etag_cdata(buff, offset, length)) return; + etag_basic(buff, offset, length); + } + + private static char[] etagchars = {'<', '/', '>'}; + public boolean etag_cdata(char[] buff, int offset, int length) throws SAXException { + String currentName = theStack.name(); + // If this is a CDATA element and the tag doesn't match, + // or isn't properly formed (junk after the name), + // restart CDATA mode and process the tag as characters. + if (CDATAElements && (theStack.flags() & Schema.F_CDATA) != 0) { + boolean realTag = (length == currentName.length()); + if (realTag) { + for (int i = 0; i < length; i++) { + if (Character.toLowerCase(buff[offset + i]) != Character.toLowerCase(currentName.charAt(i))) { + realTag = false; + break; + } + } + } + if (!realTag) { + theContentHandler.characters(etagchars, 0, 2); + theContentHandler.characters(buff, offset, length); + theContentHandler.characters(etagchars, 2, 1); + theScanner.startCDATA(); + return true; + } + } + return false; + } + + public void etag_basic(char[] buff, int offset, int length) throws SAXException { + theNewElement = null; + String name; + if (length != 0) { + // Canonicalize case of name + name = makeName(buff, offset, length); +// System.err.println("got etag [" + name + "]"); + ElementType type = theSchema.getElementType(name); + if (type == null) return; // mysterious end-tag + name = type.name(); + } + else { + name = theStack.name(); + } +// System.err.println("%% Got end of " + name); + + Element sp; + boolean inNoforce = false; + for (sp = theStack; sp != null; sp = sp.next()) { + if (sp.name().equals(name)) break; + if ((sp.flags() & Schema.F_NOFORCE) != 0) inNoforce = true; + } + + if (sp == null) return; // Ignore unknown etags + if (sp.next() == null || sp.next().next() == null) return; + if (inNoforce) { // inside an F_NOFORCE element? + sp.preclose(); // preclose the matching element + } + else { // restartably pop everything above us + while (theStack != sp) { + restartablyPop(); + } + pop(); + } + // pop any preclosed elements now at the top + while (theStack.isPreclosed()) { + pop(); + } + restart(null); + } + + // Push restartables on the stack if possible + // e is the next element to be started, if we know what it is + private void restart(Element e) throws SAXException { + while (theSaved != null && theStack.canContain(theSaved) && + (e == null || theSaved.canContain(e))) { + Element next = theSaved.next(); + push(theSaved); + theSaved = next; + } + } + + // Pop the stack irrevocably + private void pop() throws SAXException { + if (theStack == null) return; // empty stack + String name = theStack.name(); + String localName = theStack.localName(); + String namespace = theStack.namespace(); + String prefix = prefixOf(name); + +// System.err.println("%% Popping " + name); + if (!namespaces) namespace = localName = ""; + theContentHandler.endElement(namespace, localName, name); + if (foreign(prefix, namespace)) { + theContentHandler.endPrefixMapping(prefix); +// System.err.println("%% Unmapping [" + prefix + "] for elements to " + namespace); + } + Attributes atts = theStack.atts(); + for (int i = atts.getLength() - 1; i >= 0; i--) { + String attNamespace = atts.getURI(i); + String attPrefix = prefixOf(atts.getQName(i)); + if (foreign(attPrefix, attNamespace)) { + theContentHandler.endPrefixMapping(attPrefix); +// System.err.println("%% Unmapping [" + attPrefix + "] for attributes to " + attNamespace); + } + } + theStack = theStack.next(); + } + + // Pop the stack restartably + private void restartablyPop() throws SAXException { + Element popped = theStack; + pop(); + if (restartElements && (popped.flags() & Schema.F_RESTART) != 0) { + popped.anonymize(); + popped.setNext(theSaved); + theSaved = popped; + } + } + + // Push element onto stack + private boolean virginStack = true; + private void push(Element e) throws SAXException { + String name = e.name(); + String localName = e.localName(); + String namespace = e.namespace(); + String prefix = prefixOf(name); + +// System.err.println("%% Pushing " + name); + e.clean(); + if (!namespaces) namespace = localName = ""; + if (virginStack && localName.equalsIgnoreCase(theDoctypeName)) { + try { + theEntityResolver.resolveEntity(theDoctypePublicId, theDoctypeSystemId); + } catch (IOException ew) { } // Can't be thrown for root I believe. + } + if (foreign(prefix, namespace)) { + theContentHandler.startPrefixMapping(prefix, namespace); +// System.err.println("%% Mapping [" + prefix + "] for elements to " + namespace); + } + Attributes atts = e.atts(); + int len = atts.getLength(); + for (int i = 0; i < len; i++) { + String attNamespace = atts.getURI(i); + String attPrefix = prefixOf(atts.getQName(i)); + if (foreign(attPrefix, attNamespace)) { + theContentHandler.startPrefixMapping(attPrefix, attNamespace); +// System.err.println("%% Mapping [" + attPrefix + "] for attributes to " + attNamespace); + } + } + theContentHandler.startElement(namespace, localName, name, e.atts()); + e.setNext(theStack); + theStack = e; + virginStack = false; + if (CDATAElements && (theStack.flags() & Schema.F_CDATA) != 0) { + theScanner.startCDATA(); + } + } + + // Get the prefix from a QName + private String prefixOf(String name) { + int i = name.indexOf(':'); + String prefix = ""; + if (i != -1) prefix = name.substring(0, i); +// System.err.println("%% " + prefix + " is prefix of " + name); + return prefix; + } + + // Return true if we have a foreign name + private boolean foreign(String prefix, String namespace) { +// System.err.print("%% Testing " + prefix + " and " + namespace + " for foreignness -- "); + boolean foreign = !(prefix.equals("") || namespace.equals("") || + namespace.equals(theSchema.getURI())); +// System.err.println(foreign); + return foreign; + } + + /** + * Parsing the complete XML Document Type Definition is way too complex, + * but for many simple cases we can extract something useful from it. + * + * doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S? ('[' intSubset ']' S?)? '>' + * DeclSep ::= PEReference | S + * intSubset ::= (markupdecl | DeclSep)* + * markupdecl ::= elementdecl | AttlistDecl | EntityDecl | NotationDecl | PI | Comment + * ExternalID ::= 'SYSTEM' S SystemLiteral | 'PUBLIC' S PubidLiteral S SystemLiteral + */ + public void decl(char[] buff, int offset, int length) throws SAXException { + String s = new String(buff, offset, length); + String name = null; + String systemid = null; + String publicid = null; + String[] v = split(s); + if (v.length > 0 && "DOCTYPE".equals(v[0])) { + if (theDoctypeIsPresent) return; // one doctype only! + theDoctypeIsPresent = true; + if (v.length > 1) { + name = v[1]; + if (v.length>3 && "SYSTEM".equals(v[2])) { + systemid = v[3]; + } + else if (v.length > 3 && "PUBLIC".equals(v[2])) { + publicid = v[3]; + if (v.length > 4) { + systemid = v[4]; + } + else { + systemid = ""; + } + } + } + } + publicid = trimquotes(publicid); + systemid = trimquotes(systemid); + if (name != null) { + publicid = cleanPublicid(publicid); + theLexicalHandler.startDTD(name, publicid, systemid); + theLexicalHandler.endDTD(); + theDoctypeName = name; + theDoctypePublicId = publicid; + if (theScanner instanceof Locator) { // Must resolve systemid + theDoctypeSystemId = ((Locator)theScanner).getSystemId(); + try { + theDoctypeSystemId = new URL(new URL(theDoctypeSystemId), systemid).toString(); + } catch (Exception e) {} + } + } + } + + // If the String is quoted, trim the quotes. + private static String trimquotes(String in) { + if (in == null) return in; + int length = in.length(); + if (length == 0) return in; + char s = in.charAt(0); + char e = in.charAt(length - 1); + if (s == e && (s == '\'' || s == '"')) { + in = in.substring(1, in.length() - 1); + } + return in; + } + + // Split the supplied String into words or phrases seperated by spaces. + // Recognises quotes around a phrase and doesn't split it. + private static String[] split(String val) throws IllegalArgumentException { + val = val.trim(); + if (val.length() == 0) { + return new String[0]; + } + else { + ArrayList l = new ArrayList(); + int s = 0; + int e = 0; + boolean sq = false; // single quote + boolean dq = false; // double quote + char lastc = 0; + int len = val.length(); + for (e=0; e < len; e++) { + char c = val.charAt(e); + if (!dq && c == '\'' && lastc != '\\') { + sq = !sq; + if (s < 0) s = e; + } + else if (!sq && c == '\"' && lastc != '\\') { + dq = !dq; + if (s < 0) s = e; + } + else if (!sq && !dq) { + if (Character.isWhitespace(c)) { + if (s >= 0) l.add(val.substring(s, e)); + s = -1; + } + else if (s < 0 && c != ' ') { + s = e; + } + } + lastc = c; + } + l.add(val.substring(s, e)); + return (String[])l.toArray(new String[0]); + } + } + + // Replace junk in publicids with spaces + private static String legal = + "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-'()+,./:=?;!*#@$_%"; + + private String cleanPublicid(String src) { + if (src == null) return null; + int len = src.length(); + StringBuffer dst = new StringBuffer(len); + boolean suppressSpace = true; + for (int i = 0; i < len; i++) { + char ch = src.charAt(i); + if (legal.indexOf(ch) != -1) { // legal but not whitespace + dst.append(ch); + suppressSpace = false; + } + else if (suppressSpace) { // normalizable whitespace or junk + ; + } + else { + dst.append(' '); + suppressSpace = true; + } + } +// System.err.println("%% Publicid [" + dst.toString().trim() + "]"); + return dst.toString().trim(); // trim any final junk whitespace + } + + + public void gi(char[] buff, int offset, int length) throws SAXException { + if (theNewElement != null) return; + String name = makeName(buff, offset, length); + if (name == null) return; + ElementType type = theSchema.getElementType(name); + if (type == null) { + // Suppress unknown elements if ignore-bogons is on + if (ignoreBogons) return; + int bogonModel = bogonsEmpty ? Schema.M_EMPTY : Schema.M_ANY; + int bogonMemberOf = rootBogons ? Schema.M_ANY : (Schema.M_ANY & ~ Schema.M_ROOT); + theSchema.elementType(name, bogonModel, bogonMemberOf, 0); + if (!rootBogons) theSchema.parent(name, theSchema.rootElementType().name()); + type = theSchema.getElementType(name); + } + + theNewElement = new Element(type, defaultAttributes); +// System.err.println("%% Got GI " + theNewElement.name()); + } + + public void cdsect(char[] buff, int offset, int length) throws SAXException { + theLexicalHandler.startCDATA(); + pcdata(buff, offset, length); + theLexicalHandler.endCDATA(); + } + public void pcdata(char[] buff, int offset, int length) throws SAXException { + if (length == 0) return; + boolean allWhite = true; + for (int i = 0; i < length; i++) { + if (!Character.isWhitespace(buff[offset+i])) { + allWhite = false; + } + } + if (allWhite && !theStack.canContain(thePCDATA)) { + if (ignorableWhitespace) { + theContentHandler.ignorableWhitespace(buff, offset, length); + } + } + else { + rectify(thePCDATA); + theContentHandler.characters(buff, offset, length); + } + } + + public void pitarget(char[] buff, int offset, int length) throws SAXException { + if (theNewElement != null) return; + thePITarget = makeName(buff, offset, length).replace(':', '_'); + } + + public void pi(char[] buff, int offset, int length) throws SAXException { + if (theNewElement != null || thePITarget == null) return; + if ("xml".equalsIgnoreCase(thePITarget)) return; +// if (length > 0 && buff[length - 1] == '?') System.err.println("%% Removing ? from PI"); + if (length > 0 && buff[length - 1] == '?') length--; // remove trailing ? + theContentHandler.processingInstruction(thePITarget, + new String(buff, offset, length)); + thePITarget = null; + } + + public void stagc(char[] buff, int offset, int length) throws SAXException { +// System.err.println("%% Start-tag"); + if (theNewElement == null) return; + rectify(theNewElement); + if (theStack.model() == Schema.M_EMPTY) { + // Force an immediate end tag + etag_basic(buff, offset, length); + } + } + + public void stage(char[] buff, int offset, int length) throws SAXException { +// System.err.println("%% Empty-tag"); + if (theNewElement == null) return; + rectify(theNewElement); + // Force an immediate end tag + etag_basic(buff, offset, length); + } + + // Comment buffer is twice the size of the output buffer + private char[] theCommentBuffer = new char[2000]; + public void cmnt(char[] buff, int offset, int length) throws SAXException { + theLexicalHandler.comment(buff, offset, length); + } + + // Rectify the stack, pushing and popping as needed + // so that the argument can be safely pushed + private void rectify(Element e) throws SAXException { + Element sp; + while (true) { + for (sp = theStack; sp != null; sp = sp.next()) { + if (sp.canContain(e)) break; + } + if (sp != null) break; + ElementType parentType = e.parent(); + if (parentType == null) break; + Element parent = new Element(parentType, defaultAttributes); +// System.err.println("%% Ascending from " + e.name() + " to " + parent.name()); + parent.setNext(e); + e = parent; + } + if (sp == null) return; // don't know what to do + while (theStack != sp) { + if (theStack == null || theStack.next() == null || + theStack.next().next() == null) break; + restartablyPop(); + } + while (e != null) { + Element nexte = e.next(); + if (!e.name().equals("<pcdata>")) push(e); + e = nexte; + restart(e); + } + theNewElement = null; + } + + public int getEntity() { + return theEntity; + } + + // Return the argument as a valid XML name + // This no longer lowercases the result: we depend on Schema to + // canonicalize case. + private String makeName(char[] buff, int offset, int length) { + StringBuffer dst = new StringBuffer(length + 2); + boolean seenColon = false; + boolean start = true; +// String src = new String(buff, offset, length); // DEBUG + for (; length-- > 0; offset++) { + char ch = buff[offset]; + if (Character.isLetter(ch) || ch == '_') { + start = false; + dst.append(ch); + } + else if (Character.isDigit(ch) || ch == '-' || ch == '.') { + if (start) dst.append('_'); + start = false; + dst.append(ch); + } + else if (ch == ':' && !seenColon) { + seenColon = true; + if (start) dst.append('_'); + start = true; + dst.append(translateColons ? '_' : ch); + } + } + int dstLength = dst.length(); + if (dstLength == 0 || dst.charAt(dstLength - 1) == ':') dst.append('_'); +// System.err.println("Made name \"" + dst + "\" from \"" + src + "\""); + return dst.toString().intern(); + } + + // Default LexicalHandler implementation + + public void comment(char[] ch, int start, int length) throws SAXException { } + public void endCDATA() throws SAXException { } + public void endDTD() throws SAXException { } + public void endEntity(String name) throws SAXException { } + public void startCDATA() throws SAXException { } + public void startDTD(String name, String publicid, String systemid) throws SAXException { } + public void startEntity(String name) throws SAXException { } + + } |