1 files changed, 1114 insertions, 0 deletions
diff --git a/src/org/ccil/cowan/tagsoup/Parser.java b/src/org/ccil/cowan/tagsoup/Parser.java
new file mode 100644
index 0000000..0997f23
--- /dev/null
+++ b/src/org/ccil/cowan/tagsoup/Parser.java
@@ -0,0 +1,1114 @@
+// This file is part of TagSoup and is Copyright 2002-2008 by John Cowan.
+//
+// TagSoup is licensed under the Apache License,
+// Version 2.0.  You may obtain a copy of this license at
+// http://www.apache.org/licenses/LICENSE-2.0 .  You may also have
+// additional legal rights not granted by this license.
+//
+// TagSoup is distributed in the hope that it will be useful, but
+// unless required by applicable law or agreed to in writing, TagSoup
+// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
+// OF ANY KIND, either express or implied; not even the implied warranty
+// of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+// 
+// 
+// The TagSoup parser
+
+package org.ccil.cowan.tagsoup;
+import java.util.HashMap;
+import java.util.ArrayList;
+import java.io.*;
+import java.net.URL;
+import java.net.URLConnection;
+import org.xml.sax.*;
+import org.xml.sax.helpers.DefaultHandler;
+import org.xml.sax.ext.LexicalHandler;
+
+
+/**
+The SAX parser class.
+**/
+public class Parser extends DefaultHandler implements ScanHandler, XMLReader, LexicalHandler {
+
+	// XMLReader implementation
+
+	private ContentHandler theContentHandler = this;
+	private LexicalHandler theLexicalHandler = this;
+	private DTDHandler theDTDHandler = this;
+	private ErrorHandler theErrorHandler = this;
+	private EntityResolver theEntityResolver = this;
+	private Schema theSchema;
+	private Scanner theScanner;
+	private AutoDetector theAutoDetector;
+
+	// Default values for feature flags
+
+	private static boolean DEFAULT_NAMESPACES = true;
+	private static boolean DEFAULT_IGNORE_BOGONS = false;
+	private static boolean DEFAULT_BOGONS_EMPTY = false;
+        private static boolean DEFAULT_ROOT_BOGONS = true;
+	private static boolean DEFAULT_DEFAULT_ATTRIBUTES = true;
+	private static boolean DEFAULT_TRANSLATE_COLONS = false;
+	private static boolean DEFAULT_RESTART_ELEMENTS = true;
+	private static boolean DEFAULT_IGNORABLE_WHITESPACE = false;
+	private static boolean DEFAULT_CDATA_ELEMENTS = true;
+
+	// Feature flags.  
+
+	private boolean namespaces = DEFAULT_NAMESPACES;
+	private boolean ignoreBogons = DEFAULT_IGNORE_BOGONS;
+	private boolean bogonsEmpty = DEFAULT_BOGONS_EMPTY;
+        private boolean rootBogons = DEFAULT_ROOT_BOGONS;
+	private boolean defaultAttributes = DEFAULT_DEFAULT_ATTRIBUTES;
+	private boolean translateColons = DEFAULT_TRANSLATE_COLONS;
+	private boolean restartElements = DEFAULT_RESTART_ELEMENTS;
+	private boolean ignorableWhitespace = DEFAULT_IGNORABLE_WHITESPACE;
+	private boolean CDATAElements = DEFAULT_CDATA_ELEMENTS;
+
+	/**
+	A value of "true" indicates namespace URIs and unprefixed local
+	names for element and attribute names will be available.
+	**/
+	public final static String namespacesFeature =
+		"http://xml.org/sax/features/namespaces";
+
+	/**
+	A value of "true" indicates that XML qualified names (with prefixes)
+	and attributes (including xmlns* attributes) will be available.
+	We don't support this value.
+	**/
+	public final static String namespacePrefixesFeature =
+		"http://xml.org/sax/features/namespace-prefixes";
+
+	/**
+	Reports whether this parser processes external general entities
+	(it doesn't).
+	**/
+	public final static String externalGeneralEntitiesFeature =
+		"http://xml.org/sax/features/external-general-entities";
+
+	/**
+	Reports whether this parser processes external parameter entities
+	(it doesn't).
+	**/
+	public final static String externalParameterEntitiesFeature =
+		"http://xml.org/sax/features/external-parameter-entities";
+
+	/**
+	May be examined only during a parse, after the startDocument()
+	callback has been completed; read-only. The value is true if
+	the document specified standalone="yes" in its XML declaration,
+	and otherwise is false.  (It's always false.)
+	**/
+	public final static String isStandaloneFeature =
+		"http://xml.org/sax/features/is-standalone";
+
+	/**
+	A value of "true" indicates that the LexicalHandler will report
+	the beginning and end of parameter entities (it won't).
+	**/
+	public final static String lexicalHandlerParameterEntitiesFeature =
+		"http://xml.org/sax/features/lexical-handler/parameter-entities";
+
+	/**
+	A value of "true" indicates that system IDs in declarations will
+	be absolutized (relative to their base URIs) before reporting.
+	(This returns true but doesn't actually do anything.)
+	**/
+	public final static String resolveDTDURIsFeature =
+		"http://xml.org/sax/features/resolve-dtd-uris";
+
+	/**
+	Has a value of "true" if all XML names (for elements,
+	prefixes, attributes, entities, notations, and local
+	names), as well as Namespace URIs, will have been interned
+	using java.lang.String.intern. This supports fast testing of
+	equality/inequality against string constants, rather than forcing
+	slower calls to String.equals().  (We always intern.)
+	**/
+	public final static String stringInterningFeature =
+		"http://xml.org/sax/features/string-interning";
+
+	/**
+	Returns "true" if the Attributes objects passed by this
+	parser in ContentHandler.startElement() implement the
+	org.xml.sax.ext.Attributes2 interface.	(They don't.)
+	**/
+
+	public final static String useAttributes2Feature =
+		"http://xml.org/sax/features/use-attributes2";
+
+	/**
+	Returns "true" if the Locator objects passed by this parser
+	in ContentHandler.setDocumentLocator() implement the
+	org.xml.sax.ext.Locator2 interface.  (They don't.)
+	**/
+	public final static String useLocator2Feature =
+		"http://xml.org/sax/features/use-locator2";
+
+	/**
+	Returns "true" if, when setEntityResolver is given an object
+	implementing the org.xml.sax.ext.EntityResolver2 interface,
+	those new methods will be used.  (They won't be.)
+	**/
+	public final static String useEntityResolver2Feature =
+		"http://xml.org/sax/features/use-entity-resolver2";
+
+	/**
+	Controls whether the parser is reporting all validity errors
+	(We don't report any validity errors.)
+	**/
+	public final static String validationFeature =
+		"http://xml.org/sax/features/validation";
+
+	/**
+	Controls whether the parser reports Unicode normalization
+	errors as described in section 2.13 and Appendix B of the XML
+	1.1 Recommendation.  (We don't normalize.)
+	**/
+	public final static String unicodeNormalizationCheckingFeature =
+"http://xml.org/sax/features/unicode-normalization-checking";
+
+	/**
+	Controls whether, when the namespace-prefixes feature is set,
+	the parser treats namespace declaration attributes as being in
+	the http://www.w3.org/2000/xmlns/ namespace.  (It doesn't.)
+	**/
+	public final static String xmlnsURIsFeature =
+		"http://xml.org/sax/features/xmlns-uris";
+
+	/**
+	Returns "true" if the parser supports both XML 1.1 and XML 1.0.
+	(Always false.)
+	**/
+	public final static String XML11Feature =
+		"http://xml.org/sax/features/xml-1.1";
+
+	/**
+	A value of "true" indicates that the parser will ignore
+	unknown elements.
+	**/
+	public final static String ignoreBogonsFeature =
+		"http://www.ccil.org/~cowan/tagsoup/features/ignore-bogons";
+
+	/**
+	A value of "true" indicates that the parser will give unknown
+	elements a content model of EMPTY; a value of "false", a
+	content model of ANY.
+	**/
+	public final static String bogonsEmptyFeature =
+		"http://www.ccil.org/~cowan/tagsoup/features/bogons-empty";
+
+	/**
+	A value of "true" indicates that the parser will allow unknown
+	elements to be the root element.
+	**/
+	public final static String rootBogonsFeature =
+		"http://www.ccil.org/~cowan/tagsoup/features/root-bogons";
+
+	/**
+	A value of "true" indicates that the parser will return default
+	attribute values for missing attributes that have default values.
+	**/
+	public final static String defaultAttributesFeature =
+		"http://www.ccil.org/~cowan/tagsoup/features/default-attributes";
+
+	/**
+	A value of "true" indicates that the parser will 
+	translate colons into underscores in names.
+	**/
+	public final static String translateColonsFeature =
+		"http://www.ccil.org/~cowan/tagsoup/features/translate-colons";
+
+	/**
+	A value of "true" indicates that the parser will 
+	attempt to restart the restartable elements.
+	**/
+	public final static String restartElementsFeature =
+		"http://www.ccil.org/~cowan/tagsoup/features/restart-elements";
+
+	/**
+	A value of "true" indicates that the parser will 
+	transmit whitespace in element-only content via the SAX
+	ignorableWhitespace callback.  Normally this is not done,
+	because HTML is an SGML application and SGML suppresses
+	such whitespace.
+	**/
+	public final static String ignorableWhitespaceFeature =
+		"http://www.ccil.org/~cowan/tagsoup/features/ignorable-whitespace";
+
+	/**
+	A value of "true" indicates that the parser will treat CDATA
+	elements specially.  Normally true, since the input is by
+	default HTML.
+	**/
+	public final static String CDATAElementsFeature =
+		"http://www.ccil.org/~cowan/tagsoup/features/cdata-elements";
+
+	/**
+	Used to see some syntax events that are essential in some
+	applications: comments, CDATA delimiters, selected general
+	entity inclusions, and the start and end of the DTD (and
+	declaration of document element name). The Object must implement
+	org.xml.sax.ext.LexicalHandler.
+	**/
+	public final static String lexicalHandlerProperty =
+		"http://xml.org/sax/properties/lexical-handler";
+
+	/**
+	Specifies the Scanner object this Parser uses.
+	**/
+	public final static String scannerProperty =
+		"http://www.ccil.org/~cowan/tagsoup/properties/scanner";
+
+	/**
+	Specifies the Schema object this Parser uses.
+	**/
+	public final static String schemaProperty =
+		"http://www.ccil.org/~cowan/tagsoup/properties/schema";
+
+	/**
+	Specifies the AutoDetector (for encoding detection) this Parser uses.
+	**/
+	public final static String autoDetectorProperty =
+		"http://www.ccil.org/~cowan/tagsoup/properties/auto-detector";
+
+	// Due to sucky Java order of initialization issues, these
+	// entries are maintained separately from the initial values of
+	// the corresponding instance variables, but care must be taken
+	// to keep them in sync.
+
+	private HashMap theFeatures = new HashMap();
+	{
+		theFeatures.put(namespacesFeature, truthValue(DEFAULT_NAMESPACES));
+		theFeatures.put(namespacePrefixesFeature, Boolean.FALSE);
+		theFeatures.put(externalGeneralEntitiesFeature, Boolean.FALSE);
+		theFeatures.put(externalParameterEntitiesFeature, Boolean.FALSE);
+		theFeatures.put(isStandaloneFeature, Boolean.FALSE);
+		theFeatures.put(lexicalHandlerParameterEntitiesFeature,
+			Boolean.FALSE);
+		theFeatures.put(resolveDTDURIsFeature, Boolean.TRUE);
+		theFeatures.put(stringInterningFeature, Boolean.TRUE);
+		theFeatures.put(useAttributes2Feature, Boolean.FALSE);
+		theFeatures.put(useLocator2Feature, Boolean.FALSE);
+		theFeatures.put(useEntityResolver2Feature, Boolean.FALSE);
+		theFeatures.put(validationFeature, Boolean.FALSE);
+		theFeatures.put(xmlnsURIsFeature, Boolean.FALSE);
+		theFeatures.put(xmlnsURIsFeature, Boolean.FALSE);
+		theFeatures.put(XML11Feature, Boolean.FALSE);
+		theFeatures.put(ignoreBogonsFeature, truthValue(DEFAULT_IGNORE_BOGONS));
+		theFeatures.put(bogonsEmptyFeature, truthValue(DEFAULT_BOGONS_EMPTY));
+		theFeatures.put(rootBogonsFeature, truthValue(DEFAULT_ROOT_BOGONS));
+		theFeatures.put(defaultAttributesFeature, truthValue(DEFAULT_DEFAULT_ATTRIBUTES));
+		theFeatures.put(translateColonsFeature, truthValue(DEFAULT_TRANSLATE_COLONS));
+		theFeatures.put(restartElementsFeature, truthValue(DEFAULT_RESTART_ELEMENTS));
+		theFeatures.put(ignorableWhitespaceFeature, truthValue(DEFAULT_IGNORABLE_WHITESPACE));
+		theFeatures.put(CDATAElementsFeature, truthValue(DEFAULT_CDATA_ELEMENTS));
+		}
+
+	// Private clone of Boolean.valueOf that is guaranteed to return
+	// Boolean.TRUE or Boolean.FALSE
+	private static Boolean truthValue(boolean b) {
+		return b ? Boolean.TRUE : Boolean.FALSE;
+		}
+
+
+	public boolean getFeature (String name)
+		throws SAXNotRecognizedException, SAXNotSupportedException {
+		Boolean b = (Boolean)theFeatures.get(name);
+		if (b == null) {
+			throw new SAXNotRecognizedException("Unknown feature " + name);
+			}
+		return b.booleanValue();
+		}
+
+	public void setFeature (String name, boolean value)
+	throws SAXNotRecognizedException, SAXNotSupportedException {
+		Boolean b = (Boolean)theFeatures.get(name);
+		if (b == null) {
+			throw new SAXNotRecognizedException("Unknown feature " + name);
+			}
+		if (value) theFeatures.put(name, Boolean.TRUE);
+		else theFeatures.put(name, Boolean.FALSE);
+
+		if (name.equals(namespacesFeature)) namespaces = value;
+		else if (name.equals(ignoreBogonsFeature)) ignoreBogons = value;
+		else if (name.equals(bogonsEmptyFeature)) bogonsEmpty = value;
+		else if (name.equals(rootBogonsFeature)) rootBogons = value;
+		else if (name.equals(defaultAttributesFeature)) defaultAttributes = value;
+		else if (name.equals(translateColonsFeature)) translateColons = value;
+		else if (name.equals(restartElementsFeature)) restartElements = value;
+		else if (name.equals(ignorableWhitespaceFeature)) ignorableWhitespace = value;
+		else if (name.equals(CDATAElementsFeature)) CDATAElements = value;
+		}
+
+	public Object getProperty (String name)
+	throws SAXNotRecognizedException, SAXNotSupportedException {
+		if (name.equals(lexicalHandlerProperty)) {
+			return theLexicalHandler == this ? null : theLexicalHandler;
+			}
+		else if (name.equals(scannerProperty)) {
+			return theScanner;
+			}
+		else if (name.equals(schemaProperty)) {
+			return theSchema;
+			}
+		else if (name.equals(autoDetectorProperty)) {
+			return theAutoDetector;
+			}
+		else {
+			throw new SAXNotRecognizedException("Unknown property " + name);
+			}
+		}
+
+	public void setProperty (String name, Object value)
+	throws SAXNotRecognizedException, SAXNotSupportedException {
+		if (name.equals(lexicalHandlerProperty)) {
+			if (value == null) {
+				theLexicalHandler = this;
+				}
+			else if (value instanceof LexicalHandler) {
+				theLexicalHandler = (LexicalHandler)value;
+				}
+			else {
+				throw new SAXNotSupportedException("Your lexical handler is not a LexicalHandler");
+				}
+			}
+		else if (name.equals(scannerProperty)) {
+			if (value instanceof Scanner) {
+				theScanner = (Scanner)value;
+				}
+			else {
+				throw new SAXNotSupportedException("Your scanner is not a Scanner");
+				}
+			}
+		else if (name.equals(schemaProperty)) {
+			if (value instanceof Schema) {
+				theSchema = (Schema)value;
+				}
+			else {
+				 throw new SAXNotSupportedException("Your schema is not a Schema");
+				}
+			}
+		else if (name.equals(autoDetectorProperty)) {
+			if (value instanceof AutoDetector) {
+				theAutoDetector = (AutoDetector)value;
+				}
+			else {
+				throw new SAXNotSupportedException("Your auto-detector is not an AutoDetector");
+				}
+			}
+		else {
+			throw new SAXNotRecognizedException("Unknown property " + name);
+			}
+		}
+
+	public void setEntityResolver (EntityResolver resolver) {
+		theEntityResolver = (resolver == null) ? this : resolver;
+		}
+
+	public EntityResolver getEntityResolver () {
+		return (theEntityResolver == this) ? null : theEntityResolver;
+		}
+
+	public void setDTDHandler (DTDHandler handler) {
+		theDTDHandler = (handler == null) ? this : handler;
+		}
+
+	public DTDHandler getDTDHandler () {
+		return (theDTDHandler == this) ? null : theDTDHandler;
+		}
+
+	public void setContentHandler (ContentHandler handler) {
+		theContentHandler = (handler == null) ? this : handler;
+		}
+
+	public ContentHandler getContentHandler () {
+		return (theContentHandler == this) ? null : theContentHandler;
+		}
+
+	public void setErrorHandler (ErrorHandler handler) {
+		theErrorHandler = (handler == null) ? this : handler;
+		}
+
+	public ErrorHandler getErrorHandler () {
+		return (theErrorHandler == this) ? null : theErrorHandler;
+		}
+
+	public void parse (InputSource input) throws IOException, SAXException {
+		setup();
+		Reader r = getReader(input);
+		theContentHandler.startDocument();
+		theScanner.resetDocumentLocator(input.getPublicId(), input.getSystemId());
+		if (theScanner instanceof Locator) {
+			theContentHandler.setDocumentLocator((Locator)theScanner);
+			}
+		if (!(theSchema.getURI().equals("")))
+			theContentHandler.startPrefixMapping(theSchema.getPrefix(),
+				theSchema.getURI());
+		theScanner.scan(r, this);
+		}
+
+	public void parse (String systemid) throws IOException, SAXException {
+		parse(new InputSource(systemid));
+		}
+
+	// Sets up instance variables that haven't been set by setFeature
+	private void setup() {
+		if (theSchema == null) theSchema = new HTMLSchema();
+		if (theScanner == null) theScanner = new HTMLScanner();
+		if (theAutoDetector == null) {
+			theAutoDetector = new AutoDetector() {
+				public Reader autoDetectingReader(InputStream i) {
+					return new InputStreamReader(i);
+					}
+				};
+			}
+		theStack = new Element(theSchema.getElementType("<root>"), defaultAttributes);
+		thePCDATA = new Element(theSchema.getElementType("<pcdata>"), defaultAttributes);
+		theNewElement = null;
+		theAttributeName = null;
+		thePITarget = null;
+		theSaved = null;
+		theEntity = 0;
+		virginStack = true;
+                theDoctypeName = theDoctypePublicId = theDoctypeSystemId = null;
+		}
+
+	// Return a Reader based on the contents of an InputSource
+	// Buffer both the InputStream and the Reader
+	private Reader getReader(InputSource s) throws SAXException, IOException {
+		Reader r = s.getCharacterStream();
+		InputStream i = s.getByteStream();
+		String encoding = s.getEncoding();
+		String publicid = s.getPublicId();
+		String systemid = s.getSystemId();
+		if (r == null) {
+			if (i == null) i = getInputStream(publicid, systemid);
+//			i = new BufferedInputStream(i);
+			if (encoding == null) {
+				r = theAutoDetector.autoDetectingReader(i);
+				}
+			else {
+				try {
+					r = new InputStreamReader(i, encoding);
+					}
+				catch (UnsupportedEncodingException e) {
+					r = new InputStreamReader(i);
+					}
+				}
+			}
+//		r = new BufferedReader(r);
+		return r;
+		}
+
+	// Get an InputStream based on a publicid and a systemid
+	private InputStream getInputStream(String publicid, String systemid) throws IOException, SAXException {
+		URL basis = new URL("file", "", System.getProperty("user.dir") + "/.");
+		URL url = new URL(basis, systemid);
+		URLConnection c = url.openConnection();
+		return c.getInputStream();
+		}
+		// We don't process publicids (who uses them anyhow?)
+
+	// ScanHandler implementation
+
+	private Element theNewElement = null;
+	private String theAttributeName = null;
+	private boolean theDoctypeIsPresent = false;
+	private String theDoctypePublicId = null;
+	private String theDoctypeSystemId = null;
+	private String theDoctypeName = null;
+	private String thePITarget = null;
+	private Element theStack = null;
+	private Element theSaved = null;
+	private Element thePCDATA = null;
+	private int theEntity = 0;	// needs to support chars past U+FFFF
+
+	public void adup(char[] buff, int offset, int length) throws SAXException {
+		if (theNewElement == null || theAttributeName == null) return;
+		theNewElement.setAttribute(theAttributeName, null, theAttributeName);
+		theAttributeName = null;
+		}
+
+	public void aname(char[] buff, int offset, int length) throws SAXException {
+		if (theNewElement == null) return;
+		// Currently we don't rely on Schema to canonicalize
+		// attribute names.
+		theAttributeName = makeName(buff, offset, length).toLowerCase();
+//		System.err.println("%% Attribute name " + theAttributeName);
+		}
+
+	public void aval(char[] buff, int offset, int length) throws SAXException {
+		if (theNewElement == null || theAttributeName == null) return;
+		String value = new String(buff, offset, length);
+//		System.err.println("%% Attribute value [" + value + "]");
+		value = expandEntities(value);
+		theNewElement.setAttribute(theAttributeName, null, value);
+		theAttributeName = null;
+//		System.err.println("%% Aval done");
+		}
+
+	// Expand entity references in attribute values selectively.
+	// Currently we expand a reference iff it is properly terminated
+	// with a semicolon.
+	private String expandEntities(String src) {
+		int refStart = -1;
+		int len = src.length();
+		char[] dst = new char[len];
+		int dstlen = 0;
+		for (int i = 0; i < len; i++) {
+			char ch = src.charAt(i);
+			dst[dstlen++] = ch;
+//			System.err.print("i = " + i + ", d = " + dstlen + ", ch = [" + ch + "] ");
+			if (ch == '&' && refStart == -1) {
+				// start of a ref excluding &
+				refStart = dstlen;
+//				System.err.println("start of ref");
+				}
+			else if (refStart == -1) {
+				// not in a ref
+//				System.err.println("not in ref");
+				}
+			else if (Character.isLetter(ch) ||
+					Character.isDigit(ch) ||
+					ch == '#') {
+				// valid entity char
+//				System.err.println("valid");
+				}
+			else if (ch == ';') {
+				// properly terminated ref
+//				System.err.print("got [" + new String(dst, refStart, dstlen-refStart-1) + "]");
+				int ent = lookupEntity(dst, refStart, dstlen - refStart - 1);
+//				System.err.println(" = " + ent);
+				if (ent > 0xFFFF) {
+					ent -= 0x10000;
+					dst[refStart - 1] = (char)((ent>>10) + 0xD800);
+					dst[refStart] = (char)((ent&0x3FF) + 0xDC00);
+					dstlen = refStart + 1;
+					}
+				else if (ent != 0) {
+					dst[refStart - 1] = (char)ent;
+					dstlen = refStart;
+					}
+				refStart = -1;
+				}
+			else {
+				// improperly terminated ref
+//				System.err.println("end of ref");
+				refStart = -1;
+				}
+			}
+		return new String(dst, 0, dstlen);
+		}
+
+	public void entity(char[] buff, int offset, int length) throws SAXException {
+		theEntity = lookupEntity(buff, offset, length);
+		}
+
+	// Process numeric character references,
+	// deferring to the schema for named ones.
+	private int lookupEntity(char[] buff, int offset, int length) {
+		int result = 0;
+		if (length < 1) return result;
+//		System.err.println("%% Entity at " + offset + " " + length);
+//		System.err.println("%% Got entity [" + new String(buff, offset, length) + "]");
+		if (buff[offset] == '#') {
+                        if (length > 1 && (buff[offset+1] == 'x'
+                                        || buff[offset+1] == 'X')) {
+                                try {
+                                        return Integer.parseInt(new String(buff, offset + 2, length - 2), 16);
+                                        }
+                                catch (NumberFormatException e) { return 0; }
+                                }
+                        try {
+                                return Integer.parseInt(new String(buff, offset + 1, length - 1), 10);
+                                }
+                        catch (NumberFormatException e) { return 0; }
+                        }
+		return theSchema.getEntity(new String(buff, offset, length));
+		}
+
+	public void eof(char[] buff, int offset, int length) throws SAXException {
+		if (virginStack) rectify(thePCDATA);
+		while (theStack.next() != null) {
+			pop();
+			}
+		if (!(theSchema.getURI().equals("")))
+			theContentHandler.endPrefixMapping(theSchema.getPrefix());
+		theContentHandler.endDocument();
+		}
+
+	public void etag(char[] buff, int offset, int length) throws SAXException {
+		if (etag_cdata(buff, offset, length)) return;
+		etag_basic(buff, offset, length);
+		}
+
+	private static char[] etagchars = {'<', '/', '>'};
+	public boolean etag_cdata(char[] buff, int offset, int length) throws SAXException {
+		String currentName = theStack.name();
+		// If this is a CDATA element and the tag doesn't match,
+		// or isn't properly formed (junk after the name),
+		// restart CDATA mode and process the tag as characters.
+		if (CDATAElements && (theStack.flags() & Schema.F_CDATA) != 0) {
+			boolean realTag = (length == currentName.length());
+			if (realTag) {
+				for (int i = 0; i < length; i++) {
+					if (Character.toLowerCase(buff[offset + i]) != Character.toLowerCase(currentName.charAt(i))) {
+						realTag = false;
+						break;
+						}
+					}
+				}
+			if (!realTag) {
+				theContentHandler.characters(etagchars, 0, 2);
+				theContentHandler.characters(buff, offset, length);
+				theContentHandler.characters(etagchars, 2, 1);
+				theScanner.startCDATA();
+				return true;
+				}
+			}
+		return false;
+		}
+
+	public void etag_basic(char[] buff, int offset, int length) throws SAXException {
+		theNewElement = null;
+		String name;
+		if (length != 0) {
+			// Canonicalize case of name
+			name = makeName(buff, offset, length);
+//			System.err.println("got etag [" + name + "]");
+			ElementType type = theSchema.getElementType(name);
+			if (type == null) return;	// mysterious end-tag
+			name = type.name();
+			}
+		else {
+			name = theStack.name();
+			}
+//		System.err.println("%% Got end of " + name);
+
+		Element sp;
+		boolean inNoforce = false;
+		for (sp = theStack; sp != null; sp = sp.next()) {
+			if (sp.name().equals(name)) break;
+			if ((sp.flags() & Schema.F_NOFORCE) != 0) inNoforce = true;
+			}
+
+		if (sp == null) return;		// Ignore unknown etags
+		if (sp.next() == null || sp.next().next() == null) return;
+		if (inNoforce) {		// inside an F_NOFORCE element?
+			sp.preclose();		// preclose the matching element
+			}
+		else {			// restartably pop everything above us
+			while (theStack != sp) {
+				restartablyPop();
+				}
+			pop();
+			}
+		// pop any preclosed elements now at the top
+		while (theStack.isPreclosed()) {
+			pop();
+			}
+		restart(null);
+		}
+
+	// Push restartables on the stack if possible
+	// e is the next element to be started, if we know what it is
+	private void restart(Element e) throws SAXException {
+		while (theSaved != null && theStack.canContain(theSaved) &&
+				(e == null || theSaved.canContain(e))) {
+			Element next = theSaved.next();
+			push(theSaved);
+			theSaved = next;
+			}
+		}
+
+	// Pop the stack irrevocably
+	private void pop() throws SAXException {
+		if (theStack == null) return;		// empty stack
+		String name = theStack.name();
+		String localName = theStack.localName();
+		String namespace = theStack.namespace();
+		String prefix = prefixOf(name);
+
+//		System.err.println("%% Popping " + name);
+		if (!namespaces) namespace = localName = "";
+		theContentHandler.endElement(namespace, localName, name);
+		if (foreign(prefix, namespace)) {
+			theContentHandler.endPrefixMapping(prefix);
+//			System.err.println("%% Unmapping [" + prefix + "] for elements to " + namespace);
+			}
+		Attributes atts = theStack.atts();
+		for (int i = atts.getLength() - 1; i >= 0; i--) {
+			String attNamespace = atts.getURI(i);
+			String attPrefix = prefixOf(atts.getQName(i));
+			if (foreign(attPrefix, attNamespace)) {
+				theContentHandler.endPrefixMapping(attPrefix);
+//			System.err.println("%% Unmapping [" + attPrefix + "] for attributes to " + attNamespace);
+				}
+			}
+		theStack = theStack.next();
+		}
+
+	// Pop the stack restartably
+	private void restartablyPop() throws SAXException {
+		Element popped = theStack;
+		pop();
+		if (restartElements && (popped.flags() & Schema.F_RESTART) != 0) {
+			popped.anonymize();
+			popped.setNext(theSaved);
+			theSaved = popped;
+			}
+		}
+
+	// Push element onto stack
+	private boolean virginStack = true;
+	private void push(Element e) throws SAXException {
+		String name = e.name();
+		String localName = e.localName();
+		String namespace = e.namespace();
+		String prefix = prefixOf(name);
+
+//		System.err.println("%% Pushing " + name);
+		e.clean();
+		if (!namespaces) namespace = localName = "";
+                if (virginStack && localName.equalsIgnoreCase(theDoctypeName)) {
+                    try {
+                        theEntityResolver.resolveEntity(theDoctypePublicId, theDoctypeSystemId);
+                    } catch (IOException ew) { }   // Can't be thrown for root I believe.
+                }
+		if (foreign(prefix, namespace)) {
+			theContentHandler.startPrefixMapping(prefix, namespace);
+//			System.err.println("%% Mapping [" + prefix + "] for elements to " + namespace);
+			}
+		Attributes atts = e.atts();
+		int len = atts.getLength();
+		for (int i = 0; i < len; i++) {
+			String attNamespace = atts.getURI(i);
+			String attPrefix = prefixOf(atts.getQName(i));
+			if (foreign(attPrefix, attNamespace)) {
+				theContentHandler.startPrefixMapping(attPrefix, attNamespace);
+//				System.err.println("%% Mapping [" + attPrefix + "] for attributes to " + attNamespace);
+				}
+			}
+		theContentHandler.startElement(namespace, localName, name, e.atts());
+		e.setNext(theStack);
+		theStack = e;
+		virginStack = false;
+		if (CDATAElements && (theStack.flags() & Schema.F_CDATA) != 0) {
+			theScanner.startCDATA();
+			}
+		}
+
+	// Get the prefix from a QName
+	private String prefixOf(String name) {
+		int i = name.indexOf(':');
+		String prefix = "";
+		if (i != -1) prefix = name.substring(0, i);
+//		System.err.println("%% " + prefix + " is prefix of " + name);
+		return prefix;
+		}
+
+	// Return true if we have a foreign name
+	private boolean foreign(String prefix, String namespace) {
+//		System.err.print("%% Testing " + prefix + " and " + namespace + " for foreignness -- ");
+		boolean foreign = !(prefix.equals("") || namespace.equals("") ||
+			namespace.equals(theSchema.getURI()));
+//		System.err.println(foreign);
+		return foreign;
+		}
+
+        /**
+         * Parsing the complete XML Document Type Definition is way too complex,
+         * but for many simple cases we can extract something useful from it.
+         *
+         * doctypedecl  ::= '<!DOCTYPE' S Name (S ExternalID)? S? ('[' intSubset ']' S?)? '>'
+         *  DeclSep     ::= PEReference | S
+         *  intSubset   ::= (markupdecl | DeclSep)*
+         *  markupdecl  ::= elementdecl | AttlistDecl | EntityDecl | NotationDecl | PI | Comment
+         *  ExternalID  ::= 'SYSTEM' S SystemLiteral | 'PUBLIC' S PubidLiteral S SystemLiteral
+         */
+	public void decl(char[] buff, int offset, int length) throws SAXException {
+		String s = new String(buff, offset, length);
+		String name = null;
+		String systemid = null;
+		String publicid = null;
+		String[] v = split(s);
+		if (v.length > 0 && "DOCTYPE".equals(v[0])) {
+			if (theDoctypeIsPresent) return;		// one doctype only!
+			theDoctypeIsPresent = true;
+			if (v.length > 1) {
+				name = v[1];
+				if (v.length>3 && "SYSTEM".equals(v[2])) {
+				systemid = v[3];
+				}
+			else if (v.length > 3 && "PUBLIC".equals(v[2])) {
+				publicid = v[3];
+				if (v.length > 4) {
+					systemid = v[4];
+					}
+				else {
+					systemid = "";
+					}
+                    }
+                }
+            }
+		publicid = trimquotes(publicid);
+		systemid = trimquotes(systemid);
+		if (name != null) {
+			publicid = cleanPublicid(publicid);
+			theLexicalHandler.startDTD(name, publicid, systemid);
+			theLexicalHandler.endDTD();
+			theDoctypeName = name;
+			theDoctypePublicId = publicid;
+		if (theScanner instanceof Locator) {    // Must resolve systemid
+                    theDoctypeSystemId  = ((Locator)theScanner).getSystemId();
+                    try {
+                        theDoctypeSystemId = new URL(new URL(theDoctypeSystemId), systemid).toString();
+                    } catch (Exception e) {}
+                }
+            }
+        }
+
+	// If the String is quoted, trim the quotes.
+	private static String trimquotes(String in) {
+		if (in == null) return in;
+		int length = in.length();
+		if (length == 0) return in;
+		char s = in.charAt(0);
+		char e = in.charAt(length - 1);
+		if (s == e && (s == '\'' || s == '"')) {
+			in = in.substring(1, in.length() - 1);
+			}
+		return in;
+		}
+
+	// Split the supplied String into words or phrases seperated by spaces.
+	// Recognises quotes around a phrase and doesn't split it.
+	private static String[] split(String val) throws IllegalArgumentException {
+		val = val.trim();
+		if (val.length() == 0) {
+			return new String[0];
+			}
+		else {
+			ArrayList l = new ArrayList();
+			int s = 0;
+			int e = 0;
+			boolean sq = false;	// single quote
+			boolean dq = false;	// double quote
+			char lastc = 0;
+			int len = val.length();
+			for (e=0; e < len; e++) {
+				char c = val.charAt(e);
+				if (!dq && c == '\'' && lastc != '\\') {
+				sq = !sq;
+				if (s < 0) s = e;
+				}
+			else if (!sq && c == '\"' && lastc != '\\') {
+				dq = !dq;
+				if (s < 0) s = e;
+				}
+			else if (!sq && !dq) {
+				if (Character.isWhitespace(c)) {
+					if (s >= 0) l.add(val.substring(s, e));
+					s = -1;
+					}
+				else if (s < 0 && c != ' ') {
+					s = e;
+					}
+				}
+			lastc = c;
+			}
+		l.add(val.substring(s, e));
+		return (String[])l.toArray(new String[0]);
+		}
+        }
+
+	// Replace junk in publicids with spaces
+	private static String legal =
+		"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-'()+,./:=?;!*#@$_%";
+
+	private String cleanPublicid(String src) {
+		if (src == null) return null;
+		int len = src.length();
+		StringBuffer dst = new StringBuffer(len);
+		boolean suppressSpace = true;
+		for (int i = 0; i < len; i++) {
+			char ch = src.charAt(i);
+			if (legal.indexOf(ch) != -1) { 	// legal but not whitespace
+				dst.append(ch);
+				suppressSpace = false;
+				}
+			else if (suppressSpace) {	// normalizable whitespace or junk
+				;
+				}
+			else {
+				dst.append(' ');
+				suppressSpace = true;
+				}
+			}
+//		System.err.println("%% Publicid [" + dst.toString().trim() + "]");
+		return dst.toString().trim();	// trim any final junk whitespace
+		}
+
+
+	public void gi(char[] buff, int offset, int length) throws SAXException {
+		if (theNewElement != null) return;
+		String name = makeName(buff, offset, length);
+		if (name == null) return;
+		ElementType type = theSchema.getElementType(name);
+		if (type == null) {
+			// Suppress unknown elements if ignore-bogons is on
+			if (ignoreBogons) return;
+			int bogonModel = bogonsEmpty ? Schema.M_EMPTY : Schema.M_ANY;
+			int bogonMemberOf = rootBogons ? Schema.M_ANY : (Schema.M_ANY & ~ Schema.M_ROOT);
+			theSchema.elementType(name, bogonModel, bogonMemberOf, 0);
+			if (!rootBogons) theSchema.parent(name, theSchema.rootElementType().name());
+			type = theSchema.getElementType(name);
+			}
+
+		theNewElement = new Element(type, defaultAttributes);
+//		System.err.println("%% Got GI " + theNewElement.name());
+		}
+
+	public void cdsect(char[] buff, int offset, int length) throws SAXException {
+		theLexicalHandler.startCDATA();
+		pcdata(buff, offset, length);
+		theLexicalHandler.endCDATA();
+		}
+	public void pcdata(char[] buff, int offset, int length) throws SAXException {
+		if (length == 0) return;
+		boolean allWhite = true;
+		for (int i = 0; i < length; i++) {
+			if (!Character.isWhitespace(buff[offset+i])) {
+				allWhite = false;
+				}
+			}
+		if (allWhite && !theStack.canContain(thePCDATA)) {
+			if (ignorableWhitespace) {
+				theContentHandler.ignorableWhitespace(buff, offset, length);
+				}
+			}
+		else {
+			rectify(thePCDATA);
+			theContentHandler.characters(buff, offset, length);
+			}
+		}
+
+	public void pitarget(char[] buff, int offset, int length) throws SAXException {
+		if (theNewElement != null) return;
+		thePITarget = makeName(buff, offset, length).replace(':', '_');
+		}
+
+	public void pi(char[] buff, int offset, int length) throws SAXException {
+		if (theNewElement != null || thePITarget == null) return;
+		if ("xml".equalsIgnoreCase(thePITarget)) return;
+//		if (length > 0 && buff[length - 1] == '?') System.err.println("%% Removing ? from PI");
+		if (length > 0 && buff[length - 1] == '?') length--;	// remove trailing ?
+		theContentHandler.processingInstruction(thePITarget,
+			new String(buff, offset, length));
+		thePITarget = null;
+		}
+
+	public void stagc(char[] buff, int offset, int length) throws SAXException {
+//		System.err.println("%% Start-tag");
+		if (theNewElement == null) return;
+		rectify(theNewElement);
+		if (theStack.model() == Schema.M_EMPTY) {
+			// Force an immediate end tag
+			etag_basic(buff, offset, length);
+			}
+		}
+
+	public void stage(char[] buff, int offset, int length) throws SAXException {
+//		System.err.println("%% Empty-tag");
+		if (theNewElement == null) return;
+		rectify(theNewElement);
+		// Force an immediate end tag
+		etag_basic(buff, offset, length);
+		}
+
+	// Comment buffer is twice the size of the output buffer
+	private char[] theCommentBuffer = new char[2000];
+	public void cmnt(char[] buff, int offset, int length) throws SAXException {
+		theLexicalHandler.comment(buff, offset, length);
+		}
+
+	// Rectify the stack, pushing and popping as needed
+	// so that the argument can be safely pushed
+	private void rectify(Element e) throws SAXException {
+		Element sp;
+		while (true) {
+			for (sp = theStack; sp != null; sp = sp.next()) {
+				if (sp.canContain(e)) break;
+				}
+			if (sp != null) break;
+			ElementType parentType = e.parent();
+			if (parentType == null) break;
+			Element parent = new Element(parentType, defaultAttributes);
+//			System.err.println("%% Ascending from " + e.name() + " to " + parent.name());
+			parent.setNext(e);
+			e = parent;
+			}
+		if (sp == null) return;		// don't know what to do
+		while (theStack != sp) {
+			if (theStack == null || theStack.next() == null ||
+				theStack.next().next() == null) break;
+			restartablyPop();
+			}
+		while (e != null) {
+			Element nexte = e.next();
+			if (!e.name().equals("<pcdata>")) push(e);
+			e = nexte;
+			restart(e);
+			}
+		theNewElement = null;
+		}
+
+	public int getEntity() {
+		return theEntity;
+		}
+
+	// Return the argument as a valid XML name
+	// This no longer lowercases the result: we depend on Schema to
+	// canonicalize case.
+	private String makeName(char[] buff, int offset, int length) {
+		StringBuffer dst = new StringBuffer(length + 2);
+		boolean seenColon = false;
+		boolean start = true;
+//		String src = new String(buff, offset, length); // DEBUG
+		for (; length-- > 0; offset++) {
+			char ch = buff[offset];
+			if (Character.isLetter(ch) || ch == '_') {
+				start = false;
+				dst.append(ch);
+				}
+			else if (Character.isDigit(ch) || ch == '-' || ch == '.') {
+				if (start) dst.append('_');
+				start = false;
+				dst.append(ch);
+				}
+			else if (ch == ':' && !seenColon) {
+				seenColon = true;
+				if (start) dst.append('_');
+				start = true;
+				dst.append(translateColons ? '_' : ch);
+				}
+			}
+		int dstLength = dst.length();
+		if (dstLength == 0 || dst.charAt(dstLength - 1) == ':') dst.append('_');
+//		System.err.println("Made name \"" + dst + "\" from \"" + src + "\"");
+		return dst.toString().intern();
+		}
+
+	// Default LexicalHandler implementation
+
+	public void comment(char[] ch, int start, int length) throws SAXException { }
+	public void endCDATA() throws SAXException { }
+	public void endDTD() throws SAXException { }
+	public void endEntity(String name) throws SAXException { }
+	public void startCDATA() throws SAXException { }
+	public void startDTD(String name, String publicid, String systemid) throws SAXException { }
+	public void startEntity(String name) throws SAXException { }
+
+	}