aboutsummaryrefslogtreecommitdiff
path: root/src/templates/org/ccil
diff options
context:
space:
mode:
Diffstat (limited to 'src/templates/org/ccil')
-rw-r--r--src/templates/org/ccil/cowan/tagsoup/HTMLModels.java31
-rw-r--r--src/templates/org/ccil/cowan/tagsoup/HTMLScanner.java427
-rw-r--r--src/templates/org/ccil/cowan/tagsoup/HTMLSchema.java38
3 files changed, 496 insertions, 0 deletions
diff --git a/src/templates/org/ccil/cowan/tagsoup/HTMLModels.java b/src/templates/org/ccil/cowan/tagsoup/HTMLModels.java
new file mode 100644
index 0000000..e982bcf
--- /dev/null
+++ b/src/templates/org/ccil/cowan/tagsoup/HTMLModels.java
@@ -0,0 +1,31 @@
+// This file is part of TagSoup and is Copyright 2002-2008 by John Cowan.
+//
+// TagSoup is licensed under the Apache License,
+// Version 2.0. You may obtain a copy of this license at
+// http://www.apache.org/licenses/LICENSE-2.0 . You may also have
+// additional legal rights not granted by this license.
+//
+// TagSoup is distributed in the hope that it will be useful, but
+// unless required by applicable law or agreed to in writing, TagSoup
+// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
+// OF ANY KIND, either express or implied; not even the implied warranty
+// of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+//
+//
+// Defines models for HTMLSchema
+
+/**
+This interface contains generated constants representing HTML content
+models. Logically, it is part of HTMLSchema, but it is more
+convenient to generate the constants into a separate interface.
+*/
+
+package org.ccil.cowan.tagsoup;
+public interface HTMLModels {
+
+ // Start of model definitions
+ @@MODEL_DEFINITIONS@@
+
+ // End of model definitions
+
+ }
diff --git a/src/templates/org/ccil/cowan/tagsoup/HTMLScanner.java b/src/templates/org/ccil/cowan/tagsoup/HTMLScanner.java
new file mode 100644
index 0000000..568493a
--- /dev/null
+++ b/src/templates/org/ccil/cowan/tagsoup/HTMLScanner.java
@@ -0,0 +1,427 @@
+// This file is part of TagSoup and is Copyright 2002-2008 by John Cowan.
+//
+// TagSoup is licensed under the Apache License,
+// Version 2.0. You may obtain a copy of this license at
+// http://www.apache.org/licenses/LICENSE-2.0 . You may also have
+// additional legal rights not granted by this license.
+//
+// TagSoup is distributed in the hope that it will be useful, but
+// unless required by applicable law or agreed to in writing, TagSoup
+// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
+// OF ANY KIND, either express or implied; not even the implied warranty
+// of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+//
+//
+package org.ccil.cowan.tagsoup;
+import java.io.*;
+import org.xml.sax.SAXException;
+import org.xml.sax.Locator;
+
+/**
+This class implements a table-driven scanner for HTML, allowing for lots of
+defects. It implements the Scanner interface, which accepts a Reader
+object to fetch characters from and a ScanHandler object to report lexical
+events to.
+*/
+
+public class HTMLScanner implements Scanner, Locator {
+
+ // Start of state table
+ @@STATE_TABLE@@
+ // End of state table
+
+ private String thePublicid; // Locator state
+ private String theSystemid;
+ private int theLastLine;
+ private int theLastColumn;
+ private int theCurrentLine;
+ private int theCurrentColumn;
+
+ int theState; // Current state
+ int theNextState; // Next state
+ char[] theOutputBuffer = new char[200]; // Output buffer
+ int theSize; // Current buffer size
+ int[] theWinMap = { // Windows chars map
+ 0x20AC, 0xFFFD, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021,
+ 0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0xFFFD, 0x017D, 0xFFFD,
+ 0xFFFD, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014,
+ 0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0xFFFD, 0x017E, 0x0178};
+
+ // Compensate for bug in PushbackReader that allows
+ // pushing back EOF.
+ private void unread(PushbackReader r, int c) throws IOException {
+ if (c != -1) r.unread(c);
+ }
+
+ // Locator implementation
+
+ public int getLineNumber() {
+ return theLastLine;
+ }
+ public int getColumnNumber() {
+ return theLastColumn;
+ }
+ public String getPublicId() {
+ return thePublicid;
+ }
+ public String getSystemId() {
+ return theSystemid;
+ }
+
+
+ // Scanner implementation
+
+ /**
+ Reset document locator, supplying systemid and publicid.
+ @param systemid System id
+ @param publicid Public id
+ */
+
+ public void resetDocumentLocator(String publicid, String systemid) {
+ thePublicid = publicid;
+ theSystemid = systemid;
+ theLastLine = theLastColumn = theCurrentLine = theCurrentColumn = 0;
+ }
+
+ /**
+ Scan HTML source, reporting lexical events.
+ @param r0 Reader that provides characters
+ @param h ScanHandler that accepts lexical events.
+ */
+
+ public void scan(Reader r0, ScanHandler h) throws IOException, SAXException {
+ theState = S_PCDATA;
+ PushbackReader r;
+ if (r0 instanceof PushbackReader) {
+ r = (PushbackReader)r0;
+ }
+ else if (r0 instanceof BufferedReader) {
+ r = new PushbackReader(r0);
+ }
+ else {
+ r = new PushbackReader(new BufferedReader(r0));
+ }
+
+ int firstChar = r.read(); // Remove any leading BOM
+ if (firstChar != '\uFEFF') unread(r, firstChar);
+
+ while (theState != S_DONE) {
+ int ch = r.read();
+
+ // Process control characters
+ if (ch >= 0x80 && ch <= 0x9F) ch = theWinMap[ch-0x80];
+
+ if (ch == '\r') {
+ ch = r.read(); // expect LF next
+ if (ch != '\n') {
+ unread(r, ch); // nope
+ ch = '\n';
+ }
+ }
+
+ if (ch == '\n') {
+ theCurrentLine++;
+ theCurrentColumn = 0;
+ }
+ else {
+ theCurrentColumn++;
+ }
+
+ if (!(ch >= 0x20 || ch == '\n' || ch == '\t' || ch == -1)) continue;
+
+ // Search state table
+ int action = 0;
+ for (int i = 0; i < statetable.length; i += 4) {
+ if (theState != statetable[i]) {
+ if (action != 0) break;
+ continue;
+ }
+ if (statetable[i+1] == 0) {
+ action = statetable[i+2];
+ theNextState = statetable[i+3];
+ }
+ else if (statetable[i+1] == ch) {
+ action = statetable[i+2];
+ theNextState = statetable[i+3];
+ break;
+ }
+ }
+// System.err.println("In " + debug_statenames[theState] + " got " + nicechar(ch) + " doing " + debug_actionnames[action] + " then " + debug_statenames[theNextState]);
+ switch (action) {
+ case 0:
+ throw new Error(
+"HTMLScanner can't cope with " + Integer.toString(ch) + " in state " +
+Integer.toString(theState));
+ case A_ADUP:
+ h.adup(theOutputBuffer, 0, theSize);
+ theSize = 0;
+ break;
+ case A_ADUP_SAVE:
+ h.adup(theOutputBuffer, 0, theSize);
+ theSize = 0;
+ save(ch, h);
+ break;
+ case A_ADUP_STAGC:
+ h.adup(theOutputBuffer, 0, theSize);
+ theSize = 0;
+ h.stagc(theOutputBuffer, 0, theSize);
+ break;
+ case A_ANAME:
+ h.aname(theOutputBuffer, 0, theSize);
+ theSize = 0;
+ break;
+ case A_ANAME_ADUP:
+ h.aname(theOutputBuffer, 0, theSize);
+ theSize = 0;
+ h.adup(theOutputBuffer, 0, theSize);
+ break;
+ case A_ANAME_ADUP_STAGC:
+ h.aname(theOutputBuffer, 0, theSize);
+ theSize = 0;
+ h.adup(theOutputBuffer, 0, theSize);
+ h.stagc(theOutputBuffer, 0, theSize);
+ break;
+ case A_AVAL:
+ h.aval(theOutputBuffer, 0, theSize);
+ theSize = 0;
+ break;
+ case A_AVAL_STAGC:
+ h.aval(theOutputBuffer, 0, theSize);
+ theSize = 0;
+ h.stagc(theOutputBuffer, 0, theSize);
+ break;
+ case A_CDATA:
+ mark();
+ // suppress the final "]]" in the buffer
+ if (theSize > 1) theSize -= 2;
+ h.pcdata(theOutputBuffer, 0, theSize);
+ theSize = 0;
+ break;
+ case A_ENTITY_START:
+ h.pcdata(theOutputBuffer, 0, theSize);
+ theSize = 0;
+ save(ch, h);
+ break;
+ case A_ENTITY:
+ mark();
+ char ch1 = (char)ch;
+// System.out.println("Got " + ch1 + " in state " + ((theState == S_ENT) ? "S_ENT" : ((theState == S_NCR) ? "S_NCR" : "UNK")));
+ if (theState == S_ENT && ch1 == '#') {
+ theNextState = S_NCR;
+ save(ch, h);
+ break;
+ }
+ else if (theState == S_NCR && (ch1 == 'x' || ch1 == 'X')) {
+ theNextState = S_XNCR;
+ save(ch, h);
+ break;
+ }
+ else if (theState == S_ENT && Character.isLetterOrDigit(ch1)) {
+ save(ch, h);
+ break;
+ }
+ else if (theState == S_NCR && Character.isDigit(ch1)) {
+ save(ch, h);
+ break;
+ }
+ else if (theState == S_XNCR && (Character.isDigit(ch1) || "abcdefABCDEF".indexOf(ch1) != -1)) {
+ save(ch, h);
+ break;
+ }
+
+ // The whole entity reference has been collected
+// System.err.println("%%" + new String(theOutputBuffer, 0, theSize));
+ h.entity(theOutputBuffer, 1, theSize - 1);
+ int ent = h.getEntity();
+// System.err.println("%% value = " + ent);
+ if (ent != 0) {
+ theSize = 0;
+ if (ent >= 0x80 && ent <= 0x9F) {
+ ent = theWinMap[ent-0x80];
+ }
+ if (ent < 0x20) {
+ // Control becomes space
+ ent = 0x20;
+ }
+ else if (ent >= 0xD800 && ent <= 0xDFFF) {
+ // Surrogates get dropped
+ ent = 0;
+ }
+ else if (ent <= 0xFFFF) {
+ // BMP character
+ save(ent, h);
+ }
+ else {
+ // Astral converted to two surrogates
+ ent -= 0x10000;
+ save((ent>>10) + 0xD800, h);
+ save((ent&0x3FF) + 0xDC00, h);
+ }
+ if (ch != ';') {
+ unread(r, ch);
+ theCurrentColumn--;
+ }
+ }
+ else {
+ unread(r, ch);
+ theCurrentColumn--;
+ }
+ theNextState = S_PCDATA;
+ break;
+ case A_ETAG:
+ h.etag(theOutputBuffer, 0, theSize);
+ theSize = 0;
+ break;
+ case A_DECL:
+ h.decl(theOutputBuffer, 0, theSize);
+ theSize = 0;
+ break;
+ case A_GI:
+ h.gi(theOutputBuffer, 0, theSize);
+ theSize = 0;
+ break;
+ case A_GI_STAGC:
+ h.gi(theOutputBuffer, 0, theSize);
+ theSize = 0;
+ h.stagc(theOutputBuffer, 0, theSize);
+ break;
+ case A_LT:
+ mark();
+ save('<', h);
+ save(ch, h);
+ break;
+ case A_LT_PCDATA:
+ mark();
+ save('<', h);
+ h.pcdata(theOutputBuffer, 0, theSize);
+ theSize = 0;
+ break;
+ case A_PCDATA:
+ mark();
+ h.pcdata(theOutputBuffer, 0, theSize);
+ theSize = 0;
+ break;
+ case A_CMNT:
+ mark();
+ h.cmnt(theOutputBuffer, 0, theSize);
+ theSize = 0;
+ break;
+ case A_MINUS3:
+ save('-', h);
+ save(' ', h);
+ break;
+ case A_MINUS2:
+ save('-', h);
+ save(' ', h);
+ // fall through into A_MINUS
+ case A_MINUS:
+ save('-', h);
+ save(ch, h);
+ break;
+ case A_PI:
+ mark();
+ h.pi(theOutputBuffer, 0, theSize);
+ theSize = 0;
+ break;
+ case A_PITARGET:
+ h.pitarget(theOutputBuffer, 0, theSize);
+ theSize = 0;
+ break;
+ case A_PITARGET_PI:
+ h.pitarget(theOutputBuffer, 0, theSize);
+ theSize = 0;
+ h.pi(theOutputBuffer, 0, theSize);
+ break;
+ case A_SAVE:
+ save(ch, h);
+ break;
+ case A_SKIP:
+ break;
+ case A_SP:
+ save(' ', h);
+ break;
+ case A_STAGC:
+ h.stagc(theOutputBuffer, 0, theSize);
+ theSize = 0;
+ break;
+ case A_EMPTYTAG:
+ mark();
+// System.err.println("%%% Empty tag seen");
+ if (theSize > 0) h.gi(theOutputBuffer, 0, theSize);
+ theSize = 0;
+ h.stage(theOutputBuffer, 0, theSize);
+ break;
+ case A_UNGET:
+ unread(r, ch);
+ theCurrentColumn--;
+ break;
+ case A_UNSAVE_PCDATA:
+ if (theSize > 0) theSize--;
+ h.pcdata(theOutputBuffer, 0, theSize);
+ theSize = 0;
+ break;
+ default:
+ throw new Error("Can't process state " + action);
+ }
+ theState = theNextState;
+ }
+ h.eof(theOutputBuffer, 0, 0);
+ }
+
+ /**
+ * Mark the current scan position as a "point of interest" - start of a tag,
+ * cdata, processing instruction etc.
+ */
+
+ private void mark() {
+ theLastColumn = theCurrentColumn;
+ theLastLine = theCurrentLine;
+ }
+
+ /**
+ A callback for the ScanHandler that allows it to force
+ the lexer state to CDATA content (no markup is recognized except
+ the end of element.
+ */
+
+ public void startCDATA() { theNextState = S_CDATA; }
+
+ private void save(int ch, ScanHandler h) throws IOException, SAXException {
+ if (theSize >= theOutputBuffer.length - 20) {
+ if (theState == S_PCDATA || theState == S_CDATA) {
+ // Return a buffer-sized chunk of PCDATA
+ h.pcdata(theOutputBuffer, 0, theSize);
+ theSize = 0;
+ }
+ else {
+ // Grow the buffer size
+ char[] newOutputBuffer = new char[theOutputBuffer.length * 2];
+ System.arraycopy(theOutputBuffer, 0, newOutputBuffer, 0, theSize+1);
+ theOutputBuffer = newOutputBuffer;
+ }
+ }
+ theOutputBuffer[theSize++] = (char)ch;
+ }
+
+ /**
+ Test procedure. Reads HTML from the standard input and writes
+ PYX to the standard output.
+ */
+
+ public static void main(String[] argv) throws IOException, SAXException {
+ Scanner s = new HTMLScanner();
+ Reader r = new InputStreamReader(System.in, "UTF-8");
+ Writer w = new OutputStreamWriter(System.out, "UTF-8");
+ PYXWriter pw = new PYXWriter(w);
+ s.scan(r, pw);
+ w.close();
+ }
+
+
+ private static String nicechar(int in) {
+ if (in == '\n') return "\\n";
+ if (in < 32) return "0x"+Integer.toHexString(in);
+ return "'"+((char)in)+"'";
+ }
+
+ }
diff --git a/src/templates/org/ccil/cowan/tagsoup/HTMLSchema.java b/src/templates/org/ccil/cowan/tagsoup/HTMLSchema.java
new file mode 100644
index 0000000..8def657
--- /dev/null
+++ b/src/templates/org/ccil/cowan/tagsoup/HTMLSchema.java
@@ -0,0 +1,38 @@
+// This file is part of TagSoup and is Copyright 2002-2008 by John Cowan.
+//
+// TagSoup is licensed under the Apache License,
+// Version 2.0. You may obtain a copy of this license at
+// http://www.apache.org/licenses/LICENSE-2.0 . You may also have
+// additional legal rights not granted by this license.
+//
+// TagSoup is distributed in the hope that it will be useful, but
+// unless required by applicable law or agreed to in writing, TagSoup
+// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
+// OF ANY KIND, either express or implied; not even the implied warranty
+// of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+//
+//
+/**
+This class provides a Schema that has been preinitialized with HTML
+elements, attributes, and character entity declarations. All the declarations
+normally provided with HTML 4.01 are given, plus some that are IE-specific
+and NS4-specific. Attribute declarations of type CDATA with no default
+value are not included.
+*/
+
+package org.ccil.cowan.tagsoup;
+public class HTMLSchema extends Schema implements HTMLModels {
+
+ /**
+ Returns a newly constructed HTMLSchema object independent of
+ any existing ones.
+ */
+
+ public HTMLSchema() {
+ // Start of Schema calls
+ @@SCHEMA_CALLS@@
+ // End of Schema calls
+ }
+
+
+ }