diff options
Diffstat (limited to 'src/templates/org/ccil')
-rw-r--r-- | src/templates/org/ccil/cowan/tagsoup/HTMLModels.java | 31 | ||||
-rw-r--r-- | src/templates/org/ccil/cowan/tagsoup/HTMLScanner.java | 427 | ||||
-rw-r--r-- | src/templates/org/ccil/cowan/tagsoup/HTMLSchema.java | 38 |
3 files changed, 496 insertions, 0 deletions
diff --git a/src/templates/org/ccil/cowan/tagsoup/HTMLModels.java b/src/templates/org/ccil/cowan/tagsoup/HTMLModels.java new file mode 100644 index 0000000..e982bcf --- /dev/null +++ b/src/templates/org/ccil/cowan/tagsoup/HTMLModels.java @@ -0,0 +1,31 @@ +// This file is part of TagSoup and is Copyright 2002-2008 by John Cowan. +// +// TagSoup is licensed under the Apache License, +// Version 2.0. You may obtain a copy of this license at +// http://www.apache.org/licenses/LICENSE-2.0 . You may also have +// additional legal rights not granted by this license. +// +// TagSoup is distributed in the hope that it will be useful, but +// unless required by applicable law or agreed to in writing, TagSoup +// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS +// OF ANY KIND, either express or implied; not even the implied warranty +// of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +// +// +// Defines models for HTMLSchema + +/** +This interface contains generated constants representing HTML content +models. Logically, it is part of HTMLSchema, but it is more +convenient to generate the constants into a separate interface. +*/ + +package org.ccil.cowan.tagsoup; +public interface HTMLModels { + + // Start of model definitions + @@MODEL_DEFINITIONS@@ + + // End of model definitions + + } diff --git a/src/templates/org/ccil/cowan/tagsoup/HTMLScanner.java b/src/templates/org/ccil/cowan/tagsoup/HTMLScanner.java new file mode 100644 index 0000000..568493a --- /dev/null +++ b/src/templates/org/ccil/cowan/tagsoup/HTMLScanner.java @@ -0,0 +1,427 @@ +// This file is part of TagSoup and is Copyright 2002-2008 by John Cowan. +// +// TagSoup is licensed under the Apache License, +// Version 2.0. You may obtain a copy of this license at +// http://www.apache.org/licenses/LICENSE-2.0 . You may also have +// additional legal rights not granted by this license. +// +// TagSoup is distributed in the hope that it will be useful, but +// unless required by applicable law or agreed to in writing, TagSoup +// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS +// OF ANY KIND, either express or implied; not even the implied warranty +// of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +// +// +package org.ccil.cowan.tagsoup; +import java.io.*; +import org.xml.sax.SAXException; +import org.xml.sax.Locator; + +/** +This class implements a table-driven scanner for HTML, allowing for lots of +defects. It implements the Scanner interface, which accepts a Reader +object to fetch characters from and a ScanHandler object to report lexical +events to. +*/ + +public class HTMLScanner implements Scanner, Locator { + + // Start of state table + @@STATE_TABLE@@ + // End of state table + + private String thePublicid; // Locator state + private String theSystemid; + private int theLastLine; + private int theLastColumn; + private int theCurrentLine; + private int theCurrentColumn; + + int theState; // Current state + int theNextState; // Next state + char[] theOutputBuffer = new char[200]; // Output buffer + int theSize; // Current buffer size + int[] theWinMap = { // Windows chars map + 0x20AC, 0xFFFD, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, + 0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0xFFFD, 0x017D, 0xFFFD, + 0xFFFD, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, + 0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0xFFFD, 0x017E, 0x0178}; + + // Compensate for bug in PushbackReader that allows + // pushing back EOF. + private void unread(PushbackReader r, int c) throws IOException { + if (c != -1) r.unread(c); + } + + // Locator implementation + + public int getLineNumber() { + return theLastLine; + } + public int getColumnNumber() { + return theLastColumn; + } + public String getPublicId() { + return thePublicid; + } + public String getSystemId() { + return theSystemid; + } + + + // Scanner implementation + + /** + Reset document locator, supplying systemid and publicid. + @param systemid System id + @param publicid Public id + */ + + public void resetDocumentLocator(String publicid, String systemid) { + thePublicid = publicid; + theSystemid = systemid; + theLastLine = theLastColumn = theCurrentLine = theCurrentColumn = 0; + } + + /** + Scan HTML source, reporting lexical events. + @param r0 Reader that provides characters + @param h ScanHandler that accepts lexical events. + */ + + public void scan(Reader r0, ScanHandler h) throws IOException, SAXException { + theState = S_PCDATA; + PushbackReader r; + if (r0 instanceof PushbackReader) { + r = (PushbackReader)r0; + } + else if (r0 instanceof BufferedReader) { + r = new PushbackReader(r0); + } + else { + r = new PushbackReader(new BufferedReader(r0)); + } + + int firstChar = r.read(); // Remove any leading BOM + if (firstChar != '\uFEFF') unread(r, firstChar); + + while (theState != S_DONE) { + int ch = r.read(); + + // Process control characters + if (ch >= 0x80 && ch <= 0x9F) ch = theWinMap[ch-0x80]; + + if (ch == '\r') { + ch = r.read(); // expect LF next + if (ch != '\n') { + unread(r, ch); // nope + ch = '\n'; + } + } + + if (ch == '\n') { + theCurrentLine++; + theCurrentColumn = 0; + } + else { + theCurrentColumn++; + } + + if (!(ch >= 0x20 || ch == '\n' || ch == '\t' || ch == -1)) continue; + + // Search state table + int action = 0; + for (int i = 0; i < statetable.length; i += 4) { + if (theState != statetable[i]) { + if (action != 0) break; + continue; + } + if (statetable[i+1] == 0) { + action = statetable[i+2]; + theNextState = statetable[i+3]; + } + else if (statetable[i+1] == ch) { + action = statetable[i+2]; + theNextState = statetable[i+3]; + break; + } + } +// System.err.println("In " + debug_statenames[theState] + " got " + nicechar(ch) + " doing " + debug_actionnames[action] + " then " + debug_statenames[theNextState]); + switch (action) { + case 0: + throw new Error( +"HTMLScanner can't cope with " + Integer.toString(ch) + " in state " + +Integer.toString(theState)); + case A_ADUP: + h.adup(theOutputBuffer, 0, theSize); + theSize = 0; + break; + case A_ADUP_SAVE: + h.adup(theOutputBuffer, 0, theSize); + theSize = 0; + save(ch, h); + break; + case A_ADUP_STAGC: + h.adup(theOutputBuffer, 0, theSize); + theSize = 0; + h.stagc(theOutputBuffer, 0, theSize); + break; + case A_ANAME: + h.aname(theOutputBuffer, 0, theSize); + theSize = 0; + break; + case A_ANAME_ADUP: + h.aname(theOutputBuffer, 0, theSize); + theSize = 0; + h.adup(theOutputBuffer, 0, theSize); + break; + case A_ANAME_ADUP_STAGC: + h.aname(theOutputBuffer, 0, theSize); + theSize = 0; + h.adup(theOutputBuffer, 0, theSize); + h.stagc(theOutputBuffer, 0, theSize); + break; + case A_AVAL: + h.aval(theOutputBuffer, 0, theSize); + theSize = 0; + break; + case A_AVAL_STAGC: + h.aval(theOutputBuffer, 0, theSize); + theSize = 0; + h.stagc(theOutputBuffer, 0, theSize); + break; + case A_CDATA: + mark(); + // suppress the final "]]" in the buffer + if (theSize > 1) theSize -= 2; + h.pcdata(theOutputBuffer, 0, theSize); + theSize = 0; + break; + case A_ENTITY_START: + h.pcdata(theOutputBuffer, 0, theSize); + theSize = 0; + save(ch, h); + break; + case A_ENTITY: + mark(); + char ch1 = (char)ch; +// System.out.println("Got " + ch1 + " in state " + ((theState == S_ENT) ? "S_ENT" : ((theState == S_NCR) ? "S_NCR" : "UNK"))); + if (theState == S_ENT && ch1 == '#') { + theNextState = S_NCR; + save(ch, h); + break; + } + else if (theState == S_NCR && (ch1 == 'x' || ch1 == 'X')) { + theNextState = S_XNCR; + save(ch, h); + break; + } + else if (theState == S_ENT && Character.isLetterOrDigit(ch1)) { + save(ch, h); + break; + } + else if (theState == S_NCR && Character.isDigit(ch1)) { + save(ch, h); + break; + } + else if (theState == S_XNCR && (Character.isDigit(ch1) || "abcdefABCDEF".indexOf(ch1) != -1)) { + save(ch, h); + break; + } + + // The whole entity reference has been collected +// System.err.println("%%" + new String(theOutputBuffer, 0, theSize)); + h.entity(theOutputBuffer, 1, theSize - 1); + int ent = h.getEntity(); +// System.err.println("%% value = " + ent); + if (ent != 0) { + theSize = 0; + if (ent >= 0x80 && ent <= 0x9F) { + ent = theWinMap[ent-0x80]; + } + if (ent < 0x20) { + // Control becomes space + ent = 0x20; + } + else if (ent >= 0xD800 && ent <= 0xDFFF) { + // Surrogates get dropped + ent = 0; + } + else if (ent <= 0xFFFF) { + // BMP character + save(ent, h); + } + else { + // Astral converted to two surrogates + ent -= 0x10000; + save((ent>>10) + 0xD800, h); + save((ent&0x3FF) + 0xDC00, h); + } + if (ch != ';') { + unread(r, ch); + theCurrentColumn--; + } + } + else { + unread(r, ch); + theCurrentColumn--; + } + theNextState = S_PCDATA; + break; + case A_ETAG: + h.etag(theOutputBuffer, 0, theSize); + theSize = 0; + break; + case A_DECL: + h.decl(theOutputBuffer, 0, theSize); + theSize = 0; + break; + case A_GI: + h.gi(theOutputBuffer, 0, theSize); + theSize = 0; + break; + case A_GI_STAGC: + h.gi(theOutputBuffer, 0, theSize); + theSize = 0; + h.stagc(theOutputBuffer, 0, theSize); + break; + case A_LT: + mark(); + save('<', h); + save(ch, h); + break; + case A_LT_PCDATA: + mark(); + save('<', h); + h.pcdata(theOutputBuffer, 0, theSize); + theSize = 0; + break; + case A_PCDATA: + mark(); + h.pcdata(theOutputBuffer, 0, theSize); + theSize = 0; + break; + case A_CMNT: + mark(); + h.cmnt(theOutputBuffer, 0, theSize); + theSize = 0; + break; + case A_MINUS3: + save('-', h); + save(' ', h); + break; + case A_MINUS2: + save('-', h); + save(' ', h); + // fall through into A_MINUS + case A_MINUS: + save('-', h); + save(ch, h); + break; + case A_PI: + mark(); + h.pi(theOutputBuffer, 0, theSize); + theSize = 0; + break; + case A_PITARGET: + h.pitarget(theOutputBuffer, 0, theSize); + theSize = 0; + break; + case A_PITARGET_PI: + h.pitarget(theOutputBuffer, 0, theSize); + theSize = 0; + h.pi(theOutputBuffer, 0, theSize); + break; + case A_SAVE: + save(ch, h); + break; + case A_SKIP: + break; + case A_SP: + save(' ', h); + break; + case A_STAGC: + h.stagc(theOutputBuffer, 0, theSize); + theSize = 0; + break; + case A_EMPTYTAG: + mark(); +// System.err.println("%%% Empty tag seen"); + if (theSize > 0) h.gi(theOutputBuffer, 0, theSize); + theSize = 0; + h.stage(theOutputBuffer, 0, theSize); + break; + case A_UNGET: + unread(r, ch); + theCurrentColumn--; + break; + case A_UNSAVE_PCDATA: + if (theSize > 0) theSize--; + h.pcdata(theOutputBuffer, 0, theSize); + theSize = 0; + break; + default: + throw new Error("Can't process state " + action); + } + theState = theNextState; + } + h.eof(theOutputBuffer, 0, 0); + } + + /** + * Mark the current scan position as a "point of interest" - start of a tag, + * cdata, processing instruction etc. + */ + + private void mark() { + theLastColumn = theCurrentColumn; + theLastLine = theCurrentLine; + } + + /** + A callback for the ScanHandler that allows it to force + the lexer state to CDATA content (no markup is recognized except + the end of element. + */ + + public void startCDATA() { theNextState = S_CDATA; } + + private void save(int ch, ScanHandler h) throws IOException, SAXException { + if (theSize >= theOutputBuffer.length - 20) { + if (theState == S_PCDATA || theState == S_CDATA) { + // Return a buffer-sized chunk of PCDATA + h.pcdata(theOutputBuffer, 0, theSize); + theSize = 0; + } + else { + // Grow the buffer size + char[] newOutputBuffer = new char[theOutputBuffer.length * 2]; + System.arraycopy(theOutputBuffer, 0, newOutputBuffer, 0, theSize+1); + theOutputBuffer = newOutputBuffer; + } + } + theOutputBuffer[theSize++] = (char)ch; + } + + /** + Test procedure. Reads HTML from the standard input and writes + PYX to the standard output. + */ + + public static void main(String[] argv) throws IOException, SAXException { + Scanner s = new HTMLScanner(); + Reader r = new InputStreamReader(System.in, "UTF-8"); + Writer w = new OutputStreamWriter(System.out, "UTF-8"); + PYXWriter pw = new PYXWriter(w); + s.scan(r, pw); + w.close(); + } + + + private static String nicechar(int in) { + if (in == '\n') return "\\n"; + if (in < 32) return "0x"+Integer.toHexString(in); + return "'"+((char)in)+"'"; + } + + } diff --git a/src/templates/org/ccil/cowan/tagsoup/HTMLSchema.java b/src/templates/org/ccil/cowan/tagsoup/HTMLSchema.java new file mode 100644 index 0000000..8def657 --- /dev/null +++ b/src/templates/org/ccil/cowan/tagsoup/HTMLSchema.java @@ -0,0 +1,38 @@ +// This file is part of TagSoup and is Copyright 2002-2008 by John Cowan. +// +// TagSoup is licensed under the Apache License, +// Version 2.0. You may obtain a copy of this license at +// http://www.apache.org/licenses/LICENSE-2.0 . You may also have +// additional legal rights not granted by this license. +// +// TagSoup is distributed in the hope that it will be useful, but +// unless required by applicable law or agreed to in writing, TagSoup +// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS +// OF ANY KIND, either express or implied; not even the implied warranty +// of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +// +// +/** +This class provides a Schema that has been preinitialized with HTML +elements, attributes, and character entity declarations. All the declarations +normally provided with HTML 4.01 are given, plus some that are IE-specific +and NS4-specific. Attribute declarations of type CDATA with no default +value are not included. +*/ + +package org.ccil.cowan.tagsoup; +public class HTMLSchema extends Schema implements HTMLModels { + + /** + Returns a newly constructed HTMLSchema object independent of + any existing ones. + */ + + public HTMLSchema() { + // Start of Schema calls + @@SCHEMA_CALLS@@ + // End of Schema calls + } + + + } |