diff options
author | The Android Open Source Project <initial-contribution@android.com> | 2008-10-21 07:00:00 -0700 |
---|---|---|
committer | The Android Open Source Project <initial-contribution@android.com> | 2008-10-21 07:00:00 -0700 |
commit | 926d907a2fb69573f1c6337f064645dde18b1e5e (patch) | |
tree | 5a3788da7cfdb44d8c15bb91c6605606d8ac4636 /src/org/ccil/cowan/tagsoup/HTMLScanner.java | |
parent | 70e83658cac1d0d766e93853e3698921af269a37 (diff) | |
download | tagsoup-926d907a2fb69573f1c6337f064645dde18b1e5e.tar.gz |
Initial Contributionandroid-1.0release-1.0cdma-import
Diffstat (limited to 'src/org/ccil/cowan/tagsoup/HTMLScanner.java')
-rw-r--r-- | src/org/ccil/cowan/tagsoup/HTMLScanner.java | 648 |
1 files changed, 648 insertions, 0 deletions
diff --git a/src/org/ccil/cowan/tagsoup/HTMLScanner.java b/src/org/ccil/cowan/tagsoup/HTMLScanner.java new file mode 100644 index 0000000..0d73ff7 --- /dev/null +++ b/src/org/ccil/cowan/tagsoup/HTMLScanner.java @@ -0,0 +1,648 @@ +// This file is part of TagSoup and is Copyright 2002-2008 by John Cowan. +// +// TagSoup is licensed under the Apache License, +// Version 2.0. You may obtain a copy of this license at +// http://www.apache.org/licenses/LICENSE-2.0 . You may also have +// additional legal rights not granted by this license. +// +// TagSoup is distributed in the hope that it will be useful, but +// unless required by applicable law or agreed to in writing, TagSoup +// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS +// OF ANY KIND, either express or implied; not even the implied warranty +// of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +// +// +package org.ccil.cowan.tagsoup; +import java.io.*; +import org.xml.sax.SAXException; +import org.xml.sax.Locator; + +/** +This class implements a table-driven scanner for HTML, allowing for lots of +defects. It implements the Scanner interface, which accepts a Reader +object to fetch characters from and a ScanHandler object to report lexical +events to. +*/ + +public class HTMLScanner implements Scanner, Locator { + + // Start of state table + private static final int S_ANAME = 1; + private static final int S_APOS = 2; + private static final int S_AVAL = 3; + private static final int S_BB = 4; + private static final int S_BBC = 5; + private static final int S_BBCD = 6; + private static final int S_BBCDA = 7; + private static final int S_BBCDAT = 8; + private static final int S_BBCDATA = 9; + private static final int S_CDATA = 10; + private static final int S_CDATA2 = 11; + private static final int S_CDSECT = 12; + private static final int S_CDSECT1 = 13; + private static final int S_CDSECT2 = 14; + private static final int S_COM = 15; + private static final int S_COM2 = 16; + private static final int S_COM3 = 17; + private static final int S_COM4 = 18; + private static final int S_DECL = 19; + private static final int S_DECL2 = 20; + private static final int S_DONE = 21; + private static final int S_EMPTYTAG = 22; + private static final int S_ENT = 23; + private static final int S_EQ = 24; + private static final int S_ETAG = 25; + private static final int S_GI = 26; + private static final int S_NCR = 27; + private static final int S_PCDATA = 28; + private static final int S_PI = 29; + private static final int S_PITARGET = 30; + private static final int S_QUOT = 31; + private static final int S_STAGC = 32; + private static final int S_TAG = 33; + private static final int S_TAGWS = 34; + private static final int S_XNCR = 35; + private static final int A_ADUP = 1; + private static final int A_ADUP_SAVE = 2; + private static final int A_ADUP_STAGC = 3; + private static final int A_ANAME = 4; + private static final int A_ANAME_ADUP = 5; + private static final int A_ANAME_ADUP_STAGC = 6; + private static final int A_AVAL = 7; + private static final int A_AVAL_STAGC = 8; + private static final int A_CDATA = 9; + private static final int A_CMNT = 10; + private static final int A_DECL = 11; + private static final int A_EMPTYTAG = 12; + private static final int A_ENTITY = 13; + private static final int A_ENTITY_START = 14; + private static final int A_ETAG = 15; + private static final int A_GI = 16; + private static final int A_GI_STAGC = 17; + private static final int A_LT = 18; + private static final int A_LT_PCDATA = 19; + private static final int A_MINUS = 20; + private static final int A_MINUS2 = 21; + private static final int A_MINUS3 = 22; + private static final int A_PCDATA = 23; + private static final int A_PI = 24; + private static final int A_PITARGET = 25; + private static final int A_PITARGET_PI = 26; + private static final int A_SAVE = 27; + private static final int A_SKIP = 28; + private static final int A_SP = 29; + private static final int A_STAGC = 30; + private static final int A_UNGET = 31; + private static final int A_UNSAVE_PCDATA = 32; + private static int[] statetable = { + S_ANAME, '/', A_ANAME_ADUP, S_EMPTYTAG, + S_ANAME, '=', A_ANAME, S_AVAL, + S_ANAME, '>', A_ANAME_ADUP_STAGC, S_PCDATA, + S_ANAME, 0, A_SAVE, S_ANAME, + S_ANAME, -1, A_ANAME_ADUP_STAGC, S_DONE, + S_ANAME, ' ', A_ANAME, S_EQ, + S_ANAME, '\n', A_ANAME, S_EQ, + S_ANAME, '\t', A_ANAME, S_EQ, + S_APOS, '\'', A_AVAL, S_TAGWS, + S_APOS, 0, A_SAVE, S_APOS, + S_APOS, -1, A_AVAL_STAGC, S_DONE, + S_APOS, ' ', A_SP, S_APOS, + S_APOS, '\n', A_SP, S_APOS, + S_APOS, '\t', A_SP, S_APOS, + S_AVAL, '\'', A_SKIP, S_APOS, + S_AVAL, '"', A_SKIP, S_QUOT, + S_AVAL, '>', A_AVAL_STAGC, S_PCDATA, + S_AVAL, 0, A_SAVE, S_STAGC, + S_AVAL, -1, A_AVAL_STAGC, S_DONE, + S_AVAL, ' ', A_SKIP, S_AVAL, + S_AVAL, '\n', A_SKIP, S_AVAL, + S_AVAL, '\t', A_SKIP, S_AVAL, + S_BB, 'C', A_SKIP, S_BBC, + S_BB, 0, A_SKIP, S_DECL, + S_BB, -1, A_SKIP, S_DONE, + S_BBC, 'D', A_SKIP, S_BBCD, + S_BBC, 0, A_SKIP, S_DECL, + S_BBC, -1, A_SKIP, S_DONE, + S_BBCD, 'A', A_SKIP, S_BBCDA, + S_BBCD, 0, A_SKIP, S_DECL, + S_BBCD, -1, A_SKIP, S_DONE, + S_BBCDA, 'T', A_SKIP, S_BBCDAT, + S_BBCDA, 0, A_SKIP, S_DECL, + S_BBCDA, -1, A_SKIP, S_DONE, + S_BBCDAT, 'A', A_SKIP, S_BBCDATA, + S_BBCDAT, 0, A_SKIP, S_DECL, + S_BBCDAT, -1, A_SKIP, S_DONE, + S_BBCDATA, '[', A_SKIP, S_CDSECT, + S_BBCDATA, 0, A_SKIP, S_DECL, + S_BBCDATA, -1, A_SKIP, S_DONE, + S_CDATA, '<', A_SAVE, S_CDATA2, + S_CDATA, 0, A_SAVE, S_CDATA, + S_CDATA, -1, A_PCDATA, S_DONE, + S_CDATA2, '/', A_UNSAVE_PCDATA, S_ETAG, + S_CDATA2, 0, A_SAVE, S_CDATA, + S_CDATA2, -1, A_UNSAVE_PCDATA, S_DONE, + S_CDSECT, ']', A_SAVE, S_CDSECT1, + S_CDSECT, 0, A_SAVE, S_CDSECT, + S_CDSECT, -1, A_SKIP, S_DONE, + S_CDSECT1, ']', A_SAVE, S_CDSECT2, + S_CDSECT1, 0, A_SAVE, S_CDSECT, + S_CDSECT1, -1, A_SKIP, S_DONE, + S_CDSECT2, '>', A_CDATA, S_PCDATA, + S_CDSECT2, 0, A_SAVE, S_CDSECT, + S_CDSECT2, -1, A_SKIP, S_DONE, + S_COM, '-', A_SKIP, S_COM2, + S_COM, 0, A_SAVE, S_COM2, + S_COM, -1, A_CMNT, S_DONE, + S_COM2, '-', A_SKIP, S_COM3, + S_COM2, 0, A_SAVE, S_COM2, + S_COM2, -1, A_CMNT, S_DONE, + S_COM3, '-', A_SKIP, S_COM4, + S_COM3, 0, A_MINUS, S_COM2, + S_COM3, -1, A_CMNT, S_DONE, + S_COM4, '-', A_MINUS3, S_COM4, + S_COM4, '>', A_CMNT, S_PCDATA, + S_COM4, 0, A_MINUS2, S_COM2, + S_COM4, -1, A_CMNT, S_DONE, + S_DECL, '-', A_SKIP, S_COM, + S_DECL, '[', A_SKIP, S_BB, + S_DECL, '>', A_SKIP, S_PCDATA, + S_DECL, 0, A_SAVE, S_DECL2, + S_DECL, -1, A_SKIP, S_DONE, + S_DECL2, '>', A_DECL, S_PCDATA, + S_DECL2, 0, A_SAVE, S_DECL2, + S_DECL2, -1, A_SKIP, S_DONE, + S_EMPTYTAG, '>', A_EMPTYTAG, S_PCDATA, + S_EMPTYTAG, 0, A_SAVE, S_ANAME, + S_EMPTYTAG, ' ', A_SKIP, S_TAGWS, + S_EMPTYTAG, '\n', A_SKIP, S_TAGWS, + S_EMPTYTAG, '\t', A_SKIP, S_TAGWS, + S_ENT, 0, A_ENTITY, S_ENT, + S_ENT, -1, A_ENTITY, S_DONE, + S_EQ, '=', A_SKIP, S_AVAL, + S_EQ, '>', A_ADUP_STAGC, S_PCDATA, + S_EQ, 0, A_ADUP_SAVE, S_ANAME, + S_EQ, -1, A_ADUP_STAGC, S_DONE, + S_EQ, ' ', A_SKIP, S_EQ, + S_EQ, '\n', A_SKIP, S_EQ, + S_EQ, '\t', A_SKIP, S_EQ, + S_ETAG, '>', A_ETAG, S_PCDATA, + S_ETAG, 0, A_SAVE, S_ETAG, + S_ETAG, -1, A_ETAG, S_DONE, + S_ETAG, ' ', A_SKIP, S_ETAG, + S_ETAG, '\n', A_SKIP, S_ETAG, + S_ETAG, '\t', A_SKIP, S_ETAG, + S_GI, '/', A_SKIP, S_EMPTYTAG, + S_GI, '>', A_GI_STAGC, S_PCDATA, + S_GI, 0, A_SAVE, S_GI, + S_GI, -1, A_SKIP, S_DONE, + S_GI, ' ', A_GI, S_TAGWS, + S_GI, '\n', A_GI, S_TAGWS, + S_GI, '\t', A_GI, S_TAGWS, + S_NCR, 0, A_ENTITY, S_NCR, + S_NCR, -1, A_ENTITY, S_DONE, + S_PCDATA, '&', A_ENTITY_START, S_ENT, + S_PCDATA, '<', A_PCDATA, S_TAG, + S_PCDATA, 0, A_SAVE, S_PCDATA, + S_PCDATA, -1, A_PCDATA, S_DONE, + S_PI, '>', A_PI, S_PCDATA, + S_PI, 0, A_SAVE, S_PI, + S_PI, -1, A_PI, S_DONE, + S_PITARGET, '>', A_PITARGET_PI, S_PCDATA, + S_PITARGET, 0, A_SAVE, S_PITARGET, + S_PITARGET, -1, A_PITARGET_PI, S_DONE, + S_PITARGET, ' ', A_PITARGET, S_PI, + S_PITARGET, '\n', A_PITARGET, S_PI, + S_PITARGET, '\t', A_PITARGET, S_PI, + S_QUOT, '"', A_AVAL, S_TAGWS, + S_QUOT, 0, A_SAVE, S_QUOT, + S_QUOT, -1, A_AVAL_STAGC, S_DONE, + S_QUOT, ' ', A_SP, S_QUOT, + S_QUOT, '\n', A_SP, S_QUOT, + S_QUOT, '\t', A_SP, S_QUOT, + S_STAGC, '>', A_AVAL_STAGC, S_PCDATA, + S_STAGC, 0, A_SAVE, S_STAGC, + S_STAGC, -1, A_AVAL_STAGC, S_DONE, + S_STAGC, ' ', A_AVAL, S_TAGWS, + S_STAGC, '\n', A_AVAL, S_TAGWS, + S_STAGC, '\t', A_AVAL, S_TAGWS, + S_TAG, '!', A_SKIP, S_DECL, + S_TAG, '?', A_SKIP, S_PITARGET, + S_TAG, '/', A_SKIP, S_ETAG, + S_TAG, '<', A_SAVE, S_TAG, + S_TAG, 0, A_SAVE, S_GI, + S_TAG, -1, A_LT_PCDATA, S_DONE, + S_TAG, ' ', A_LT, S_PCDATA, + S_TAG, '\n', A_LT, S_PCDATA, + S_TAG, '\t', A_LT, S_PCDATA, + S_TAGWS, '/', A_SKIP, S_EMPTYTAG, + S_TAGWS, '>', A_STAGC, S_PCDATA, + S_TAGWS, 0, A_SAVE, S_ANAME, + S_TAGWS, -1, A_STAGC, S_DONE, + S_TAGWS, ' ', A_SKIP, S_TAGWS, + S_TAGWS, '\n', A_SKIP, S_TAGWS, + S_TAGWS, '\t', A_SKIP, S_TAGWS, + S_XNCR, 0, A_ENTITY, S_XNCR, + S_XNCR, -1, A_ENTITY, S_DONE, + + }; + private static final String[] debug_actionnames = { "", "A_ADUP", "A_ADUP_SAVE", "A_ADUP_STAGC", "A_ANAME", "A_ANAME_ADUP", "A_ANAME_ADUP_STAGC", "A_AVAL", "A_AVAL_STAGC", "A_CDATA", "A_CMNT", "A_DECL", "A_EMPTYTAG", "A_ENTITY", "A_ENTITY_START", "A_ETAG", "A_GI", "A_GI_STAGC", "A_LT", "A_LT_PCDATA", "A_MINUS", "A_MINUS2", "A_MINUS3", "A_PCDATA", "A_PI", "A_PITARGET", "A_PITARGET_PI", "A_SAVE", "A_SKIP", "A_SP", "A_STAGC", "A_UNGET", "A_UNSAVE_PCDATA"}; + private static final String[] debug_statenames = { "", "S_ANAME", "S_APOS", "S_AVAL", "S_BB", "S_BBC", "S_BBCD", "S_BBCDA", "S_BBCDAT", "S_BBCDATA", "S_CDATA", "S_CDATA2", "S_CDSECT", "S_CDSECT1", "S_CDSECT2", "S_COM", "S_COM2", "S_COM3", "S_COM4", "S_DECL", "S_DECL2", "S_DONE", "S_EMPTYTAG", "S_ENT", "S_EQ", "S_ETAG", "S_GI", "S_NCR", "S_PCDATA", "S_PI", "S_PITARGET", "S_QUOT", "S_STAGC", "S_TAG", "S_TAGWS", "S_XNCR"}; + + + // End of state table + + private String thePublicid; // Locator state + private String theSystemid; + private int theLastLine; + private int theLastColumn; + private int theCurrentLine; + private int theCurrentColumn; + + int theState; // Current state + int theNextState; // Next state + char[] theOutputBuffer = new char[200]; // Output buffer + int theSize; // Current buffer size + int[] theWinMap = { // Windows chars map + 0x20AC, 0xFFFD, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, + 0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0xFFFD, 0x017D, 0xFFFD, + 0xFFFD, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, + 0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0xFFFD, 0x017E, 0x0178}; + + // Compensate for bug in PushbackReader that allows + // pushing back EOF. + private void unread(PushbackReader r, int c) throws IOException { + if (c != -1) r.unread(c); + } + + // Locator implementation + + public int getLineNumber() { + return theLastLine; + } + public int getColumnNumber() { + return theLastColumn; + } + public String getPublicId() { + return thePublicid; + } + public String getSystemId() { + return theSystemid; + } + + + // Scanner implementation + + /** + Reset document locator, supplying systemid and publicid. + @param systemid System id + @param publicid Public id + */ + + public void resetDocumentLocator(String publicid, String systemid) { + thePublicid = publicid; + theSystemid = systemid; + theLastLine = theLastColumn = theCurrentLine = theCurrentColumn = 0; + } + + /** + Scan HTML source, reporting lexical events. + @param r0 Reader that provides characters + @param h ScanHandler that accepts lexical events. + */ + + public void scan(Reader r0, ScanHandler h) throws IOException, SAXException { + theState = S_PCDATA; + PushbackReader r; + if (r0 instanceof PushbackReader) { + r = (PushbackReader)r0; + } + else if (r0 instanceof BufferedReader) { + r = new PushbackReader(r0); + } + else { + r = new PushbackReader(new BufferedReader(r0)); + } + + int firstChar = r.read(); // Remove any leading BOM + if (firstChar != '\uFEFF') unread(r, firstChar); + + while (theState != S_DONE) { + int ch = r.read(); + + // Process control characters + if (ch >= 0x80 && ch <= 0x9F) ch = theWinMap[ch-0x80]; + + if (ch == '\r') { + ch = r.read(); // expect LF next + if (ch != '\n') { + unread(r, ch); // nope + ch = '\n'; + } + } + + if (ch == '\n') { + theCurrentLine++; + theCurrentColumn = 0; + } + else { + theCurrentColumn++; + } + + if (!(ch >= 0x20 || ch == '\n' || ch == '\t' || ch == -1)) continue; + + // Search state table + int action = 0; + for (int i = 0; i < statetable.length; i += 4) { + if (theState != statetable[i]) { + if (action != 0) break; + continue; + } + if (statetable[i+1] == 0) { + action = statetable[i+2]; + theNextState = statetable[i+3]; + } + else if (statetable[i+1] == ch) { + action = statetable[i+2]; + theNextState = statetable[i+3]; + break; + } + } +// System.err.println("In " + debug_statenames[theState] + " got " + nicechar(ch) + " doing " + debug_actionnames[action] + " then " + debug_statenames[theNextState]); + switch (action) { + case 0: + throw new Error( +"HTMLScanner can't cope with " + Integer.toString(ch) + " in state " + +Integer.toString(theState)); + case A_ADUP: + h.adup(theOutputBuffer, 0, theSize); + theSize = 0; + break; + case A_ADUP_SAVE: + h.adup(theOutputBuffer, 0, theSize); + theSize = 0; + save(ch, h); + break; + case A_ADUP_STAGC: + h.adup(theOutputBuffer, 0, theSize); + theSize = 0; + h.stagc(theOutputBuffer, 0, theSize); + break; + case A_ANAME: + h.aname(theOutputBuffer, 0, theSize); + theSize = 0; + break; + case A_ANAME_ADUP: + h.aname(theOutputBuffer, 0, theSize); + theSize = 0; + h.adup(theOutputBuffer, 0, theSize); + break; + case A_ANAME_ADUP_STAGC: + h.aname(theOutputBuffer, 0, theSize); + theSize = 0; + h.adup(theOutputBuffer, 0, theSize); + h.stagc(theOutputBuffer, 0, theSize); + break; + case A_AVAL: + h.aval(theOutputBuffer, 0, theSize); + theSize = 0; + break; + case A_AVAL_STAGC: + h.aval(theOutputBuffer, 0, theSize); + theSize = 0; + h.stagc(theOutputBuffer, 0, theSize); + break; + case A_CDATA: + mark(); + // suppress the final "]]" in the buffer + if (theSize > 1) theSize -= 2; + h.pcdata(theOutputBuffer, 0, theSize); + theSize = 0; + break; + case A_ENTITY_START: + h.pcdata(theOutputBuffer, 0, theSize); + theSize = 0; + save(ch, h); + break; + case A_ENTITY: + mark(); + char ch1 = (char)ch; +// System.out.println("Got " + ch1 + " in state " + ((theState == S_ENT) ? "S_ENT" : ((theState == S_NCR) ? "S_NCR" : "UNK"))); + if (theState == S_ENT && ch1 == '#') { + theNextState = S_NCR; + save(ch, h); + break; + } + else if (theState == S_NCR && (ch1 == 'x' || ch1 == 'X')) { + theNextState = S_XNCR; + save(ch, h); + break; + } + else if (theState == S_ENT && Character.isLetterOrDigit(ch1)) { + save(ch, h); + break; + } + else if (theState == S_NCR && Character.isDigit(ch1)) { + save(ch, h); + break; + } + else if (theState == S_XNCR && (Character.isDigit(ch1) || "abcdefABCDEF".indexOf(ch1) != -1)) { + save(ch, h); + break; + } + + // The whole entity reference has been collected +// System.err.println("%%" + new String(theOutputBuffer, 0, theSize)); + h.entity(theOutputBuffer, 1, theSize - 1); + int ent = h.getEntity(); +// System.err.println("%% value = " + ent); + if (ent != 0) { + theSize = 0; + if (ent >= 0x80 && ent <= 0x9F) { + ent = theWinMap[ent-0x80]; + } + if (ent < 0x20) { + // Control becomes space + ent = 0x20; + } + else if (ent >= 0xD800 && ent <= 0xDFFF) { + // Surrogates get dropped + ent = 0; + } + else if (ent <= 0xFFFF) { + // BMP character + save(ent, h); + } + else { + // Astral converted to two surrogates + ent -= 0x10000; + save((ent>>10) + 0xD800, h); + save((ent&0x3FF) + 0xDC00, h); + } + if (ch != ';') { + unread(r, ch); + theCurrentColumn--; + } + } + else { + unread(r, ch); + theCurrentColumn--; + } + theNextState = S_PCDATA; + break; + case A_ETAG: + h.etag(theOutputBuffer, 0, theSize); + theSize = 0; + break; + case A_DECL: + h.decl(theOutputBuffer, 0, theSize); + theSize = 0; + break; + case A_GI: + h.gi(theOutputBuffer, 0, theSize); + theSize = 0; + break; + case A_GI_STAGC: + h.gi(theOutputBuffer, 0, theSize); + theSize = 0; + h.stagc(theOutputBuffer, 0, theSize); + break; + case A_LT: + mark(); + save('<', h); + save(ch, h); + break; + case A_LT_PCDATA: + mark(); + save('<', h); + h.pcdata(theOutputBuffer, 0, theSize); + theSize = 0; + break; + case A_PCDATA: + mark(); + h.pcdata(theOutputBuffer, 0, theSize); + theSize = 0; + break; + case A_CMNT: + mark(); + h.cmnt(theOutputBuffer, 0, theSize); + theSize = 0; + break; + case A_MINUS3: + save('-', h); + save(' ', h); + break; + case A_MINUS2: + save('-', h); + save(' ', h); + // fall through into A_MINUS + case A_MINUS: + save('-', h); + save(ch, h); + break; + case A_PI: + mark(); + h.pi(theOutputBuffer, 0, theSize); + theSize = 0; + break; + case A_PITARGET: + h.pitarget(theOutputBuffer, 0, theSize); + theSize = 0; + break; + case A_PITARGET_PI: + h.pitarget(theOutputBuffer, 0, theSize); + theSize = 0; + h.pi(theOutputBuffer, 0, theSize); + break; + case A_SAVE: + save(ch, h); + break; + case A_SKIP: + break; + case A_SP: + save(' ', h); + break; + case A_STAGC: + h.stagc(theOutputBuffer, 0, theSize); + theSize = 0; + break; + case A_EMPTYTAG: + mark(); +// System.err.println("%%% Empty tag seen"); + if (theSize > 0) h.gi(theOutputBuffer, 0, theSize); + theSize = 0; + h.stage(theOutputBuffer, 0, theSize); + break; + case A_UNGET: + unread(r, ch); + theCurrentColumn--; + break; + case A_UNSAVE_PCDATA: + if (theSize > 0) theSize--; + h.pcdata(theOutputBuffer, 0, theSize); + theSize = 0; + break; + default: + throw new Error("Can't process state " + action); + } + theState = theNextState; + } + h.eof(theOutputBuffer, 0, 0); + } + + /** + * Mark the current scan position as a "point of interest" - start of a tag, + * cdata, processing instruction etc. + */ + + private void mark() { + theLastColumn = theCurrentColumn; + theLastLine = theCurrentLine; + } + + /** + A callback for the ScanHandler that allows it to force + the lexer state to CDATA content (no markup is recognized except + the end of element. + */ + + public void startCDATA() { theNextState = S_CDATA; } + + private void save(int ch, ScanHandler h) throws IOException, SAXException { + if (theSize >= theOutputBuffer.length - 20) { + if (theState == S_PCDATA || theState == S_CDATA) { + // Return a buffer-sized chunk of PCDATA + h.pcdata(theOutputBuffer, 0, theSize); + theSize = 0; + } + else { + // Grow the buffer size + char[] newOutputBuffer = new char[theOutputBuffer.length * 2]; + System.arraycopy(theOutputBuffer, 0, newOutputBuffer, 0, theSize+1); + theOutputBuffer = newOutputBuffer; + } + } + theOutputBuffer[theSize++] = (char)ch; + } + + /** + Test procedure. Reads HTML from the standard input and writes + PYX to the standard output. + */ + + public static void main(String[] argv) throws IOException, SAXException { + Scanner s = new HTMLScanner(); + Reader r = new InputStreamReader(System.in, "UTF-8"); + Writer w = new OutputStreamWriter(System.out, "UTF-8"); + PYXWriter pw = new PYXWriter(w); + s.scan(r, pw); + w.close(); + } + + + private static String nicechar(int in) { + if (in == '\n') return "\\n"; + if (in < 32) return "0x"+Integer.toHexString(in); + return "'"+((char)in)+"'"; + } + + } |