diff options
author | The Android Open Source Project <initial-contribution@android.com> | 2009-03-03 18:28:39 -0800 |
---|---|---|
committer | The Android Open Source Project <initial-contribution@android.com> | 2009-03-03 18:28:39 -0800 |
commit | b0e041b91e19d554585bc4423075929013f05f22 (patch) | |
tree | 4b825dc642cb6eb9a060e54bf8d69288fbee4904 /src/org/ccil/cowan/tagsoup/HTMLScanner.java | |
parent | 4bb395b502d0c2495f7a5d226ccf7f06f53dea38 (diff) | |
download | tagsoup-b0e041b91e19d554585bc4423075929013f05f22.tar.gz |
auto import from //depot/cupcake/@135843
Diffstat (limited to 'src/org/ccil/cowan/tagsoup/HTMLScanner.java')
-rw-r--r-- | src/org/ccil/cowan/tagsoup/HTMLScanner.java | 648 |
1 files changed, 0 insertions, 648 deletions
diff --git a/src/org/ccil/cowan/tagsoup/HTMLScanner.java b/src/org/ccil/cowan/tagsoup/HTMLScanner.java deleted file mode 100644 index f5980ee..0000000 --- a/src/org/ccil/cowan/tagsoup/HTMLScanner.java +++ /dev/null @@ -1,648 +0,0 @@ -// This file is part of TagSoup and is Copyright 2002-2008 by John Cowan. -// -// TagSoup is licensed under the Apache License, -// Version 2.0. You may obtain a copy of this license at -// http://www.apache.org/licenses/LICENSE-2.0 . You may also have -// additional legal rights not granted by this license. -// -// TagSoup is distributed in the hope that it will be useful, but -// unless required by applicable law or agreed to in writing, TagSoup -// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS -// OF ANY KIND, either express or implied; not even the implied warranty -// of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. -// -// -package org.ccil.cowan.tagsoup; -import java.io.*; -import org.xml.sax.SAXException; -import org.xml.sax.Locator; - -/** -This class implements a table-driven scanner for HTML, allowing for lots of -defects. It implements the Scanner interface, which accepts a Reader -object to fetch characters from and a ScanHandler object to report lexical -events to. -*/ - -public class HTMLScanner implements Scanner, Locator { - - // Start of state table - private static final int S_ANAME = 1; - private static final int S_APOS = 2; - private static final int S_AVAL = 3; - private static final int S_BB = 4; - private static final int S_BBC = 5; - private static final int S_BBCD = 6; - private static final int S_BBCDA = 7; - private static final int S_BBCDAT = 8; - private static final int S_BBCDATA = 9; - private static final int S_CDATA = 10; - private static final int S_CDATA2 = 11; - private static final int S_CDSECT = 12; - private static final int S_CDSECT1 = 13; - private static final int S_CDSECT2 = 14; - private static final int S_COM = 15; - private static final int S_COM2 = 16; - private static final int S_COM3 = 17; - private static final int S_COM4 = 18; - private static final int S_DECL = 19; - private static final int S_DECL2 = 20; - private static final int S_DONE = 21; - private static final int S_EMPTYTAG = 22; - private static final int S_ENT = 23; - private static final int S_EQ = 24; - private static final int S_ETAG = 25; - private static final int S_GI = 26; - private static final int S_NCR = 27; - private static final int S_PCDATA = 28; - private static final int S_PI = 29; - private static final int S_PITARGET = 30; - private static final int S_QUOT = 31; - private static final int S_STAGC = 32; - private static final int S_TAG = 33; - private static final int S_TAGWS = 34; - private static final int S_XNCR = 35; - private static final int A_ADUP = 1; - private static final int A_ADUP_SAVE = 2; - private static final int A_ADUP_STAGC = 3; - private static final int A_ANAME = 4; - private static final int A_ANAME_ADUP = 5; - private static final int A_ANAME_ADUP_STAGC = 6; - private static final int A_AVAL = 7; - private static final int A_AVAL_STAGC = 8; - private static final int A_CDATA = 9; - private static final int A_CMNT = 10; - private static final int A_DECL = 11; - private static final int A_EMPTYTAG = 12; - private static final int A_ENTITY = 13; - private static final int A_ENTITY_START = 14; - private static final int A_ETAG = 15; - private static final int A_GI = 16; - private static final int A_GI_STAGC = 17; - private static final int A_LT = 18; - private static final int A_LT_PCDATA = 19; - private static final int A_MINUS = 20; - private static final int A_MINUS2 = 21; - private static final int A_MINUS3 = 22; - private static final int A_PCDATA = 23; - private static final int A_PI = 24; - private static final int A_PITARGET = 25; - private static final int A_PITARGET_PI = 26; - private static final int A_SAVE = 27; - private static final int A_SKIP = 28; - private static final int A_SP = 29; - private static final int A_STAGC = 30; - private static final int A_UNGET = 31; - private static final int A_UNSAVE_PCDATA = 32; - private static int[] statetable = { - S_ANAME, '/', A_ANAME_ADUP, S_EMPTYTAG, - S_ANAME, '=', A_ANAME, S_AVAL, - S_ANAME, '>', A_ANAME_ADUP_STAGC, S_PCDATA, - S_ANAME, 0, A_SAVE, S_ANAME, - S_ANAME, -1, A_ANAME_ADUP_STAGC, S_DONE, - S_ANAME, ' ', A_ANAME, S_EQ, - S_ANAME, '\n', A_ANAME, S_EQ, - S_ANAME, '\t', A_ANAME, S_EQ, - S_APOS, '\'', A_AVAL, S_TAGWS, - S_APOS, 0, A_SAVE, S_APOS, - S_APOS, -1, A_AVAL_STAGC, S_DONE, - S_APOS, ' ', A_SP, S_APOS, - S_APOS, '\n', A_SP, S_APOS, - S_APOS, '\t', A_SP, S_APOS, - S_AVAL, '\'', A_SKIP, S_APOS, - S_AVAL, '"', A_SKIP, S_QUOT, - S_AVAL, '>', A_AVAL_STAGC, S_PCDATA, - S_AVAL, 0, A_SAVE, S_STAGC, - S_AVAL, -1, A_AVAL_STAGC, S_DONE, - S_AVAL, ' ', A_SKIP, S_AVAL, - S_AVAL, '\n', A_SKIP, S_AVAL, - S_AVAL, '\t', A_SKIP, S_AVAL, - S_BB, 'C', A_SKIP, S_BBC, - S_BB, 0, A_SKIP, S_DECL, - S_BB, -1, A_SKIP, S_DONE, - S_BBC, 'D', A_SKIP, S_BBCD, - S_BBC, 0, A_SKIP, S_DECL, - S_BBC, -1, A_SKIP, S_DONE, - S_BBCD, 'A', A_SKIP, S_BBCDA, - S_BBCD, 0, A_SKIP, S_DECL, - S_BBCD, -1, A_SKIP, S_DONE, - S_BBCDA, 'T', A_SKIP, S_BBCDAT, - S_BBCDA, 0, A_SKIP, S_DECL, - S_BBCDA, -1, A_SKIP, S_DONE, - S_BBCDAT, 'A', A_SKIP, S_BBCDATA, - S_BBCDAT, 0, A_SKIP, S_DECL, - S_BBCDAT, -1, A_SKIP, S_DONE, - S_BBCDATA, '[', A_SKIP, S_CDSECT, - S_BBCDATA, 0, A_SKIP, S_DECL, - S_BBCDATA, -1, A_SKIP, S_DONE, - S_CDATA, '<', A_SAVE, S_CDATA2, - S_CDATA, 0, A_SAVE, S_CDATA, - S_CDATA, -1, A_PCDATA, S_DONE, - S_CDATA2, '/', A_UNSAVE_PCDATA, S_ETAG, - S_CDATA2, 0, A_SAVE, S_CDATA, - S_CDATA2, -1, A_UNSAVE_PCDATA, S_DONE, - S_CDSECT, ']', A_SAVE, S_CDSECT1, - S_CDSECT, 0, A_SAVE, S_CDSECT, - S_CDSECT, -1, A_SKIP, S_DONE, - S_CDSECT1, ']', A_SAVE, S_CDSECT2, - S_CDSECT1, 0, A_SAVE, S_CDSECT, - S_CDSECT1, -1, A_SKIP, S_DONE, - S_CDSECT2, '>', A_CDATA, S_PCDATA, - S_CDSECT2, 0, A_SAVE, S_CDSECT, - S_CDSECT2, -1, A_SKIP, S_DONE, - S_COM, '-', A_SKIP, S_COM2, - S_COM, 0, A_SAVE, S_COM2, - S_COM, -1, A_CMNT, S_DONE, - S_COM2, '-', A_SKIP, S_COM3, - S_COM2, 0, A_SAVE, S_COM2, - S_COM2, -1, A_CMNT, S_DONE, - S_COM3, '-', A_SKIP, S_COM4, - S_COM3, 0, A_MINUS, S_COM2, - S_COM3, -1, A_CMNT, S_DONE, - S_COM4, '-', A_MINUS3, S_COM4, - S_COM4, '>', A_CMNT, S_PCDATA, - S_COM4, 0, A_MINUS2, S_COM2, - S_COM4, -1, A_CMNT, S_DONE, - S_DECL, '-', A_SKIP, S_COM, - S_DECL, '[', A_SKIP, S_BB, - S_DECL, '>', A_SKIP, S_PCDATA, - S_DECL, 0, A_SAVE, S_DECL2, - S_DECL, -1, A_SKIP, S_DONE, - S_DECL2, '>', A_DECL, S_PCDATA, - S_DECL2, 0, A_SAVE, S_DECL2, - S_DECL2, -1, A_SKIP, S_DONE, - S_EMPTYTAG, '>', A_EMPTYTAG, S_PCDATA, - S_EMPTYTAG, 0, A_SAVE, S_ANAME, - S_EMPTYTAG, ' ', A_SKIP, S_TAGWS, - S_EMPTYTAG, '\n', A_SKIP, S_TAGWS, - S_EMPTYTAG, '\t', A_SKIP, S_TAGWS, - S_ENT, 0, A_ENTITY, S_ENT, - S_ENT, -1, A_ENTITY, S_DONE, - S_EQ, '=', A_SKIP, S_AVAL, - S_EQ, '>', A_ADUP_STAGC, S_PCDATA, - S_EQ, 0, A_ADUP_SAVE, S_ANAME, - S_EQ, -1, A_ADUP_STAGC, S_DONE, - S_EQ, ' ', A_SKIP, S_EQ, - S_EQ, '\n', A_SKIP, S_EQ, - S_EQ, '\t', A_SKIP, S_EQ, - S_ETAG, '>', A_ETAG, S_PCDATA, - S_ETAG, 0, A_SAVE, S_ETAG, - S_ETAG, -1, A_ETAG, S_DONE, - S_ETAG, ' ', A_SKIP, S_ETAG, - S_ETAG, '\n', A_SKIP, S_ETAG, - S_ETAG, '\t', A_SKIP, S_ETAG, - S_GI, '/', A_SKIP, S_EMPTYTAG, - S_GI, '>', A_GI_STAGC, S_PCDATA, - S_GI, 0, A_SAVE, S_GI, - S_GI, -1, A_SKIP, S_DONE, - S_GI, ' ', A_GI, S_TAGWS, - S_GI, '\n', A_GI, S_TAGWS, - S_GI, '\t', A_GI, S_TAGWS, - S_NCR, 0, A_ENTITY, S_NCR, - S_NCR, -1, A_ENTITY, S_DONE, - S_PCDATA, '&', A_ENTITY_START, S_ENT, - S_PCDATA, '<', A_PCDATA, S_TAG, - S_PCDATA, 0, A_SAVE, S_PCDATA, - S_PCDATA, -1, A_PCDATA, S_DONE, - S_PI, '>', A_PI, S_PCDATA, - S_PI, 0, A_SAVE, S_PI, - S_PI, -1, A_PI, S_DONE, - S_PITARGET, '>', A_PITARGET_PI, S_PCDATA, - S_PITARGET, 0, A_SAVE, S_PITARGET, - S_PITARGET, -1, A_PITARGET_PI, S_DONE, - S_PITARGET, ' ', A_PITARGET, S_PI, - S_PITARGET, '\n', A_PITARGET, S_PI, - S_PITARGET, '\t', A_PITARGET, S_PI, - S_QUOT, '"', A_AVAL, S_TAGWS, - S_QUOT, 0, A_SAVE, S_QUOT, - S_QUOT, -1, A_AVAL_STAGC, S_DONE, - S_QUOT, ' ', A_SP, S_QUOT, - S_QUOT, '\n', A_SP, S_QUOT, - S_QUOT, '\t', A_SP, S_QUOT, - S_STAGC, '>', A_AVAL_STAGC, S_PCDATA, - S_STAGC, 0, A_SAVE, S_STAGC, - S_STAGC, -1, A_AVAL_STAGC, S_DONE, - S_STAGC, ' ', A_AVAL, S_TAGWS, - S_STAGC, '\n', A_AVAL, S_TAGWS, - S_STAGC, '\t', A_AVAL, S_TAGWS, - S_TAG, '!', A_SKIP, S_DECL, - S_TAG, '?', A_SKIP, S_PITARGET, - S_TAG, '/', A_SKIP, S_ETAG, - S_TAG, '<', A_SAVE, S_TAG, - S_TAG, 0, A_SAVE, S_GI, - S_TAG, -1, A_LT_PCDATA, S_DONE, - S_TAG, ' ', A_LT, S_PCDATA, - S_TAG, '\n', A_LT, S_PCDATA, - S_TAG, '\t', A_LT, S_PCDATA, - S_TAGWS, '/', A_SKIP, S_EMPTYTAG, - S_TAGWS, '>', A_STAGC, S_PCDATA, - S_TAGWS, 0, A_SAVE, S_ANAME, - S_TAGWS, -1, A_STAGC, S_DONE, - S_TAGWS, ' ', A_SKIP, S_TAGWS, - S_TAGWS, '\n', A_SKIP, S_TAGWS, - S_TAGWS, '\t', A_SKIP, S_TAGWS, - S_XNCR, 0, A_ENTITY, S_XNCR, - S_XNCR, -1, A_ENTITY, S_DONE, - - }; - private static final String[] debug_actionnames = { "", "A_ADUP", "A_ADUP_SAVE", "A_ADUP_STAGC", "A_ANAME", "A_ANAME_ADUP", "A_ANAME_ADUP_STAGC", "A_AVAL", "A_AVAL_STAGC", "A_CDATA", "A_CMNT", "A_DECL", "A_EMPTYTAG", "A_ENTITY", "A_ENTITY_START", "A_ETAG", "A_GI", "A_GI_STAGC", "A_LT", "A_LT_PCDATA", "A_MINUS", "A_MINUS2", "A_MINUS3", "A_PCDATA", "A_PI", "A_PITARGET", "A_PITARGET_PI", "A_SAVE", "A_SKIP", "A_SP", "A_STAGC", "A_UNGET", "A_UNSAVE_PCDATA"}; - private static final String[] debug_statenames = { "", "S_ANAME", "S_APOS", "S_AVAL", "S_BB", "S_BBC", "S_BBCD", "S_BBCDA", "S_BBCDAT", "S_BBCDATA", "S_CDATA", "S_CDATA2", "S_CDSECT", "S_CDSECT1", "S_CDSECT2", "S_COM", "S_COM2", "S_COM3", "S_COM4", "S_DECL", "S_DECL2", "S_DONE", "S_EMPTYTAG", "S_ENT", "S_EQ", "S_ETAG", "S_GI", "S_NCR", "S_PCDATA", "S_PI", "S_PITARGET", "S_QUOT", "S_STAGC", "S_TAG", "S_TAGWS", "S_XNCR"}; - - - // End of state table - - private String thePublicid; // Locator state - private String theSystemid; - private int theLastLine; - private int theLastColumn; - private int theCurrentLine; - private int theCurrentColumn; - - int theState; // Current state - int theNextState; // Next state - char[] theOutputBuffer = new char[200]; // Output buffer - int theSize; // Current buffer size - int[] theWinMap = { // Windows chars map - 0x20AC, 0xFFFD, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, - 0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0xFFFD, 0x017D, 0xFFFD, - 0xFFFD, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, - 0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0xFFFD, 0x017E, 0x0178}; - - // Compensate for bug in PushbackReader that allows - // pushing back EOF. - private void unread(PushbackReader r, int c) throws IOException { - if (c != -1) r.unread(c); - } - - // Locator implementation - - public int getLineNumber() { - return theLastLine; - } - public int getColumnNumber() { - return theLastColumn; - } - public String getPublicId() { - return thePublicid; - } - public String getSystemId() { - return theSystemid; - } - - - // Scanner implementation - - /** - Reset document locator, supplying systemid and publicid. - @param systemid System id - @param publicid Public id - */ - - public void resetDocumentLocator(String publicid, String systemid) { - thePublicid = publicid; - theSystemid = systemid; - theLastLine = theLastColumn = theCurrentLine = theCurrentColumn = 0; - } - - /** - Scan HTML source, reporting lexical events. - @param r0 Reader that provides characters - @param h ScanHandler that accepts lexical events. - */ - - public void scan(Reader r0, ScanHandler h) throws IOException, SAXException { - theState = S_PCDATA; - PushbackReader r; - if (r0 instanceof PushbackReader) { - r = (PushbackReader)r0; - } - else if (r0 instanceof BufferedReader) { - r = new PushbackReader(r0); - } - else { - r = new PushbackReader(new BufferedReader(r0, 200)); - } - - int firstChar = r.read(); // Remove any leading BOM - if (firstChar != '\uFEFF') unread(r, firstChar); - - while (theState != S_DONE) { - int ch = r.read(); - - // Process control characters - if (ch >= 0x80 && ch <= 0x9F) ch = theWinMap[ch-0x80]; - - if (ch == '\r') { - ch = r.read(); // expect LF next - if (ch != '\n') { - unread(r, ch); // nope - ch = '\n'; - } - } - - if (ch == '\n') { - theCurrentLine++; - theCurrentColumn = 0; - } - else { - theCurrentColumn++; - } - - if (!(ch >= 0x20 || ch == '\n' || ch == '\t' || ch == -1)) continue; - - // Search state table - int action = 0; - for (int i = 0; i < statetable.length; i += 4) { - if (theState != statetable[i]) { - if (action != 0) break; - continue; - } - if (statetable[i+1] == 0) { - action = statetable[i+2]; - theNextState = statetable[i+3]; - } - else if (statetable[i+1] == ch) { - action = statetable[i+2]; - theNextState = statetable[i+3]; - break; - } - } -// System.err.println("In " + debug_statenames[theState] + " got " + nicechar(ch) + " doing " + debug_actionnames[action] + " then " + debug_statenames[theNextState]); - switch (action) { - case 0: - throw new Error( -"HTMLScanner can't cope with " + Integer.toString(ch) + " in state " + -Integer.toString(theState)); - case A_ADUP: - h.adup(theOutputBuffer, 0, theSize); - theSize = 0; - break; - case A_ADUP_SAVE: - h.adup(theOutputBuffer, 0, theSize); - theSize = 0; - save(ch, h); - break; - case A_ADUP_STAGC: - h.adup(theOutputBuffer, 0, theSize); - theSize = 0; - h.stagc(theOutputBuffer, 0, theSize); - break; - case A_ANAME: - h.aname(theOutputBuffer, 0, theSize); - theSize = 0; - break; - case A_ANAME_ADUP: - h.aname(theOutputBuffer, 0, theSize); - theSize = 0; - h.adup(theOutputBuffer, 0, theSize); - break; - case A_ANAME_ADUP_STAGC: - h.aname(theOutputBuffer, 0, theSize); - theSize = 0; - h.adup(theOutputBuffer, 0, theSize); - h.stagc(theOutputBuffer, 0, theSize); - break; - case A_AVAL: - h.aval(theOutputBuffer, 0, theSize); - theSize = 0; - break; - case A_AVAL_STAGC: - h.aval(theOutputBuffer, 0, theSize); - theSize = 0; - h.stagc(theOutputBuffer, 0, theSize); - break; - case A_CDATA: - mark(); - // suppress the final "]]" in the buffer - if (theSize > 1) theSize -= 2; - h.pcdata(theOutputBuffer, 0, theSize); - theSize = 0; - break; - case A_ENTITY_START: - h.pcdata(theOutputBuffer, 0, theSize); - theSize = 0; - save(ch, h); - break; - case A_ENTITY: - mark(); - char ch1 = (char)ch; -// System.out.println("Got " + ch1 + " in state " + ((theState == S_ENT) ? "S_ENT" : ((theState == S_NCR) ? "S_NCR" : "UNK"))); - if (theState == S_ENT && ch1 == '#') { - theNextState = S_NCR; - save(ch, h); - break; - } - else if (theState == S_NCR && (ch1 == 'x' || ch1 == 'X')) { - theNextState = S_XNCR; - save(ch, h); - break; - } - else if (theState == S_ENT && Character.isLetterOrDigit(ch1)) { - save(ch, h); - break; - } - else if (theState == S_NCR && Character.isDigit(ch1)) { - save(ch, h); - break; - } - else if (theState == S_XNCR && (Character.isDigit(ch1) || "abcdefABCDEF".indexOf(ch1) != -1)) { - save(ch, h); - break; - } - - // The whole entity reference has been collected -// System.err.println("%%" + new String(theOutputBuffer, 0, theSize)); - h.entity(theOutputBuffer, 1, theSize - 1); - int ent = h.getEntity(); -// System.err.println("%% value = " + ent); - if (ent != 0) { - theSize = 0; - if (ent >= 0x80 && ent <= 0x9F) { - ent = theWinMap[ent-0x80]; - } - if (ent < 0x20) { - // Control becomes space - ent = 0x20; - } - else if (ent >= 0xD800 && ent <= 0xDFFF) { - // Surrogates get dropped - ent = 0; - } - else if (ent <= 0xFFFF) { - // BMP character - save(ent, h); - } - else { - // Astral converted to two surrogates - ent -= 0x10000; - save((ent>>10) + 0xD800, h); - save((ent&0x3FF) + 0xDC00, h); - } - if (ch != ';') { - unread(r, ch); - theCurrentColumn--; - } - } - else { - unread(r, ch); - theCurrentColumn--; - } - theNextState = S_PCDATA; - break; - case A_ETAG: - h.etag(theOutputBuffer, 0, theSize); - theSize = 0; - break; - case A_DECL: - h.decl(theOutputBuffer, 0, theSize); - theSize = 0; - break; - case A_GI: - h.gi(theOutputBuffer, 0, theSize); - theSize = 0; - break; - case A_GI_STAGC: - h.gi(theOutputBuffer, 0, theSize); - theSize = 0; - h.stagc(theOutputBuffer, 0, theSize); - break; - case A_LT: - mark(); - save('<', h); - save(ch, h); - break; - case A_LT_PCDATA: - mark(); - save('<', h); - h.pcdata(theOutputBuffer, 0, theSize); - theSize = 0; - break; - case A_PCDATA: - mark(); - h.pcdata(theOutputBuffer, 0, theSize); - theSize = 0; - break; - case A_CMNT: - mark(); - h.cmnt(theOutputBuffer, 0, theSize); - theSize = 0; - break; - case A_MINUS3: - save('-', h); - save(' ', h); - break; - case A_MINUS2: - save('-', h); - save(' ', h); - // fall through into A_MINUS - case A_MINUS: - save('-', h); - save(ch, h); - break; - case A_PI: - mark(); - h.pi(theOutputBuffer, 0, theSize); - theSize = 0; - break; - case A_PITARGET: - h.pitarget(theOutputBuffer, 0, theSize); - theSize = 0; - break; - case A_PITARGET_PI: - h.pitarget(theOutputBuffer, 0, theSize); - theSize = 0; - h.pi(theOutputBuffer, 0, theSize); - break; - case A_SAVE: - save(ch, h); - break; - case A_SKIP: - break; - case A_SP: - save(' ', h); - break; - case A_STAGC: - h.stagc(theOutputBuffer, 0, theSize); - theSize = 0; - break; - case A_EMPTYTAG: - mark(); -// System.err.println("%%% Empty tag seen"); - if (theSize > 0) h.gi(theOutputBuffer, 0, theSize); - theSize = 0; - h.stage(theOutputBuffer, 0, theSize); - break; - case A_UNGET: - unread(r, ch); - theCurrentColumn--; - break; - case A_UNSAVE_PCDATA: - if (theSize > 0) theSize--; - h.pcdata(theOutputBuffer, 0, theSize); - theSize = 0; - break; - default: - throw new Error("Can't process state " + action); - } - theState = theNextState; - } - h.eof(theOutputBuffer, 0, 0); - } - - /** - * Mark the current scan position as a "point of interest" - start of a tag, - * cdata, processing instruction etc. - */ - - private void mark() { - theLastColumn = theCurrentColumn; - theLastLine = theCurrentLine; - } - - /** - A callback for the ScanHandler that allows it to force - the lexer state to CDATA content (no markup is recognized except - the end of element. - */ - - public void startCDATA() { theNextState = S_CDATA; } - - private void save(int ch, ScanHandler h) throws IOException, SAXException { - if (theSize >= theOutputBuffer.length - 20) { - if (theState == S_PCDATA || theState == S_CDATA) { - // Return a buffer-sized chunk of PCDATA - h.pcdata(theOutputBuffer, 0, theSize); - theSize = 0; - } - else { - // Grow the buffer size - char[] newOutputBuffer = new char[theOutputBuffer.length * 2]; - System.arraycopy(theOutputBuffer, 0, newOutputBuffer, 0, theSize+1); - theOutputBuffer = newOutputBuffer; - } - } - theOutputBuffer[theSize++] = (char)ch; - } - - /** - Test procedure. Reads HTML from the standard input and writes - PYX to the standard output. - */ - - public static void main(String[] argv) throws IOException, SAXException { - Scanner s = new HTMLScanner(); - Reader r = new InputStreamReader(System.in, "UTF-8"); - Writer w = new OutputStreamWriter(System.out, "UTF-8"); - PYXWriter pw = new PYXWriter(w); - s.scan(r, pw); - w.close(); - } - - - private static String nicechar(int in) { - if (in == '\n') return "\\n"; - if (in < 32) return "0x"+Integer.toHexString(in); - return "'"+((char)in)+"'"; - } - - } |