aboutsummaryrefslogtreecommitdiff
path: root/src/org/ccil/cowan/tagsoup/HTMLScanner.java
diff options
context:
space:
mode:
authorThe Android Open Source Project <initial-contribution@android.com>2009-03-03 18:28:39 -0800
committerThe Android Open Source Project <initial-contribution@android.com>2009-03-03 18:28:39 -0800
commitb0e041b91e19d554585bc4423075929013f05f22 (patch)
tree4b825dc642cb6eb9a060e54bf8d69288fbee4904 /src/org/ccil/cowan/tagsoup/HTMLScanner.java
parent4bb395b502d0c2495f7a5d226ccf7f06f53dea38 (diff)
downloadtagsoup-b0e041b91e19d554585bc4423075929013f05f22.tar.gz
auto import from //depot/cupcake/@135843
Diffstat (limited to 'src/org/ccil/cowan/tagsoup/HTMLScanner.java')
-rw-r--r--src/org/ccil/cowan/tagsoup/HTMLScanner.java648
1 files changed, 0 insertions, 648 deletions
diff --git a/src/org/ccil/cowan/tagsoup/HTMLScanner.java b/src/org/ccil/cowan/tagsoup/HTMLScanner.java
deleted file mode 100644
index f5980ee..0000000
--- a/src/org/ccil/cowan/tagsoup/HTMLScanner.java
+++ /dev/null
@@ -1,648 +0,0 @@
-// This file is part of TagSoup and is Copyright 2002-2008 by John Cowan.
-//
-// TagSoup is licensed under the Apache License,
-// Version 2.0. You may obtain a copy of this license at
-// http://www.apache.org/licenses/LICENSE-2.0 . You may also have
-// additional legal rights not granted by this license.
-//
-// TagSoup is distributed in the hope that it will be useful, but
-// unless required by applicable law or agreed to in writing, TagSoup
-// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
-// OF ANY KIND, either express or implied; not even the implied warranty
-// of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
-//
-//
-package org.ccil.cowan.tagsoup;
-import java.io.*;
-import org.xml.sax.SAXException;
-import org.xml.sax.Locator;
-
-/**
-This class implements a table-driven scanner for HTML, allowing for lots of
-defects. It implements the Scanner interface, which accepts a Reader
-object to fetch characters from and a ScanHandler object to report lexical
-events to.
-*/
-
-public class HTMLScanner implements Scanner, Locator {
-
- // Start of state table
- private static final int S_ANAME = 1;
- private static final int S_APOS = 2;
- private static final int S_AVAL = 3;
- private static final int S_BB = 4;
- private static final int S_BBC = 5;
- private static final int S_BBCD = 6;
- private static final int S_BBCDA = 7;
- private static final int S_BBCDAT = 8;
- private static final int S_BBCDATA = 9;
- private static final int S_CDATA = 10;
- private static final int S_CDATA2 = 11;
- private static final int S_CDSECT = 12;
- private static final int S_CDSECT1 = 13;
- private static final int S_CDSECT2 = 14;
- private static final int S_COM = 15;
- private static final int S_COM2 = 16;
- private static final int S_COM3 = 17;
- private static final int S_COM4 = 18;
- private static final int S_DECL = 19;
- private static final int S_DECL2 = 20;
- private static final int S_DONE = 21;
- private static final int S_EMPTYTAG = 22;
- private static final int S_ENT = 23;
- private static final int S_EQ = 24;
- private static final int S_ETAG = 25;
- private static final int S_GI = 26;
- private static final int S_NCR = 27;
- private static final int S_PCDATA = 28;
- private static final int S_PI = 29;
- private static final int S_PITARGET = 30;
- private static final int S_QUOT = 31;
- private static final int S_STAGC = 32;
- private static final int S_TAG = 33;
- private static final int S_TAGWS = 34;
- private static final int S_XNCR = 35;
- private static final int A_ADUP = 1;
- private static final int A_ADUP_SAVE = 2;
- private static final int A_ADUP_STAGC = 3;
- private static final int A_ANAME = 4;
- private static final int A_ANAME_ADUP = 5;
- private static final int A_ANAME_ADUP_STAGC = 6;
- private static final int A_AVAL = 7;
- private static final int A_AVAL_STAGC = 8;
- private static final int A_CDATA = 9;
- private static final int A_CMNT = 10;
- private static final int A_DECL = 11;
- private static final int A_EMPTYTAG = 12;
- private static final int A_ENTITY = 13;
- private static final int A_ENTITY_START = 14;
- private static final int A_ETAG = 15;
- private static final int A_GI = 16;
- private static final int A_GI_STAGC = 17;
- private static final int A_LT = 18;
- private static final int A_LT_PCDATA = 19;
- private static final int A_MINUS = 20;
- private static final int A_MINUS2 = 21;
- private static final int A_MINUS3 = 22;
- private static final int A_PCDATA = 23;
- private static final int A_PI = 24;
- private static final int A_PITARGET = 25;
- private static final int A_PITARGET_PI = 26;
- private static final int A_SAVE = 27;
- private static final int A_SKIP = 28;
- private static final int A_SP = 29;
- private static final int A_STAGC = 30;
- private static final int A_UNGET = 31;
- private static final int A_UNSAVE_PCDATA = 32;
- private static int[] statetable = {
- S_ANAME, '/', A_ANAME_ADUP, S_EMPTYTAG,
- S_ANAME, '=', A_ANAME, S_AVAL,
- S_ANAME, '>', A_ANAME_ADUP_STAGC, S_PCDATA,
- S_ANAME, 0, A_SAVE, S_ANAME,
- S_ANAME, -1, A_ANAME_ADUP_STAGC, S_DONE,
- S_ANAME, ' ', A_ANAME, S_EQ,
- S_ANAME, '\n', A_ANAME, S_EQ,
- S_ANAME, '\t', A_ANAME, S_EQ,
- S_APOS, '\'', A_AVAL, S_TAGWS,
- S_APOS, 0, A_SAVE, S_APOS,
- S_APOS, -1, A_AVAL_STAGC, S_DONE,
- S_APOS, ' ', A_SP, S_APOS,
- S_APOS, '\n', A_SP, S_APOS,
- S_APOS, '\t', A_SP, S_APOS,
- S_AVAL, '\'', A_SKIP, S_APOS,
- S_AVAL, '"', A_SKIP, S_QUOT,
- S_AVAL, '>', A_AVAL_STAGC, S_PCDATA,
- S_AVAL, 0, A_SAVE, S_STAGC,
- S_AVAL, -1, A_AVAL_STAGC, S_DONE,
- S_AVAL, ' ', A_SKIP, S_AVAL,
- S_AVAL, '\n', A_SKIP, S_AVAL,
- S_AVAL, '\t', A_SKIP, S_AVAL,
- S_BB, 'C', A_SKIP, S_BBC,
- S_BB, 0, A_SKIP, S_DECL,
- S_BB, -1, A_SKIP, S_DONE,
- S_BBC, 'D', A_SKIP, S_BBCD,
- S_BBC, 0, A_SKIP, S_DECL,
- S_BBC, -1, A_SKIP, S_DONE,
- S_BBCD, 'A', A_SKIP, S_BBCDA,
- S_BBCD, 0, A_SKIP, S_DECL,
- S_BBCD, -1, A_SKIP, S_DONE,
- S_BBCDA, 'T', A_SKIP, S_BBCDAT,
- S_BBCDA, 0, A_SKIP, S_DECL,
- S_BBCDA, -1, A_SKIP, S_DONE,
- S_BBCDAT, 'A', A_SKIP, S_BBCDATA,
- S_BBCDAT, 0, A_SKIP, S_DECL,
- S_BBCDAT, -1, A_SKIP, S_DONE,
- S_BBCDATA, '[', A_SKIP, S_CDSECT,
- S_BBCDATA, 0, A_SKIP, S_DECL,
- S_BBCDATA, -1, A_SKIP, S_DONE,
- S_CDATA, '<', A_SAVE, S_CDATA2,
- S_CDATA, 0, A_SAVE, S_CDATA,
- S_CDATA, -1, A_PCDATA, S_DONE,
- S_CDATA2, '/', A_UNSAVE_PCDATA, S_ETAG,
- S_CDATA2, 0, A_SAVE, S_CDATA,
- S_CDATA2, -1, A_UNSAVE_PCDATA, S_DONE,
- S_CDSECT, ']', A_SAVE, S_CDSECT1,
- S_CDSECT, 0, A_SAVE, S_CDSECT,
- S_CDSECT, -1, A_SKIP, S_DONE,
- S_CDSECT1, ']', A_SAVE, S_CDSECT2,
- S_CDSECT1, 0, A_SAVE, S_CDSECT,
- S_CDSECT1, -1, A_SKIP, S_DONE,
- S_CDSECT2, '>', A_CDATA, S_PCDATA,
- S_CDSECT2, 0, A_SAVE, S_CDSECT,
- S_CDSECT2, -1, A_SKIP, S_DONE,
- S_COM, '-', A_SKIP, S_COM2,
- S_COM, 0, A_SAVE, S_COM2,
- S_COM, -1, A_CMNT, S_DONE,
- S_COM2, '-', A_SKIP, S_COM3,
- S_COM2, 0, A_SAVE, S_COM2,
- S_COM2, -1, A_CMNT, S_DONE,
- S_COM3, '-', A_SKIP, S_COM4,
- S_COM3, 0, A_MINUS, S_COM2,
- S_COM3, -1, A_CMNT, S_DONE,
- S_COM4, '-', A_MINUS3, S_COM4,
- S_COM4, '>', A_CMNT, S_PCDATA,
- S_COM4, 0, A_MINUS2, S_COM2,
- S_COM4, -1, A_CMNT, S_DONE,
- S_DECL, '-', A_SKIP, S_COM,
- S_DECL, '[', A_SKIP, S_BB,
- S_DECL, '>', A_SKIP, S_PCDATA,
- S_DECL, 0, A_SAVE, S_DECL2,
- S_DECL, -1, A_SKIP, S_DONE,
- S_DECL2, '>', A_DECL, S_PCDATA,
- S_DECL2, 0, A_SAVE, S_DECL2,
- S_DECL2, -1, A_SKIP, S_DONE,
- S_EMPTYTAG, '>', A_EMPTYTAG, S_PCDATA,
- S_EMPTYTAG, 0, A_SAVE, S_ANAME,
- S_EMPTYTAG, ' ', A_SKIP, S_TAGWS,
- S_EMPTYTAG, '\n', A_SKIP, S_TAGWS,
- S_EMPTYTAG, '\t', A_SKIP, S_TAGWS,
- S_ENT, 0, A_ENTITY, S_ENT,
- S_ENT, -1, A_ENTITY, S_DONE,
- S_EQ, '=', A_SKIP, S_AVAL,
- S_EQ, '>', A_ADUP_STAGC, S_PCDATA,
- S_EQ, 0, A_ADUP_SAVE, S_ANAME,
- S_EQ, -1, A_ADUP_STAGC, S_DONE,
- S_EQ, ' ', A_SKIP, S_EQ,
- S_EQ, '\n', A_SKIP, S_EQ,
- S_EQ, '\t', A_SKIP, S_EQ,
- S_ETAG, '>', A_ETAG, S_PCDATA,
- S_ETAG, 0, A_SAVE, S_ETAG,
- S_ETAG, -1, A_ETAG, S_DONE,
- S_ETAG, ' ', A_SKIP, S_ETAG,
- S_ETAG, '\n', A_SKIP, S_ETAG,
- S_ETAG, '\t', A_SKIP, S_ETAG,
- S_GI, '/', A_SKIP, S_EMPTYTAG,
- S_GI, '>', A_GI_STAGC, S_PCDATA,
- S_GI, 0, A_SAVE, S_GI,
- S_GI, -1, A_SKIP, S_DONE,
- S_GI, ' ', A_GI, S_TAGWS,
- S_GI, '\n', A_GI, S_TAGWS,
- S_GI, '\t', A_GI, S_TAGWS,
- S_NCR, 0, A_ENTITY, S_NCR,
- S_NCR, -1, A_ENTITY, S_DONE,
- S_PCDATA, '&', A_ENTITY_START, S_ENT,
- S_PCDATA, '<', A_PCDATA, S_TAG,
- S_PCDATA, 0, A_SAVE, S_PCDATA,
- S_PCDATA, -1, A_PCDATA, S_DONE,
- S_PI, '>', A_PI, S_PCDATA,
- S_PI, 0, A_SAVE, S_PI,
- S_PI, -1, A_PI, S_DONE,
- S_PITARGET, '>', A_PITARGET_PI, S_PCDATA,
- S_PITARGET, 0, A_SAVE, S_PITARGET,
- S_PITARGET, -1, A_PITARGET_PI, S_DONE,
- S_PITARGET, ' ', A_PITARGET, S_PI,
- S_PITARGET, '\n', A_PITARGET, S_PI,
- S_PITARGET, '\t', A_PITARGET, S_PI,
- S_QUOT, '"', A_AVAL, S_TAGWS,
- S_QUOT, 0, A_SAVE, S_QUOT,
- S_QUOT, -1, A_AVAL_STAGC, S_DONE,
- S_QUOT, ' ', A_SP, S_QUOT,
- S_QUOT, '\n', A_SP, S_QUOT,
- S_QUOT, '\t', A_SP, S_QUOT,
- S_STAGC, '>', A_AVAL_STAGC, S_PCDATA,
- S_STAGC, 0, A_SAVE, S_STAGC,
- S_STAGC, -1, A_AVAL_STAGC, S_DONE,
- S_STAGC, ' ', A_AVAL, S_TAGWS,
- S_STAGC, '\n', A_AVAL, S_TAGWS,
- S_STAGC, '\t', A_AVAL, S_TAGWS,
- S_TAG, '!', A_SKIP, S_DECL,
- S_TAG, '?', A_SKIP, S_PITARGET,
- S_TAG, '/', A_SKIP, S_ETAG,
- S_TAG, '<', A_SAVE, S_TAG,
- S_TAG, 0, A_SAVE, S_GI,
- S_TAG, -1, A_LT_PCDATA, S_DONE,
- S_TAG, ' ', A_LT, S_PCDATA,
- S_TAG, '\n', A_LT, S_PCDATA,
- S_TAG, '\t', A_LT, S_PCDATA,
- S_TAGWS, '/', A_SKIP, S_EMPTYTAG,
- S_TAGWS, '>', A_STAGC, S_PCDATA,
- S_TAGWS, 0, A_SAVE, S_ANAME,
- S_TAGWS, -1, A_STAGC, S_DONE,
- S_TAGWS, ' ', A_SKIP, S_TAGWS,
- S_TAGWS, '\n', A_SKIP, S_TAGWS,
- S_TAGWS, '\t', A_SKIP, S_TAGWS,
- S_XNCR, 0, A_ENTITY, S_XNCR,
- S_XNCR, -1, A_ENTITY, S_DONE,
-
- };
- private static final String[] debug_actionnames = { "", "A_ADUP", "A_ADUP_SAVE", "A_ADUP_STAGC", "A_ANAME", "A_ANAME_ADUP", "A_ANAME_ADUP_STAGC", "A_AVAL", "A_AVAL_STAGC", "A_CDATA", "A_CMNT", "A_DECL", "A_EMPTYTAG", "A_ENTITY", "A_ENTITY_START", "A_ETAG", "A_GI", "A_GI_STAGC", "A_LT", "A_LT_PCDATA", "A_MINUS", "A_MINUS2", "A_MINUS3", "A_PCDATA", "A_PI", "A_PITARGET", "A_PITARGET_PI", "A_SAVE", "A_SKIP", "A_SP", "A_STAGC", "A_UNGET", "A_UNSAVE_PCDATA"};
- private static final String[] debug_statenames = { "", "S_ANAME", "S_APOS", "S_AVAL", "S_BB", "S_BBC", "S_BBCD", "S_BBCDA", "S_BBCDAT", "S_BBCDATA", "S_CDATA", "S_CDATA2", "S_CDSECT", "S_CDSECT1", "S_CDSECT2", "S_COM", "S_COM2", "S_COM3", "S_COM4", "S_DECL", "S_DECL2", "S_DONE", "S_EMPTYTAG", "S_ENT", "S_EQ", "S_ETAG", "S_GI", "S_NCR", "S_PCDATA", "S_PI", "S_PITARGET", "S_QUOT", "S_STAGC", "S_TAG", "S_TAGWS", "S_XNCR"};
-
-
- // End of state table
-
- private String thePublicid; // Locator state
- private String theSystemid;
- private int theLastLine;
- private int theLastColumn;
- private int theCurrentLine;
- private int theCurrentColumn;
-
- int theState; // Current state
- int theNextState; // Next state
- char[] theOutputBuffer = new char[200]; // Output buffer
- int theSize; // Current buffer size
- int[] theWinMap = { // Windows chars map
- 0x20AC, 0xFFFD, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021,
- 0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0xFFFD, 0x017D, 0xFFFD,
- 0xFFFD, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014,
- 0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0xFFFD, 0x017E, 0x0178};
-
- // Compensate for bug in PushbackReader that allows
- // pushing back EOF.
- private void unread(PushbackReader r, int c) throws IOException {
- if (c != -1) r.unread(c);
- }
-
- // Locator implementation
-
- public int getLineNumber() {
- return theLastLine;
- }
- public int getColumnNumber() {
- return theLastColumn;
- }
- public String getPublicId() {
- return thePublicid;
- }
- public String getSystemId() {
- return theSystemid;
- }
-
-
- // Scanner implementation
-
- /**
- Reset document locator, supplying systemid and publicid.
- @param systemid System id
- @param publicid Public id
- */
-
- public void resetDocumentLocator(String publicid, String systemid) {
- thePublicid = publicid;
- theSystemid = systemid;
- theLastLine = theLastColumn = theCurrentLine = theCurrentColumn = 0;
- }
-
- /**
- Scan HTML source, reporting lexical events.
- @param r0 Reader that provides characters
- @param h ScanHandler that accepts lexical events.
- */
-
- public void scan(Reader r0, ScanHandler h) throws IOException, SAXException {
- theState = S_PCDATA;
- PushbackReader r;
- if (r0 instanceof PushbackReader) {
- r = (PushbackReader)r0;
- }
- else if (r0 instanceof BufferedReader) {
- r = new PushbackReader(r0);
- }
- else {
- r = new PushbackReader(new BufferedReader(r0, 200));
- }
-
- int firstChar = r.read(); // Remove any leading BOM
- if (firstChar != '\uFEFF') unread(r, firstChar);
-
- while (theState != S_DONE) {
- int ch = r.read();
-
- // Process control characters
- if (ch >= 0x80 && ch <= 0x9F) ch = theWinMap[ch-0x80];
-
- if (ch == '\r') {
- ch = r.read(); // expect LF next
- if (ch != '\n') {
- unread(r, ch); // nope
- ch = '\n';
- }
- }
-
- if (ch == '\n') {
- theCurrentLine++;
- theCurrentColumn = 0;
- }
- else {
- theCurrentColumn++;
- }
-
- if (!(ch >= 0x20 || ch == '\n' || ch == '\t' || ch == -1)) continue;
-
- // Search state table
- int action = 0;
- for (int i = 0; i < statetable.length; i += 4) {
- if (theState != statetable[i]) {
- if (action != 0) break;
- continue;
- }
- if (statetable[i+1] == 0) {
- action = statetable[i+2];
- theNextState = statetable[i+3];
- }
- else if (statetable[i+1] == ch) {
- action = statetable[i+2];
- theNextState = statetable[i+3];
- break;
- }
- }
-// System.err.println("In " + debug_statenames[theState] + " got " + nicechar(ch) + " doing " + debug_actionnames[action] + " then " + debug_statenames[theNextState]);
- switch (action) {
- case 0:
- throw new Error(
-"HTMLScanner can't cope with " + Integer.toString(ch) + " in state " +
-Integer.toString(theState));
- case A_ADUP:
- h.adup(theOutputBuffer, 0, theSize);
- theSize = 0;
- break;
- case A_ADUP_SAVE:
- h.adup(theOutputBuffer, 0, theSize);
- theSize = 0;
- save(ch, h);
- break;
- case A_ADUP_STAGC:
- h.adup(theOutputBuffer, 0, theSize);
- theSize = 0;
- h.stagc(theOutputBuffer, 0, theSize);
- break;
- case A_ANAME:
- h.aname(theOutputBuffer, 0, theSize);
- theSize = 0;
- break;
- case A_ANAME_ADUP:
- h.aname(theOutputBuffer, 0, theSize);
- theSize = 0;
- h.adup(theOutputBuffer, 0, theSize);
- break;
- case A_ANAME_ADUP_STAGC:
- h.aname(theOutputBuffer, 0, theSize);
- theSize = 0;
- h.adup(theOutputBuffer, 0, theSize);
- h.stagc(theOutputBuffer, 0, theSize);
- break;
- case A_AVAL:
- h.aval(theOutputBuffer, 0, theSize);
- theSize = 0;
- break;
- case A_AVAL_STAGC:
- h.aval(theOutputBuffer, 0, theSize);
- theSize = 0;
- h.stagc(theOutputBuffer, 0, theSize);
- break;
- case A_CDATA:
- mark();
- // suppress the final "]]" in the buffer
- if (theSize > 1) theSize -= 2;
- h.pcdata(theOutputBuffer, 0, theSize);
- theSize = 0;
- break;
- case A_ENTITY_START:
- h.pcdata(theOutputBuffer, 0, theSize);
- theSize = 0;
- save(ch, h);
- break;
- case A_ENTITY:
- mark();
- char ch1 = (char)ch;
-// System.out.println("Got " + ch1 + " in state " + ((theState == S_ENT) ? "S_ENT" : ((theState == S_NCR) ? "S_NCR" : "UNK")));
- if (theState == S_ENT && ch1 == '#') {
- theNextState = S_NCR;
- save(ch, h);
- break;
- }
- else if (theState == S_NCR && (ch1 == 'x' || ch1 == 'X')) {
- theNextState = S_XNCR;
- save(ch, h);
- break;
- }
- else if (theState == S_ENT && Character.isLetterOrDigit(ch1)) {
- save(ch, h);
- break;
- }
- else if (theState == S_NCR && Character.isDigit(ch1)) {
- save(ch, h);
- break;
- }
- else if (theState == S_XNCR && (Character.isDigit(ch1) || "abcdefABCDEF".indexOf(ch1) != -1)) {
- save(ch, h);
- break;
- }
-
- // The whole entity reference has been collected
-// System.err.println("%%" + new String(theOutputBuffer, 0, theSize));
- h.entity(theOutputBuffer, 1, theSize - 1);
- int ent = h.getEntity();
-// System.err.println("%% value = " + ent);
- if (ent != 0) {
- theSize = 0;
- if (ent >= 0x80 && ent <= 0x9F) {
- ent = theWinMap[ent-0x80];
- }
- if (ent < 0x20) {
- // Control becomes space
- ent = 0x20;
- }
- else if (ent >= 0xD800 && ent <= 0xDFFF) {
- // Surrogates get dropped
- ent = 0;
- }
- else if (ent <= 0xFFFF) {
- // BMP character
- save(ent, h);
- }
- else {
- // Astral converted to two surrogates
- ent -= 0x10000;
- save((ent>>10) + 0xD800, h);
- save((ent&0x3FF) + 0xDC00, h);
- }
- if (ch != ';') {
- unread(r, ch);
- theCurrentColumn--;
- }
- }
- else {
- unread(r, ch);
- theCurrentColumn--;
- }
- theNextState = S_PCDATA;
- break;
- case A_ETAG:
- h.etag(theOutputBuffer, 0, theSize);
- theSize = 0;
- break;
- case A_DECL:
- h.decl(theOutputBuffer, 0, theSize);
- theSize = 0;
- break;
- case A_GI:
- h.gi(theOutputBuffer, 0, theSize);
- theSize = 0;
- break;
- case A_GI_STAGC:
- h.gi(theOutputBuffer, 0, theSize);
- theSize = 0;
- h.stagc(theOutputBuffer, 0, theSize);
- break;
- case A_LT:
- mark();
- save('<', h);
- save(ch, h);
- break;
- case A_LT_PCDATA:
- mark();
- save('<', h);
- h.pcdata(theOutputBuffer, 0, theSize);
- theSize = 0;
- break;
- case A_PCDATA:
- mark();
- h.pcdata(theOutputBuffer, 0, theSize);
- theSize = 0;
- break;
- case A_CMNT:
- mark();
- h.cmnt(theOutputBuffer, 0, theSize);
- theSize = 0;
- break;
- case A_MINUS3:
- save('-', h);
- save(' ', h);
- break;
- case A_MINUS2:
- save('-', h);
- save(' ', h);
- // fall through into A_MINUS
- case A_MINUS:
- save('-', h);
- save(ch, h);
- break;
- case A_PI:
- mark();
- h.pi(theOutputBuffer, 0, theSize);
- theSize = 0;
- break;
- case A_PITARGET:
- h.pitarget(theOutputBuffer, 0, theSize);
- theSize = 0;
- break;
- case A_PITARGET_PI:
- h.pitarget(theOutputBuffer, 0, theSize);
- theSize = 0;
- h.pi(theOutputBuffer, 0, theSize);
- break;
- case A_SAVE:
- save(ch, h);
- break;
- case A_SKIP:
- break;
- case A_SP:
- save(' ', h);
- break;
- case A_STAGC:
- h.stagc(theOutputBuffer, 0, theSize);
- theSize = 0;
- break;
- case A_EMPTYTAG:
- mark();
-// System.err.println("%%% Empty tag seen");
- if (theSize > 0) h.gi(theOutputBuffer, 0, theSize);
- theSize = 0;
- h.stage(theOutputBuffer, 0, theSize);
- break;
- case A_UNGET:
- unread(r, ch);
- theCurrentColumn--;
- break;
- case A_UNSAVE_PCDATA:
- if (theSize > 0) theSize--;
- h.pcdata(theOutputBuffer, 0, theSize);
- theSize = 0;
- break;
- default:
- throw new Error("Can't process state " + action);
- }
- theState = theNextState;
- }
- h.eof(theOutputBuffer, 0, 0);
- }
-
- /**
- * Mark the current scan position as a "point of interest" - start of a tag,
- * cdata, processing instruction etc.
- */
-
- private void mark() {
- theLastColumn = theCurrentColumn;
- theLastLine = theCurrentLine;
- }
-
- /**
- A callback for the ScanHandler that allows it to force
- the lexer state to CDATA content (no markup is recognized except
- the end of element.
- */
-
- public void startCDATA() { theNextState = S_CDATA; }
-
- private void save(int ch, ScanHandler h) throws IOException, SAXException {
- if (theSize >= theOutputBuffer.length - 20) {
- if (theState == S_PCDATA || theState == S_CDATA) {
- // Return a buffer-sized chunk of PCDATA
- h.pcdata(theOutputBuffer, 0, theSize);
- theSize = 0;
- }
- else {
- // Grow the buffer size
- char[] newOutputBuffer = new char[theOutputBuffer.length * 2];
- System.arraycopy(theOutputBuffer, 0, newOutputBuffer, 0, theSize+1);
- theOutputBuffer = newOutputBuffer;
- }
- }
- theOutputBuffer[theSize++] = (char)ch;
- }
-
- /**
- Test procedure. Reads HTML from the standard input and writes
- PYX to the standard output.
- */
-
- public static void main(String[] argv) throws IOException, SAXException {
- Scanner s = new HTMLScanner();
- Reader r = new InputStreamReader(System.in, "UTF-8");
- Writer w = new OutputStreamWriter(System.out, "UTF-8");
- PYXWriter pw = new PYXWriter(w);
- s.scan(r, pw);
- w.close();
- }
-
-
- private static String nicechar(int in) {
- if (in == '\n') return "\\n";
- if (in < 32) return "0x"+Integer.toHexString(in);
- return "'"+((char)in)+"'";
- }
-
- }