aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorjoehw <unknown>2019-03-28 15:18:20 +0000
committerbell-sw <liberica@bell-sw.com>2019-07-22 19:22:03 +0300
commit7a3c0cdc66248b6fcc825e76425c4ecd19007ef5 (patch)
treeffd4f3597c10b58d169897381f44b3fbb1b04c30
parent9e0241d90097aa681d96ff6573e147caf9a2db39 (diff)
downloadjdk8u_jaxp-7a3c0cdc66248b6fcc825e76425c4ecd19007ef5.tar.gz
8207760: SAXException: Invalid UTF-16 surrogate detected: d83c ?
Summary: Properly handle unicode16 characters split across buffer chunks. Reviewed-by: lancea, dfuchs
-rw-r--r--src/com/sun/org/apache/xml/internal/serializer/ToHTMLStream.java23
-rw-r--r--src/com/sun/org/apache/xml/internal/serializer/ToStream.java244
-rw-r--r--src/com/sun/org/apache/xml/internal/serializer/ToTextStream.java43
3 files changed, 164 insertions, 146 deletions
diff --git a/src/com/sun/org/apache/xml/internal/serializer/ToHTMLStream.java b/src/com/sun/org/apache/xml/internal/serializer/ToHTMLStream.java
index a68ec6d..633d0e9 100644
--- a/src/com/sun/org/apache/xml/internal/serializer/ToHTMLStream.java
+++ b/src/com/sun/org/apache/xml/internal/serializer/ToHTMLStream.java
@@ -1,6 +1,5 @@
/*
- * reserved comment block
- * DO NOT REMOVE OR ALTER!
+ * Copyright (c) 2014, 2018, Oracle and/or its affiliates. All rights reserved.
*/
/*
* Copyright 2001-2004 The Apache Software Foundation.
@@ -43,6 +42,7 @@ import org.xml.sax.SAXException;
* because it is used from another package.
*
* @xsl.usage internal
+ * @LastModified: Sept 2018
*/
public final class ToHTMLStream extends ToStream
{
@@ -1021,7 +1021,7 @@ public final class ToHTMLStream extends ToStream
String name,
String value,
ElemDesc elemDesc)
- throws IOException
+ throws IOException, SAXException
{
writer.write(' ');
@@ -1345,7 +1345,7 @@ public final class ToHTMLStream extends ToStream
*/
public void writeAttrString(
final java.io.Writer writer, String string, String encoding)
- throws IOException
+ throws IOException, SAXException
{
final int end = string.length();
if (end > m_attrBuff.length)
@@ -1397,13 +1397,16 @@ public final class ToHTMLStream extends ToStream
}
else
{
- if (Encodings.isHighUTF16Surrogate(ch))
+ if (Encodings.isHighUTF16Surrogate(ch) ||
+ Encodings.isLowUTF16Surrogate(ch))
{
-
- writeUTF16Surrogate(ch, chars, i, end);
- i++; // two input characters processed
- // this increments by one and the for()
- // loop itself increments by another one.
+ if (writeUTF16Surrogate(ch, chars, i, end) >= 0) {
+ // move the index if the low surrogate is consumed
+ // as writeUTF16Surrogate has written the pair
+ if (Encodings.isHighUTF16Surrogate(ch)) {
+ i++;
+ }
+ }
}
// The next is kind of a hack to keep from escaping in the case
diff --git a/src/com/sun/org/apache/xml/internal/serializer/ToStream.java b/src/com/sun/org/apache/xml/internal/serializer/ToStream.java
index 2301763..b4da7bb 100644
--- a/src/com/sun/org/apache/xml/internal/serializer/ToStream.java
+++ b/src/com/sun/org/apache/xml/internal/serializer/ToStream.java
@@ -1,6 +1,5 @@
/*
- * reserved comment block
- * DO NOT REMOVE OR ALTER!
+ * Copyright (c) 2006, 2018, Oracle and/or its affiliates. All rights reserved.
*/
/*
* Copyright 2001-2004 The Apache Software Foundation.
@@ -51,6 +50,7 @@ import org.xml.sax.SAXException;
* serializers (xml, html, text ...) that write output to a stream.
*
* @xsl.usage internal
+ * @LastModified: Sept 2018
*/
abstract public class ToStream extends SerializerBase
{
@@ -200,6 +200,7 @@ abstract public class ToStream extends SerializerBase
*/
private boolean m_expandDTDEntities = true;
+ private char m_highSurrogate = 0;
/**
* Default constructor
@@ -947,45 +948,46 @@ abstract public class ToStream extends SerializerBase
* @param ch Character array.
* @param i position Where the surrogate was detected.
* @param end The end index of the significant characters.
- * @return 0 if the pair of characters was written out as-is,
- * the unicode code point of the character represented by
- * the surrogate pair if an entity reference with that value
- * was written out.
+ * @return the status of writing a surrogate pair.
+ * -1 -- nothing is written
+ * 0 -- the pair is written as-is
+ * code point -- the pair is written as an entity reference
*
* @throws IOException
* @throws org.xml.sax.SAXException if invalid UTF-16 surrogate detected.
*/
protected int writeUTF16Surrogate(char c, char ch[], int i, int end)
- throws IOException
+ throws IOException, SAXException
{
- int codePoint = 0;
+ int status = -1;
if (i + 1 >= end)
{
- throw new IOException(
- Utils.messages.createMessage(
- MsgKey.ER_INVALID_UTF16_SURROGATE,
- new Object[] { Integer.toHexString((int) c)}));
+ m_highSurrogate = c;
+ return status;
+ }
+
+ char high, low;
+ if (m_highSurrogate == 0) {
+ high = c;
+ low = ch[i+1];
+ status = 0;
+ } else {
+ high = m_highSurrogate;
+ low = c;
+ m_highSurrogate = 0;
}
- final char high = c;
- final char low = ch[i+1];
if (!Encodings.isLowUTF16Surrogate(low)) {
- throw new IOException(
- Utils.messages.createMessage(
- MsgKey.ER_INVALID_UTF16_SURROGATE,
- new Object[] {
- Integer.toHexString((int) c)
- + " "
- + Integer.toHexString(low)}));
+ throwIOE(high, low);
}
final java.io.Writer writer = m_writer;
// If we make it to here we have a valid high, low surrogate pair
- if (m_encodingInfo.isInEncoding(c,low)) {
+ if (m_encodingInfo.isInEncoding(high,low)) {
// If the character formed by the surrogate pair
// is in the encoding, so just write it out
- writer.write(ch,i,2);
+ writer.write(new char[]{high, low}, 0, 2);
}
else {
// Don't know what to do with this char, it is
@@ -993,24 +995,16 @@ abstract public class ToStream extends SerializerBase
// a surrogate pair, so write out as an entity ref
final String encoding = getEncoding();
if (encoding != null) {
- /* The output encoding is known,
- * so somthing is wrong.
- */
- codePoint = Encodings.toCodePoint(high, low);
- // not in the encoding, so write out a character reference
- writer.write('&');
- writer.write('#');
- writer.write(Integer.toString(codePoint));
- writer.write(';');
+ status = writeCharRef(writer, high, low);
} else {
/* The output encoding is not known,
* so just write it out as-is.
*/
- writer.write(ch, i, 2);
+ writer.write(new char[]{high, low}, 0, 2);
}
}
// non-zero only if character reference was written out.
- return codePoint;
+ return status;
}
/**
@@ -1100,32 +1094,7 @@ abstract public class ToStream extends SerializerBase
}
else if (isCData && (!escapingNotNeeded(c)))
{
- // if (i != 0)
- if (m_cdataTagOpen)
- closeCDATA();
-
- // This needs to go into a function...
- if (Encodings.isHighUTF16Surrogate(c))
- {
- writeUTF16Surrogate(c, ch, i, end);
- i++ ; // process two input characters
- }
- else
- {
- writer.write("&#");
-
- String intStr = Integer.toString((int) c);
-
- writer.write(intStr);
- writer.write(';');
- }
-
- // if ((i != 0) && (i < (end - 1)))
- // if (!m_cdataTagOpen && (i < (end - 1)))
- // {
- // writer.write(CDATA_DELIMITER_OPEN);
- // m_cdataTagOpen = true;
- // }
+ i = handleEscaping(writer, c, ch, i, end);
}
else if (
isCData
@@ -1149,29 +1118,44 @@ abstract public class ToStream extends SerializerBase
}
writer.write(c);
}
-
- // This needs to go into a function...
- else if (Encodings.isHighUTF16Surrogate(c))
- {
- if (m_cdataTagOpen)
- closeCDATA();
- writeUTF16Surrogate(c, ch, i, end);
- i++; // process two input characters
+ else {
+ i = handleEscaping(writer, c, ch, i, end);
}
- else
- {
- if (m_cdataTagOpen)
- closeCDATA();
- writer.write("&#");
+ }
+ }
- String intStr = Integer.toString((int) c);
+ }
- writer.write(intStr);
- writer.write(';');
+ /**
+ * Handles escaping, writes either with a surrogate pair or a character
+ * reference.
+ *
+ * @param c the current char
+ * @param ch the character array
+ * @param i the current position
+ * @param end the end index of the array
+ * @return the next index
+ *
+ * @throws IOException
+ * @throws org.xml.sax.SAXException if invalid UTF-16 surrogate detected.
+ */
+ private int handleEscaping(Writer writer, char c, char ch[], int i, int end)
+ throws IOException, SAXException {
+ if (Encodings.isHighUTF16Surrogate(c) || Encodings.isLowUTF16Surrogate(c))
+ {
+ if (writeUTF16Surrogate(c, ch, i, end) >= 0) {
+ // move the index if the low surrogate is consumed
+ // as writeUTF16Surrogate has written the pair
+ if (Encodings.isHighUTF16Surrogate(c)) {
+ i++ ;
}
}
}
-
+ else
+ {
+ writeCharRef(writer, c);
+ }
+ return i;
}
/**
@@ -1242,7 +1226,7 @@ abstract public class ToStream extends SerializerBase
}
m_ispreserve = true;
- if (shouldIndent())
+ if (!m_cdataTagOpen && shouldIndent())
indent();
boolean writeCDataBrackets =
@@ -1564,7 +1548,7 @@ abstract public class ToStream extends SerializerBase
int i,
char ch,
int lastDirty,
- boolean fromTextNode) throws IOException
+ boolean fromTextNode) throws IOException, SAXException
{
int startClean = lastDirty + 1;
// if we have some clean characters accumulated
@@ -1643,54 +1627,41 @@ abstract public class ToStream extends SerializerBase
int len,
boolean fromTextNode,
boolean escLF)
- throws IOException
+ throws IOException, SAXException
{
int pos = accumDefaultEntity(writer, ch, i, chars, len, fromTextNode, escLF);
if (i == pos)
{
+ if (m_highSurrogate != 0) {
+ if (!(Encodings.isLowUTF16Surrogate(ch))) {
+ throwIOE(m_highSurrogate, ch);
+ }
+ writeCharRef(writer, m_highSurrogate, ch);
+ m_highSurrogate = 0;
+ return ++pos;
+ }
+
if (Encodings.isHighUTF16Surrogate(ch))
{
-
- // Should be the UTF-16 low surrogate of the hig/low pair.
- char next;
- // Unicode code point formed from the high/low pair.
- int codePoint = 0;
-
if (i + 1 >= len)
{
- throw new IOException(
- Utils.messages.createMessage(
- MsgKey.ER_INVALID_UTF16_SURROGATE,
- new Object[] { Integer.toHexString(ch)}));
- //"Invalid UTF-16 surrogate detected: "
-
- //+Integer.toHexString(ch)+ " ?");
+ // save for the next read
+ m_highSurrogate = ch;
+ pos++;
}
else
{
- next = chars[++i];
+ // the next should be the UTF-16 low surrogate of the hig/low pair.
+ char next = chars[++i];
if (!(Encodings.isLowUTF16Surrogate(next)))
- throw new IOException(
- Utils.messages.createMessage(
- MsgKey
- .ER_INVALID_UTF16_SURROGATE,
- new Object[] {
- Integer.toHexString(ch)
- + " "
- + Integer.toHexString(next)}));
- //"Invalid UTF-16 surrogate detected: "
-
- //+Integer.toHexString(ch)+" "+Integer.toHexString(next));
- codePoint = Encodings.toCodePoint(ch,next);
- }
+ throwIOE(ch, next);
- writer.write("&#");
- writer.write(Integer.toString(codePoint));
- writer.write(';');
- pos += 2; // count the two characters that went into writing out this entity
+ writeCharRef(writer, ch, next);
+ pos += 2; // count the two characters that went into writing out this entity
+ }
}
else
{
@@ -1702,18 +1673,14 @@ abstract public class ToStream extends SerializerBase
if (isCharacterInC0orC1Range(ch) ||
(XMLVERSION11.equals(getVersion()) && isNELorLSEPCharacter(ch)))
{
- writer.write("&#");
- writer.write(Integer.toString(ch));
- writer.write(';');
+ writeCharRef(writer, ch);
}
else if ((!escapingNotNeeded(ch) ||
( (fromTextNode && m_charInfo.isSpecialTextChar(ch))
|| (!fromTextNode && m_charInfo.isSpecialAttrChar(ch))))
- && m_elemContext.m_currentElemDepth > 0)
+ && m_elemContext.m_currentElemDepth > 0)
{
- writer.write("&#");
- writer.write(Integer.toString(ch));
- writer.write(';');
+ writeCharRef(writer, ch);
}
else
{
@@ -1727,6 +1694,45 @@ abstract public class ToStream extends SerializerBase
}
/**
+ * Writes out a character reference.
+ * @param writer the writer
+ * @param c the character
+ * @throws IOException
+ */
+ private void writeCharRef(Writer writer, char c) throws IOException, SAXException {
+ if (m_cdataTagOpen)
+ closeCDATA();
+ writer.write("&#");
+ writer.write(Integer.toString(c));
+ writer.write(';');
+ }
+
+ /**
+ * Writes out a pair of surrogates as a character reference
+ * @param writer the writer
+ * @param high the high surrogate
+ * @param low the low surrogate
+ * @throws IOException
+ */
+ private int writeCharRef(Writer writer, char high, char low) throws IOException, SAXException {
+ if (m_cdataTagOpen)
+ closeCDATA();
+ // Unicode code point formed from the high/low pair.
+ int codePoint = Encodings.toCodePoint(high, low);
+ writer.write("&#");
+ writer.write(Integer.toString(codePoint));
+ writer.write(';');
+ return codePoint;
+ }
+
+ private void throwIOE(char ch, char next) throws IOException {
+ throw new IOException(Utils.messages.createMessage(
+ MsgKey.ER_INVALID_UTF16_SURROGATE,
+ new Object[] {Integer.toHexString(ch) + " "
+ + Integer.toHexString(next)}));
+ }
+
+ /**
* Receive notification of the beginning of an element, although this is a
* SAX method additional namespace or attribute information can occur before
* or after this call, that is associated with this element.
@@ -1962,7 +1968,7 @@ abstract public class ToStream extends SerializerBase
Writer writer,
String string,
String encoding)
- throws IOException
+ throws IOException, SAXException
{
final int len = string.length();
if (len > m_attrBuff.length)
diff --git a/src/com/sun/org/apache/xml/internal/serializer/ToTextStream.java b/src/com/sun/org/apache/xml/internal/serializer/ToTextStream.java
index 9e39b89..aecf377 100644
--- a/src/com/sun/org/apache/xml/internal/serializer/ToTextStream.java
+++ b/src/com/sun/org/apache/xml/internal/serializer/ToTextStream.java
@@ -1,6 +1,5 @@
/*
- * reserved comment block
- * DO NOT REMOVE OR ALTER!
+ * Copyright (c) 2018, Oracle and/or its affiliates. All rights reserved.
*/
/*
* Copyright 2001-2004 The Apache Software Foundation.
@@ -35,6 +34,7 @@ import org.xml.sax.SAXException;
* This class converts SAX or SAX-like calls to a
* serialized document for xsl:output method of "text".
* @xsl.usage internal
+ * @LastModified: Sept 2018
*/
public final class ToTextStream extends ToStream
{
@@ -296,23 +296,32 @@ public final class ToTextStream extends ToStream
} else if (m_encodingInfo.isInEncoding(c)) {
writer.write(c);
// one input char processed
- } else if (Encodings.isHighUTF16Surrogate(c)) {
+ } else if (Encodings.isHighUTF16Surrogate(c) ||
+ Encodings.isLowUTF16Surrogate(c)) {
final int codePoint = writeUTF16Surrogate(c, ch, i, end);
- if (codePoint != 0) {
- // I think we can just emit the message,
- // not crash and burn.
- final String integralValue = Integer.toString(codePoint);
- final String msg = Utils.messages.createMessage(
- MsgKey.ER_ILLEGAL_CHARACTER,
- new Object[] { integralValue, encoding });
-
- //Older behavior was to throw the message,
- //but newer gentler behavior is to write a message to System.err
- //throw new SAXException(msg);
- System.err.println(msg);
-
+ if (codePoint >= 0) {
+ // move the index if the low surrogate is consumed
+ // as writeUTF16Surrogate has written the pair
+ if (Encodings.isHighUTF16Surrogate(c)) {
+ i++;
+ }
+
+ // printing to the console is not appropriate, but will leave
+ // it as is for compatibility.
+ if (codePoint >0) {
+ // I think we can just emit the message,
+ // not crash and burn.
+ final String integralValue = Integer.toString(codePoint);
+ final String msg = Utils.messages.createMessage(
+ MsgKey.ER_ILLEGAL_CHARACTER,
+ new Object[] { integralValue, encoding });
+
+ //Older behavior was to throw the message,
+ //but newer gentler behavior is to write a message to System.err
+ //throw new SAXException(msg);
+ System.err.println(msg);
+ }
}
- i++; // two input chars processed
} else {
// Don't know what to do with this char, it is
// not in the encoding and not a high char in