1 files changed, 140 insertions, 74 deletions
diff --git a/tinyxmlparser.cpp b/tinyxmlparser.cpp
index 67d0a9e..81b7eae 100644
--- a/tinyxmlparser.cpp
+++ b/tinyxmlparser.cpp
@@ -1,6 +1,6 @@
 /*
 www.sourceforge.net/projects/tinyxml
-Original code (2.0 and earlier )copyright (c) 2000-2002 Lee Thomason (www.grinninglizard.com)
+Original code by Lee Thomason (www.grinninglizard.com)
 
 This software is provided 'as-is', without any express or implied 
 warranty. In no event will the authors be held liable for any 
@@ -22,16 +22,25 @@ must not be misrepresented as being the original software.
 distribution.
 */
 
-#include "tinyxml.h"
 #include <ctype.h>
 #include <stddef.h>
 
+#include "tinyxml.h"
+
 //#define DEBUG_PARSER
+#if defined( DEBUG_PARSER )
+#	if defined( DEBUG ) && defined( _MSC_VER )
+#		include <windows.h>
+#		define TIXML_LOG OutputDebugString
+#	else
+#		define TIXML_LOG printf
+#	endif
+#endif
 
 // Note tha "PutString" hardcodes the same list. This
 // is less flexible than it appears. Changing the entries
 // or order will break putstring.	
-TiXmlBase::Entity TiXmlBase::entity[ NUM_ENTITY ] = 
+TiXmlBase::Entity TiXmlBase::entity[ TiXmlBase::NUM_ENTITY ] = 
 {
 	{ "&amp;",  5, '&' },
 	{ "&lt;",   4, '<' },
@@ -165,7 +174,7 @@ class TiXmlParsingData
   public:
 	void Stamp( const char* now, TiXmlEncoding encoding );
 
-	const TiXmlCursor& Cursor()	{ return cursor; }
+	const TiXmlCursor& Cursor() const	{ return cursor; }
 
   private:
 	// Only used by the document!
@@ -277,7 +286,7 @@ void TiXmlParsingData::Stamp( const char* now, TiXmlEncoding encoding )
 				if ( encoding == TIXML_ENCODING_UTF8 )
 				{
 					// Eat the 1 to 4 byte utf8 character.
-					int step = TiXmlBase::utf8ByteTable[*((unsigned char*)p)];
+					int step = TiXmlBase::utf8ByteTable[*((const unsigned char*)p)];
 					if ( step == 0 )
 						step = 1;		// Error case from bad encoding, but handle gracefully.
 					p += step;
@@ -337,7 +346,7 @@ const char* TiXmlBase::SkipWhiteSpace( const char* p, TiXmlEncoding encoding )
 				continue;
 			}
 
-			if ( IsWhiteSpace( *p ) || *p == '\n' || *p =='\r' )		// Still using old rules for white space.
+			if ( IsWhiteSpace( *p ) )		// Still using old rules for white space.
 				++p;
 			else
 				break;
@@ -345,7 +354,7 @@ const char* TiXmlBase::SkipWhiteSpace( const char* p, TiXmlEncoding encoding )
 	}
 	else
 	{
-		while ( *p && IsWhiteSpace( *p ) || *p == '\n' || *p =='\r' )
+		while ( *p && IsWhiteSpace( *p ) )
 			++p;
 	}
 
@@ -353,7 +362,7 @@ const char* TiXmlBase::SkipWhiteSpace( const char* p, TiXmlEncoding encoding )
 }
 
 #ifdef TIXML_USE_STL
-/*static*/ bool TiXmlBase::StreamWhiteSpace( TIXML_ISTREAM * in, TIXML_STRING * tag )
+/*static*/ bool TiXmlBase::StreamWhiteSpace( std::istream * in, TIXML_STRING * tag )
 {
 	for( ;; )
 	{
@@ -368,7 +377,7 @@ const char* TiXmlBase::SkipWhiteSpace( const char* p, TiXmlEncoding encoding )
 	}
 }
 
-/*static*/ bool TiXmlBase::StreamTo( TIXML_ISTREAM * in, int character, TIXML_STRING * tag )
+/*static*/ bool TiXmlBase::StreamTo( std::istream * in, int character, TIXML_STRING * tag )
 {
 	//assert( character > 0 && character < 128 );	// else it won't work in utf-8
 	while ( in->good() )
@@ -386,8 +395,14 @@ const char* TiXmlBase::SkipWhiteSpace( const char* p, TiXmlEncoding encoding )
 }
 #endif
 
+// One of TinyXML's more performance demanding functions. Try to keep the memory overhead down. The
+// "assign" optimization removes over 10% of the execution time.
+//
 const char* TiXmlBase::ReadName( const char* p, TIXML_STRING * name, TiXmlEncoding encoding )
 {
+	// Oddly, not supported on some comilers,
+	//name->clear();
+	// So use this:
 	*name = "";
 	assert( p );
 
@@ -401,6 +416,7 @@ const char* TiXmlBase::ReadName( const char* p, TIXML_STRING * name, TiXmlEncodi
 	if (    p && *p 
 		 && ( IsAlpha( (unsigned char) *p, encoding ) || *p == '_' ) )
 	{
+		const char* start = p;
 		while(		p && *p
 				&&	(		IsAlphaNum( (unsigned char ) *p, encoding ) 
 						 || *p == '_'
@@ -408,9 +424,12 @@ const char* TiXmlBase::ReadName( const char* p, TIXML_STRING * name, TiXmlEncodi
 						 || *p == '.'
 						 || *p == ':' ) )
 		{
-			(*name) += *p;
+			//(*name) += *p; // expensive
 			++p;
 		}
+		if ( p-start > 0 ) {
+			name->assign( start, p-start );
+		}
 		return p;
 	}
 	return 0;
@@ -506,6 +525,8 @@ const char* TiXmlBase::GetEntity( const char* p, char* value, int* length, TiXml
 
 	// So it wasn't an entity, its unrecognized, or something like that.
 	*value = *p;	// Don't put back the last one, since we return it!
+	//*length = 1;	// Leave unrecognized entities - this doesn't really work.
+					// Just writes strange XML.
 	return p+1;
 }
 
@@ -610,12 +631,14 @@ const char* TiXmlBase::ReadText(	const char* p,
 			}
 		}
 	}
-	return p + strlen( endTag );
+	if ( p && *p )
+		p += strlen( endTag );
+	return ( p && *p ) ? p : 0;
 }
 
 #ifdef TIXML_USE_STL
 
-void TiXmlDocument::StreamIn( TIXML_ISTREAM * in, TIXML_STRING * tag )
+void TiXmlDocument::StreamIn( std::istream * in, TIXML_STRING * tag )
 {
 	// The basic issue with a document is that we don't know what we're
 	// streaming. Read something presumed to be a tag (and hope), then
@@ -802,7 +825,6 @@ TiXmlNode* TiXmlNode::Identify( const char* p, TiXmlEncoding encoding )
 		return 0;
 	}
 
-	TiXmlDocument* doc = GetDocument();
 	p = SkipWhiteSpace( p, encoding );
 
 	if ( !p || !*p )
@@ -873,17 +895,12 @@ TiXmlNode* TiXmlNode::Identify( const char* p, TiXmlEncoding encoding )
 		// Set the parent, so it can report errors
 		returnNode->parent = this;
 	}
-	else
-	{
-		if ( doc )
-			doc->SetError( TIXML_ERROR_OUT_OF_MEMORY, 0, 0, TIXML_ENCODING_UNKNOWN );
-	}
 	return returnNode;
 }
 
 #ifdef TIXML_USE_STL
 
-void TiXmlElement::StreamIn (TIXML_ISTREAM * in, TIXML_STRING * tag)
+void TiXmlElement::StreamIn (std::istream * in, TIXML_STRING * tag)
 {
 	// We're called with some amount of pre-parsing. That is, some of "this"
 	// element is in "tag". Go ahead and stream to the closing ">"
@@ -918,6 +935,7 @@ void TiXmlElement::StreamIn (TIXML_ISTREAM * in, TIXML_STRING * tag)
 	{
 		// There is more. Could be:
 		//		text
+		//		cdata text (which looks like another node)
 		//		closing tag
 		//		another node.
 		for ( ;; )
@@ -965,6 +983,17 @@ void TiXmlElement::StreamIn (TIXML_ISTREAM * in, TIXML_STRING * tag)
 				*tag += (char) c;
 				in->get();
 
+				// Early out if we find the CDATA id.
+				if ( c == '[' && tag->size() >= 9 )
+				{
+					size_t len = tag->size();
+					const char* start = tag->c_str() + len - 9;
+					if ( strcmp( start, "<![CDATA[" ) == 0 ) {
+						assert( !closingTag );
+						break;
+					}
+				}
+
 				if ( !firstCharFound && c != '<' && !IsWhiteSpace( c ) )
 				{
 					firstCharFound = true;
@@ -1048,7 +1077,6 @@ const char* TiXmlElement::Parse( const char* p, TiXmlParsingData* data, TiXmlEnc
 
     TIXML_STRING endTag ("</");
 	endTag += value;
-	endTag += ">";
 
 	// Check for and read attributes. Also look for an empty
 	// tag or an end tag.
@@ -1079,14 +1107,28 @@ const char* TiXmlElement::Parse( const char* p, TiXmlParsingData* data, TiXmlEnc
 			// elements -- read the end tag, and return.
 			++p;
 			p = ReadValue( p, data, encoding );		// Note this is an Element method, and will set the error if one happens.
-			if ( !p || !*p )
+			if ( !p || !*p ) {
+				// We were looking for the end tag, but found nothing.
+				// Fix for [ 1663758 ] Failure to report error on bad XML
+				if ( document ) document->SetError( TIXML_ERROR_READING_END_TAG, p, data, encoding );
 				return 0;
+			}
 
 			// We should find the end tag now
+			// note that:
+			// </foo > and
+			// </foo> 
+			// are both valid end tags.
 			if ( StringEqual( p, endTag.c_str(), false, encoding ) )
 			{
 				p += endTag.length();
-				return p;
+				p = SkipWhiteSpace( p, encoding );
+				if ( p && *p && *p == '>' ) {
+					++p;
+					return p;
+				}
+				if ( document ) document->SetError( TIXML_ERROR_READING_END_TAG, p, data, encoding );
+				return 0;
 			}
 			else
 			{
@@ -1100,12 +1142,11 @@ const char* TiXmlElement::Parse( const char* p, TiXmlParsingData* data, TiXmlEnc
 			TiXmlAttribute* attrib = new TiXmlAttribute();
 			if ( !attrib )
 			{
-				if ( document ) document->SetError( TIXML_ERROR_OUT_OF_MEMORY, pErr, data, encoding );
 				return 0;
 			}
 
 			attrib->SetDocument( document );
-			const char* pErr = p;
+			pErr = p;
 			p = attrib->Parse( p, data, encoding );
 
 			if ( !p || !*p )
@@ -1116,10 +1157,14 @@ const char* TiXmlElement::Parse( const char* p, TiXmlParsingData* data, TiXmlEnc
 			}
 
 			// Handle the strange case of double attributes:
+			#ifdef TIXML_USE_STL
+			TiXmlAttribute* node = attributeSet.Find( attrib->NameTStr() );
+			#else
 			TiXmlAttribute* node = attributeSet.Find( attrib->Name() );
+			#endif
 			if ( node )
 			{
-				node->SetValue( attrib->Value() );
+				if ( document ) document->SetError( TIXML_ERROR_PARSING_ELEMENT, pErr, data, encoding );
 				delete attrib;
 				return 0;
 			}
@@ -1148,8 +1193,7 @@ const char* TiXmlElement::ReadValue( const char* p, TiXmlParsingData* data, TiXm
 
 			if ( !textNode )
 			{
-				if ( document ) document->SetError( TIXML_ERROR_OUT_OF_MEMORY, 0, 0, encoding );
-				    return 0;
+			    return 0;
 			}
 
 			if ( TiXmlBase::IsWhiteSpaceCondensed() )
@@ -1204,7 +1248,7 @@ const char* TiXmlElement::ReadValue( const char* p, TiXmlParsingData* data, TiXm
 
 
 #ifdef TIXML_USE_STL
-void TiXmlUnknown::StreamIn( TIXML_ISTREAM * in, TIXML_STRING * tag )
+void TiXmlUnknown::StreamIn( std::istream * in, TIXML_STRING * tag )
 {
 	while ( in->good() )
 	{
@@ -1254,15 +1298,16 @@ const char* TiXmlUnknown::Parse( const char* p, TiXmlParsingData* data, TiXmlEnc
 
 	if ( !p )
 	{
-		if ( document )	document->SetError( TIXML_ERROR_PARSING_UNKNOWN, 0, 0, encoding );
+		if ( document )	
+			document->SetError( TIXML_ERROR_PARSING_UNKNOWN, 0, 0, encoding );
 	}
-	if ( *p == '>' )
+	if ( p && *p == '>' )
 		return p+1;
 	return p;
 }
 
 #ifdef TIXML_USE_STL
-void TiXmlComment::StreamIn( TIXML_ISTREAM * in, TIXML_STRING * tag )
+void TiXmlComment::StreamIn( std::istream * in, TIXML_STRING * tag )
 {
 	while ( in->good() )
 	{
@@ -1306,11 +1351,40 @@ const char* TiXmlComment::Parse( const char* p, TiXmlParsingData* data, TiXmlEnc
 
 	if ( !StringEqual( p, startTag, false, encoding ) )
 	{
-		document->SetError( TIXML_ERROR_PARSING_COMMENT, p, data, encoding );
+		if ( document )
+			document->SetError( TIXML_ERROR_PARSING_COMMENT, p, data, encoding );
 		return 0;
 	}
 	p += strlen( startTag );
-	p = ReadText( p, &value, false, endTag, false, encoding );
+
+	// [ 1475201 ] TinyXML parses entities in comments
+	// Oops - ReadText doesn't work, because we don't want to parse the entities.
+	// p = ReadText( p, &value, false, endTag, false, encoding );
+	//
+	// from the XML spec:
+	/*
+	 [Definition: Comments may appear anywhere in a document outside other markup; in addition, 
+	              they may appear within the document type declaration at places allowed by the grammar. 
+				  They are not part of the document's character data; an XML processor MAY, but need not, 
+				  make it possible for an application to retrieve the text of comments. For compatibility, 
+				  the string "--" (double-hyphen) MUST NOT occur within comments.] Parameter entity 
+				  references MUST NOT be recognized within comments.
+
+				  An example of a comment:
+
+				  <!-- declarations for <head> & <body> -->
+	*/
+
+    value = "";
+	// Keep all the white space.
+	while (	p && *p && !StringEqual( p, endTag, false, encoding ) )
+	{
+		value.append( p, 1 );
+		++p;
+	}
+	if ( p && *p ) 
+		p += strlen( endTag );
+
 	return p;
 }
 
@@ -1320,10 +1394,6 @@ const char* TiXmlAttribute::Parse( const char* p, TiXmlParsingData* data, TiXmlE
 	p = SkipWhiteSpace( p, encoding );
 	if ( !p || !*p ) return 0;
 
-	int tabsize = 4;
-	if ( document )
-		tabsize = document->TabSize();
-
 	if ( data )
 	{
 		data->Stamp( p, encoding );
@@ -1353,17 +1423,19 @@ const char* TiXmlAttribute::Parse( const char* p, TiXmlParsingData* data, TiXmlE
 	}
 	
 	const char* end;
+	const char SINGLE_QUOTE = '\'';
+	const char DOUBLE_QUOTE = '\"';
 
-	if ( *p == '\'' )
+	if ( *p == SINGLE_QUOTE )
 	{
 		++p;
-		end = "\'";
+		end = "\'";		// single quote in string
 		p = ReadText( p, &value, false, end, false, encoding );
 	}
-	else if ( *p == '"' )
+	else if ( *p == DOUBLE_QUOTE )
 	{
 		++p;
-		end = "\"";
+		end = "\"";		// double quote in string
 		p = ReadText( p, &value, false, end, false, encoding );
 	}
 	else
@@ -1372,10 +1444,17 @@ const char* TiXmlAttribute::Parse( const char* p, TiXmlParsingData* data, TiXmlE
 		// But this is such a common error that the parser will try
 		// its best, even without them.
 		value = "";
-		while (    p && *p										// existence
-				&& !IsWhiteSpace( *p ) && *p != '\n' && *p != '\r'	// whitespace
-				&& *p != '/' && *p != '>' )						// tag end
+		while (    p && *p											// existence
+				&& !IsWhiteSpace( *p )								// whitespace
+				&& *p != '/' && *p != '>' )							// tag end
 		{
+			if ( *p == SINGLE_QUOTE || *p == DOUBLE_QUOTE ) {
+				// [ 1451649 ] Attribute values with trailing quotes not handled correctly
+				// We did not have an opening quote but seem to have a 
+				// closing one. Give up and throw an error.
+				if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, p, data, encoding );
+				return 0;
+			}
 			value += *p;
 			++p;
 		}
@@ -1384,11 +1463,15 @@ const char* TiXmlAttribute::Parse( const char* p, TiXmlParsingData* data, TiXmlE
 }
 
 #ifdef TIXML_USE_STL
-void TiXmlText::StreamIn( TIXML_ISTREAM * in, TIXML_STRING * tag )
+void TiXmlText::StreamIn( std::istream * in, TIXML_STRING * tag )
 {
-	if ( cdata )
+	while ( in->good() )
 	{
-		int c = in->get();	
+		int c = in->peek();	
+		if ( !cdata && (c == '<' ) ) 
+		{
+			return;
+		}
 		if ( c <= 0 )
 		{
 			TiXmlDocument* document = GetDocument();
@@ -1398,33 +1481,15 @@ void TiXmlText::StreamIn( TIXML_ISTREAM * in, TIXML_STRING * tag )
 		}
 
 		(*tag) += (char) c;
+		in->get();	// "commits" the peek made above
 
-		if ( c == '>' 
-			 && tag->at( tag->length() - 2 ) == ']'
-			 && tag->at( tag->length() - 3 ) == ']' )
-		{
-			// All is well.
-			return;		
-		}
-	}
-	else
-	{
-		while ( in->good() )
-		{
-			int c = in->peek();	
-			if ( c == '<' )
-				return;
-			if ( c <= 0 )
-			{
-				TiXmlDocument* document = GetDocument();
-				if ( document )
-					document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
+		if ( cdata && c == '>' && tag->size() >= 3 ) {
+			size_t len = tag->size();
+			if ( (*tag)[len-2] == ']' && (*tag)[len-3] == ']' ) {
+				// terminator of cdata.
 				return;
 			}
-
-			(*tag) += (char) c;
-			in->get();
-		}
+		}    
 	}
 }
 #endif
@@ -1449,7 +1514,8 @@ const char* TiXmlText::Parse( const char* p, TiXmlParsingData* data, TiXmlEncodi
 
 		if ( !StringEqual( p, startTag, false, encoding ) )
 		{
-			document->SetError( TIXML_ERROR_PARSING_CDATA, p, data, encoding );
+			if ( document )
+				document->SetError( TIXML_ERROR_PARSING_CDATA, p, data, encoding );
 			return 0;
 		}
 		p += strlen( startTag );
@@ -1473,14 +1539,14 @@ const char* TiXmlText::Parse( const char* p, TiXmlParsingData* data, TiXmlEncodi
 
 		const char* end = "<";
 		p = ReadText( p, &value, ignoreWhite, end, false, encoding );
-		if ( p )
+		if ( p && *p )
 			return p-1;	// don't truncate the '<'
 		return 0;
 	}
 }
 
 #ifdef TIXML_USE_STL
-void TiXmlDeclaration::StreamIn( TIXML_ISTREAM * in, TIXML_STRING * tag )
+void TiXmlDeclaration::StreamIn( std::istream * in, TIXML_STRING * tag )
 {
 	while ( in->good() )
 	{