diff options
Diffstat (limited to 'tinyxmlparser.cpp')
-rw-r--r-- | tinyxmlparser.cpp | 214 |
1 files changed, 140 insertions, 74 deletions
diff --git a/tinyxmlparser.cpp b/tinyxmlparser.cpp index 67d0a9e..81b7eae 100644 --- a/tinyxmlparser.cpp +++ b/tinyxmlparser.cpp @@ -1,6 +1,6 @@ /* www.sourceforge.net/projects/tinyxml -Original code (2.0 and earlier )copyright (c) 2000-2002 Lee Thomason (www.grinninglizard.com) +Original code by Lee Thomason (www.grinninglizard.com) This software is provided 'as-is', without any express or implied warranty. In no event will the authors be held liable for any @@ -22,16 +22,25 @@ must not be misrepresented as being the original software. distribution. */ -#include "tinyxml.h" #include <ctype.h> #include <stddef.h> +#include "tinyxml.h" + //#define DEBUG_PARSER +#if defined( DEBUG_PARSER ) +# if defined( DEBUG ) && defined( _MSC_VER ) +# include <windows.h> +# define TIXML_LOG OutputDebugString +# else +# define TIXML_LOG printf +# endif +#endif // Note tha "PutString" hardcodes the same list. This // is less flexible than it appears. Changing the entries // or order will break putstring. -TiXmlBase::Entity TiXmlBase::entity[ NUM_ENTITY ] = +TiXmlBase::Entity TiXmlBase::entity[ TiXmlBase::NUM_ENTITY ] = { { "&", 5, '&' }, { "<", 4, '<' }, @@ -165,7 +174,7 @@ class TiXmlParsingData public: void Stamp( const char* now, TiXmlEncoding encoding ); - const TiXmlCursor& Cursor() { return cursor; } + const TiXmlCursor& Cursor() const { return cursor; } private: // Only used by the document! @@ -277,7 +286,7 @@ void TiXmlParsingData::Stamp( const char* now, TiXmlEncoding encoding ) if ( encoding == TIXML_ENCODING_UTF8 ) { // Eat the 1 to 4 byte utf8 character. - int step = TiXmlBase::utf8ByteTable[*((unsigned char*)p)]; + int step = TiXmlBase::utf8ByteTable[*((const unsigned char*)p)]; if ( step == 0 ) step = 1; // Error case from bad encoding, but handle gracefully. p += step; @@ -337,7 +346,7 @@ const char* TiXmlBase::SkipWhiteSpace( const char* p, TiXmlEncoding encoding ) continue; } - if ( IsWhiteSpace( *p ) || *p == '\n' || *p =='\r' ) // Still using old rules for white space. + if ( IsWhiteSpace( *p ) ) // Still using old rules for white space. ++p; else break; @@ -345,7 +354,7 @@ const char* TiXmlBase::SkipWhiteSpace( const char* p, TiXmlEncoding encoding ) } else { - while ( *p && IsWhiteSpace( *p ) || *p == '\n' || *p =='\r' ) + while ( *p && IsWhiteSpace( *p ) ) ++p; } @@ -353,7 +362,7 @@ const char* TiXmlBase::SkipWhiteSpace( const char* p, TiXmlEncoding encoding ) } #ifdef TIXML_USE_STL -/*static*/ bool TiXmlBase::StreamWhiteSpace( TIXML_ISTREAM * in, TIXML_STRING * tag ) +/*static*/ bool TiXmlBase::StreamWhiteSpace( std::istream * in, TIXML_STRING * tag ) { for( ;; ) { @@ -368,7 +377,7 @@ const char* TiXmlBase::SkipWhiteSpace( const char* p, TiXmlEncoding encoding ) } } -/*static*/ bool TiXmlBase::StreamTo( TIXML_ISTREAM * in, int character, TIXML_STRING * tag ) +/*static*/ bool TiXmlBase::StreamTo( std::istream * in, int character, TIXML_STRING * tag ) { //assert( character > 0 && character < 128 ); // else it won't work in utf-8 while ( in->good() ) @@ -386,8 +395,14 @@ const char* TiXmlBase::SkipWhiteSpace( const char* p, TiXmlEncoding encoding ) } #endif +// One of TinyXML's more performance demanding functions. Try to keep the memory overhead down. The +// "assign" optimization removes over 10% of the execution time. +// const char* TiXmlBase::ReadName( const char* p, TIXML_STRING * name, TiXmlEncoding encoding ) { + // Oddly, not supported on some comilers, + //name->clear(); + // So use this: *name = ""; assert( p ); @@ -401,6 +416,7 @@ const char* TiXmlBase::ReadName( const char* p, TIXML_STRING * name, TiXmlEncodi if ( p && *p && ( IsAlpha( (unsigned char) *p, encoding ) || *p == '_' ) ) { + const char* start = p; while( p && *p && ( IsAlphaNum( (unsigned char ) *p, encoding ) || *p == '_' @@ -408,9 +424,12 @@ const char* TiXmlBase::ReadName( const char* p, TIXML_STRING * name, TiXmlEncodi || *p == '.' || *p == ':' ) ) { - (*name) += *p; + //(*name) += *p; // expensive ++p; } + if ( p-start > 0 ) { + name->assign( start, p-start ); + } return p; } return 0; @@ -506,6 +525,8 @@ const char* TiXmlBase::GetEntity( const char* p, char* value, int* length, TiXml // So it wasn't an entity, its unrecognized, or something like that. *value = *p; // Don't put back the last one, since we return it! + //*length = 1; // Leave unrecognized entities - this doesn't really work. + // Just writes strange XML. return p+1; } @@ -610,12 +631,14 @@ const char* TiXmlBase::ReadText( const char* p, } } } - return p + strlen( endTag ); + if ( p && *p ) + p += strlen( endTag ); + return ( p && *p ) ? p : 0; } #ifdef TIXML_USE_STL -void TiXmlDocument::StreamIn( TIXML_ISTREAM * in, TIXML_STRING * tag ) +void TiXmlDocument::StreamIn( std::istream * in, TIXML_STRING * tag ) { // The basic issue with a document is that we don't know what we're // streaming. Read something presumed to be a tag (and hope), then @@ -802,7 +825,6 @@ TiXmlNode* TiXmlNode::Identify( const char* p, TiXmlEncoding encoding ) return 0; } - TiXmlDocument* doc = GetDocument(); p = SkipWhiteSpace( p, encoding ); if ( !p || !*p ) @@ -873,17 +895,12 @@ TiXmlNode* TiXmlNode::Identify( const char* p, TiXmlEncoding encoding ) // Set the parent, so it can report errors returnNode->parent = this; } - else - { - if ( doc ) - doc->SetError( TIXML_ERROR_OUT_OF_MEMORY, 0, 0, TIXML_ENCODING_UNKNOWN ); - } return returnNode; } #ifdef TIXML_USE_STL -void TiXmlElement::StreamIn (TIXML_ISTREAM * in, TIXML_STRING * tag) +void TiXmlElement::StreamIn (std::istream * in, TIXML_STRING * tag) { // We're called with some amount of pre-parsing. That is, some of "this" // element is in "tag". Go ahead and stream to the closing ">" @@ -918,6 +935,7 @@ void TiXmlElement::StreamIn (TIXML_ISTREAM * in, TIXML_STRING * tag) { // There is more. Could be: // text + // cdata text (which looks like another node) // closing tag // another node. for ( ;; ) @@ -965,6 +983,17 @@ void TiXmlElement::StreamIn (TIXML_ISTREAM * in, TIXML_STRING * tag) *tag += (char) c; in->get(); + // Early out if we find the CDATA id. + if ( c == '[' && tag->size() >= 9 ) + { + size_t len = tag->size(); + const char* start = tag->c_str() + len - 9; + if ( strcmp( start, "<![CDATA[" ) == 0 ) { + assert( !closingTag ); + break; + } + } + if ( !firstCharFound && c != '<' && !IsWhiteSpace( c ) ) { firstCharFound = true; @@ -1048,7 +1077,6 @@ const char* TiXmlElement::Parse( const char* p, TiXmlParsingData* data, TiXmlEnc TIXML_STRING endTag ("</"); endTag += value; - endTag += ">"; // Check for and read attributes. Also look for an empty // tag or an end tag. @@ -1079,14 +1107,28 @@ const char* TiXmlElement::Parse( const char* p, TiXmlParsingData* data, TiXmlEnc // elements -- read the end tag, and return. ++p; p = ReadValue( p, data, encoding ); // Note this is an Element method, and will set the error if one happens. - if ( !p || !*p ) + if ( !p || !*p ) { + // We were looking for the end tag, but found nothing. + // Fix for [ 1663758 ] Failure to report error on bad XML + if ( document ) document->SetError( TIXML_ERROR_READING_END_TAG, p, data, encoding ); return 0; + } // We should find the end tag now + // note that: + // </foo > and + // </foo> + // are both valid end tags. if ( StringEqual( p, endTag.c_str(), false, encoding ) ) { p += endTag.length(); - return p; + p = SkipWhiteSpace( p, encoding ); + if ( p && *p && *p == '>' ) { + ++p; + return p; + } + if ( document ) document->SetError( TIXML_ERROR_READING_END_TAG, p, data, encoding ); + return 0; } else { @@ -1100,12 +1142,11 @@ const char* TiXmlElement::Parse( const char* p, TiXmlParsingData* data, TiXmlEnc TiXmlAttribute* attrib = new TiXmlAttribute(); if ( !attrib ) { - if ( document ) document->SetError( TIXML_ERROR_OUT_OF_MEMORY, pErr, data, encoding ); return 0; } attrib->SetDocument( document ); - const char* pErr = p; + pErr = p; p = attrib->Parse( p, data, encoding ); if ( !p || !*p ) @@ -1116,10 +1157,14 @@ const char* TiXmlElement::Parse( const char* p, TiXmlParsingData* data, TiXmlEnc } // Handle the strange case of double attributes: + #ifdef TIXML_USE_STL + TiXmlAttribute* node = attributeSet.Find( attrib->NameTStr() ); + #else TiXmlAttribute* node = attributeSet.Find( attrib->Name() ); + #endif if ( node ) { - node->SetValue( attrib->Value() ); + if ( document ) document->SetError( TIXML_ERROR_PARSING_ELEMENT, pErr, data, encoding ); delete attrib; return 0; } @@ -1148,8 +1193,7 @@ const char* TiXmlElement::ReadValue( const char* p, TiXmlParsingData* data, TiXm if ( !textNode ) { - if ( document ) document->SetError( TIXML_ERROR_OUT_OF_MEMORY, 0, 0, encoding ); - return 0; + return 0; } if ( TiXmlBase::IsWhiteSpaceCondensed() ) @@ -1204,7 +1248,7 @@ const char* TiXmlElement::ReadValue( const char* p, TiXmlParsingData* data, TiXm #ifdef TIXML_USE_STL -void TiXmlUnknown::StreamIn( TIXML_ISTREAM * in, TIXML_STRING * tag ) +void TiXmlUnknown::StreamIn( std::istream * in, TIXML_STRING * tag ) { while ( in->good() ) { @@ -1254,15 +1298,16 @@ const char* TiXmlUnknown::Parse( const char* p, TiXmlParsingData* data, TiXmlEnc if ( !p ) { - if ( document ) document->SetError( TIXML_ERROR_PARSING_UNKNOWN, 0, 0, encoding ); + if ( document ) + document->SetError( TIXML_ERROR_PARSING_UNKNOWN, 0, 0, encoding ); } - if ( *p == '>' ) + if ( p && *p == '>' ) return p+1; return p; } #ifdef TIXML_USE_STL -void TiXmlComment::StreamIn( TIXML_ISTREAM * in, TIXML_STRING * tag ) +void TiXmlComment::StreamIn( std::istream * in, TIXML_STRING * tag ) { while ( in->good() ) { @@ -1306,11 +1351,40 @@ const char* TiXmlComment::Parse( const char* p, TiXmlParsingData* data, TiXmlEnc if ( !StringEqual( p, startTag, false, encoding ) ) { - document->SetError( TIXML_ERROR_PARSING_COMMENT, p, data, encoding ); + if ( document ) + document->SetError( TIXML_ERROR_PARSING_COMMENT, p, data, encoding ); return 0; } p += strlen( startTag ); - p = ReadText( p, &value, false, endTag, false, encoding ); + + // [ 1475201 ] TinyXML parses entities in comments + // Oops - ReadText doesn't work, because we don't want to parse the entities. + // p = ReadText( p, &value, false, endTag, false, encoding ); + // + // from the XML spec: + /* + [Definition: Comments may appear anywhere in a document outside other markup; in addition, + they may appear within the document type declaration at places allowed by the grammar. + They are not part of the document's character data; an XML processor MAY, but need not, + make it possible for an application to retrieve the text of comments. For compatibility, + the string "--" (double-hyphen) MUST NOT occur within comments.] Parameter entity + references MUST NOT be recognized within comments. + + An example of a comment: + + <!-- declarations for <head> & <body> --> + */ + + value = ""; + // Keep all the white space. + while ( p && *p && !StringEqual( p, endTag, false, encoding ) ) + { + value.append( p, 1 ); + ++p; + } + if ( p && *p ) + p += strlen( endTag ); + return p; } @@ -1320,10 +1394,6 @@ const char* TiXmlAttribute::Parse( const char* p, TiXmlParsingData* data, TiXmlE p = SkipWhiteSpace( p, encoding ); if ( !p || !*p ) return 0; - int tabsize = 4; - if ( document ) - tabsize = document->TabSize(); - if ( data ) { data->Stamp( p, encoding ); @@ -1353,17 +1423,19 @@ const char* TiXmlAttribute::Parse( const char* p, TiXmlParsingData* data, TiXmlE } const char* end; + const char SINGLE_QUOTE = '\''; + const char DOUBLE_QUOTE = '\"'; - if ( *p == '\'' ) + if ( *p == SINGLE_QUOTE ) { ++p; - end = "\'"; + end = "\'"; // single quote in string p = ReadText( p, &value, false, end, false, encoding ); } - else if ( *p == '"' ) + else if ( *p == DOUBLE_QUOTE ) { ++p; - end = "\""; + end = "\""; // double quote in string p = ReadText( p, &value, false, end, false, encoding ); } else @@ -1372,10 +1444,17 @@ const char* TiXmlAttribute::Parse( const char* p, TiXmlParsingData* data, TiXmlE // But this is such a common error that the parser will try // its best, even without them. value = ""; - while ( p && *p // existence - && !IsWhiteSpace( *p ) && *p != '\n' && *p != '\r' // whitespace - && *p != '/' && *p != '>' ) // tag end + while ( p && *p // existence + && !IsWhiteSpace( *p ) // whitespace + && *p != '/' && *p != '>' ) // tag end { + if ( *p == SINGLE_QUOTE || *p == DOUBLE_QUOTE ) { + // [ 1451649 ] Attribute values with trailing quotes not handled correctly + // We did not have an opening quote but seem to have a + // closing one. Give up and throw an error. + if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, p, data, encoding ); + return 0; + } value += *p; ++p; } @@ -1384,11 +1463,15 @@ const char* TiXmlAttribute::Parse( const char* p, TiXmlParsingData* data, TiXmlE } #ifdef TIXML_USE_STL -void TiXmlText::StreamIn( TIXML_ISTREAM * in, TIXML_STRING * tag ) +void TiXmlText::StreamIn( std::istream * in, TIXML_STRING * tag ) { - if ( cdata ) + while ( in->good() ) { - int c = in->get(); + int c = in->peek(); + if ( !cdata && (c == '<' ) ) + { + return; + } if ( c <= 0 ) { TiXmlDocument* document = GetDocument(); @@ -1398,33 +1481,15 @@ void TiXmlText::StreamIn( TIXML_ISTREAM * in, TIXML_STRING * tag ) } (*tag) += (char) c; + in->get(); // "commits" the peek made above - if ( c == '>' - && tag->at( tag->length() - 2 ) == ']' - && tag->at( tag->length() - 3 ) == ']' ) - { - // All is well. - return; - } - } - else - { - while ( in->good() ) - { - int c = in->peek(); - if ( c == '<' ) - return; - if ( c <= 0 ) - { - TiXmlDocument* document = GetDocument(); - if ( document ) - document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN ); + if ( cdata && c == '>' && tag->size() >= 3 ) { + size_t len = tag->size(); + if ( (*tag)[len-2] == ']' && (*tag)[len-3] == ']' ) { + // terminator of cdata. return; } - - (*tag) += (char) c; - in->get(); - } + } } } #endif @@ -1449,7 +1514,8 @@ const char* TiXmlText::Parse( const char* p, TiXmlParsingData* data, TiXmlEncodi if ( !StringEqual( p, startTag, false, encoding ) ) { - document->SetError( TIXML_ERROR_PARSING_CDATA, p, data, encoding ); + if ( document ) + document->SetError( TIXML_ERROR_PARSING_CDATA, p, data, encoding ); return 0; } p += strlen( startTag ); @@ -1473,14 +1539,14 @@ const char* TiXmlText::Parse( const char* p, TiXmlParsingData* data, TiXmlEncodi const char* end = "<"; p = ReadText( p, &value, ignoreWhite, end, false, encoding ); - if ( p ) + if ( p && *p ) return p-1; // don't truncate the '<' return 0; } } #ifdef TIXML_USE_STL -void TiXmlDeclaration::StreamIn( TIXML_ISTREAM * in, TIXML_STRING * tag ) +void TiXmlDeclaration::StreamIn( std::istream * in, TIXML_STRING * tag ) { while ( in->good() ) { |