diff options
Diffstat (limited to 'src/reader/parser/inside_doctype.rs')
-rw-r--r-- | src/reader/parser/inside_doctype.rs | 237 |
1 files changed, 228 insertions, 9 deletions
diff --git a/src/reader/parser/inside_doctype.rs b/src/reader/parser/inside_doctype.rs index 8dcf367..93ea470 100644 --- a/src/reader/parser/inside_doctype.rs +++ b/src/reader/parser/inside_doctype.rs @@ -1,16 +1,235 @@ -use reader::lexer::Token; +use crate::reader::error::SyntaxError; +use crate::common::{is_name_char, is_name_start_char, is_whitespace_char}; +use crate::reader::lexer::Token; -use super::{Result, PullParser, State}; +use super::{DoctypeSubstate, PullParser, QuoteToken, Result, State}; impl PullParser { - pub fn inside_doctype(&mut self, t: Token) -> Option<Result> { - match t { - Token::TagEnd => { - self.lexer.enable_errors(); - self.into_state_continue(State::OutsideTag) - } + pub fn inside_doctype(&mut self, t: Token, substate: DoctypeSubstate) -> Option<Result> { + match substate { + DoctypeSubstate::Outside => match t { + Token::TagEnd => self.into_state_continue(State::OutsideTag), + Token::MarkupDeclarationStart => { + self.buf.clear(); + self.into_state_continue(State::InsideDoctype(DoctypeSubstate::InsideName)) + }, + Token::Character('%') => { + self.data.ref_data.clear(); + self.data.ref_data.push('%'); + self.into_state_continue(State::InsideDoctype(DoctypeSubstate::PEReferenceInDtd)) + }, + Token::CommentStart => { + self.into_state_continue(State::InsideDoctype(DoctypeSubstate::Comment)) + }, + Token::SingleQuote | Token::DoubleQuote => { + // just discard string literals + self.data.quote = Some(super::QuoteToken::from_token(&t)); + self.into_state_continue(State::InsideDoctype(DoctypeSubstate::String)) + }, + Token::CDataEnd | Token::CDataStart => Some(self.error(SyntaxError::UnexpectedToken(t))), + // TODO: parse SYSTEM, and [ + _ => None, + }, + DoctypeSubstate::String => match t { + Token::SingleQuote if self.data.quote != Some(QuoteToken::SingleQuoteToken) => { None }, + Token::DoubleQuote if self.data.quote != Some(QuoteToken::DoubleQuoteToken) => { None }, + Token::SingleQuote | Token::DoubleQuote => { + self.data.quote = None; + self.into_state_continue(State::InsideDoctype(DoctypeSubstate::Outside)) + }, + _ => None, + }, + DoctypeSubstate::Comment => match t { + Token::CommentEnd => { + self.into_state_continue(State::InsideDoctype(DoctypeSubstate::Outside)) + }, + _ => None, + }, + DoctypeSubstate::InsideName => match t { + Token::Character(c @ 'A'..='Z') => { + self.buf.push(c); + None + }, + Token::Character(c) if is_whitespace_char(c) => { + match self.buf.as_str() { + "ENTITY" => self.into_state_continue(State::InsideDoctype(DoctypeSubstate::BeforeEntityName)), + "NOTATION" | "ELEMENT" | "ATTLIST" => self.into_state_continue(State::InsideDoctype(DoctypeSubstate::SkipDeclaration)), + s => Some(self.error(SyntaxError::UnknownMarkupDeclaration(s.into()))), + } - _ => None + }, + _ => Some(self.error(SyntaxError::UnexpectedToken(t))), + }, + DoctypeSubstate::BeforeEntityName => { + self.data.name.clear(); + match t { + Token::Character(c) if is_whitespace_char(c) => None, + Token::Character('%') => { // % is for PEDecl + self.data.name.push('%'); + self.into_state_continue(State::InsideDoctype(DoctypeSubstate::PEReferenceDefinitionStart)) + }, + Token::Character(c) if is_name_start_char(c) => { + self.data.name.push(c); + self.into_state_continue(State::InsideDoctype(DoctypeSubstate::EntityName)) + }, + _ => Some(self.error(SyntaxError::UnexpectedTokenInEntity(t))), + } + }, + DoctypeSubstate::EntityName => match t { + Token::Character(c) if is_whitespace_char(c) => { + self.into_state_continue(State::InsideDoctype(DoctypeSubstate::BeforeEntityValue)) + }, + Token::Character(c) if is_name_char(c) => { + self.data.name.push(c); + None + }, + _ => Some(self.error(SyntaxError::UnexpectedTokenInEntity(t))), + }, + DoctypeSubstate::BeforeEntityValue => { + self.buf.clear(); + match t { + Token::Character(c) if is_whitespace_char(c) => None, + // SYSTEM/PUBLIC not supported + Token::Character('S' | 'P') => { + let name = self.data.take_name(); + self.entities.entry(name).or_insert_with(String::new); // Dummy value, but at least the name is recognized + + self.into_state_continue(State::InsideDoctype(DoctypeSubstate::SkipDeclaration)) + }, + Token::SingleQuote | Token::DoubleQuote => { + self.data.quote = Some(super::QuoteToken::from_token(&t)); + self.into_state_continue(State::InsideDoctype(DoctypeSubstate::EntityValue)) + }, + _ => Some(self.error(SyntaxError::UnexpectedTokenInEntity(t))), + } + }, + DoctypeSubstate::EntityValue => match t { + Token::SingleQuote if self.data.quote != Some(QuoteToken::SingleQuoteToken) => { self.buf.push('\''); None }, + Token::DoubleQuote if self.data.quote != Some(QuoteToken::DoubleQuoteToken) => { self.buf.push('"'); None }, + Token::SingleQuote | Token::DoubleQuote => { + self.data.quote = None; + let name = self.data.take_name(); + let val = self.take_buf(); + self.entities.entry(name).or_insert(val); // First wins + self.into_state_continue(State::InsideDoctype(DoctypeSubstate::SkipDeclaration)) // FIXME + }, + Token::ReferenceStart | Token::Character('&') => { + self.data.ref_data.clear(); + self.into_state_continue(State::InsideDoctype(DoctypeSubstate::NumericReferenceStart)) + }, + Token::Character('%') => { + self.data.ref_data.clear(); + self.data.ref_data.push('%'); // include literal % in the name to distinguish from regular entities + self.into_state_continue(State::InsideDoctype(DoctypeSubstate::PEReferenceInValue)) + }, + Token::Character(c) if !self.is_valid_xml_char(c) => { + Some(self.error(SyntaxError::InvalidCharacterEntity(c as u32))) + }, + Token::Character(c) => { + self.buf.push(c); + None + }, + _ => Some(self.error(SyntaxError::UnexpectedTokenInEntity(t))), + }, + DoctypeSubstate::PEReferenceDefinitionStart => match t { + Token::Character(c) if is_whitespace_char(c) => { + None + }, + Token::Character(c) if is_name_start_char(c) => { + debug_assert_eq!(self.data.name, "%"); + self.data.name.push(c); + self.into_state_continue(State::InsideDoctype(DoctypeSubstate::PEReferenceDefinition)) + }, + _ => Some(self.error(SyntaxError::UnexpectedTokenInEntity(t))), + }, + DoctypeSubstate::PEReferenceDefinition => match t { + Token::Character(c) if is_name_char(c) => { + self.data.name.push(c); + None + }, + Token::Character(c) if is_whitespace_char(c) => { + self.into_state_continue(State::InsideDoctype(DoctypeSubstate::BeforeEntityValue)) + }, + _ => Some(self.error(SyntaxError::UnexpectedTokenInEntity(t))), + }, + DoctypeSubstate::PEReferenceInDtd => match t { + Token::Character(c) if is_name_char(c) => { + self.data.ref_data.push(c); + None + }, + Token::ReferenceEnd | Token::Character(';') => { + let name = self.data.take_ref_data(); + match self.entities.get(&name) { + Some(ent) => { + if let Err(e) = self.lexer.reparse(ent) { + return Some(Err(e)); + } + self.into_state_continue(State::InsideDoctype(DoctypeSubstate::Outside)) + }, + None => Some(self.error(SyntaxError::UndefinedEntity(name.into()))), + } + }, + _ => Some(self.error(SyntaxError::UnexpectedTokenInEntity(t))), + }, + DoctypeSubstate::PEReferenceInValue => match t { + Token::Character(c) if is_name_char(c) => { + self.data.ref_data.push(c); + None + }, + Token::ReferenceEnd | Token::Character(';') => { + let name = self.data.take_ref_data(); + match self.entities.get(&name) { + Some(ent) => { + self.buf.push_str(ent); + self.into_state_continue(State::InsideDoctype(DoctypeSubstate::EntityValue)) + }, + None => Some(self.error(SyntaxError::UndefinedEntity(name.into()))), + } + }, + _ => Some(self.error(SyntaxError::UnexpectedTokenInEntity(t))), + }, + DoctypeSubstate::NumericReferenceStart => match t { + Token::Character('#') => { + self.into_state_continue(State::InsideDoctype(DoctypeSubstate::NumericReference)) + }, + Token::Character(c) if !self.is_valid_xml_char(c) => { + Some(self.error(SyntaxError::InvalidCharacterEntity(c as u32))) + }, + Token::Character(c) => { + self.buf.push('&'); + self.buf.push(c); + // named entities are not expanded inside doctype + self.into_state_continue(State::InsideDoctype(DoctypeSubstate::EntityValue)) + }, + _ => Some(self.error(SyntaxError::UnexpectedTokenInEntity(t))), + }, + DoctypeSubstate::NumericReference => match t { + Token::ReferenceEnd | Token::Character(';') => { + let r = self.data.take_ref_data(); + // https://www.w3.org/TR/xml/#sec-entexpand + match self.numeric_reference_from_str(&r) { + Ok(c) => { + self.buf.push(c); + self.into_state_continue(State::InsideDoctype(DoctypeSubstate::EntityValue)) + } + Err(e) => Some(self.error(e)), + } + }, + Token::Character(c) if !self.is_valid_xml_char(c) => { + Some(self.error(SyntaxError::InvalidCharacterEntity(c as u32))) + }, + Token::Character(c) => { + self.data.ref_data.push(c); + None + }, + _ => Some(self.error(SyntaxError::UnexpectedTokenInEntity(t))), + }, + DoctypeSubstate::SkipDeclaration => match t { + Token::TagEnd => { + self.into_state_continue(State::InsideDoctype(DoctypeSubstate::Outside)) + }, + _ => None, + }, } } } |