diff options
Diffstat (limited to 'src/machine.rs')
-rw-r--r-- | src/machine.rs | 1987 |
1 files changed, 0 insertions, 1987 deletions
diff --git a/src/machine.rs b/src/machine.rs deleted file mode 100644 index fd4b36b..0000000 --- a/src/machine.rs +++ /dev/null @@ -1,1987 +0,0 @@ -use crate::entities::try_read_character_reference; -use crate::offset::{Offset, Position}; -use crate::token::AttrValueSyntax; -use crate::tokenizer::CdataAction; -use crate::utils::{ - ascii_digit_pat, control_pat, ctostr, noncharacter_pat, surrogate_pat, whitespace_pat, State, -}; -use crate::{reader::Reader, Emitter, Error, Tokenizer}; - -pub enum ControlToken { - Eof, - Continue, - CdataOpen, -} - -#[inline] -pub fn consume<O, R, E>(slf: &mut Tokenizer<R, O, E>) -> Result<ControlToken, R::Error> -where - O: Offset, - R: Reader + Position<O>, - E: Emitter<O>, -{ - macro_rules! mutate_character_reference { - (* $mul:literal + $x:ident - $sub:literal) => { - match slf - .character_reference_code - .checked_mul($mul) - .and_then(|cr| cr.checked_add($x as u32 - $sub)) - { - Some(cr) => slf.character_reference_code = cr, - None => { - // provoke err - slf.character_reference_code = 0x110000; - } - }; - }; - } - - slf.position_before_match = slf.reader.position(); - - match slf.state { - State::Data => match slf.read_char()? { - Some('&') => { - slf.return_state = Some(slf.state); - slf.state = State::CharacterReference; - Ok(ControlToken::Continue) - } - Some('<') => { - slf.some_offset = slf.position_before_match; - slf.state = State::TagOpen; - Ok(ControlToken::Continue) - } - Some('\0') => { - slf.emit_error(Error::UnexpectedNullCharacter); - slf.emitter.emit_string("\0"); - Ok(ControlToken::Continue) - } - Some(x) => { - slf.emitter.emit_string(ctostr!(x)); - Ok(ControlToken::Continue) - } - None => Ok(ControlToken::Eof), - }, - State::RcData => match slf.read_char()? { - Some('&') => { - slf.return_state = Some(State::RcData); - slf.state = State::CharacterReference; - Ok(ControlToken::Continue) - } - Some('<') => { - slf.state = State::RcDataLessThanSign; - Ok(ControlToken::Continue) - } - Some('\0') => { - slf.emit_error(Error::UnexpectedNullCharacter); - slf.emitter.emit_string("\u{fffd}"); - Ok(ControlToken::Continue) - } - Some(x) => { - slf.emitter.emit_string(ctostr!(x)); - Ok(ControlToken::Continue) - } - None => Ok(ControlToken::Eof), - }, - State::RawText => match slf.read_char()? { - Some('<') => { - slf.state = State::RawTextLessThanSign; - Ok(ControlToken::Continue) - } - Some('\0') => { - slf.emit_error(Error::UnexpectedNullCharacter); - slf.emitter.emit_string("\u{fffd}"); - Ok(ControlToken::Continue) - } - Some(x) => { - slf.emitter.emit_string(ctostr!(x)); - Ok(ControlToken::Continue) - } - None => Ok(ControlToken::Eof), - }, - State::ScriptData => match slf.read_char()? { - Some('<') => { - slf.state = State::ScriptDataLessThanSign; - Ok(ControlToken::Continue) - } - Some('\0') => { - slf.emit_error(Error::UnexpectedNullCharacter); - slf.emitter.emit_string("\u{fffd}"); - Ok(ControlToken::Continue) - } - Some(x) => { - slf.emitter.emit_string(ctostr!(x)); - Ok(ControlToken::Continue) - } - None => Ok(ControlToken::Eof), - }, - State::PlainText => match slf.read_char()? { - Some('\0') => { - slf.emit_error(Error::UnexpectedNullCharacter); - slf.emitter.emit_string("\u{fffd}"); - Ok(ControlToken::Continue) - } - Some(x) => { - slf.emitter.emit_string(ctostr!(x)); - Ok(ControlToken::Continue) - } - None => Ok(ControlToken::Eof), - }, - State::TagOpen => match slf.read_char()? { - Some('!') => { - slf.state = State::MarkupDeclarationOpen; - Ok(ControlToken::Continue) - } - Some('/') => { - slf.state = State::EndTagOpen; - Ok(ControlToken::Continue) - } - Some(x) if x.is_ascii_alphabetic() => { - slf.init_start_tag(); - slf.state = State::TagName; - slf.unread_char(Some(x)); - Ok(ControlToken::Continue) - } - c @ Some('?') => { - slf.emit_error(Error::UnexpectedQuestionMarkInsteadOfTagName); - slf.emitter.init_comment(slf.reader.position()); - slf.state = State::BogusComment; - slf.unread_char(c); - Ok(ControlToken::Continue) - } - None => { - slf.emit_error(Error::EofBeforeTagName); - slf.emitter.emit_string("<"); - Ok(ControlToken::Eof) - } - c @ Some(_) => { - slf.emit_error(Error::InvalidFirstCharacterOfTagName); - slf.state = State::Data; - slf.emitter.emit_string("<"); - slf.unread_char(c); - Ok(ControlToken::Continue) - } - }, - State::EndTagOpen => match slf.read_char()? { - Some(x) if x.is_ascii_alphabetic() => { - slf.init_end_tag(); - slf.state = State::TagName; - slf.unread_char(Some(x)); - Ok(ControlToken::Continue) - } - Some('>') => { - slf.emit_error(Error::MissingEndTagName); - slf.state = State::Data; - Ok(ControlToken::Continue) - } - None => { - slf.emit_error(Error::EofBeforeTagName); - slf.emitter.emit_string("</"); - Ok(ControlToken::Eof) - } - Some(x) => { - slf.emit_error(Error::InvalidFirstCharacterOfTagName); - slf.emitter.init_comment(slf.reader.position()); - slf.state = State::BogusComment; - slf.unread_char(Some(x)); - Ok(ControlToken::Continue) - } - }, - State::TagName => match slf.read_char()? { - Some(whitespace_pat!()) => { - slf.emitter.terminate_tag_name(slf.position_before_match); - slf.state = State::BeforeAttributeName; - Ok(ControlToken::Continue) - } - Some('/') => { - slf.emitter.terminate_tag_name(slf.position_before_match); - slf.state = State::SelfClosingStartTag; - Ok(ControlToken::Continue) - } - Some('>') => { - slf.emitter.terminate_tag_name(slf.position_before_match); - slf.state = State::Data; - slf.emit_current_tag(); - Ok(ControlToken::Continue) - } - Some('\0') => { - slf.emit_error(Error::UnexpectedNullCharacter); - slf.push_tag_name("\u{fffd}"); - Ok(ControlToken::Continue) - } - Some(x) => { - slf.push_tag_name(ctostr!(x.to_ascii_lowercase())); - Ok(ControlToken::Continue) - } - None => { - slf.emit_error(Error::EofInTag); - Ok(ControlToken::Eof) - } - }, - State::RcDataLessThanSign => match slf.read_char()? { - Some('/') => { - slf.temporary_buffer.clear(); - slf.state = State::RcDataEndTagOpen; - Ok(ControlToken::Continue) - } - c => { - slf.emitter.emit_string("<"); - slf.state = State::RcData; - slf.unread_char(c); - Ok(ControlToken::Continue) - } - }, - State::RcDataEndTagOpen => match slf.read_char()? { - Some(x) if x.is_ascii_alphabetic() => { - slf.init_end_tag(); - slf.state = State::RcDataEndTagName; - slf.unread_char(Some(x)); - Ok(ControlToken::Continue) - } - c => { - slf.emitter.emit_string("</"); - slf.state = State::RcData; - slf.unread_char(c); - Ok(ControlToken::Continue) - } - }, - State::RcDataEndTagName => match slf.read_char()? { - Some(whitespace_pat!()) if slf.current_end_tag_is_appropriate() => { - slf.state = State::BeforeAttributeName; - Ok(ControlToken::Continue) - } - Some('/') if slf.current_end_tag_is_appropriate() => { - slf.state = State::SelfClosingStartTag; - Ok(ControlToken::Continue) - } - Some('>') if slf.current_end_tag_is_appropriate() => { - slf.state = State::Data; - slf.emit_current_tag(); - Ok(ControlToken::Continue) - } - Some(x) if x.is_ascii_alphabetic() => { - slf.push_tag_name(ctostr!(x.to_ascii_lowercase())); - slf.temporary_buffer.push(x); - Ok(ControlToken::Continue) - } - c => { - slf.emitter.emit_string("</"); - slf.flush_buffer_characters(); - - slf.state = State::RcData; - slf.unread_char(c); - Ok(ControlToken::Continue) - } - }, - State::RawTextLessThanSign => match slf.read_char()? { - Some('/') => { - slf.temporary_buffer.clear(); - slf.state = State::RawTextEndTagOpen; - Ok(ControlToken::Continue) - } - c => { - slf.emitter.emit_string("<"); - slf.state = State::RawText; - slf.unread_char(c); - Ok(ControlToken::Continue) - } - }, - State::RawTextEndTagOpen => match slf.read_char()? { - Some(x) if x.is_ascii_alphabetic() => { - slf.init_end_tag(); - slf.state = State::RawTextEndTagName; - slf.unread_char(Some(x)); - Ok(ControlToken::Continue) - } - c => { - slf.emitter.emit_string("</"); - slf.state = State::RawText; - slf.unread_char(c); - Ok(ControlToken::Continue) - } - }, - State::RawTextEndTagName => match slf.read_char()? { - Some(whitespace_pat!()) if slf.current_end_tag_is_appropriate() => { - slf.state = State::BeforeAttributeName; - Ok(ControlToken::Continue) - } - Some('/') if slf.current_end_tag_is_appropriate() => { - slf.state = State::SelfClosingStartTag; - Ok(ControlToken::Continue) - } - Some('>') if slf.current_end_tag_is_appropriate() => { - slf.state = State::Data; - slf.emit_current_tag(); - Ok(ControlToken::Continue) - } - Some(x) if x.is_ascii_alphabetic() => { - slf.push_tag_name(ctostr!(x.to_ascii_lowercase())); - slf.temporary_buffer.push(x); - Ok(ControlToken::Continue) - } - c => { - slf.emitter.emit_string("</"); - slf.flush_buffer_characters(); - - slf.state = State::RawText; - slf.unread_char(c); - Ok(ControlToken::Continue) - } - }, - State::ScriptDataLessThanSign => match slf.read_char()? { - Some('/') => { - slf.temporary_buffer.clear(); - slf.state = State::ScriptDataEndTagOpen; - Ok(ControlToken::Continue) - } - Some('!') => { - slf.state = State::ScriptDataEscapeStart; - slf.emitter.emit_string("<!"); - Ok(ControlToken::Continue) - } - c => { - slf.emitter.emit_string("<"); - slf.state = State::ScriptData; - slf.unread_char(c); - Ok(ControlToken::Continue) - } - }, - State::ScriptDataEndTagOpen => match slf.read_char()? { - Some(x) if x.is_ascii_alphabetic() => { - slf.init_end_tag(); - slf.state = State::ScriptDataEndTagName; - slf.unread_char(Some(x)); - Ok(ControlToken::Continue) - } - c => { - slf.emitter.emit_string("</"); - slf.state = State::ScriptData; - slf.unread_char(c); - Ok(ControlToken::Continue) - } - }, - State::ScriptDataEndTagName => match slf.read_char()? { - Some(whitespace_pat!()) if slf.current_end_tag_is_appropriate() => { - slf.state = State::BeforeAttributeName; - Ok(ControlToken::Continue) - } - Some('/') if slf.current_end_tag_is_appropriate() => { - slf.state = State::SelfClosingStartTag; - Ok(ControlToken::Continue) - } - Some('>') if slf.current_end_tag_is_appropriate() => { - slf.state = State::Data; - slf.emit_current_tag(); - Ok(ControlToken::Continue) - } - Some(x) if x.is_ascii_alphabetic() => { - slf.push_tag_name(ctostr!(x.to_ascii_lowercase())); - slf.temporary_buffer.push(x.to_ascii_lowercase()); - Ok(ControlToken::Continue) - } - c => { - slf.emitter.emit_string("</"); - slf.flush_buffer_characters(); - slf.state = State::Data; - slf.unread_char(c); - Ok(ControlToken::Continue) - } - }, - State::ScriptDataEscapeStart => match slf.read_char()? { - Some('-') => { - slf.state = State::ScriptDataEscapeStartDash; - slf.emitter.emit_string("-"); - Ok(ControlToken::Continue) - } - c => { - slf.state = State::ScriptData; - slf.unread_char(c); - Ok(ControlToken::Continue) - } - }, - State::ScriptDataEscapeStartDash => match slf.read_char()? { - Some('-') => { - slf.state = State::ScriptDataEscapedDashDash; - slf.emitter.emit_string("-"); - Ok(ControlToken::Continue) - } - c => { - slf.state = State::ScriptData; - slf.unread_char(c); - Ok(ControlToken::Continue) - } - }, - State::ScriptDataEscaped => match slf.read_char()? { - Some('-') => { - slf.state = State::ScriptDataEscapedDash; - slf.emitter.emit_string("-"); - Ok(ControlToken::Continue) - } - Some('<') => { - slf.state = State::ScriptDataEscapedLessThanSign; - Ok(ControlToken::Continue) - } - Some('\0') => { - slf.emit_error(Error::UnexpectedNullCharacter); - slf.emitter.emit_string("\u{fffd}"); - Ok(ControlToken::Continue) - } - None => { - slf.emit_error(Error::EofInScriptHtmlCommentLikeText); - Ok(ControlToken::Eof) - } - Some(x) => { - slf.emitter.emit_string(ctostr!(x)); - Ok(ControlToken::Continue) - } - }, - State::ScriptDataEscapedDash => match slf.read_char()? { - Some('-') => { - slf.state = State::ScriptDataEscapedDashDash; - slf.emitter.emit_string("-"); - Ok(ControlToken::Continue) - } - Some('<') => { - slf.state = State::ScriptDataEscapedLessThanSign; - Ok(ControlToken::Continue) - } - Some('\0') => { - slf.emit_error(Error::UnexpectedNullCharacter); - slf.state = State::ScriptDataEscaped; - slf.emitter.emit_string("\u{fffd}"); - Ok(ControlToken::Continue) - } - None => { - slf.emit_error(Error::EofInScriptHtmlCommentLikeText); - Ok(ControlToken::Eof) - } - Some(x) => { - slf.state = State::ScriptDataEscaped; - slf.emitter.emit_string(ctostr!(x)); - Ok(ControlToken::Continue) - } - }, - State::ScriptDataEscapedDashDash => match slf.read_char()? { - Some('-') => { - slf.emitter.emit_string("-"); - Ok(ControlToken::Continue) - } - Some('<') => { - slf.state = State::ScriptDataEscapedLessThanSign; - Ok(ControlToken::Continue) - } - Some('>') => { - slf.state = State::ScriptData; - slf.emitter.emit_string(">"); - Ok(ControlToken::Continue) - } - Some('\0') => { - slf.emit_error(Error::UnexpectedNullCharacter); - slf.state = State::ScriptDataEscaped; - slf.emitter.emit_string("\u{fffd}"); - Ok(ControlToken::Continue) - } - None => { - slf.emit_error(Error::EofInScriptHtmlCommentLikeText); - Ok(ControlToken::Eof) - } - Some(x) => { - slf.state = State::ScriptDataEscaped; - slf.emitter.emit_string(ctostr!(x)); - Ok(ControlToken::Continue) - } - }, - State::ScriptDataEscapedLessThanSign => match slf.read_char()? { - Some('/') => { - slf.temporary_buffer.clear(); - slf.state = State::ScriptDataEscapedEndTagOpen; - Ok(ControlToken::Continue) - } - Some(x) if x.is_ascii_alphabetic() => { - slf.temporary_buffer.clear(); - slf.emitter.emit_string("<"); - slf.state = State::ScriptDataDoubleEscapeStart; - slf.unread_char(Some(x)); - Ok(ControlToken::Continue) - } - c => { - slf.emitter.emit_string("<"); - slf.state = State::ScriptDataEscaped; - slf.unread_char(c); - Ok(ControlToken::Continue) - } - }, - State::ScriptDataEscapedEndTagOpen => match slf.read_char()? { - Some(x) if x.is_ascii_alphabetic() => { - slf.init_end_tag(); - slf.state = State::ScriptDataEscapedEndTagName; - slf.unread_char(Some(x)); - Ok(ControlToken::Continue) - } - c => { - slf.emitter.emit_string("</"); - slf.unread_char(c); - slf.state = State::ScriptDataEscaped; - Ok(ControlToken::Continue) - } - }, - State::ScriptDataEscapedEndTagName => match slf.read_char()? { - Some(whitespace_pat!()) if slf.current_end_tag_is_appropriate() => { - slf.state = State::BeforeAttributeName; - Ok(ControlToken::Continue) - } - Some('/') if slf.current_end_tag_is_appropriate() => { - slf.state = State::SelfClosingStartTag; - Ok(ControlToken::Continue) - } - Some('>') if slf.current_end_tag_is_appropriate() => { - slf.state = State::Data; - slf.emit_current_tag(); - Ok(ControlToken::Continue) - } - Some(x) if x.is_ascii_alphabetic() => { - slf.push_tag_name(ctostr!(x.to_ascii_lowercase())); - slf.temporary_buffer.push(x); - Ok(ControlToken::Continue) - } - c => { - slf.emitter.emit_string("</"); - slf.flush_buffer_characters(); - slf.state = State::ScriptDataEscaped; - slf.unread_char(c); - Ok(ControlToken::Continue) - } - }, - State::ScriptDataDoubleEscapeStart => match slf.read_char()? { - Some(x @ whitespace_pat!() | x @ '/' | x @ '>') => { - if slf.temporary_buffer == "script" { - slf.state = State::ScriptDataDoubleEscaped; - } else { - slf.state = State::ScriptDataEscaped; - } - slf.emitter.emit_string(ctostr!(x)); - Ok(ControlToken::Continue) - } - Some(x) if x.is_ascii_alphabetic() => { - slf.temporary_buffer.push(x.to_ascii_lowercase()); - slf.emitter.emit_string(ctostr!(x)); - Ok(ControlToken::Continue) - } - c => { - slf.state = State::ScriptDataEscaped; - slf.unread_char(c); - Ok(ControlToken::Continue) - } - }, - State::ScriptDataDoubleEscaped => match slf.read_char()? { - Some('-') => { - slf.state = State::ScriptDataDoubleEscapedDash; - slf.emitter.emit_string("-"); - Ok(ControlToken::Continue) - } - Some('<') => { - slf.state = State::ScriptDataDoubleEscapedLessThanSign; - slf.emitter.emit_string("<"); - Ok(ControlToken::Continue) - } - Some('\0') => { - slf.emit_error(Error::UnexpectedNullCharacter); - slf.emitter.emit_string("\u{fffd}"); - Ok(ControlToken::Continue) - } - None => { - slf.emit_error(Error::EofInScriptHtmlCommentLikeText); - Ok(ControlToken::Eof) - } - Some(x) => { - slf.emitter.emit_string(ctostr!(x)); - Ok(ControlToken::Continue) - } - }, - State::ScriptDataDoubleEscapedDash => match slf.read_char()? { - Some('-') => { - slf.state = State::ScriptDataDoubleEscapedDashDash; - slf.emitter.emit_string("-"); - Ok(ControlToken::Continue) - } - Some('<') => { - slf.state = State::ScriptDataDoubleEscapedLessThanSign; - slf.emitter.emit_string("<"); - Ok(ControlToken::Continue) - } - Some('\0') => { - slf.emit_error(Error::UnexpectedNullCharacter); - slf.state = State::ScriptDataDoubleEscaped; - slf.emitter.emit_string("\u{fffd}"); - Ok(ControlToken::Continue) - } - None => { - slf.emit_error(Error::EofInScriptHtmlCommentLikeText); - Ok(ControlToken::Eof) - } - Some(x) => { - slf.state = State::ScriptDataDoubleEscaped; - slf.emitter.emit_string(ctostr!(x)); - Ok(ControlToken::Continue) - } - }, - State::ScriptDataDoubleEscapedDashDash => match slf.read_char()? { - Some('-') => { - slf.emitter.emit_string("-"); - Ok(ControlToken::Continue) - } - Some('<') => { - slf.emitter.emit_string("<"); - slf.state = State::ScriptDataDoubleEscapedLessThanSign; - Ok(ControlToken::Continue) - } - Some('>') => { - slf.emitter.emit_string(">"); - slf.state = State::ScriptData; - Ok(ControlToken::Continue) - } - Some('\0') => { - slf.emit_error(Error::UnexpectedNullCharacter); - slf.state = State::ScriptDataDoubleEscaped; - slf.emitter.emit_string("\u{fffd}"); - Ok(ControlToken::Continue) - } - None => { - slf.emit_error(Error::EofInScriptHtmlCommentLikeText); - Ok(ControlToken::Eof) - } - Some(x) => { - slf.state = State::ScriptDataDoubleEscaped; - slf.emitter.emit_string(ctostr!(x)); - Ok(ControlToken::Continue) - } - }, - State::ScriptDataDoubleEscapedLessThanSign => match slf.read_char()? { - Some('/') => { - slf.temporary_buffer.clear(); - slf.state = State::ScriptDataDoubleEscapeEnd; - slf.emitter.emit_string("/"); - Ok(ControlToken::Continue) - } - c => { - slf.state = State::ScriptDataDoubleEscaped; - slf.unread_char(c); - Ok(ControlToken::Continue) - } - }, - State::ScriptDataDoubleEscapeEnd => match slf.read_char()? { - Some(x @ whitespace_pat!() | x @ '/' | x @ '>') => { - if slf.temporary_buffer == "script" { - slf.state = State::ScriptDataEscaped; - } else { - slf.state = State::ScriptDataDoubleEscaped; - } - - slf.emitter.emit_string(ctostr!(x)); - Ok(ControlToken::Continue) - } - Some(x) if x.is_ascii_alphabetic() => { - slf.temporary_buffer.push(x.to_ascii_lowercase()); - slf.emitter.emit_string(ctostr!(x)); - Ok(ControlToken::Continue) - } - c => { - slf.state = State::ScriptDataDoubleEscaped; - slf.unread_char(c); - Ok(ControlToken::Continue) - } - }, - State::BeforeAttributeName => match slf.read_char()? { - Some(whitespace_pat!()) => Ok(ControlToken::Continue), - c @ Some('/' | '>') | c @ None => { - slf.state = State::AfterAttributeName; - slf.unread_char(c); - Ok(ControlToken::Continue) - } - Some('=') => { - slf.emit_error(Error::UnexpectedEqualsSignBeforeAttributeName); - slf.emitter.init_attribute_name(slf.reader.position()); - slf.emitter.push_attribute_name("="); - slf.state = State::AttributeName; - Ok(ControlToken::Continue) - } - Some(x) => { - slf.emitter.init_attribute_name(slf.position_before_match); - slf.state = State::AttributeName; - slf.unread_char(Some(x)); - Ok(ControlToken::Continue) - } - }, - State::AttributeName => match slf.read_char()? { - c @ Some(whitespace_pat!() | '/' | '>') | c @ None => { - slf.emitter - .terminate_attribute_name(slf.position_before_match); - slf.state = State::AfterAttributeName; - slf.unread_char(c); - Ok(ControlToken::Continue) - } - Some('=') => { - slf.emitter - .terminate_attribute_name(slf.position_before_match); - slf.state = State::BeforeAttributeValue; - Ok(ControlToken::Continue) - } - Some('\0') => { - slf.emit_error(Error::UnexpectedNullCharacter); - slf.emitter.push_attribute_name("\u{fffd}"); - Ok(ControlToken::Continue) - } - Some(x @ '"' | x @ '\'' | x @ '<') => { - slf.emit_error(Error::UnexpectedCharacterInAttributeName); - slf.emitter - .push_attribute_name(ctostr!(x.to_ascii_lowercase())); - Ok(ControlToken::Continue) - } - Some(x) => { - slf.emitter - .push_attribute_name(ctostr!(x.to_ascii_lowercase())); - Ok(ControlToken::Continue) - } - }, - State::AfterAttributeName => match slf.read_char()? { - Some(whitespace_pat!()) => Ok(ControlToken::Continue), - Some('/') => { - slf.state = State::SelfClosingStartTag; - Ok(ControlToken::Continue) - } - Some('=') => { - slf.state = State::BeforeAttributeValue; - Ok(ControlToken::Continue) - } - Some('>') => { - slf.state = State::Data; - slf.emit_current_tag(); - Ok(ControlToken::Continue) - } - None => { - slf.emit_error(Error::EofInTag); - Ok(ControlToken::Eof) - } - Some(x) => { - slf.emitter.init_attribute_name(slf.position_before_match); - slf.state = State::AttributeName; - slf.unread_char(Some(x)); - Ok(ControlToken::Continue) - } - }, - State::BeforeAttributeValue => match slf.read_char()? { - Some(whitespace_pat!()) => Ok(ControlToken::Continue), - Some('"') => { - slf.emitter - .init_attribute_value(AttrValueSyntax::DoubleQuoted, slf.reader.position()); - slf.state = State::AttributeValueDoubleQuoted; - Ok(ControlToken::Continue) - } - Some('\'') => { - slf.emitter - .init_attribute_value(AttrValueSyntax::SingleQuoted, slf.reader.position()); - slf.state = State::AttributeValueSingleQuoted; - Ok(ControlToken::Continue) - } - Some('>') => { - slf.emit_error(Error::MissingAttributeValue); - slf.state = State::Data; - slf.emit_current_tag(); - Ok(ControlToken::Continue) - } - c => { - slf.emitter - .init_attribute_value(AttrValueSyntax::Unquoted, slf.position_before_match); - slf.state = State::AttributeValueUnquoted; - slf.unread_char(c); - Ok(ControlToken::Continue) - } - }, - State::AttributeValueDoubleQuoted => match slf.read_char()? { - Some('"') => { - slf.emitter.terminate_attribute_value( - // We cannot simply pass slf.position_before_match because - // State::NamedCharacterReference calls Tokenizer::unread_char - // which Reader::position doesn't account for. - // TODO: pass slf.position_before_match once CharacterReference has been converted to a function call - slf.reader.position() - slf.reader.len_of_char_in_current_encoding('"'), - ); - slf.state = State::AfterAttributeValueQuoted; - Ok(ControlToken::Continue) - } - Some('&') => { - slf.return_state = Some(State::AttributeValueDoubleQuoted); - slf.state = State::CharacterReference; - Ok(ControlToken::Continue) - } - Some('\0') => { - slf.emit_error(Error::UnexpectedNullCharacter); - slf.emitter.push_attribute_value("\u{fffd}"); - Ok(ControlToken::Continue) - } - None => { - slf.emit_error(Error::EofInTag); - Ok(ControlToken::Eof) - } - Some(x) => { - slf.emitter.push_attribute_value(ctostr!(x)); - Ok(ControlToken::Continue) - } - }, - State::AttributeValueSingleQuoted => match slf.read_char()? { - Some('\'') => { - slf.emitter.terminate_attribute_value( - // We cannot simply pass slf.position_before_match because - // State::NamedCharacterReference calls Tokenizer::unread_char - // which Reader::position doesn't account for. - // TODO: pass slf.position_before_match once CharacterReference has been converted to a function call - slf.reader.position() - slf.reader.len_of_char_in_current_encoding('\''), - ); - slf.state = State::AfterAttributeValueQuoted; - Ok(ControlToken::Continue) - } - Some('&') => { - slf.return_state = Some(State::AttributeValueSingleQuoted); - slf.state = State::CharacterReference; - Ok(ControlToken::Continue) - } - Some('\0') => { - slf.emit_error(Error::UnexpectedNullCharacter); - slf.emitter.push_attribute_value("\u{fffd}"); - Ok(ControlToken::Continue) - } - None => { - slf.emit_error(Error::EofInTag); - Ok(ControlToken::Eof) - } - Some(x) => { - slf.emitter.push_attribute_value(ctostr!(x)); - Ok(ControlToken::Continue) - } - }, - State::AttributeValueUnquoted => match slf.read_char()? { - Some(whitespace_pat!()) => { - slf.emitter.terminate_attribute_value( - // We cannot simply pass slf.position_before_match because - // State::NamedCharacterReference calls Tokenizer::unread_char - // which Reader::position doesn't account for. - // TODO: pass slf.position_before_match once CharacterReference has been converted to a function call - slf.reader.position() - slf.reader.len_of_char_in_current_encoding(' '), - ); - slf.state = State::BeforeAttributeName; - Ok(ControlToken::Continue) - } - Some('&') => { - slf.return_state = Some(State::AttributeValueUnquoted); - slf.state = State::CharacterReference; - Ok(ControlToken::Continue) - } - Some('>') => { - slf.state = State::Data; - slf.emit_current_tag(); - Ok(ControlToken::Continue) - } - Some('\0') => { - slf.emit_error(Error::UnexpectedNullCharacter); - slf.emitter.push_attribute_value("\u{fffd}"); - Ok(ControlToken::Continue) - } - Some(x @ '"' | x @ '\'' | x @ '<' | x @ '=' | x @ '\u{60}') => { - slf.emit_error(Error::UnexpectedCharacterInUnquotedAttributeValue); - slf.emitter.push_attribute_value(ctostr!(x)); - Ok(ControlToken::Continue) - } - None => { - slf.emit_error(Error::EofInTag); - Ok(ControlToken::Eof) - } - Some(x) => { - slf.emitter.push_attribute_value(ctostr!(x)); - Ok(ControlToken::Continue) - } - }, - State::AfterAttributeValueQuoted => match slf.read_char()? { - Some(whitespace_pat!()) => { - slf.state = State::BeforeAttributeName; - Ok(ControlToken::Continue) - } - Some('/') => { - slf.state = State::SelfClosingStartTag; - Ok(ControlToken::Continue) - } - Some('>') => { - slf.state = State::Data; - slf.emit_current_tag(); - Ok(ControlToken::Continue) - } - None => { - slf.emit_error(Error::EofInTag); - Ok(ControlToken::Eof) - } - Some(x) => { - slf.emit_error(Error::MissingWhitespaceBetweenAttributes); - slf.state = State::BeforeAttributeName; - slf.unread_char(Some(x)); - Ok(ControlToken::Continue) - } - }, - State::SelfClosingStartTag => match slf.read_char()? { - Some('>') => { - slf.emitter.set_self_closing( - slf.position_before_match - slf.reader.len_of_char_in_current_encoding('/') - ..slf.position_before_match, - ); - slf.state = State::Data; - slf.emit_current_tag(); - Ok(ControlToken::Continue) - } - None => { - slf.emit_error(Error::EofInTag); - Ok(ControlToken::Eof) - } - Some(x) => { - slf.emit_error(Error::UnexpectedSolidusInTag); - slf.state = State::BeforeAttributeName; - slf.unread_char(Some(x)); - Ok(ControlToken::Continue) - } - }, - State::BogusComment => match slf.read_char()? { - Some('>') => { - slf.state = State::Data; - slf.emitter.emit_current_comment(slf.position_before_match); - Ok(ControlToken::Continue) - } - None => { - slf.emitter.emit_current_comment(slf.position_before_match); - Ok(ControlToken::Eof) - } - Some('\0') => { - slf.emit_error(Error::UnexpectedNullCharacter); - slf.emitter.push_comment("\u{fffd}"); - Ok(ControlToken::Continue) - } - Some(x) => { - slf.emitter.push_comment(ctostr!(x)); - Ok(ControlToken::Continue) - } - }, - State::MarkupDeclarationOpen => match slf.read_char()? { - Some('-') if slf.try_read_string("-", true)? => { - slf.emitter.init_comment(slf.reader.position()); - slf.state = State::CommentStart; - Ok(ControlToken::Continue) - } - Some('d' | 'D') if slf.try_read_string("octype", false)? => { - slf.state = State::Doctype; - Ok(ControlToken::Continue) - } - Some('[') if slf.try_read_string("CDATA[", true)? => Ok(ControlToken::CdataOpen), - c => { - slf.emit_error(Error::IncorrectlyOpenedComment); - slf.emitter.init_comment(slf.position_before_match); - slf.state = State::BogusComment; - slf.unread_char(c); - Ok(ControlToken::Continue) - } - }, - State::CommentStart => match slf.read_char()? { - Some('-') => { - slf.state = State::CommentStartDash; - Ok(ControlToken::Continue) - } - Some('>') => { - slf.emit_error(Error::AbruptClosingOfEmptyComment); - slf.state = State::Data; - slf.emitter.emit_current_comment(slf.position_before_match); - Ok(ControlToken::Continue) - } - c => { - slf.unread_char(c); - slf.state = State::Comment; - Ok(ControlToken::Continue) - } - }, - State::CommentStartDash => match slf.read_char()? { - Some('-') => { - slf.state = State::CommentEnd; - Ok(ControlToken::Continue) - } - Some(c @ '>') => { - slf.emit_error(Error::AbruptClosingOfEmptyComment); - slf.state = State::Data; - slf.emitter.emit_current_comment( - slf.position_before_match - slf.reader.len_of_char_in_current_encoding(c), - ); - Ok(ControlToken::Continue) - } - None => { - slf.emit_error(Error::EofInComment); - slf.emitter.emit_current_comment( - slf.position_before_match - slf.reader.len_of_char_in_current_encoding('-'), - ); - Ok(ControlToken::Eof) - } - c @ Some(_) => { - slf.emitter.push_comment("-"); - slf.unread_char(c); - slf.state = State::Comment; - Ok(ControlToken::Continue) - } - }, - State::Comment => match slf.read_char()? { - Some('<') => { - slf.emitter.push_comment("<"); - slf.state = State::CommentLessThanSign; - Ok(ControlToken::Continue) - } - Some('-') => { - slf.some_offset = slf.position_before_match; - slf.state = State::CommentEndDash; - Ok(ControlToken::Continue) - } - Some('\0') => { - slf.emit_error(Error::UnexpectedNullCharacter); - slf.emitter.push_comment("\u{fffd}"); - Ok(ControlToken::Continue) - } - None => { - slf.emit_error(Error::EofInComment); - slf.emitter.emit_current_comment(slf.reader.position()); - Ok(ControlToken::Eof) - } - Some(x) => { - slf.emitter.push_comment(ctostr!(x)); - Ok(ControlToken::Continue) - } - }, - State::CommentLessThanSign => match slf.read_char()? { - Some('!') => { - slf.emitter.push_comment("!"); - slf.state = State::CommentLessThanSignBang; - Ok(ControlToken::Continue) - } - Some('<') => { - slf.emitter.push_comment("<"); - Ok(ControlToken::Continue) - } - c => { - slf.unread_char(c); - slf.state = State::Comment; - Ok(ControlToken::Continue) - } - }, - State::CommentLessThanSignBang => match slf.read_char()? { - Some('-') => { - slf.state = State::CommentLessThanSignBangDash; - Ok(ControlToken::Continue) - } - c => { - slf.unread_char(c); - slf.state = State::Comment; - Ok(ControlToken::Continue) - } - }, - State::CommentLessThanSignBangDash => match slf.read_char()? { - Some('-') => { - slf.state = State::CommentLessThanSignBangDashDash; - Ok(ControlToken::Continue) - } - c => { - slf.unread_char(c); - slf.state = State::CommentEndDash; - Ok(ControlToken::Continue) - } - }, - State::CommentLessThanSignBangDashDash => match slf.read_char()? { - c @ Some('>') | c @ None => { - slf.unread_char(c); - slf.state = State::CommentEnd; - Ok(ControlToken::Continue) - } - c => { - slf.emit_error(Error::NestedComment); - slf.unread_char(c); - slf.state = State::CommentEnd; - Ok(ControlToken::Continue) - } - }, - State::CommentEndDash => match slf.read_char()? { - Some('-') => { - slf.state = State::CommentEnd; - Ok(ControlToken::Continue) - } - None => { - slf.emit_error(Error::EofInComment); - slf.emitter.emit_current_comment(slf.some_offset); - Ok(ControlToken::Eof) - } - c => { - slf.emitter.push_comment("-"); - slf.unread_char(c); - slf.state = State::Comment; - Ok(ControlToken::Continue) - } - }, - State::CommentEnd => match slf.read_char()? { - Some('>') => { - slf.state = State::Data; - slf.emitter.emit_current_comment(slf.some_offset); - Ok(ControlToken::Continue) - } - Some('!') => { - slf.state = State::CommentEndBang; - Ok(ControlToken::Continue) - } - Some('-') => { - slf.emitter.push_comment("-"); - Ok(ControlToken::Continue) - } - None => { - slf.emit_error(Error::EofInComment); - slf.emitter.emit_current_comment(slf.some_offset); - Ok(ControlToken::Eof) - } - c @ Some(_) => { - slf.emitter.push_comment("-"); - slf.emitter.push_comment("-"); - slf.unread_char(c); - slf.state = State::Comment; - Ok(ControlToken::Continue) - } - }, - State::CommentEndBang => match slf.read_char()? { - Some('-') => { - slf.emitter.push_comment("-"); - slf.emitter.push_comment("-"); - slf.emitter.push_comment("!"); - slf.state = State::CommentEndDash; - Ok(ControlToken::Continue) - } - Some('>') => { - slf.emit_error(Error::IncorrectlyClosedComment); - slf.state = State::Data; - slf.emitter.emit_current_comment(slf.some_offset); - Ok(ControlToken::Continue) - } - None => { - slf.emit_error(Error::EofInComment); - slf.emitter.emit_current_comment(slf.some_offset); - Ok(ControlToken::Eof) - } - c @ Some(_) => { - slf.emitter.push_comment("-"); - slf.emitter.push_comment("-"); - slf.emitter.push_comment("!"); - slf.state = State::Comment; - slf.unread_char(c); - Ok(ControlToken::Continue) - } - }, - State::Doctype => match slf.read_char()? { - Some(whitespace_pat!()) => { - slf.state = State::BeforeDoctypeName; - Ok(ControlToken::Continue) - } - c @ Some('>') => { - slf.unread_char(c); - slf.state = State::BeforeDoctypeName; - Ok(ControlToken::Continue) - } - None => { - slf.emit_error(Error::EofInDoctype); - slf.init_doctype(); - slf.emitter.set_force_quirks(); - slf.emitter.emit_current_doctype(slf.reader.position()); - Ok(ControlToken::Eof) - } - c @ Some(_) => { - slf.emit_error(Error::MissingWhitespaceBeforeDoctypeName); - slf.unread_char(c); - slf.state = State::BeforeDoctypeName; - Ok(ControlToken::Continue) - } - }, - State::BeforeDoctypeName => match slf.read_char()? { - Some(whitespace_pat!()) => Ok(ControlToken::Continue), - Some('\0') => { - slf.emit_error(Error::UnexpectedNullCharacter); - slf.init_doctype(); - slf.emitter.init_doctype_name(slf.position_before_match); - slf.emitter.push_doctype_name("\u{fffd}"); - slf.state = State::DoctypeName; - Ok(ControlToken::Continue) - } - Some('>') => { - slf.emit_error(Error::MissingDoctypeName); - slf.init_doctype(); - slf.emitter.set_force_quirks(); - slf.state = State::Data; - slf.emitter.emit_current_doctype(slf.reader.position()); - Ok(ControlToken::Continue) - } - None => { - slf.emit_error(Error::EofInDoctype); - slf.init_doctype(); - slf.emitter.set_force_quirks(); - slf.emitter.emit_current_doctype(slf.reader.position()); - Ok(ControlToken::Eof) - } - Some(x) => { - slf.init_doctype(); - slf.emitter.init_doctype_name(slf.position_before_match); - slf.emitter - .push_doctype_name(ctostr!(x.to_ascii_lowercase())); - slf.state = State::DoctypeName; - Ok(ControlToken::Continue) - } - }, - State::DoctypeName => match slf.read_char()? { - Some(whitespace_pat!()) => { - slf.emitter - .terminate_doctype_name(slf.position_before_match); - slf.state = State::AfterDoctypeName; - Ok(ControlToken::Continue) - } - Some('>') => { - slf.emitter - .terminate_doctype_name(slf.position_before_match); - slf.state = State::Data; - slf.emitter.emit_current_doctype(slf.reader.position()); - Ok(ControlToken::Continue) - } - Some('\0') => { - slf.emit_error(Error::UnexpectedNullCharacter); - slf.emitter.push_doctype_name("\u{fffd}"); - Ok(ControlToken::Continue) - } - None => { - slf.emit_error(Error::EofInDoctype); - slf.emitter - .terminate_doctype_name(slf.position_before_match); - slf.emitter.set_force_quirks(); - slf.emitter.emit_current_doctype(slf.reader.position()); - Ok(ControlToken::Eof) - } - Some(x) => { - slf.emitter - .push_doctype_name(ctostr!(x.to_ascii_lowercase())); - Ok(ControlToken::Continue) - } - }, - State::AfterDoctypeName => match slf.read_char()? { - Some(whitespace_pat!()) => Ok(ControlToken::Continue), - Some('>') => { - slf.state = State::Data; - slf.emitter.emit_current_doctype(slf.reader.position()); - Ok(ControlToken::Continue) - } - None => { - slf.emit_error(Error::EofInDoctype); - slf.emitter.set_force_quirks(); - slf.emitter.emit_current_doctype(slf.reader.position()); - Ok(ControlToken::Eof) - } - Some('p' | 'P') if slf.try_read_string("ublic", false)? => { - slf.state = State::AfterDoctypePublicKeyword; - Ok(ControlToken::Continue) - } - Some('s' | 'S') if slf.try_read_string("ystem", false)? => { - slf.state = State::AfterDoctypeSystemKeyword; - Ok(ControlToken::Continue) - } - c @ Some(_) => { - slf.emit_error(Error::InvalidCharacterSequenceAfterDoctypeName); - slf.emitter.set_force_quirks(); - slf.unread_char(c); - slf.state = State::BogusDoctype; - Ok(ControlToken::Continue) - } - }, - State::AfterDoctypePublicKeyword => match slf.read_char()? { - Some(whitespace_pat!()) => { - slf.state = State::BeforeDoctypePublicIdentifier; - Ok(ControlToken::Continue) - } - Some('"') => { - slf.emit_error(Error::MissingWhitespaceAfterDoctypePublicKeyword); - slf.emitter.init_doctype_public_id(slf.reader.position()); - slf.state = State::DoctypePublicIdentifierDoubleQuoted; - Ok(ControlToken::Continue) - } - Some('\'') => { - slf.emit_error(Error::MissingWhitespaceAfterDoctypePublicKeyword); - slf.emitter.init_doctype_public_id(slf.reader.position()); - slf.state = State::DoctypePublicIdentifierSingleQuoted; - Ok(ControlToken::Continue) - } - Some('>') => { - slf.emit_error(Error::MissingDoctypePublicIdentifier); - slf.emitter.set_force_quirks(); - slf.state = State::Data; - slf.emitter.emit_current_doctype(slf.reader.position()); - Ok(ControlToken::Continue) - } - None => { - slf.emit_error(Error::EofInDoctype); - slf.emitter.set_force_quirks(); - slf.emitter.emit_current_doctype(slf.reader.position()); - Ok(ControlToken::Eof) - } - c @ Some(_) => { - slf.emit_error(Error::MissingQuoteBeforeDoctypePublicIdentifier); - slf.emitter.set_force_quirks(); - slf.unread_char(c); - slf.state = State::BogusDoctype; - Ok(ControlToken::Continue) - } - }, - State::BeforeDoctypePublicIdentifier => match slf.read_char()? { - Some(whitespace_pat!()) => Ok(ControlToken::Continue), - Some('"') => { - slf.emitter.init_doctype_public_id(slf.reader.position()); - slf.state = State::DoctypePublicIdentifierDoubleQuoted; - Ok(ControlToken::Continue) - } - Some('\'') => { - slf.emitter.init_doctype_public_id(slf.reader.position()); - slf.state = State::DoctypePublicIdentifierSingleQuoted; - Ok(ControlToken::Continue) - } - Some('>') => { - slf.emit_error(Error::MissingDoctypePublicIdentifier); - slf.emitter.set_force_quirks(); - slf.state = State::Data; - slf.emitter.emit_current_doctype(slf.reader.position()); - Ok(ControlToken::Continue) - } - None => { - slf.emit_error(Error::EofInDoctype); - slf.emitter.set_force_quirks(); - slf.emitter.emit_current_doctype(slf.reader.position()); - Ok(ControlToken::Eof) - } - c @ Some(_) => { - slf.emit_error(Error::MissingQuoteBeforeDoctypePublicIdentifier); - slf.emitter.set_force_quirks(); - slf.unread_char(c); - slf.state = State::BogusDoctype; - Ok(ControlToken::Continue) - } - }, - State::DoctypePublicIdentifierDoubleQuoted => match slf.read_char()? { - Some('"') => { - slf.emitter - .terminate_doctype_public_id(slf.position_before_match); - slf.state = State::AfterDoctypePublicIdentifier; - Ok(ControlToken::Continue) - } - Some('\0') => { - slf.emit_error(Error::UnexpectedNullCharacter); - slf.emitter.push_doctype_public_id("\u{fffd}"); - Ok(ControlToken::Continue) - } - Some('>') => { - slf.emitter - .terminate_doctype_public_id(slf.position_before_match); - slf.emit_error(Error::AbruptDoctypePublicIdentifier); - slf.emitter.set_force_quirks(); - slf.state = State::Data; - slf.emitter.emit_current_doctype(slf.reader.position()); - Ok(ControlToken::Continue) - } - None => { - slf.emitter - .terminate_doctype_public_id(slf.reader.position()); - slf.emit_error(Error::EofInDoctype); - slf.emitter.set_force_quirks(); - slf.emitter.emit_current_doctype(slf.reader.position()); - Ok(ControlToken::Eof) - } - Some(x) => { - slf.emitter.push_doctype_public_id(ctostr!(x)); - Ok(ControlToken::Continue) - } - }, - State::DoctypePublicIdentifierSingleQuoted => match slf.read_char()? { - Some('\'') => { - slf.emitter - .terminate_doctype_public_id(slf.position_before_match); - slf.state = State::AfterDoctypePublicIdentifier; - Ok(ControlToken::Continue) - } - Some('\0') => { - slf.emit_error(Error::UnexpectedNullCharacter); - slf.emitter.push_doctype_public_id("\u{fffd}"); - Ok(ControlToken::Continue) - } - Some('>') => { - slf.emitter - .terminate_doctype_public_id(slf.position_before_match); - slf.emit_error(Error::AbruptDoctypePublicIdentifier); - slf.emitter.set_force_quirks(); - slf.state = State::Data; - slf.emitter.emit_current_doctype(slf.reader.position()); - Ok(ControlToken::Continue) - } - None => { - slf.emitter - .terminate_doctype_public_id(slf.reader.position()); - slf.emit_error(Error::EofInDoctype); - slf.emitter.set_force_quirks(); - slf.emitter.emit_current_doctype(slf.reader.position()); - Ok(ControlToken::Eof) - } - Some(x) => { - slf.emitter.push_doctype_public_id(ctostr!(x)); - Ok(ControlToken::Continue) - } - }, - State::AfterDoctypePublicIdentifier => match slf.read_char()? { - Some(whitespace_pat!()) => { - slf.state = State::BetweenDoctypePublicAndSystemIdentifiers; - Ok(ControlToken::Continue) - } - Some('>') => { - slf.state = State::Data; - slf.emitter.emit_current_doctype(slf.reader.position()); - Ok(ControlToken::Continue) - } - Some('"') => { - slf.emit_error(Error::MissingWhitespaceBetweenDoctypePublicAndSystemIdentifiers); - slf.emitter.init_doctype_system_id(slf.reader.position()); - slf.state = State::DoctypeSystemIdentifierDoubleQuoted; - Ok(ControlToken::Continue) - } - Some('\'') => { - slf.emit_error(Error::MissingWhitespaceBetweenDoctypePublicAndSystemIdentifiers); - slf.emitter.init_doctype_system_id(slf.reader.position()); - slf.state = State::DoctypeSystemIdentifierSingleQuoted; - Ok(ControlToken::Continue) - } - None => { - slf.emit_error(Error::EofInDoctype); - slf.emitter.set_force_quirks(); - slf.emitter.emit_current_doctype(slf.reader.position()); - Ok(ControlToken::Eof) - } - c @ Some(_) => { - slf.emit_error(Error::MissingQuoteBeforeDoctypeSystemIdentifier); - slf.emitter.set_force_quirks(); - slf.unread_char(c); - slf.state = State::BogusDoctype; - Ok(ControlToken::Continue) - } - }, - State::BetweenDoctypePublicAndSystemIdentifiers => match slf.read_char()? { - Some(whitespace_pat!()) => Ok(ControlToken::Continue), - Some('>') => { - slf.state = State::Data; - slf.emitter.emit_current_doctype(slf.reader.position()); - Ok(ControlToken::Continue) - } - Some('"') => { - slf.emitter.init_doctype_system_id(slf.reader.position()); - slf.state = State::DoctypeSystemIdentifierDoubleQuoted; - Ok(ControlToken::Continue) - } - Some('\'') => { - slf.emitter.init_doctype_system_id(slf.reader.position()); - slf.state = State::DoctypeSystemIdentifierSingleQuoted; - Ok(ControlToken::Continue) - } - None => { - slf.emit_error(Error::EofInDoctype); - slf.emitter.set_force_quirks(); - slf.emitter.emit_current_doctype(slf.reader.position()); - Ok(ControlToken::Eof) - } - c @ Some(_) => { - slf.emit_error(Error::MissingQuoteBeforeDoctypeSystemIdentifier); - slf.emitter.set_force_quirks(); - slf.state = State::BogusDoctype; - slf.unread_char(c); - Ok(ControlToken::Continue) - } - }, - State::AfterDoctypeSystemKeyword => match slf.read_char()? { - Some(whitespace_pat!()) => { - slf.state = State::BeforeDoctypeSystemIdentifier; - Ok(ControlToken::Continue) - } - Some('"') => { - slf.emit_error(Error::MissingWhitespaceAfterDoctypeSystemKeyword); - slf.emitter.init_doctype_system_id(slf.reader.position()); - slf.state = State::DoctypeSystemIdentifierDoubleQuoted; - Ok(ControlToken::Continue) - } - Some('\'') => { - slf.emit_error(Error::MissingWhitespaceAfterDoctypeSystemKeyword); - slf.emitter.init_doctype_system_id(slf.reader.position()); - slf.state = State::DoctypeSystemIdentifierSingleQuoted; - Ok(ControlToken::Continue) - } - Some('>') => { - slf.emit_error(Error::MissingDoctypeSystemIdentifier); - slf.emitter.set_force_quirks(); - slf.state = State::Data; - slf.emitter.emit_current_doctype(slf.reader.position()); - Ok(ControlToken::Continue) - } - None => { - slf.emit_error(Error::EofInDoctype); - slf.emitter.set_force_quirks(); - slf.emitter.emit_current_doctype(slf.reader.position()); - Ok(ControlToken::Eof) - } - c @ Some(_) => { - slf.emit_error(Error::MissingQuoteBeforeDoctypeSystemIdentifier); - slf.emitter.set_force_quirks(); - slf.state = State::BogusDoctype; - slf.unread_char(c); - Ok(ControlToken::Continue) - } - }, - State::BeforeDoctypeSystemIdentifier => match slf.read_char()? { - Some(whitespace_pat!()) => Ok(ControlToken::Continue), - Some('"') => { - slf.emitter.init_doctype_system_id(slf.reader.position()); - slf.state = State::DoctypeSystemIdentifierDoubleQuoted; - Ok(ControlToken::Continue) - } - Some('\'') => { - slf.emitter.init_doctype_system_id(slf.reader.position()); - slf.state = State::DoctypeSystemIdentifierSingleQuoted; - Ok(ControlToken::Continue) - } - Some('>') => { - slf.emit_error(Error::MissingDoctypeSystemIdentifier); - slf.emitter.set_force_quirks(); - slf.state = State::Data; - slf.emitter.emit_current_doctype(slf.reader.position()); - Ok(ControlToken::Continue) - } - None => { - slf.emit_error(Error::EofInDoctype); - slf.emitter.set_force_quirks(); - slf.emitter.emit_current_doctype(slf.reader.position()); - Ok(ControlToken::Eof) - } - c @ Some(_) => { - slf.emit_error(Error::MissingQuoteBeforeDoctypeSystemIdentifier); - slf.emitter.set_force_quirks(); - slf.state = State::BogusDoctype; - slf.unread_char(c); - Ok(ControlToken::Continue) - } - }, - State::DoctypeSystemIdentifierDoubleQuoted => match slf.read_char()? { - Some('"') => { - slf.emitter - .terminate_doctype_system_id(slf.position_before_match); - slf.state = State::AfterDoctypeSystemIdentifier; - Ok(ControlToken::Continue) - } - Some('\0') => { - slf.emit_error(Error::UnexpectedNullCharacter); - slf.emitter.push_doctype_system_id("\u{fffd}"); - Ok(ControlToken::Continue) - } - Some('>') => { - slf.emitter - .terminate_doctype_system_id(slf.position_before_match); - slf.emit_error(Error::AbruptDoctypeSystemIdentifier); - slf.emitter.set_force_quirks(); - slf.state = State::Data; - slf.emitter.emit_current_doctype(slf.reader.position()); - Ok(ControlToken::Continue) - } - None => { - slf.emitter - .terminate_doctype_system_id(slf.reader.position()); - slf.emit_error(Error::EofInDoctype); - slf.emitter.set_force_quirks(); - slf.emitter.emit_current_doctype(slf.reader.position()); - Ok(ControlToken::Eof) - } - Some(x) => { - slf.emitter.push_doctype_system_id(ctostr!(x)); - Ok(ControlToken::Continue) - } - }, - State::DoctypeSystemIdentifierSingleQuoted => match slf.read_char()? { - Some('\'') => { - slf.emitter - .terminate_doctype_system_id(slf.position_before_match); - slf.state = State::AfterDoctypeSystemIdentifier; - Ok(ControlToken::Continue) - } - Some('\0') => { - slf.emit_error(Error::UnexpectedNullCharacter); - slf.emitter.push_doctype_system_id("\u{fffd}"); - Ok(ControlToken::Continue) - } - Some('>') => { - slf.emitter - .terminate_doctype_system_id(slf.position_before_match); - slf.emit_error(Error::AbruptDoctypeSystemIdentifier); - slf.emitter.set_force_quirks(); - slf.state = State::Data; - slf.emitter.emit_current_doctype(slf.reader.position()); - Ok(ControlToken::Continue) - } - None => { - slf.emitter - .terminate_doctype_system_id(slf.reader.position()); - slf.emit_error(Error::EofInDoctype); - slf.emitter.set_force_quirks(); - slf.emitter.emit_current_doctype(slf.reader.position()); - Ok(ControlToken::Eof) - } - Some(x) => { - slf.emitter.push_doctype_system_id(ctostr!(x)); - Ok(ControlToken::Continue) - } - }, - State::AfterDoctypeSystemIdentifier => match slf.read_char()? { - Some(whitespace_pat!()) => Ok(ControlToken::Continue), - Some('>') => { - slf.state = State::Data; - slf.emitter.emit_current_doctype(slf.reader.position()); - Ok(ControlToken::Continue) - } - None => { - slf.emit_error(Error::EofInDoctype); - slf.emitter.set_force_quirks(); - slf.emitter.emit_current_doctype(slf.reader.position()); - Ok(ControlToken::Eof) - } - c @ Some(_) => { - slf.emit_error(Error::UnexpectedCharacterAfterDoctypeSystemIdentifier); - slf.unread_char(c); - slf.state = State::BogusDoctype; - Ok(ControlToken::Continue) - } - }, - State::BogusDoctype => match slf.read_char()? { - Some('>') => { - slf.state = State::Data; - slf.emitter.emit_current_doctype(slf.reader.position()); - Ok(ControlToken::Continue) - } - Some('\0') => { - slf.emit_error(Error::UnexpectedNullCharacter); - Ok(ControlToken::Continue) - } - None => { - slf.emitter.emit_current_doctype(slf.reader.position()); - Ok(ControlToken::Eof) - } - Some(_) => Ok(ControlToken::Continue), - }, - State::CdataSection => match slf.read_char()? { - Some(']') => { - slf.state = State::CdataSectionBracket; - Ok(ControlToken::Continue) - } - None => { - slf.emit_error(Error::EofInCdata); - Ok(ControlToken::Eof) - } - Some(x) => { - slf.emitter.emit_string(ctostr!(x)); - Ok(ControlToken::Continue) - } - }, - State::CdataSectionBracket => match slf.read_char()? { - Some(']') => { - slf.state = State::CdataSectionEnd; - Ok(ControlToken::Continue) - } - c => { - slf.emitter.emit_string("]"); - slf.state = State::CdataSection; - slf.unread_char(c); - Ok(ControlToken::Continue) - } - }, - State::CdataSectionEnd => match slf.read_char()? { - Some(']') => { - slf.emitter.emit_string("]"); - Ok(ControlToken::Continue) - } - Some('>') => { - slf.state = State::Data; - Ok(ControlToken::Continue) - } - c => { - slf.emitter.emit_string("]]"); - slf.unread_char(c); - slf.state = State::CdataSection; - Ok(ControlToken::Continue) - } - }, - State::CharacterReference => { - // TODO: we can avoid these Reader method calls by changing CharacterReference to be a function instead of a state - slf.some_offset = - slf.reader.position() - slf.reader.len_of_char_in_current_encoding('&'); - slf.temporary_buffer.clear(); - slf.temporary_buffer.push('&'); - match slf.read_char()? { - Some(x) if x.is_ascii_alphanumeric() => { - slf.unread_char(Some(x)); - slf.state = State::NamedCharacterReference; - Ok(ControlToken::Continue) - } - Some('#') => { - slf.temporary_buffer.push('#'); - slf.state = State::NumericCharacterReference; - Ok(ControlToken::Continue) - } - c => { - slf.flush_code_points_consumed_as_character_reference(); - slf.state = slf.return_state.take().unwrap(); - slf.unread_char(c); - Ok(ControlToken::Continue) - } - } - } - State::NamedCharacterReference => { - let c = slf.read_char()?; - - let char_ref = match c { - Some(x) => try_read_character_reference(x, |x| slf.try_read_string(x, true))? - .map(|char_ref| (x, char_ref)), - - None => None, - }; - - if let Some((x, char_ref)) = char_ref { - slf.temporary_buffer.push(x); - slf.temporary_buffer.push_str(char_ref.name); - let char_ref_name_last_character = char_ref.name.chars().last(); - - let next_character = slf.read_char()?; - slf.unread_char(next_character); - - if slf.is_consumed_as_part_of_an_attribute() - && char_ref_name_last_character != Some(';') - && matches!(next_character, Some(x) if x == '=' || x.is_ascii_alphanumeric()) - { - slf.flush_code_points_consumed_as_character_reference(); - slf.state = slf.return_state.take().unwrap(); - Ok(ControlToken::Continue) - } else { - if char_ref_name_last_character != Some(';') { - slf.emit_error(Error::MissingSemicolonAfterCharacterReference); - } - - slf.temporary_buffer.clear(); - slf.temporary_buffer.push_str(char_ref.characters); - slf.flush_code_points_consumed_as_character_reference(); - slf.state = slf.return_state.take().unwrap(); - Ok(ControlToken::Continue) - } - } else { - slf.unread_char(c); - slf.flush_code_points_consumed_as_character_reference(); - slf.state = State::AmbiguousAmpersand; - Ok(ControlToken::Continue) - } - } - State::AmbiguousAmpersand => match slf.read_char()? { - Some(x) if x.is_ascii_alphanumeric() => { - if slf.is_consumed_as_part_of_an_attribute() { - slf.emitter.push_attribute_value(ctostr!(x)); - } else { - slf.emitter.emit_string(ctostr!(x)); - } - - Ok(ControlToken::Continue) - } - c @ Some(';') => { - slf.emit_error(Error::UnknownNamedCharacterReference); - slf.unread_char(c); - slf.state = slf.return_state.take().unwrap(); - Ok(ControlToken::Continue) - } - c => { - slf.unread_char(c); - slf.state = slf.return_state.take().unwrap(); - Ok(ControlToken::Continue) - } - }, - State::NumericCharacterReference => { - slf.character_reference_code = 0; - match slf.read_char()? { - Some(x @ 'x' | x @ 'X') => { - slf.temporary_buffer.push(x); - slf.state = State::HexadecimalCharacterReferenceStart; - Ok(ControlToken::Continue) - } - c => { - slf.unread_char(c); - slf.state = State::DecimalCharacterReferenceStart; - Ok(ControlToken::Continue) - } - } - } - State::HexadecimalCharacterReferenceStart => match slf.read_char()? { - c @ Some('0'..='9' | 'A'..='F' | 'a'..='f') => { - slf.unread_char(c); - slf.state = State::HexadecimalCharacterReference; - Ok(ControlToken::Continue) - } - c => { - slf.emit_error(Error::AbsenceOfDigitsInNumericCharacterReference); - slf.flush_code_points_consumed_as_character_reference(); - slf.unread_char(c); - slf.state = slf.return_state.take().unwrap(); - Ok(ControlToken::Continue) - } - }, - State::DecimalCharacterReferenceStart => match slf.read_char()? { - Some(x @ ascii_digit_pat!()) => { - slf.unread_char(Some(x)); - slf.state = State::DecimalCharacterReference; - Ok(ControlToken::Continue) - } - c => { - slf.emit_error(Error::AbsenceOfDigitsInNumericCharacterReference); - slf.flush_code_points_consumed_as_character_reference(); - slf.unread_char(c); - slf.state = slf.return_state.take().unwrap(); - Ok(ControlToken::Continue) - } - }, - State::HexadecimalCharacterReference => match slf.read_char()? { - Some(x @ ascii_digit_pat!()) => { - mutate_character_reference!(*16 + x - 0x0030); - Ok(ControlToken::Continue) - } - Some(x @ 'A'..='F') => { - mutate_character_reference!(*16 + x - 0x0037); - Ok(ControlToken::Continue) - } - Some(x @ 'a'..='f') => { - mutate_character_reference!(*16 + x - 0x0057); - Ok(ControlToken::Continue) - } - Some(';') => { - slf.state = State::NumericCharacterReferenceEnd; - Ok(ControlToken::Continue) - } - c => { - slf.emit_error(Error::MissingSemicolonAfterCharacterReference); - slf.unread_char(c); - slf.state = State::NumericCharacterReferenceEnd; - Ok(ControlToken::Continue) - } - }, - State::DecimalCharacterReference => match slf.read_char()? { - Some(x @ ascii_digit_pat!()) => { - mutate_character_reference!(*10 + x - 0x0030); - Ok(ControlToken::Continue) - } - Some(';') => { - slf.state = State::NumericCharacterReferenceEnd; - Ok(ControlToken::Continue) - } - c => { - slf.emit_error(Error::MissingSemicolonAfterCharacterReference); - slf.unread_char(c); - slf.state = State::NumericCharacterReferenceEnd; - Ok(ControlToken::Continue) - } - }, - State::NumericCharacterReferenceEnd => { - match slf.character_reference_code { - 0x00 => { - slf.emit_error(Error::NullCharacterReference); - slf.character_reference_code = 0xfffd; - } - 0x110000.. => { - slf.emit_error(Error::CharacterReferenceOutsideUnicodeRange); - slf.character_reference_code = 0xfffd; - } - surrogate_pat!() => { - slf.emit_error(Error::SurrogateCharacterReference); - slf.character_reference_code = 0xfffd; - } - // noncharacter - noncharacter_pat!() => { - slf.emit_error(Error::NoncharacterCharacterReference); - } - // 0x000d, or a control that is not whitespace - x @ 0x000d | x @ control_pat!() - if !matches!(x, 0x0009 | 0x000a | 0x000c | 0x0020) => - { - slf.emit_error(Error::ControlCharacterReference); - slf.character_reference_code = match x { - 0x80 => 0x20AC, // EURO SIGN (€) - 0x82 => 0x201A, // SINGLE LOW-9 QUOTATION MARK (‚) - 0x83 => 0x0192, // LATIN SMALL LETTER F WITH HOOK (ƒ) - 0x84 => 0x201E, // DOUBLE LOW-9 QUOTATION MARK („) - 0x85 => 0x2026, // HORIZONTAL ELLIPSIS (…) - 0x86 => 0x2020, // DAGGER (†) - 0x87 => 0x2021, // DOUBLE DAGGER (‡) - 0x88 => 0x02C6, // MODIFIER LETTER CIRCUMFLEX ACCENT (ˆ) - 0x89 => 0x2030, // PER MILLE SIGN (‰) - 0x8A => 0x0160, // LATIN CAPITAL LETTER S WITH CARON (Š) - 0x8B => 0x2039, // SINGLE LEFT-POINTING ANGLE QUOTATION MARK (‹) - 0x8C => 0x0152, // LATIN CAPITAL LIGATURE OE (Œ) - 0x8E => 0x017D, // LATIN CAPITAL LETTER Z WITH CARON (Ž) - 0x91 => 0x2018, // LEFT SINGLE QUOTATION MARK (‘) - 0x92 => 0x2019, // RIGHT SINGLE QUOTATION MARK (’) - 0x93 => 0x201C, // LEFT DOUBLE QUOTATION MARK (“) - 0x94 => 0x201D, // RIGHT DOUBLE QUOTATION MARK (”) - 0x95 => 0x2022, // BULLET (•) - 0x96 => 0x2013, // EN DASH (–) - 0x97 => 0x2014, // EM DASH (—) - 0x98 => 0x02DC, // SMALL TILDE (˜) - 0x99 => 0x2122, // TRADE MARK SIGN (™) - 0x9A => 0x0161, // LATIN SMALL LETTER S WITH CARON (š) - 0x9B => 0x203A, // SINGLE RIGHT-POINTING ANGLE QUOTATION MARK (›) - 0x9C => 0x0153, // LATIN SMALL LIGATURE OE (œ) - 0x9E => 0x017E, // LATIN SMALL LETTER Z WITH CARON (ž) - 0x9F => 0x0178, // LATIN CAPITAL LETTER Y WITH DIAERESIS (Ÿ) - _ => slf.character_reference_code, - }; - } - _ => (), - } - - slf.temporary_buffer.clear(); - slf.temporary_buffer - .push(std::char::from_u32(slf.character_reference_code).unwrap()); - slf.flush_code_points_consumed_as_character_reference(); - slf.state = slf.return_state.take().unwrap(); - Ok(ControlToken::Continue) - } - } -} - -impl<R: Reader + Position<O>, O: Offset, E: Emitter<O>> Tokenizer<R, O, E> { - #[inline] - fn init_doctype(&mut self) { - self.emitter.init_doctype(self.some_offset); - } -} - -#[inline] -pub fn handle_cdata_open<O, R, E>(slf: &mut Tokenizer<R, O, E>, action: CdataAction) -where - O: Offset, - R: Reader + Position<O>, - E: Emitter<O>, -{ - match action { - CdataAction::Cdata => slf.state = State::CdataSection, - CdataAction::BogusComment => { - slf.emit_error(Error::CdataInHtmlContent); - - slf.emitter.init_comment(slf.reader.position()); - slf.emitter.push_comment("[CDATA["); - slf.state = State::BogusComment; - } - } -} |