diff options
| author | Martin Fischer <martin@push-f.com> | 2023-09-09 20:38:51 +0200 | 
|---|---|---|
| committer | Martin Fischer <martin@push-f.com> | 2023-09-28 10:36:08 +0200 | 
| commit | a4846482707412f3cbacabeb082f25762a2636d5 (patch) | |
| tree | bbde44d58c0e1f63110e008ac3001acc11de84cf /src/tokenizer | |
| parent | 595985f5e17fafedc1cf6a72691ea7eb7dc20174 (diff) | |
refactor: move machine module under tokenizer
Diffstat (limited to 'src/tokenizer')
| -rw-r--r-- | src/tokenizer/machine.rs | 1987 | 
1 files changed, 1987 insertions, 0 deletions
| diff --git a/src/tokenizer/machine.rs b/src/tokenizer/machine.rs new file mode 100644 index 0000000..fd4b36b --- /dev/null +++ b/src/tokenizer/machine.rs @@ -0,0 +1,1987 @@ +use crate::entities::try_read_character_reference; +use crate::offset::{Offset, Position}; +use crate::token::AttrValueSyntax; +use crate::tokenizer::CdataAction; +use crate::utils::{ +    ascii_digit_pat, control_pat, ctostr, noncharacter_pat, surrogate_pat, whitespace_pat, State, +}; +use crate::{reader::Reader, Emitter, Error, Tokenizer}; + +pub enum ControlToken { +    Eof, +    Continue, +    CdataOpen, +} + +#[inline] +pub fn consume<O, R, E>(slf: &mut Tokenizer<R, O, E>) -> Result<ControlToken, R::Error> +where +    O: Offset, +    R: Reader + Position<O>, +    E: Emitter<O>, +{ +    macro_rules! mutate_character_reference { +        (* $mul:literal + $x:ident - $sub:literal) => { +            match slf +                .character_reference_code +                .checked_mul($mul) +                .and_then(|cr| cr.checked_add($x as u32 - $sub)) +            { +                Some(cr) => slf.character_reference_code = cr, +                None => { +                    // provoke err +                    slf.character_reference_code = 0x110000; +                } +            }; +        }; +    } + +    slf.position_before_match = slf.reader.position(); + +    match slf.state { +        State::Data => match slf.read_char()? { +            Some('&') => { +                slf.return_state = Some(slf.state); +                slf.state = State::CharacterReference; +                Ok(ControlToken::Continue) +            } +            Some('<') => { +                slf.some_offset = slf.position_before_match; +                slf.state = State::TagOpen; +                Ok(ControlToken::Continue) +            } +            Some('\0') => { +                slf.emit_error(Error::UnexpectedNullCharacter); +                slf.emitter.emit_string("\0"); +                Ok(ControlToken::Continue) +            } +            Some(x) => { +                slf.emitter.emit_string(ctostr!(x)); +                Ok(ControlToken::Continue) +            } +            None => Ok(ControlToken::Eof), +        }, +        State::RcData => match slf.read_char()? { +            Some('&') => { +                slf.return_state = Some(State::RcData); +                slf.state = State::CharacterReference; +                Ok(ControlToken::Continue) +            } +            Some('<') => { +                slf.state = State::RcDataLessThanSign; +                Ok(ControlToken::Continue) +            } +            Some('\0') => { +                slf.emit_error(Error::UnexpectedNullCharacter); +                slf.emitter.emit_string("\u{fffd}"); +                Ok(ControlToken::Continue) +            } +            Some(x) => { +                slf.emitter.emit_string(ctostr!(x)); +                Ok(ControlToken::Continue) +            } +            None => Ok(ControlToken::Eof), +        }, +        State::RawText => match slf.read_char()? { +            Some('<') => { +                slf.state = State::RawTextLessThanSign; +                Ok(ControlToken::Continue) +            } +            Some('\0') => { +                slf.emit_error(Error::UnexpectedNullCharacter); +                slf.emitter.emit_string("\u{fffd}"); +                Ok(ControlToken::Continue) +            } +            Some(x) => { +                slf.emitter.emit_string(ctostr!(x)); +                Ok(ControlToken::Continue) +            } +            None => Ok(ControlToken::Eof), +        }, +        State::ScriptData => match slf.read_char()? { +            Some('<') => { +                slf.state = State::ScriptDataLessThanSign; +                Ok(ControlToken::Continue) +            } +            Some('\0') => { +                slf.emit_error(Error::UnexpectedNullCharacter); +                slf.emitter.emit_string("\u{fffd}"); +                Ok(ControlToken::Continue) +            } +            Some(x) => { +                slf.emitter.emit_string(ctostr!(x)); +                Ok(ControlToken::Continue) +            } +            None => Ok(ControlToken::Eof), +        }, +        State::PlainText => match slf.read_char()? { +            Some('\0') => { +                slf.emit_error(Error::UnexpectedNullCharacter); +                slf.emitter.emit_string("\u{fffd}"); +                Ok(ControlToken::Continue) +            } +            Some(x) => { +                slf.emitter.emit_string(ctostr!(x)); +                Ok(ControlToken::Continue) +            } +            None => Ok(ControlToken::Eof), +        }, +        State::TagOpen => match slf.read_char()? { +            Some('!') => { +                slf.state = State::MarkupDeclarationOpen; +                Ok(ControlToken::Continue) +            } +            Some('/') => { +                slf.state = State::EndTagOpen; +                Ok(ControlToken::Continue) +            } +            Some(x) if x.is_ascii_alphabetic() => { +                slf.init_start_tag(); +                slf.state = State::TagName; +                slf.unread_char(Some(x)); +                Ok(ControlToken::Continue) +            } +            c @ Some('?') => { +                slf.emit_error(Error::UnexpectedQuestionMarkInsteadOfTagName); +                slf.emitter.init_comment(slf.reader.position()); +                slf.state = State::BogusComment; +                slf.unread_char(c); +                Ok(ControlToken::Continue) +            } +            None => { +                slf.emit_error(Error::EofBeforeTagName); +                slf.emitter.emit_string("<"); +                Ok(ControlToken::Eof) +            } +            c @ Some(_) => { +                slf.emit_error(Error::InvalidFirstCharacterOfTagName); +                slf.state = State::Data; +                slf.emitter.emit_string("<"); +                slf.unread_char(c); +                Ok(ControlToken::Continue) +            } +        }, +        State::EndTagOpen => match slf.read_char()? { +            Some(x) if x.is_ascii_alphabetic() => { +                slf.init_end_tag(); +                slf.state = State::TagName; +                slf.unread_char(Some(x)); +                Ok(ControlToken::Continue) +            } +            Some('>') => { +                slf.emit_error(Error::MissingEndTagName); +                slf.state = State::Data; +                Ok(ControlToken::Continue) +            } +            None => { +                slf.emit_error(Error::EofBeforeTagName); +                slf.emitter.emit_string("</"); +                Ok(ControlToken::Eof) +            } +            Some(x) => { +                slf.emit_error(Error::InvalidFirstCharacterOfTagName); +                slf.emitter.init_comment(slf.reader.position()); +                slf.state = State::BogusComment; +                slf.unread_char(Some(x)); +                Ok(ControlToken::Continue) +            } +        }, +        State::TagName => match slf.read_char()? { +            Some(whitespace_pat!()) => { +                slf.emitter.terminate_tag_name(slf.position_before_match); +                slf.state = State::BeforeAttributeName; +                Ok(ControlToken::Continue) +            } +            Some('/') => { +                slf.emitter.terminate_tag_name(slf.position_before_match); +                slf.state = State::SelfClosingStartTag; +                Ok(ControlToken::Continue) +            } +            Some('>') => { +                slf.emitter.terminate_tag_name(slf.position_before_match); +                slf.state = State::Data; +                slf.emit_current_tag(); +                Ok(ControlToken::Continue) +            } +            Some('\0') => { +                slf.emit_error(Error::UnexpectedNullCharacter); +                slf.push_tag_name("\u{fffd}"); +                Ok(ControlToken::Continue) +            } +            Some(x) => { +                slf.push_tag_name(ctostr!(x.to_ascii_lowercase())); +                Ok(ControlToken::Continue) +            } +            None => { +                slf.emit_error(Error::EofInTag); +                Ok(ControlToken::Eof) +            } +        }, +        State::RcDataLessThanSign => match slf.read_char()? { +            Some('/') => { +                slf.temporary_buffer.clear(); +                slf.state = State::RcDataEndTagOpen; +                Ok(ControlToken::Continue) +            } +            c => { +                slf.emitter.emit_string("<"); +                slf.state = State::RcData; +                slf.unread_char(c); +                Ok(ControlToken::Continue) +            } +        }, +        State::RcDataEndTagOpen => match slf.read_char()? { +            Some(x) if x.is_ascii_alphabetic() => { +                slf.init_end_tag(); +                slf.state = State::RcDataEndTagName; +                slf.unread_char(Some(x)); +                Ok(ControlToken::Continue) +            } +            c => { +                slf.emitter.emit_string("</"); +                slf.state = State::RcData; +                slf.unread_char(c); +                Ok(ControlToken::Continue) +            } +        }, +        State::RcDataEndTagName => match slf.read_char()? { +            Some(whitespace_pat!()) if slf.current_end_tag_is_appropriate() => { +                slf.state = State::BeforeAttributeName; +                Ok(ControlToken::Continue) +            } +            Some('/') if slf.current_end_tag_is_appropriate() => { +                slf.state = State::SelfClosingStartTag; +                Ok(ControlToken::Continue) +            } +            Some('>') if slf.current_end_tag_is_appropriate() => { +                slf.state = State::Data; +                slf.emit_current_tag(); +                Ok(ControlToken::Continue) +            } +            Some(x) if x.is_ascii_alphabetic() => { +                slf.push_tag_name(ctostr!(x.to_ascii_lowercase())); +                slf.temporary_buffer.push(x); +                Ok(ControlToken::Continue) +            } +            c => { +                slf.emitter.emit_string("</"); +                slf.flush_buffer_characters(); + +                slf.state = State::RcData; +                slf.unread_char(c); +                Ok(ControlToken::Continue) +            } +        }, +        State::RawTextLessThanSign => match slf.read_char()? { +            Some('/') => { +                slf.temporary_buffer.clear(); +                slf.state = State::RawTextEndTagOpen; +                Ok(ControlToken::Continue) +            } +            c => { +                slf.emitter.emit_string("<"); +                slf.state = State::RawText; +                slf.unread_char(c); +                Ok(ControlToken::Continue) +            } +        }, +        State::RawTextEndTagOpen => match slf.read_char()? { +            Some(x) if x.is_ascii_alphabetic() => { +                slf.init_end_tag(); +                slf.state = State::RawTextEndTagName; +                slf.unread_char(Some(x)); +                Ok(ControlToken::Continue) +            } +            c => { +                slf.emitter.emit_string("</"); +                slf.state = State::RawText; +                slf.unread_char(c); +                Ok(ControlToken::Continue) +            } +        }, +        State::RawTextEndTagName => match slf.read_char()? { +            Some(whitespace_pat!()) if slf.current_end_tag_is_appropriate() => { +                slf.state = State::BeforeAttributeName; +                Ok(ControlToken::Continue) +            } +            Some('/') if slf.current_end_tag_is_appropriate() => { +                slf.state = State::SelfClosingStartTag; +                Ok(ControlToken::Continue) +            } +            Some('>') if slf.current_end_tag_is_appropriate() => { +                slf.state = State::Data; +                slf.emit_current_tag(); +                Ok(ControlToken::Continue) +            } +            Some(x) if x.is_ascii_alphabetic() => { +                slf.push_tag_name(ctostr!(x.to_ascii_lowercase())); +                slf.temporary_buffer.push(x); +                Ok(ControlToken::Continue) +            } +            c => { +                slf.emitter.emit_string("</"); +                slf.flush_buffer_characters(); + +                slf.state = State::RawText; +                slf.unread_char(c); +                Ok(ControlToken::Continue) +            } +        }, +        State::ScriptDataLessThanSign => match slf.read_char()? { +            Some('/') => { +                slf.temporary_buffer.clear(); +                slf.state = State::ScriptDataEndTagOpen; +                Ok(ControlToken::Continue) +            } +            Some('!') => { +                slf.state = State::ScriptDataEscapeStart; +                slf.emitter.emit_string("<!"); +                Ok(ControlToken::Continue) +            } +            c => { +                slf.emitter.emit_string("<"); +                slf.state = State::ScriptData; +                slf.unread_char(c); +                Ok(ControlToken::Continue) +            } +        }, +        State::ScriptDataEndTagOpen => match slf.read_char()? { +            Some(x) if x.is_ascii_alphabetic() => { +                slf.init_end_tag(); +                slf.state = State::ScriptDataEndTagName; +                slf.unread_char(Some(x)); +                Ok(ControlToken::Continue) +            } +            c => { +                slf.emitter.emit_string("</"); +                slf.state = State::ScriptData; +                slf.unread_char(c); +                Ok(ControlToken::Continue) +            } +        }, +        State::ScriptDataEndTagName => match slf.read_char()? { +            Some(whitespace_pat!()) if slf.current_end_tag_is_appropriate() => { +                slf.state = State::BeforeAttributeName; +                Ok(ControlToken::Continue) +            } +            Some('/') if slf.current_end_tag_is_appropriate() => { +                slf.state = State::SelfClosingStartTag; +                Ok(ControlToken::Continue) +            } +            Some('>') if slf.current_end_tag_is_appropriate() => { +                slf.state = State::Data; +                slf.emit_current_tag(); +                Ok(ControlToken::Continue) +            } +            Some(x) if x.is_ascii_alphabetic() => { +                slf.push_tag_name(ctostr!(x.to_ascii_lowercase())); +                slf.temporary_buffer.push(x.to_ascii_lowercase()); +                Ok(ControlToken::Continue) +            } +            c => { +                slf.emitter.emit_string("</"); +                slf.flush_buffer_characters(); +                slf.state = State::Data; +                slf.unread_char(c); +                Ok(ControlToken::Continue) +            } +        }, +        State::ScriptDataEscapeStart => match slf.read_char()? { +            Some('-') => { +                slf.state = State::ScriptDataEscapeStartDash; +                slf.emitter.emit_string("-"); +                Ok(ControlToken::Continue) +            } +            c => { +                slf.state = State::ScriptData; +                slf.unread_char(c); +                Ok(ControlToken::Continue) +            } +        }, +        State::ScriptDataEscapeStartDash => match slf.read_char()? { +            Some('-') => { +                slf.state = State::ScriptDataEscapedDashDash; +                slf.emitter.emit_string("-"); +                Ok(ControlToken::Continue) +            } +            c => { +                slf.state = State::ScriptData; +                slf.unread_char(c); +                Ok(ControlToken::Continue) +            } +        }, +        State::ScriptDataEscaped => match slf.read_char()? { +            Some('-') => { +                slf.state = State::ScriptDataEscapedDash; +                slf.emitter.emit_string("-"); +                Ok(ControlToken::Continue) +            } +            Some('<') => { +                slf.state = State::ScriptDataEscapedLessThanSign; +                Ok(ControlToken::Continue) +            } +            Some('\0') => { +                slf.emit_error(Error::UnexpectedNullCharacter); +                slf.emitter.emit_string("\u{fffd}"); +                Ok(ControlToken::Continue) +            } +            None => { +                slf.emit_error(Error::EofInScriptHtmlCommentLikeText); +                Ok(ControlToken::Eof) +            } +            Some(x) => { +                slf.emitter.emit_string(ctostr!(x)); +                Ok(ControlToken::Continue) +            } +        }, +        State::ScriptDataEscapedDash => match slf.read_char()? { +            Some('-') => { +                slf.state = State::ScriptDataEscapedDashDash; +                slf.emitter.emit_string("-"); +                Ok(ControlToken::Continue) +            } +            Some('<') => { +                slf.state = State::ScriptDataEscapedLessThanSign; +                Ok(ControlToken::Continue) +            } +            Some('\0') => { +                slf.emit_error(Error::UnexpectedNullCharacter); +                slf.state = State::ScriptDataEscaped; +                slf.emitter.emit_string("\u{fffd}"); +                Ok(ControlToken::Continue) +            } +            None => { +                slf.emit_error(Error::EofInScriptHtmlCommentLikeText); +                Ok(ControlToken::Eof) +            } +            Some(x) => { +                slf.state = State::ScriptDataEscaped; +                slf.emitter.emit_string(ctostr!(x)); +                Ok(ControlToken::Continue) +            } +        }, +        State::ScriptDataEscapedDashDash => match slf.read_char()? { +            Some('-') => { +                slf.emitter.emit_string("-"); +                Ok(ControlToken::Continue) +            } +            Some('<') => { +                slf.state = State::ScriptDataEscapedLessThanSign; +                Ok(ControlToken::Continue) +            } +            Some('>') => { +                slf.state = State::ScriptData; +                slf.emitter.emit_string(">"); +                Ok(ControlToken::Continue) +            } +            Some('\0') => { +                slf.emit_error(Error::UnexpectedNullCharacter); +                slf.state = State::ScriptDataEscaped; +                slf.emitter.emit_string("\u{fffd}"); +                Ok(ControlToken::Continue) +            } +            None => { +                slf.emit_error(Error::EofInScriptHtmlCommentLikeText); +                Ok(ControlToken::Eof) +            } +            Some(x) => { +                slf.state = State::ScriptDataEscaped; +                slf.emitter.emit_string(ctostr!(x)); +                Ok(ControlToken::Continue) +            } +        }, +        State::ScriptDataEscapedLessThanSign => match slf.read_char()? { +            Some('/') => { +                slf.temporary_buffer.clear(); +                slf.state = State::ScriptDataEscapedEndTagOpen; +                Ok(ControlToken::Continue) +            } +            Some(x) if x.is_ascii_alphabetic() => { +                slf.temporary_buffer.clear(); +                slf.emitter.emit_string("<"); +                slf.state = State::ScriptDataDoubleEscapeStart; +                slf.unread_char(Some(x)); +                Ok(ControlToken::Continue) +            } +            c => { +                slf.emitter.emit_string("<"); +                slf.state = State::ScriptDataEscaped; +                slf.unread_char(c); +                Ok(ControlToken::Continue) +            } +        }, +        State::ScriptDataEscapedEndTagOpen => match slf.read_char()? { +            Some(x) if x.is_ascii_alphabetic() => { +                slf.init_end_tag(); +                slf.state = State::ScriptDataEscapedEndTagName; +                slf.unread_char(Some(x)); +                Ok(ControlToken::Continue) +            } +            c => { +                slf.emitter.emit_string("</"); +                slf.unread_char(c); +                slf.state = State::ScriptDataEscaped; +                Ok(ControlToken::Continue) +            } +        }, +        State::ScriptDataEscapedEndTagName => match slf.read_char()? { +            Some(whitespace_pat!()) if slf.current_end_tag_is_appropriate() => { +                slf.state = State::BeforeAttributeName; +                Ok(ControlToken::Continue) +            } +            Some('/') if slf.current_end_tag_is_appropriate() => { +                slf.state = State::SelfClosingStartTag; +                Ok(ControlToken::Continue) +            } +            Some('>') if slf.current_end_tag_is_appropriate() => { +                slf.state = State::Data; +                slf.emit_current_tag(); +                Ok(ControlToken::Continue) +            } +            Some(x) if x.is_ascii_alphabetic() => { +                slf.push_tag_name(ctostr!(x.to_ascii_lowercase())); +                slf.temporary_buffer.push(x); +                Ok(ControlToken::Continue) +            } +            c => { +                slf.emitter.emit_string("</"); +                slf.flush_buffer_characters(); +                slf.state = State::ScriptDataEscaped; +                slf.unread_char(c); +                Ok(ControlToken::Continue) +            } +        }, +        State::ScriptDataDoubleEscapeStart => match slf.read_char()? { +            Some(x @ whitespace_pat!() | x @ '/' | x @ '>') => { +                if slf.temporary_buffer == "script" { +                    slf.state = State::ScriptDataDoubleEscaped; +                } else { +                    slf.state = State::ScriptDataEscaped; +                } +                slf.emitter.emit_string(ctostr!(x)); +                Ok(ControlToken::Continue) +            } +            Some(x) if x.is_ascii_alphabetic() => { +                slf.temporary_buffer.push(x.to_ascii_lowercase()); +                slf.emitter.emit_string(ctostr!(x)); +                Ok(ControlToken::Continue) +            } +            c => { +                slf.state = State::ScriptDataEscaped; +                slf.unread_char(c); +                Ok(ControlToken::Continue) +            } +        }, +        State::ScriptDataDoubleEscaped => match slf.read_char()? { +            Some('-') => { +                slf.state = State::ScriptDataDoubleEscapedDash; +                slf.emitter.emit_string("-"); +                Ok(ControlToken::Continue) +            } +            Some('<') => { +                slf.state = State::ScriptDataDoubleEscapedLessThanSign; +                slf.emitter.emit_string("<"); +                Ok(ControlToken::Continue) +            } +            Some('\0') => { +                slf.emit_error(Error::UnexpectedNullCharacter); +                slf.emitter.emit_string("\u{fffd}"); +                Ok(ControlToken::Continue) +            } +            None => { +                slf.emit_error(Error::EofInScriptHtmlCommentLikeText); +                Ok(ControlToken::Eof) +            } +            Some(x) => { +                slf.emitter.emit_string(ctostr!(x)); +                Ok(ControlToken::Continue) +            } +        }, +        State::ScriptDataDoubleEscapedDash => match slf.read_char()? { +            Some('-') => { +                slf.state = State::ScriptDataDoubleEscapedDashDash; +                slf.emitter.emit_string("-"); +                Ok(ControlToken::Continue) +            } +            Some('<') => { +                slf.state = State::ScriptDataDoubleEscapedLessThanSign; +                slf.emitter.emit_string("<"); +                Ok(ControlToken::Continue) +            } +            Some('\0') => { +                slf.emit_error(Error::UnexpectedNullCharacter); +                slf.state = State::ScriptDataDoubleEscaped; +                slf.emitter.emit_string("\u{fffd}"); +                Ok(ControlToken::Continue) +            } +            None => { +                slf.emit_error(Error::EofInScriptHtmlCommentLikeText); +                Ok(ControlToken::Eof) +            } +            Some(x) => { +                slf.state = State::ScriptDataDoubleEscaped; +                slf.emitter.emit_string(ctostr!(x)); +                Ok(ControlToken::Continue) +            } +        }, +        State::ScriptDataDoubleEscapedDashDash => match slf.read_char()? { +            Some('-') => { +                slf.emitter.emit_string("-"); +                Ok(ControlToken::Continue) +            } +            Some('<') => { +                slf.emitter.emit_string("<"); +                slf.state = State::ScriptDataDoubleEscapedLessThanSign; +                Ok(ControlToken::Continue) +            } +            Some('>') => { +                slf.emitter.emit_string(">"); +                slf.state = State::ScriptData; +                Ok(ControlToken::Continue) +            } +            Some('\0') => { +                slf.emit_error(Error::UnexpectedNullCharacter); +                slf.state = State::ScriptDataDoubleEscaped; +                slf.emitter.emit_string("\u{fffd}"); +                Ok(ControlToken::Continue) +            } +            None => { +                slf.emit_error(Error::EofInScriptHtmlCommentLikeText); +                Ok(ControlToken::Eof) +            } +            Some(x) => { +                slf.state = State::ScriptDataDoubleEscaped; +                slf.emitter.emit_string(ctostr!(x)); +                Ok(ControlToken::Continue) +            } +        }, +        State::ScriptDataDoubleEscapedLessThanSign => match slf.read_char()? { +            Some('/') => { +                slf.temporary_buffer.clear(); +                slf.state = State::ScriptDataDoubleEscapeEnd; +                slf.emitter.emit_string("/"); +                Ok(ControlToken::Continue) +            } +            c => { +                slf.state = State::ScriptDataDoubleEscaped; +                slf.unread_char(c); +                Ok(ControlToken::Continue) +            } +        }, +        State::ScriptDataDoubleEscapeEnd => match slf.read_char()? { +            Some(x @ whitespace_pat!() | x @ '/' | x @ '>') => { +                if slf.temporary_buffer == "script" { +                    slf.state = State::ScriptDataEscaped; +                } else { +                    slf.state = State::ScriptDataDoubleEscaped; +                } + +                slf.emitter.emit_string(ctostr!(x)); +                Ok(ControlToken::Continue) +            } +            Some(x) if x.is_ascii_alphabetic() => { +                slf.temporary_buffer.push(x.to_ascii_lowercase()); +                slf.emitter.emit_string(ctostr!(x)); +                Ok(ControlToken::Continue) +            } +            c => { +                slf.state = State::ScriptDataDoubleEscaped; +                slf.unread_char(c); +                Ok(ControlToken::Continue) +            } +        }, +        State::BeforeAttributeName => match slf.read_char()? { +            Some(whitespace_pat!()) => Ok(ControlToken::Continue), +            c @ Some('/' | '>') | c @ None => { +                slf.state = State::AfterAttributeName; +                slf.unread_char(c); +                Ok(ControlToken::Continue) +            } +            Some('=') => { +                slf.emit_error(Error::UnexpectedEqualsSignBeforeAttributeName); +                slf.emitter.init_attribute_name(slf.reader.position()); +                slf.emitter.push_attribute_name("="); +                slf.state = State::AttributeName; +                Ok(ControlToken::Continue) +            } +            Some(x) => { +                slf.emitter.init_attribute_name(slf.position_before_match); +                slf.state = State::AttributeName; +                slf.unread_char(Some(x)); +                Ok(ControlToken::Continue) +            } +        }, +        State::AttributeName => match slf.read_char()? { +            c @ Some(whitespace_pat!() | '/' | '>') | c @ None => { +                slf.emitter +                    .terminate_attribute_name(slf.position_before_match); +                slf.state = State::AfterAttributeName; +                slf.unread_char(c); +                Ok(ControlToken::Continue) +            } +            Some('=') => { +                slf.emitter +                    .terminate_attribute_name(slf.position_before_match); +                slf.state = State::BeforeAttributeValue; +                Ok(ControlToken::Continue) +            } +            Some('\0') => { +                slf.emit_error(Error::UnexpectedNullCharacter); +                slf.emitter.push_attribute_name("\u{fffd}"); +                Ok(ControlToken::Continue) +            } +            Some(x @ '"' | x @ '\'' | x @ '<') => { +                slf.emit_error(Error::UnexpectedCharacterInAttributeName); +                slf.emitter +                    .push_attribute_name(ctostr!(x.to_ascii_lowercase())); +                Ok(ControlToken::Continue) +            } +            Some(x) => { +                slf.emitter +                    .push_attribute_name(ctostr!(x.to_ascii_lowercase())); +                Ok(ControlToken::Continue) +            } +        }, +        State::AfterAttributeName => match slf.read_char()? { +            Some(whitespace_pat!()) => Ok(ControlToken::Continue), +            Some('/') => { +                slf.state = State::SelfClosingStartTag; +                Ok(ControlToken::Continue) +            } +            Some('=') => { +                slf.state = State::BeforeAttributeValue; +                Ok(ControlToken::Continue) +            } +            Some('>') => { +                slf.state = State::Data; +                slf.emit_current_tag(); +                Ok(ControlToken::Continue) +            } +            None => { +                slf.emit_error(Error::EofInTag); +                Ok(ControlToken::Eof) +            } +            Some(x) => { +                slf.emitter.init_attribute_name(slf.position_before_match); +                slf.state = State::AttributeName; +                slf.unread_char(Some(x)); +                Ok(ControlToken::Continue) +            } +        }, +        State::BeforeAttributeValue => match slf.read_char()? { +            Some(whitespace_pat!()) => Ok(ControlToken::Continue), +            Some('"') => { +                slf.emitter +                    .init_attribute_value(AttrValueSyntax::DoubleQuoted, slf.reader.position()); +                slf.state = State::AttributeValueDoubleQuoted; +                Ok(ControlToken::Continue) +            } +            Some('\'') => { +                slf.emitter +                    .init_attribute_value(AttrValueSyntax::SingleQuoted, slf.reader.position()); +                slf.state = State::AttributeValueSingleQuoted; +                Ok(ControlToken::Continue) +            } +            Some('>') => { +                slf.emit_error(Error::MissingAttributeValue); +                slf.state = State::Data; +                slf.emit_current_tag(); +                Ok(ControlToken::Continue) +            } +            c => { +                slf.emitter +                    .init_attribute_value(AttrValueSyntax::Unquoted, slf.position_before_match); +                slf.state = State::AttributeValueUnquoted; +                slf.unread_char(c); +                Ok(ControlToken::Continue) +            } +        }, +        State::AttributeValueDoubleQuoted => match slf.read_char()? { +            Some('"') => { +                slf.emitter.terminate_attribute_value( +                    // We cannot simply pass slf.position_before_match because +                    // State::NamedCharacterReference calls Tokenizer::unread_char +                    // which Reader::position doesn't account for. +                    // TODO: pass slf.position_before_match once CharacterReference has been converted to a function call +                    slf.reader.position() - slf.reader.len_of_char_in_current_encoding('"'), +                ); +                slf.state = State::AfterAttributeValueQuoted; +                Ok(ControlToken::Continue) +            } +            Some('&') => { +                slf.return_state = Some(State::AttributeValueDoubleQuoted); +                slf.state = State::CharacterReference; +                Ok(ControlToken::Continue) +            } +            Some('\0') => { +                slf.emit_error(Error::UnexpectedNullCharacter); +                slf.emitter.push_attribute_value("\u{fffd}"); +                Ok(ControlToken::Continue) +            } +            None => { +                slf.emit_error(Error::EofInTag); +                Ok(ControlToken::Eof) +            } +            Some(x) => { +                slf.emitter.push_attribute_value(ctostr!(x)); +                Ok(ControlToken::Continue) +            } +        }, +        State::AttributeValueSingleQuoted => match slf.read_char()? { +            Some('\'') => { +                slf.emitter.terminate_attribute_value( +                    // We cannot simply pass slf.position_before_match because +                    // State::NamedCharacterReference calls Tokenizer::unread_char +                    // which Reader::position doesn't account for. +                    // TODO: pass slf.position_before_match once CharacterReference has been converted to a function call +                    slf.reader.position() - slf.reader.len_of_char_in_current_encoding('\''), +                ); +                slf.state = State::AfterAttributeValueQuoted; +                Ok(ControlToken::Continue) +            } +            Some('&') => { +                slf.return_state = Some(State::AttributeValueSingleQuoted); +                slf.state = State::CharacterReference; +                Ok(ControlToken::Continue) +            } +            Some('\0') => { +                slf.emit_error(Error::UnexpectedNullCharacter); +                slf.emitter.push_attribute_value("\u{fffd}"); +                Ok(ControlToken::Continue) +            } +            None => { +                slf.emit_error(Error::EofInTag); +                Ok(ControlToken::Eof) +            } +            Some(x) => { +                slf.emitter.push_attribute_value(ctostr!(x)); +                Ok(ControlToken::Continue) +            } +        }, +        State::AttributeValueUnquoted => match slf.read_char()? { +            Some(whitespace_pat!()) => { +                slf.emitter.terminate_attribute_value( +                    // We cannot simply pass slf.position_before_match because +                    // State::NamedCharacterReference calls Tokenizer::unread_char +                    // which Reader::position doesn't account for. +                    // TODO: pass slf.position_before_match once CharacterReference has been converted to a function call +                    slf.reader.position() - slf.reader.len_of_char_in_current_encoding(' '), +                ); +                slf.state = State::BeforeAttributeName; +                Ok(ControlToken::Continue) +            } +            Some('&') => { +                slf.return_state = Some(State::AttributeValueUnquoted); +                slf.state = State::CharacterReference; +                Ok(ControlToken::Continue) +            } +            Some('>') => { +                slf.state = State::Data; +                slf.emit_current_tag(); +                Ok(ControlToken::Continue) +            } +            Some('\0') => { +                slf.emit_error(Error::UnexpectedNullCharacter); +                slf.emitter.push_attribute_value("\u{fffd}"); +                Ok(ControlToken::Continue) +            } +            Some(x @ '"' | x @ '\'' | x @ '<' | x @ '=' | x @ '\u{60}') => { +                slf.emit_error(Error::UnexpectedCharacterInUnquotedAttributeValue); +                slf.emitter.push_attribute_value(ctostr!(x)); +                Ok(ControlToken::Continue) +            } +            None => { +                slf.emit_error(Error::EofInTag); +                Ok(ControlToken::Eof) +            } +            Some(x) => { +                slf.emitter.push_attribute_value(ctostr!(x)); +                Ok(ControlToken::Continue) +            } +        }, +        State::AfterAttributeValueQuoted => match slf.read_char()? { +            Some(whitespace_pat!()) => { +                slf.state = State::BeforeAttributeName; +                Ok(ControlToken::Continue) +            } +            Some('/') => { +                slf.state = State::SelfClosingStartTag; +                Ok(ControlToken::Continue) +            } +            Some('>') => { +                slf.state = State::Data; +                slf.emit_current_tag(); +                Ok(ControlToken::Continue) +            } +            None => { +                slf.emit_error(Error::EofInTag); +                Ok(ControlToken::Eof) +            } +            Some(x) => { +                slf.emit_error(Error::MissingWhitespaceBetweenAttributes); +                slf.state = State::BeforeAttributeName; +                slf.unread_char(Some(x)); +                Ok(ControlToken::Continue) +            } +        }, +        State::SelfClosingStartTag => match slf.read_char()? { +            Some('>') => { +                slf.emitter.set_self_closing( +                    slf.position_before_match - slf.reader.len_of_char_in_current_encoding('/') +                        ..slf.position_before_match, +                ); +                slf.state = State::Data; +                slf.emit_current_tag(); +                Ok(ControlToken::Continue) +            } +            None => { +                slf.emit_error(Error::EofInTag); +                Ok(ControlToken::Eof) +            } +            Some(x) => { +                slf.emit_error(Error::UnexpectedSolidusInTag); +                slf.state = State::BeforeAttributeName; +                slf.unread_char(Some(x)); +                Ok(ControlToken::Continue) +            } +        }, +        State::BogusComment => match slf.read_char()? { +            Some('>') => { +                slf.state = State::Data; +                slf.emitter.emit_current_comment(slf.position_before_match); +                Ok(ControlToken::Continue) +            } +            None => { +                slf.emitter.emit_current_comment(slf.position_before_match); +                Ok(ControlToken::Eof) +            } +            Some('\0') => { +                slf.emit_error(Error::UnexpectedNullCharacter); +                slf.emitter.push_comment("\u{fffd}"); +                Ok(ControlToken::Continue) +            } +            Some(x) => { +                slf.emitter.push_comment(ctostr!(x)); +                Ok(ControlToken::Continue) +            } +        }, +        State::MarkupDeclarationOpen => match slf.read_char()? { +            Some('-') if slf.try_read_string("-", true)? => { +                slf.emitter.init_comment(slf.reader.position()); +                slf.state = State::CommentStart; +                Ok(ControlToken::Continue) +            } +            Some('d' | 'D') if slf.try_read_string("octype", false)? => { +                slf.state = State::Doctype; +                Ok(ControlToken::Continue) +            } +            Some('[') if slf.try_read_string("CDATA[", true)? => Ok(ControlToken::CdataOpen), +            c => { +                slf.emit_error(Error::IncorrectlyOpenedComment); +                slf.emitter.init_comment(slf.position_before_match); +                slf.state = State::BogusComment; +                slf.unread_char(c); +                Ok(ControlToken::Continue) +            } +        }, +        State::CommentStart => match slf.read_char()? { +            Some('-') => { +                slf.state = State::CommentStartDash; +                Ok(ControlToken::Continue) +            } +            Some('>') => { +                slf.emit_error(Error::AbruptClosingOfEmptyComment); +                slf.state = State::Data; +                slf.emitter.emit_current_comment(slf.position_before_match); +                Ok(ControlToken::Continue) +            } +            c => { +                slf.unread_char(c); +                slf.state = State::Comment; +                Ok(ControlToken::Continue) +            } +        }, +        State::CommentStartDash => match slf.read_char()? { +            Some('-') => { +                slf.state = State::CommentEnd; +                Ok(ControlToken::Continue) +            } +            Some(c @ '>') => { +                slf.emit_error(Error::AbruptClosingOfEmptyComment); +                slf.state = State::Data; +                slf.emitter.emit_current_comment( +                    slf.position_before_match - slf.reader.len_of_char_in_current_encoding(c), +                ); +                Ok(ControlToken::Continue) +            } +            None => { +                slf.emit_error(Error::EofInComment); +                slf.emitter.emit_current_comment( +                    slf.position_before_match - slf.reader.len_of_char_in_current_encoding('-'), +                ); +                Ok(ControlToken::Eof) +            } +            c @ Some(_) => { +                slf.emitter.push_comment("-"); +                slf.unread_char(c); +                slf.state = State::Comment; +                Ok(ControlToken::Continue) +            } +        }, +        State::Comment => match slf.read_char()? { +            Some('<') => { +                slf.emitter.push_comment("<"); +                slf.state = State::CommentLessThanSign; +                Ok(ControlToken::Continue) +            } +            Some('-') => { +                slf.some_offset = slf.position_before_match; +                slf.state = State::CommentEndDash; +                Ok(ControlToken::Continue) +            } +            Some('\0') => { +                slf.emit_error(Error::UnexpectedNullCharacter); +                slf.emitter.push_comment("\u{fffd}"); +                Ok(ControlToken::Continue) +            } +            None => { +                slf.emit_error(Error::EofInComment); +                slf.emitter.emit_current_comment(slf.reader.position()); +                Ok(ControlToken::Eof) +            } +            Some(x) => { +                slf.emitter.push_comment(ctostr!(x)); +                Ok(ControlToken::Continue) +            } +        }, +        State::CommentLessThanSign => match slf.read_char()? { +            Some('!') => { +                slf.emitter.push_comment("!"); +                slf.state = State::CommentLessThanSignBang; +                Ok(ControlToken::Continue) +            } +            Some('<') => { +                slf.emitter.push_comment("<"); +                Ok(ControlToken::Continue) +            } +            c => { +                slf.unread_char(c); +                slf.state = State::Comment; +                Ok(ControlToken::Continue) +            } +        }, +        State::CommentLessThanSignBang => match slf.read_char()? { +            Some('-') => { +                slf.state = State::CommentLessThanSignBangDash; +                Ok(ControlToken::Continue) +            } +            c => { +                slf.unread_char(c); +                slf.state = State::Comment; +                Ok(ControlToken::Continue) +            } +        }, +        State::CommentLessThanSignBangDash => match slf.read_char()? { +            Some('-') => { +                slf.state = State::CommentLessThanSignBangDashDash; +                Ok(ControlToken::Continue) +            } +            c => { +                slf.unread_char(c); +                slf.state = State::CommentEndDash; +                Ok(ControlToken::Continue) +            } +        }, +        State::CommentLessThanSignBangDashDash => match slf.read_char()? { +            c @ Some('>') | c @ None => { +                slf.unread_char(c); +                slf.state = State::CommentEnd; +                Ok(ControlToken::Continue) +            } +            c => { +                slf.emit_error(Error::NestedComment); +                slf.unread_char(c); +                slf.state = State::CommentEnd; +                Ok(ControlToken::Continue) +            } +        }, +        State::CommentEndDash => match slf.read_char()? { +            Some('-') => { +                slf.state = State::CommentEnd; +                Ok(ControlToken::Continue) +            } +            None => { +                slf.emit_error(Error::EofInComment); +                slf.emitter.emit_current_comment(slf.some_offset); +                Ok(ControlToken::Eof) +            } +            c => { +                slf.emitter.push_comment("-"); +                slf.unread_char(c); +                slf.state = State::Comment; +                Ok(ControlToken::Continue) +            } +        }, +        State::CommentEnd => match slf.read_char()? { +            Some('>') => { +                slf.state = State::Data; +                slf.emitter.emit_current_comment(slf.some_offset); +                Ok(ControlToken::Continue) +            } +            Some('!') => { +                slf.state = State::CommentEndBang; +                Ok(ControlToken::Continue) +            } +            Some('-') => { +                slf.emitter.push_comment("-"); +                Ok(ControlToken::Continue) +            } +            None => { +                slf.emit_error(Error::EofInComment); +                slf.emitter.emit_current_comment(slf.some_offset); +                Ok(ControlToken::Eof) +            } +            c @ Some(_) => { +                slf.emitter.push_comment("-"); +                slf.emitter.push_comment("-"); +                slf.unread_char(c); +                slf.state = State::Comment; +                Ok(ControlToken::Continue) +            } +        }, +        State::CommentEndBang => match slf.read_char()? { +            Some('-') => { +                slf.emitter.push_comment("-"); +                slf.emitter.push_comment("-"); +                slf.emitter.push_comment("!"); +                slf.state = State::CommentEndDash; +                Ok(ControlToken::Continue) +            } +            Some('>') => { +                slf.emit_error(Error::IncorrectlyClosedComment); +                slf.state = State::Data; +                slf.emitter.emit_current_comment(slf.some_offset); +                Ok(ControlToken::Continue) +            } +            None => { +                slf.emit_error(Error::EofInComment); +                slf.emitter.emit_current_comment(slf.some_offset); +                Ok(ControlToken::Eof) +            } +            c @ Some(_) => { +                slf.emitter.push_comment("-"); +                slf.emitter.push_comment("-"); +                slf.emitter.push_comment("!"); +                slf.state = State::Comment; +                slf.unread_char(c); +                Ok(ControlToken::Continue) +            } +        }, +        State::Doctype => match slf.read_char()? { +            Some(whitespace_pat!()) => { +                slf.state = State::BeforeDoctypeName; +                Ok(ControlToken::Continue) +            } +            c @ Some('>') => { +                slf.unread_char(c); +                slf.state = State::BeforeDoctypeName; +                Ok(ControlToken::Continue) +            } +            None => { +                slf.emit_error(Error::EofInDoctype); +                slf.init_doctype(); +                slf.emitter.set_force_quirks(); +                slf.emitter.emit_current_doctype(slf.reader.position()); +                Ok(ControlToken::Eof) +            } +            c @ Some(_) => { +                slf.emit_error(Error::MissingWhitespaceBeforeDoctypeName); +                slf.unread_char(c); +                slf.state = State::BeforeDoctypeName; +                Ok(ControlToken::Continue) +            } +        }, +        State::BeforeDoctypeName => match slf.read_char()? { +            Some(whitespace_pat!()) => Ok(ControlToken::Continue), +            Some('\0') => { +                slf.emit_error(Error::UnexpectedNullCharacter); +                slf.init_doctype(); +                slf.emitter.init_doctype_name(slf.position_before_match); +                slf.emitter.push_doctype_name("\u{fffd}"); +                slf.state = State::DoctypeName; +                Ok(ControlToken::Continue) +            } +            Some('>') => { +                slf.emit_error(Error::MissingDoctypeName); +                slf.init_doctype(); +                slf.emitter.set_force_quirks(); +                slf.state = State::Data; +                slf.emitter.emit_current_doctype(slf.reader.position()); +                Ok(ControlToken::Continue) +            } +            None => { +                slf.emit_error(Error::EofInDoctype); +                slf.init_doctype(); +                slf.emitter.set_force_quirks(); +                slf.emitter.emit_current_doctype(slf.reader.position()); +                Ok(ControlToken::Eof) +            } +            Some(x) => { +                slf.init_doctype(); +                slf.emitter.init_doctype_name(slf.position_before_match); +                slf.emitter +                    .push_doctype_name(ctostr!(x.to_ascii_lowercase())); +                slf.state = State::DoctypeName; +                Ok(ControlToken::Continue) +            } +        }, +        State::DoctypeName => match slf.read_char()? { +            Some(whitespace_pat!()) => { +                slf.emitter +                    .terminate_doctype_name(slf.position_before_match); +                slf.state = State::AfterDoctypeName; +                Ok(ControlToken::Continue) +            } +            Some('>') => { +                slf.emitter +                    .terminate_doctype_name(slf.position_before_match); +                slf.state = State::Data; +                slf.emitter.emit_current_doctype(slf.reader.position()); +                Ok(ControlToken::Continue) +            } +            Some('\0') => { +                slf.emit_error(Error::UnexpectedNullCharacter); +                slf.emitter.push_doctype_name("\u{fffd}"); +                Ok(ControlToken::Continue) +            } +            None => { +                slf.emit_error(Error::EofInDoctype); +                slf.emitter +                    .terminate_doctype_name(slf.position_before_match); +                slf.emitter.set_force_quirks(); +                slf.emitter.emit_current_doctype(slf.reader.position()); +                Ok(ControlToken::Eof) +            } +            Some(x) => { +                slf.emitter +                    .push_doctype_name(ctostr!(x.to_ascii_lowercase())); +                Ok(ControlToken::Continue) +            } +        }, +        State::AfterDoctypeName => match slf.read_char()? { +            Some(whitespace_pat!()) => Ok(ControlToken::Continue), +            Some('>') => { +                slf.state = State::Data; +                slf.emitter.emit_current_doctype(slf.reader.position()); +                Ok(ControlToken::Continue) +            } +            None => { +                slf.emit_error(Error::EofInDoctype); +                slf.emitter.set_force_quirks(); +                slf.emitter.emit_current_doctype(slf.reader.position()); +                Ok(ControlToken::Eof) +            } +            Some('p' | 'P') if slf.try_read_string("ublic", false)? => { +                slf.state = State::AfterDoctypePublicKeyword; +                Ok(ControlToken::Continue) +            } +            Some('s' | 'S') if slf.try_read_string("ystem", false)? => { +                slf.state = State::AfterDoctypeSystemKeyword; +                Ok(ControlToken::Continue) +            } +            c @ Some(_) => { +                slf.emit_error(Error::InvalidCharacterSequenceAfterDoctypeName); +                slf.emitter.set_force_quirks(); +                slf.unread_char(c); +                slf.state = State::BogusDoctype; +                Ok(ControlToken::Continue) +            } +        }, +        State::AfterDoctypePublicKeyword => match slf.read_char()? { +            Some(whitespace_pat!()) => { +                slf.state = State::BeforeDoctypePublicIdentifier; +                Ok(ControlToken::Continue) +            } +            Some('"') => { +                slf.emit_error(Error::MissingWhitespaceAfterDoctypePublicKeyword); +                slf.emitter.init_doctype_public_id(slf.reader.position()); +                slf.state = State::DoctypePublicIdentifierDoubleQuoted; +                Ok(ControlToken::Continue) +            } +            Some('\'') => { +                slf.emit_error(Error::MissingWhitespaceAfterDoctypePublicKeyword); +                slf.emitter.init_doctype_public_id(slf.reader.position()); +                slf.state = State::DoctypePublicIdentifierSingleQuoted; +                Ok(ControlToken::Continue) +            } +            Some('>') => { +                slf.emit_error(Error::MissingDoctypePublicIdentifier); +                slf.emitter.set_force_quirks(); +                slf.state = State::Data; +                slf.emitter.emit_current_doctype(slf.reader.position()); +                Ok(ControlToken::Continue) +            } +            None => { +                slf.emit_error(Error::EofInDoctype); +                slf.emitter.set_force_quirks(); +                slf.emitter.emit_current_doctype(slf.reader.position()); +                Ok(ControlToken::Eof) +            } +            c @ Some(_) => { +                slf.emit_error(Error::MissingQuoteBeforeDoctypePublicIdentifier); +                slf.emitter.set_force_quirks(); +                slf.unread_char(c); +                slf.state = State::BogusDoctype; +                Ok(ControlToken::Continue) +            } +        }, +        State::BeforeDoctypePublicIdentifier => match slf.read_char()? { +            Some(whitespace_pat!()) => Ok(ControlToken::Continue), +            Some('"') => { +                slf.emitter.init_doctype_public_id(slf.reader.position()); +                slf.state = State::DoctypePublicIdentifierDoubleQuoted; +                Ok(ControlToken::Continue) +            } +            Some('\'') => { +                slf.emitter.init_doctype_public_id(slf.reader.position()); +                slf.state = State::DoctypePublicIdentifierSingleQuoted; +                Ok(ControlToken::Continue) +            } +            Some('>') => { +                slf.emit_error(Error::MissingDoctypePublicIdentifier); +                slf.emitter.set_force_quirks(); +                slf.state = State::Data; +                slf.emitter.emit_current_doctype(slf.reader.position()); +                Ok(ControlToken::Continue) +            } +            None => { +                slf.emit_error(Error::EofInDoctype); +                slf.emitter.set_force_quirks(); +                slf.emitter.emit_current_doctype(slf.reader.position()); +                Ok(ControlToken::Eof) +            } +            c @ Some(_) => { +                slf.emit_error(Error::MissingQuoteBeforeDoctypePublicIdentifier); +                slf.emitter.set_force_quirks(); +                slf.unread_char(c); +                slf.state = State::BogusDoctype; +                Ok(ControlToken::Continue) +            } +        }, +        State::DoctypePublicIdentifierDoubleQuoted => match slf.read_char()? { +            Some('"') => { +                slf.emitter +                    .terminate_doctype_public_id(slf.position_before_match); +                slf.state = State::AfterDoctypePublicIdentifier; +                Ok(ControlToken::Continue) +            } +            Some('\0') => { +                slf.emit_error(Error::UnexpectedNullCharacter); +                slf.emitter.push_doctype_public_id("\u{fffd}"); +                Ok(ControlToken::Continue) +            } +            Some('>') => { +                slf.emitter +                    .terminate_doctype_public_id(slf.position_before_match); +                slf.emit_error(Error::AbruptDoctypePublicIdentifier); +                slf.emitter.set_force_quirks(); +                slf.state = State::Data; +                slf.emitter.emit_current_doctype(slf.reader.position()); +                Ok(ControlToken::Continue) +            } +            None => { +                slf.emitter +                    .terminate_doctype_public_id(slf.reader.position()); +                slf.emit_error(Error::EofInDoctype); +                slf.emitter.set_force_quirks(); +                slf.emitter.emit_current_doctype(slf.reader.position()); +                Ok(ControlToken::Eof) +            } +            Some(x) => { +                slf.emitter.push_doctype_public_id(ctostr!(x)); +                Ok(ControlToken::Continue) +            } +        }, +        State::DoctypePublicIdentifierSingleQuoted => match slf.read_char()? { +            Some('\'') => { +                slf.emitter +                    .terminate_doctype_public_id(slf.position_before_match); +                slf.state = State::AfterDoctypePublicIdentifier; +                Ok(ControlToken::Continue) +            } +            Some('\0') => { +                slf.emit_error(Error::UnexpectedNullCharacter); +                slf.emitter.push_doctype_public_id("\u{fffd}"); +                Ok(ControlToken::Continue) +            } +            Some('>') => { +                slf.emitter +                    .terminate_doctype_public_id(slf.position_before_match); +                slf.emit_error(Error::AbruptDoctypePublicIdentifier); +                slf.emitter.set_force_quirks(); +                slf.state = State::Data; +                slf.emitter.emit_current_doctype(slf.reader.position()); +                Ok(ControlToken::Continue) +            } +            None => { +                slf.emitter +                    .terminate_doctype_public_id(slf.reader.position()); +                slf.emit_error(Error::EofInDoctype); +                slf.emitter.set_force_quirks(); +                slf.emitter.emit_current_doctype(slf.reader.position()); +                Ok(ControlToken::Eof) +            } +            Some(x) => { +                slf.emitter.push_doctype_public_id(ctostr!(x)); +                Ok(ControlToken::Continue) +            } +        }, +        State::AfterDoctypePublicIdentifier => match slf.read_char()? { +            Some(whitespace_pat!()) => { +                slf.state = State::BetweenDoctypePublicAndSystemIdentifiers; +                Ok(ControlToken::Continue) +            } +            Some('>') => { +                slf.state = State::Data; +                slf.emitter.emit_current_doctype(slf.reader.position()); +                Ok(ControlToken::Continue) +            } +            Some('"') => { +                slf.emit_error(Error::MissingWhitespaceBetweenDoctypePublicAndSystemIdentifiers); +                slf.emitter.init_doctype_system_id(slf.reader.position()); +                slf.state = State::DoctypeSystemIdentifierDoubleQuoted; +                Ok(ControlToken::Continue) +            } +            Some('\'') => { +                slf.emit_error(Error::MissingWhitespaceBetweenDoctypePublicAndSystemIdentifiers); +                slf.emitter.init_doctype_system_id(slf.reader.position()); +                slf.state = State::DoctypeSystemIdentifierSingleQuoted; +                Ok(ControlToken::Continue) +            } +            None => { +                slf.emit_error(Error::EofInDoctype); +                slf.emitter.set_force_quirks(); +                slf.emitter.emit_current_doctype(slf.reader.position()); +                Ok(ControlToken::Eof) +            } +            c @ Some(_) => { +                slf.emit_error(Error::MissingQuoteBeforeDoctypeSystemIdentifier); +                slf.emitter.set_force_quirks(); +                slf.unread_char(c); +                slf.state = State::BogusDoctype; +                Ok(ControlToken::Continue) +            } +        }, +        State::BetweenDoctypePublicAndSystemIdentifiers => match slf.read_char()? { +            Some(whitespace_pat!()) => Ok(ControlToken::Continue), +            Some('>') => { +                slf.state = State::Data; +                slf.emitter.emit_current_doctype(slf.reader.position()); +                Ok(ControlToken::Continue) +            } +            Some('"') => { +                slf.emitter.init_doctype_system_id(slf.reader.position()); +                slf.state = State::DoctypeSystemIdentifierDoubleQuoted; +                Ok(ControlToken::Continue) +            } +            Some('\'') => { +                slf.emitter.init_doctype_system_id(slf.reader.position()); +                slf.state = State::DoctypeSystemIdentifierSingleQuoted; +                Ok(ControlToken::Continue) +            } +            None => { +                slf.emit_error(Error::EofInDoctype); +                slf.emitter.set_force_quirks(); +                slf.emitter.emit_current_doctype(slf.reader.position()); +                Ok(ControlToken::Eof) +            } +            c @ Some(_) => { +                slf.emit_error(Error::MissingQuoteBeforeDoctypeSystemIdentifier); +                slf.emitter.set_force_quirks(); +                slf.state = State::BogusDoctype; +                slf.unread_char(c); +                Ok(ControlToken::Continue) +            } +        }, +        State::AfterDoctypeSystemKeyword => match slf.read_char()? { +            Some(whitespace_pat!()) => { +                slf.state = State::BeforeDoctypeSystemIdentifier; +                Ok(ControlToken::Continue) +            } +            Some('"') => { +                slf.emit_error(Error::MissingWhitespaceAfterDoctypeSystemKeyword); +                slf.emitter.init_doctype_system_id(slf.reader.position()); +                slf.state = State::DoctypeSystemIdentifierDoubleQuoted; +                Ok(ControlToken::Continue) +            } +            Some('\'') => { +                slf.emit_error(Error::MissingWhitespaceAfterDoctypeSystemKeyword); +                slf.emitter.init_doctype_system_id(slf.reader.position()); +                slf.state = State::DoctypeSystemIdentifierSingleQuoted; +                Ok(ControlToken::Continue) +            } +            Some('>') => { +                slf.emit_error(Error::MissingDoctypeSystemIdentifier); +                slf.emitter.set_force_quirks(); +                slf.state = State::Data; +                slf.emitter.emit_current_doctype(slf.reader.position()); +                Ok(ControlToken::Continue) +            } +            None => { +                slf.emit_error(Error::EofInDoctype); +                slf.emitter.set_force_quirks(); +                slf.emitter.emit_current_doctype(slf.reader.position()); +                Ok(ControlToken::Eof) +            } +            c @ Some(_) => { +                slf.emit_error(Error::MissingQuoteBeforeDoctypeSystemIdentifier); +                slf.emitter.set_force_quirks(); +                slf.state = State::BogusDoctype; +                slf.unread_char(c); +                Ok(ControlToken::Continue) +            } +        }, +        State::BeforeDoctypeSystemIdentifier => match slf.read_char()? { +            Some(whitespace_pat!()) => Ok(ControlToken::Continue), +            Some('"') => { +                slf.emitter.init_doctype_system_id(slf.reader.position()); +                slf.state = State::DoctypeSystemIdentifierDoubleQuoted; +                Ok(ControlToken::Continue) +            } +            Some('\'') => { +                slf.emitter.init_doctype_system_id(slf.reader.position()); +                slf.state = State::DoctypeSystemIdentifierSingleQuoted; +                Ok(ControlToken::Continue) +            } +            Some('>') => { +                slf.emit_error(Error::MissingDoctypeSystemIdentifier); +                slf.emitter.set_force_quirks(); +                slf.state = State::Data; +                slf.emitter.emit_current_doctype(slf.reader.position()); +                Ok(ControlToken::Continue) +            } +            None => { +                slf.emit_error(Error::EofInDoctype); +                slf.emitter.set_force_quirks(); +                slf.emitter.emit_current_doctype(slf.reader.position()); +                Ok(ControlToken::Eof) +            } +            c @ Some(_) => { +                slf.emit_error(Error::MissingQuoteBeforeDoctypeSystemIdentifier); +                slf.emitter.set_force_quirks(); +                slf.state = State::BogusDoctype; +                slf.unread_char(c); +                Ok(ControlToken::Continue) +            } +        }, +        State::DoctypeSystemIdentifierDoubleQuoted => match slf.read_char()? { +            Some('"') => { +                slf.emitter +                    .terminate_doctype_system_id(slf.position_before_match); +                slf.state = State::AfterDoctypeSystemIdentifier; +                Ok(ControlToken::Continue) +            } +            Some('\0') => { +                slf.emit_error(Error::UnexpectedNullCharacter); +                slf.emitter.push_doctype_system_id("\u{fffd}"); +                Ok(ControlToken::Continue) +            } +            Some('>') => { +                slf.emitter +                    .terminate_doctype_system_id(slf.position_before_match); +                slf.emit_error(Error::AbruptDoctypeSystemIdentifier); +                slf.emitter.set_force_quirks(); +                slf.state = State::Data; +                slf.emitter.emit_current_doctype(slf.reader.position()); +                Ok(ControlToken::Continue) +            } +            None => { +                slf.emitter +                    .terminate_doctype_system_id(slf.reader.position()); +                slf.emit_error(Error::EofInDoctype); +                slf.emitter.set_force_quirks(); +                slf.emitter.emit_current_doctype(slf.reader.position()); +                Ok(ControlToken::Eof) +            } +            Some(x) => { +                slf.emitter.push_doctype_system_id(ctostr!(x)); +                Ok(ControlToken::Continue) +            } +        }, +        State::DoctypeSystemIdentifierSingleQuoted => match slf.read_char()? { +            Some('\'') => { +                slf.emitter +                    .terminate_doctype_system_id(slf.position_before_match); +                slf.state = State::AfterDoctypeSystemIdentifier; +                Ok(ControlToken::Continue) +            } +            Some('\0') => { +                slf.emit_error(Error::UnexpectedNullCharacter); +                slf.emitter.push_doctype_system_id("\u{fffd}"); +                Ok(ControlToken::Continue) +            } +            Some('>') => { +                slf.emitter +                    .terminate_doctype_system_id(slf.position_before_match); +                slf.emit_error(Error::AbruptDoctypeSystemIdentifier); +                slf.emitter.set_force_quirks(); +                slf.state = State::Data; +                slf.emitter.emit_current_doctype(slf.reader.position()); +                Ok(ControlToken::Continue) +            } +            None => { +                slf.emitter +                    .terminate_doctype_system_id(slf.reader.position()); +                slf.emit_error(Error::EofInDoctype); +                slf.emitter.set_force_quirks(); +                slf.emitter.emit_current_doctype(slf.reader.position()); +                Ok(ControlToken::Eof) +            } +            Some(x) => { +                slf.emitter.push_doctype_system_id(ctostr!(x)); +                Ok(ControlToken::Continue) +            } +        }, +        State::AfterDoctypeSystemIdentifier => match slf.read_char()? { +            Some(whitespace_pat!()) => Ok(ControlToken::Continue), +            Some('>') => { +                slf.state = State::Data; +                slf.emitter.emit_current_doctype(slf.reader.position()); +                Ok(ControlToken::Continue) +            } +            None => { +                slf.emit_error(Error::EofInDoctype); +                slf.emitter.set_force_quirks(); +                slf.emitter.emit_current_doctype(slf.reader.position()); +                Ok(ControlToken::Eof) +            } +            c @ Some(_) => { +                slf.emit_error(Error::UnexpectedCharacterAfterDoctypeSystemIdentifier); +                slf.unread_char(c); +                slf.state = State::BogusDoctype; +                Ok(ControlToken::Continue) +            } +        }, +        State::BogusDoctype => match slf.read_char()? { +            Some('>') => { +                slf.state = State::Data; +                slf.emitter.emit_current_doctype(slf.reader.position()); +                Ok(ControlToken::Continue) +            } +            Some('\0') => { +                slf.emit_error(Error::UnexpectedNullCharacter); +                Ok(ControlToken::Continue) +            } +            None => { +                slf.emitter.emit_current_doctype(slf.reader.position()); +                Ok(ControlToken::Eof) +            } +            Some(_) => Ok(ControlToken::Continue), +        }, +        State::CdataSection => match slf.read_char()? { +            Some(']') => { +                slf.state = State::CdataSectionBracket; +                Ok(ControlToken::Continue) +            } +            None => { +                slf.emit_error(Error::EofInCdata); +                Ok(ControlToken::Eof) +            } +            Some(x) => { +                slf.emitter.emit_string(ctostr!(x)); +                Ok(ControlToken::Continue) +            } +        }, +        State::CdataSectionBracket => match slf.read_char()? { +            Some(']') => { +                slf.state = State::CdataSectionEnd; +                Ok(ControlToken::Continue) +            } +            c => { +                slf.emitter.emit_string("]"); +                slf.state = State::CdataSection; +                slf.unread_char(c); +                Ok(ControlToken::Continue) +            } +        }, +        State::CdataSectionEnd => match slf.read_char()? { +            Some(']') => { +                slf.emitter.emit_string("]"); +                Ok(ControlToken::Continue) +            } +            Some('>') => { +                slf.state = State::Data; +                Ok(ControlToken::Continue) +            } +            c => { +                slf.emitter.emit_string("]]"); +                slf.unread_char(c); +                slf.state = State::CdataSection; +                Ok(ControlToken::Continue) +            } +        }, +        State::CharacterReference => { +            // TODO: we can avoid these Reader method calls by changing CharacterReference to be a function instead of a state +            slf.some_offset = +                slf.reader.position() - slf.reader.len_of_char_in_current_encoding('&'); +            slf.temporary_buffer.clear(); +            slf.temporary_buffer.push('&'); +            match slf.read_char()? { +                Some(x) if x.is_ascii_alphanumeric() => { +                    slf.unread_char(Some(x)); +                    slf.state = State::NamedCharacterReference; +                    Ok(ControlToken::Continue) +                } +                Some('#') => { +                    slf.temporary_buffer.push('#'); +                    slf.state = State::NumericCharacterReference; +                    Ok(ControlToken::Continue) +                } +                c => { +                    slf.flush_code_points_consumed_as_character_reference(); +                    slf.state = slf.return_state.take().unwrap(); +                    slf.unread_char(c); +                    Ok(ControlToken::Continue) +                } +            } +        } +        State::NamedCharacterReference => { +            let c = slf.read_char()?; + +            let char_ref = match c { +                Some(x) => try_read_character_reference(x, |x| slf.try_read_string(x, true))? +                    .map(|char_ref| (x, char_ref)), + +                None => None, +            }; + +            if let Some((x, char_ref)) = char_ref { +                slf.temporary_buffer.push(x); +                slf.temporary_buffer.push_str(char_ref.name); +                let char_ref_name_last_character = char_ref.name.chars().last(); + +                let next_character = slf.read_char()?; +                slf.unread_char(next_character); + +                if slf.is_consumed_as_part_of_an_attribute() +                    && char_ref_name_last_character != Some(';') +                    && matches!(next_character, Some(x) if x == '=' || x.is_ascii_alphanumeric()) +                { +                    slf.flush_code_points_consumed_as_character_reference(); +                    slf.state = slf.return_state.take().unwrap(); +                    Ok(ControlToken::Continue) +                } else { +                    if char_ref_name_last_character != Some(';') { +                        slf.emit_error(Error::MissingSemicolonAfterCharacterReference); +                    } + +                    slf.temporary_buffer.clear(); +                    slf.temporary_buffer.push_str(char_ref.characters); +                    slf.flush_code_points_consumed_as_character_reference(); +                    slf.state = slf.return_state.take().unwrap(); +                    Ok(ControlToken::Continue) +                } +            } else { +                slf.unread_char(c); +                slf.flush_code_points_consumed_as_character_reference(); +                slf.state = State::AmbiguousAmpersand; +                Ok(ControlToken::Continue) +            } +        } +        State::AmbiguousAmpersand => match slf.read_char()? { +            Some(x) if x.is_ascii_alphanumeric() => { +                if slf.is_consumed_as_part_of_an_attribute() { +                    slf.emitter.push_attribute_value(ctostr!(x)); +                } else { +                    slf.emitter.emit_string(ctostr!(x)); +                } + +                Ok(ControlToken::Continue) +            } +            c @ Some(';') => { +                slf.emit_error(Error::UnknownNamedCharacterReference); +                slf.unread_char(c); +                slf.state = slf.return_state.take().unwrap(); +                Ok(ControlToken::Continue) +            } +            c => { +                slf.unread_char(c); +                slf.state = slf.return_state.take().unwrap(); +                Ok(ControlToken::Continue) +            } +        }, +        State::NumericCharacterReference => { +            slf.character_reference_code = 0; +            match slf.read_char()? { +                Some(x @ 'x' | x @ 'X') => { +                    slf.temporary_buffer.push(x); +                    slf.state = State::HexadecimalCharacterReferenceStart; +                    Ok(ControlToken::Continue) +                } +                c => { +                    slf.unread_char(c); +                    slf.state = State::DecimalCharacterReferenceStart; +                    Ok(ControlToken::Continue) +                } +            } +        } +        State::HexadecimalCharacterReferenceStart => match slf.read_char()? { +            c @ Some('0'..='9' | 'A'..='F' | 'a'..='f') => { +                slf.unread_char(c); +                slf.state = State::HexadecimalCharacterReference; +                Ok(ControlToken::Continue) +            } +            c => { +                slf.emit_error(Error::AbsenceOfDigitsInNumericCharacterReference); +                slf.flush_code_points_consumed_as_character_reference(); +                slf.unread_char(c); +                slf.state = slf.return_state.take().unwrap(); +                Ok(ControlToken::Continue) +            } +        }, +        State::DecimalCharacterReferenceStart => match slf.read_char()? { +            Some(x @ ascii_digit_pat!()) => { +                slf.unread_char(Some(x)); +                slf.state = State::DecimalCharacterReference; +                Ok(ControlToken::Continue) +            } +            c => { +                slf.emit_error(Error::AbsenceOfDigitsInNumericCharacterReference); +                slf.flush_code_points_consumed_as_character_reference(); +                slf.unread_char(c); +                slf.state = slf.return_state.take().unwrap(); +                Ok(ControlToken::Continue) +            } +        }, +        State::HexadecimalCharacterReference => match slf.read_char()? { +            Some(x @ ascii_digit_pat!()) => { +                mutate_character_reference!(*16 + x - 0x0030); +                Ok(ControlToken::Continue) +            } +            Some(x @ 'A'..='F') => { +                mutate_character_reference!(*16 + x - 0x0037); +                Ok(ControlToken::Continue) +            } +            Some(x @ 'a'..='f') => { +                mutate_character_reference!(*16 + x - 0x0057); +                Ok(ControlToken::Continue) +            } +            Some(';') => { +                slf.state = State::NumericCharacterReferenceEnd; +                Ok(ControlToken::Continue) +            } +            c => { +                slf.emit_error(Error::MissingSemicolonAfterCharacterReference); +                slf.unread_char(c); +                slf.state = State::NumericCharacterReferenceEnd; +                Ok(ControlToken::Continue) +            } +        }, +        State::DecimalCharacterReference => match slf.read_char()? { +            Some(x @ ascii_digit_pat!()) => { +                mutate_character_reference!(*10 + x - 0x0030); +                Ok(ControlToken::Continue) +            } +            Some(';') => { +                slf.state = State::NumericCharacterReferenceEnd; +                Ok(ControlToken::Continue) +            } +            c => { +                slf.emit_error(Error::MissingSemicolonAfterCharacterReference); +                slf.unread_char(c); +                slf.state = State::NumericCharacterReferenceEnd; +                Ok(ControlToken::Continue) +            } +        }, +        State::NumericCharacterReferenceEnd => { +            match slf.character_reference_code { +                0x00 => { +                    slf.emit_error(Error::NullCharacterReference); +                    slf.character_reference_code = 0xfffd; +                } +                0x110000.. => { +                    slf.emit_error(Error::CharacterReferenceOutsideUnicodeRange); +                    slf.character_reference_code = 0xfffd; +                } +                surrogate_pat!() => { +                    slf.emit_error(Error::SurrogateCharacterReference); +                    slf.character_reference_code = 0xfffd; +                } +                // noncharacter +                noncharacter_pat!() => { +                    slf.emit_error(Error::NoncharacterCharacterReference); +                } +                // 0x000d, or a control that is not whitespace +                x @ 0x000d | x @ control_pat!() +                    if !matches!(x, 0x0009 | 0x000a | 0x000c | 0x0020) => +                { +                    slf.emit_error(Error::ControlCharacterReference); +                    slf.character_reference_code = match x { +                        0x80 => 0x20AC, // EURO SIGN (€) +                        0x82 => 0x201A, // SINGLE LOW-9 QUOTATION MARK (‚) +                        0x83 => 0x0192, // LATIN SMALL LETTER F WITH HOOK (ƒ) +                        0x84 => 0x201E, // DOUBLE LOW-9 QUOTATION MARK („) +                        0x85 => 0x2026, // HORIZONTAL ELLIPSIS (…) +                        0x86 => 0x2020, // DAGGER (†) +                        0x87 => 0x2021, // DOUBLE DAGGER (‡) +                        0x88 => 0x02C6, // MODIFIER LETTER CIRCUMFLEX ACCENT (ˆ) +                        0x89 => 0x2030, // PER MILLE SIGN (‰) +                        0x8A => 0x0160, // LATIN CAPITAL LETTER S WITH CARON (Š) +                        0x8B => 0x2039, // SINGLE LEFT-POINTING ANGLE QUOTATION MARK (‹) +                        0x8C => 0x0152, // LATIN CAPITAL LIGATURE OE (Œ) +                        0x8E => 0x017D, // LATIN CAPITAL LETTER Z WITH CARON (Ž) +                        0x91 => 0x2018, // LEFT SINGLE QUOTATION MARK (‘) +                        0x92 => 0x2019, // RIGHT SINGLE QUOTATION MARK (’) +                        0x93 => 0x201C, // LEFT DOUBLE QUOTATION MARK (“) +                        0x94 => 0x201D, // RIGHT DOUBLE QUOTATION MARK (”) +                        0x95 => 0x2022, // BULLET (•) +                        0x96 => 0x2013, // EN DASH (–) +                        0x97 => 0x2014, // EM DASH (—) +                        0x98 => 0x02DC, // SMALL TILDE (˜) +                        0x99 => 0x2122, // TRADE MARK SIGN (™) +                        0x9A => 0x0161, // LATIN SMALL LETTER S WITH CARON (š) +                        0x9B => 0x203A, // SINGLE RIGHT-POINTING ANGLE QUOTATION MARK (›) +                        0x9C => 0x0153, // LATIN SMALL LIGATURE OE (œ) +                        0x9E => 0x017E, // LATIN SMALL LETTER Z WITH CARON (ž) +                        0x9F => 0x0178, // LATIN CAPITAL LETTER Y WITH DIAERESIS (Ÿ) +                        _ => slf.character_reference_code, +                    }; +                } +                _ => (), +            } + +            slf.temporary_buffer.clear(); +            slf.temporary_buffer +                .push(std::char::from_u32(slf.character_reference_code).unwrap()); +            slf.flush_code_points_consumed_as_character_reference(); +            slf.state = slf.return_state.take().unwrap(); +            Ok(ControlToken::Continue) +        } +    } +} + +impl<R: Reader + Position<O>, O: Offset, E: Emitter<O>> Tokenizer<R, O, E> { +    #[inline] +    fn init_doctype(&mut self) { +        self.emitter.init_doctype(self.some_offset); +    } +} + +#[inline] +pub fn handle_cdata_open<O, R, E>(slf: &mut Tokenizer<R, O, E>, action: CdataAction) +where +    O: Offset, +    R: Reader + Position<O>, +    E: Emitter<O>, +{ +    match action { +        CdataAction::Cdata => slf.state = State::CdataSection, +        CdataAction::BogusComment => { +            slf.emit_error(Error::CdataInHtmlContent); + +            slf.emitter.init_comment(slf.reader.position()); +            slf.emitter.push_comment("[CDATA["); +            slf.state = State::BogusComment; +        } +    } +} | 
