diff options
Diffstat (limited to 'src')
| -rw-r--r-- | src/lib.rs | 2173 | ||||
| -rw-r--r-- | src/machine.rs | 2062 | ||||
| -rw-r--r-- | src/tokenizer.rs | 244 | ||||
| -rw-r--r-- | src/utils.rs | 164 | 
4 files changed, 2324 insertions, 2319 deletions
| @@ -9,2179 +9,14 @@ mod error;  mod machine;  mod never;  mod reader; +mod tokenizer; +mod utils;  #[cfg(feature = "integration-tests")] -pub use machine::State; -#[cfg(not(feature = "integration-tests"))] -use machine::State; - -use machine::{ -    ascii_digit_pat, control_pat, noncharacter_pat, surrogate_pat, whitespace_pat, ControlToken, -}; +pub use utils::State;  pub use emitter::{DefaultEmitter, Doctype, Emitter, EndTag, StartTag, Token};  pub use error::Error;  pub use never::Never;  pub use reader::{BufReadReader, Readable, Reader, StringReader}; - -macro_rules! ctostr { -    ($c:expr) => { -        &*$c.encode_utf8(&mut [0; 4]) -    }; -} - -// this is a stack that can hold 0 to 2 Ts -#[derive(Debug, Default)] -struct Stack2<T: Copy>(Option<(T, Option<T>)>); - -impl<T: Copy> Stack2<T> { -    #[inline] -    fn push(&mut self, c: T) { -        self.0 = match self.0 { -            None => Some((c, None)), -            Some((c1, None)) => Some((c1, Some(c))), -            Some((_c1, Some(_c2))) => panic!("stack full!"), -        } -    } - -    #[inline] -    fn pop(&mut self) -> Option<T> { -        let (new_self, rv) = match self.0 { -            Some((c1, Some(c2))) => (Some((c1, None)), Some(c2)), -            Some((c1, None)) => (None, Some(c1)), -            None => (None, None), -        }; -        self.0 = new_self; -        rv -    } - -    #[inline] -    fn is_empty(&self) -> bool { -        matches!(self.0, None) -    } -} - -/// A HTML tokenizer. See crate-level docs for basic usage. -pub struct Tokenizer<R: Reader, E: Emitter = DefaultEmitter> { -    eof: bool, -    state: State, -    emitter: E, -    temporary_buffer: String, -    reader: R, -    to_reconsume: Stack2<Option<char>>, -    character_reference_code: u32, -    return_state: Option<State>, -} - -impl<R: Reader> Tokenizer<R> { -    /// Create a new tokenizer from some input. -    /// -    /// `input` can be `&String` or `&str` at the moment, as those are the types for which -    /// [`crate::Readable`] is implemented, but you can implement that trait on your own types. -    /// -    /// Patches are welcome for providing an efficient implementation over async streams, -    /// iterators, files, etc, as long as any dependencies come behind featureflags. -    pub fn new<'a, S: Readable<'a, Reader = R>>(input: S) -> Self { -        Tokenizer::<S::Reader>::new_with_emitter(input, DefaultEmitter::default()) -    } -} - -impl<R: Reader, E: Emitter> Tokenizer<R, E> { -    /// Construct a new tokenizer from some input and a custom emitter. -    /// -    /// Use this method over [`Tokenizer::new`] when you want to have more control over string allocation for -    /// tokens. -    pub fn new_with_emitter<'a, S: Readable<'a, Reader = R>>(input: S, emitter: E) -> Self { -        Tokenizer { -            eof: false, -            state: State::Data, -            emitter, -            temporary_buffer: String::new(), -            to_reconsume: Stack2::default(), -            reader: input.to_reader(), -            character_reference_code: 0, -            return_state: None, -        } -    } - -    #[cfg(feature = "integration-tests")] -    /// Test-internal function to override internal state. -    /// -    /// Only available with the `integration-tests` feature which is not public API. -    pub fn set_state(&mut self, state: State) { -        self.state = state; -    } - -    /// Set the statemachine to start/continue in [plaintext -    /// state](https://html.spec.whatwg.org/#plaintext-state). -    /// -    /// This tokenizer never gets into that state naturally. -    pub fn set_plaintext_state(&mut self) { -        self.state = State::PlainText; -    } - -    #[cfg(feature = "integration-tests")] -    /// Test-internal function to override internal state. -    /// -    /// Only available with the `integration-tests` feature which is not public API. -    pub fn set_last_start_tag(&mut self, last_start_tag: Option<&str>) { -        self.emitter.set_last_start_tag(last_start_tag); -    } - -    #[inline] -    fn unread_char(&mut self, c: Option<char>) { -        self.to_reconsume.push(c); -    } - -    #[inline] -    fn validate_char(&mut self, c: char) { -        match c as u32 { -            surrogate_pat!() => { -                self.emitter.emit_error(Error::SurrogateInInputStream); -            } -            noncharacter_pat!() => { -                self.emitter.emit_error(Error::NoncharacterInInputStream); -            } -            // control without whitespace or nul -            x @ control_pat!() -                if !matches!(x, 0x0000 | 0x0009 | 0x000a | 0x000c | 0x000d | 0x0020) => -            { -                self.emitter -                    .emit_error(Error::ControlCharacterInInputStream); -            } -            _ => (), -        } -    } - -    fn read_char(&mut self) -> Result<Option<char>, R::Error> { -        let (c_res, reconsumed) = match self.to_reconsume.pop() { -            Some(c) => (Ok(c), true), -            None => (self.reader.read_char(), false), -        }; - -        let mut c = match c_res { -            Ok(Some(c)) => c, -            res => return res, -        }; - -        if c == '\r' { -            c = '\n'; -            let c2 = self.reader.read_char()?; -            if c2 != Some('\n') { -                self.unread_char(c2); -            } -        } - -        if !reconsumed { -            self.validate_char(c); -        } - -        Ok(Some(c)) -    } - -    #[inline] -    fn try_read_string(&mut self, s: &str, case_sensitive: bool) -> Result<bool, R::Error> { -        debug_assert!(!s.is_empty()); -        debug_assert!(self.to_reconsume.is_empty()); -        self.reader.try_read_string(s, case_sensitive) -    } - -    fn is_consumed_as_part_of_an_attribute(&self) -> bool { -        matches!( -            self.return_state, -            Some( -                State::AttributeValueDoubleQuoted -                    | State::AttributeValueSingleQuoted -                    | State::AttributeValueUnquoted -            ) -        ) -    } - -    fn flush_code_points_consumed_as_character_reference(&mut self) { -        if self.is_consumed_as_part_of_an_attribute() { -            self.emitter.push_attribute_value(&self.temporary_buffer); -            self.temporary_buffer.clear(); -        } else { -            self.flush_buffer_characters(); -        } -    } - -    fn next_input_character(&mut self) -> Result<Option<char>, R::Error> { -        let rv = self.read_char()?; -        self.unread_char(rv); -        Ok(rv) -    } - -    fn flush_buffer_characters(&mut self) { -        self.emitter.emit_string(&self.temporary_buffer); -        self.temporary_buffer.clear(); -    } - -    fn consume(&mut self) -> Result<ControlToken, R::Error> { -        macro_rules! mutate_character_reference { -            (* $mul:literal + $x:ident - $sub:literal) => { -                match self -                    .character_reference_code -                    .checked_mul($mul) -                    .and_then(|cr| cr.checked_add($x as u32 - $sub)) -                { -                    Some(cr) => self.character_reference_code = cr, -                    None => { -                        // provoke err -                        self.character_reference_code = 0x110000; -                    } -                }; -            }; -        } - -        match self.state { -            State::Data => match self.read_char()? { -                Some('&') => { -                    self.return_state = Some(self.state); -                    self.state = State::CharacterReference; -                    Ok(ControlToken::Continue) -                } -                Some('<') => { -                    self.state = State::TagOpen; -                    Ok(ControlToken::Continue) -                } -                Some('\0') => { -                    self.emitter.emit_error(Error::UnexpectedNullCharacter); -                    self.emitter.emit_string("\0"); -                    Ok(ControlToken::Continue) -                } -                Some(x) => { -                    self.emitter.emit_string(ctostr!(x)); -                    Ok(ControlToken::Continue) -                } -                None => Ok(ControlToken::Eof), -            }, -            State::RcData => match self.read_char()? { -                Some('&') => { -                    self.return_state = Some(State::RcData); -                    self.state = State::CharacterReference; -                    Ok(ControlToken::Continue) -                } -                Some('<') => { -                    self.state = State::RcDataLessThanSign; -                    Ok(ControlToken::Continue) -                } -                Some('\0') => { -                    self.emitter.emit_error(Error::UnexpectedNullCharacter); -                    self.emitter.emit_string("\u{fffd}"); -                    Ok(ControlToken::Continue) -                } -                Some(x) => { -                    self.emitter.emit_string(ctostr!(x)); -                    Ok(ControlToken::Continue) -                } -                None => Ok(ControlToken::Eof), -            }, -            State::RawText => match self.read_char()? { -                Some('<') => { -                    self.state = State::RawTextLessThanSign; -                    Ok(ControlToken::Continue) -                } -                Some('\0') => { -                    self.emitter.emit_error(Error::UnexpectedNullCharacter); -                    self.emitter.emit_string("\u{fffd}"); -                    Ok(ControlToken::Continue) -                } -                Some(x) => { -                    self.emitter.emit_string(ctostr!(x)); -                    Ok(ControlToken::Continue) -                } -                None => Ok(ControlToken::Eof), -            }, -            State::ScriptData => match self.read_char()? { -                Some('<') => { -                    self.state = State::ScriptDataLessThanSign; -                    Ok(ControlToken::Continue) -                } -                Some('\0') => { -                    self.emitter.emit_error(Error::UnexpectedNullCharacter); -                    self.emitter.emit_string("\u{fffd}"); -                    Ok(ControlToken::Continue) -                } -                Some(x) => { -                    self.emitter.emit_string(ctostr!(x)); -                    Ok(ControlToken::Continue) -                } -                None => Ok(ControlToken::Eof), -            }, -            State::PlainText => match self.read_char()? { -                Some('\0') => { -                    self.emitter.emit_error(Error::UnexpectedNullCharacter); -                    self.emitter.emit_string("\u{fffd}"); -                    Ok(ControlToken::Continue) -                } -                Some(x) => { -                    self.emitter.emit_string(ctostr!(x)); -                    Ok(ControlToken::Continue) -                } -                None => Ok(ControlToken::Eof), -            }, -            State::TagOpen => match self.read_char()? { -                Some('!') => { -                    self.state = State::MarkupDeclarationOpen; -                    Ok(ControlToken::Continue) -                } -                Some('/') => { -                    self.state = State::EndTagOpen; -                    Ok(ControlToken::Continue) -                } -                Some(x) if x.is_ascii_alphabetic() => { -                    self.emitter.init_start_tag(); -                    self.state = State::TagName; -                    self.unread_char(Some(x)); -                    Ok(ControlToken::Continue) -                } -                c @ Some('?') => { -                    self.emitter -                        .emit_error(Error::UnexpectedQuestionMarkInsteadOfTagName); -                    self.emitter.init_comment(); -                    self.state = State::BogusComment; -                    self.unread_char(c); -                    Ok(ControlToken::Continue) -                } -                None => { -                    self.emitter.emit_error(Error::EofBeforeTagName); -                    self.emitter.emit_string("<"); -                    Ok(ControlToken::Eof) -                } -                c @ Some(_) => { -                    self.emitter -                        .emit_error(Error::InvalidFirstCharacterOfTagName); -                    self.state = State::Data; -                    self.emitter.emit_string("<"); -                    self.unread_char(c); -                    Ok(ControlToken::Continue) -                } -            }, -            State::EndTagOpen => match self.read_char()? { -                Some(x) if x.is_ascii_alphabetic() => { -                    self.emitter.init_end_tag(); -                    self.state = State::TagName; -                    self.unread_char(Some(x)); -                    Ok(ControlToken::Continue) -                } -                Some('>') => { -                    self.emitter.emit_error(Error::MissingEndTagName); -                    self.state = State::Data; -                    Ok(ControlToken::Continue) -                } -                None => { -                    self.emitter.emit_error(Error::EofBeforeTagName); -                    self.emitter.emit_string("</"); -                    Ok(ControlToken::Eof) -                } -                Some(x) => { -                    self.emitter -                        .emit_error(Error::InvalidFirstCharacterOfTagName); -                    self.emitter.init_comment(); -                    self.state = State::BogusComment; -                    self.unread_char(Some(x)); -                    Ok(ControlToken::Continue) -                } -            }, -            State::TagName => match self.read_char()? { -                Some(whitespace_pat!()) => { -                    self.state = State::BeforeAttributeName; -                    Ok(ControlToken::Continue) -                } -                Some('/') => { -                    self.state = State::SelfClosingStartTag; -                    Ok(ControlToken::Continue) -                } -                Some('>') => { -                    self.state = State::Data; -                    self.emitter.emit_current_tag(); -                    Ok(ControlToken::Continue) -                } -                Some('\0') => { -                    self.emitter.emit_error(Error::UnexpectedNullCharacter); -                    self.emitter.push_tag_name("\u{fffd}"); -                    Ok(ControlToken::Continue) -                } -                Some(x) => { -                    self.emitter.push_tag_name(ctostr!(x.to_ascii_lowercase())); -                    Ok(ControlToken::Continue) -                } -                None => { -                    self.emitter.emit_error(Error::EofInTag); -                    Ok(ControlToken::Eof) -                } -            }, -            State::RcDataLessThanSign => match self.read_char()? { -                Some('/') => { -                    self.temporary_buffer.clear(); -                    self.state = State::RcDataEndTagOpen; -                    Ok(ControlToken::Continue) -                } -                c => { -                    self.emitter.emit_string("<"); -                    self.state = State::RcData; -                    self.unread_char(c); -                    Ok(ControlToken::Continue) -                } -            }, -            State::RcDataEndTagOpen => match self.read_char()? { -                Some(x) if x.is_ascii_alphabetic() => { -                    self.emitter.init_end_tag(); -                    self.state = State::RcDataEndTagName; -                    self.unread_char(Some(x)); -                    Ok(ControlToken::Continue) -                } -                c => { -                    self.emitter.emit_string("</"); -                    self.state = State::RcData; -                    self.unread_char(c); -                    Ok(ControlToken::Continue) -                } -            }, -            State::RcDataEndTagName => match self.read_char()? { -                Some(whitespace_pat!()) if self.emitter.current_is_appropriate_end_tag_token() => { -                    self.state = State::BeforeAttributeName; -                    Ok(ControlToken::Continue) -                } -                Some('/') if self.emitter.current_is_appropriate_end_tag_token() => { -                    self.state = State::SelfClosingStartTag; -                    Ok(ControlToken::Continue) -                } -                Some('>') if self.emitter.current_is_appropriate_end_tag_token() => { -                    self.state = State::Data; -                    self.emitter.emit_current_tag(); -                    Ok(ControlToken::Continue) -                } -                Some(x) if x.is_ascii_alphabetic() => { -                    self.emitter.push_tag_name(ctostr!(x.to_ascii_lowercase())); -                    self.temporary_buffer.push(x); -                    Ok(ControlToken::Continue) -                } -                c => { -                    self.emitter.emit_string("</"); -                    self.flush_buffer_characters(); - -                    self.state = State::RcData; -                    self.unread_char(c); -                    Ok(ControlToken::Continue) -                } -            }, -            State::RawTextLessThanSign => match self.read_char()? { -                Some('/') => { -                    self.temporary_buffer.clear(); -                    self.state = State::RawTextEndTagOpen; -                    Ok(ControlToken::Continue) -                } -                c => { -                    self.emitter.emit_string("<"); -                    self.state = State::RawText; -                    self.unread_char(c); -                    Ok(ControlToken::Continue) -                } -            }, -            State::RawTextEndTagOpen => match self.read_char()? { -                Some(x) if x.is_ascii_alphabetic() => { -                    self.emitter.init_end_tag(); -                    self.state = State::RawTextEndTagName; -                    self.unread_char(Some(x)); -                    Ok(ControlToken::Continue) -                } -                c => { -                    self.emitter.emit_string("</"); -                    self.state = State::RawText; -                    self.unread_char(c); -                    Ok(ControlToken::Continue) -                } -            }, -            State::RawTextEndTagName => match self.read_char()? { -                Some(whitespace_pat!()) if self.emitter.current_is_appropriate_end_tag_token() => { -                    self.state = State::BeforeAttributeName; -                    Ok(ControlToken::Continue) -                } -                Some('/') if self.emitter.current_is_appropriate_end_tag_token() => { -                    self.state = State::SelfClosingStartTag; -                    Ok(ControlToken::Continue) -                } -                Some('>') if self.emitter.current_is_appropriate_end_tag_token() => { -                    self.state = State::Data; -                    self.emitter.emit_current_tag(); -                    Ok(ControlToken::Continue) -                } -                Some(x) if x.is_ascii_alphabetic() => { -                    self.emitter.push_tag_name(ctostr!(x.to_ascii_lowercase())); -                    self.temporary_buffer.push(x); -                    Ok(ControlToken::Continue) -                } -                c => { -                    self.emitter.emit_string("</"); -                    self.flush_buffer_characters(); - -                    self.state = State::RawText; -                    self.unread_char(c); -                    Ok(ControlToken::Continue) -                } -            }, -            State::ScriptDataLessThanSign => match self.read_char()? { -                Some('/') => { -                    self.temporary_buffer.clear(); -                    self.state = State::ScriptDataEndTagOpen; -                    Ok(ControlToken::Continue) -                } -                Some('!') => { -                    self.state = State::ScriptDataEscapeStart; -                    self.emitter.emit_string("<!"); -                    Ok(ControlToken::Continue) -                } -                c => { -                    self.emitter.emit_string("<"); -                    self.state = State::Data; -                    self.unread_char(c); -                    Ok(ControlToken::Continue) -                } -            }, -            State::ScriptDataEndTagOpen => match self.read_char()? { -                Some(x) if x.is_ascii_alphabetic() => { -                    self.emitter.init_end_tag(); -                    self.state = State::ScriptDataEndTagName; -                    self.unread_char(Some(x)); -                    Ok(ControlToken::Continue) -                } -                c => { -                    self.emitter.emit_string("</"); -                    self.state = State::ScriptData; -                    self.unread_char(c); -                    Ok(ControlToken::Continue) -                } -            }, -            State::ScriptDataEndTagName => match self.read_char()? { -                Some(whitespace_pat!()) if self.emitter.current_is_appropriate_end_tag_token() => { -                    self.state = State::BeforeAttributeName; -                    Ok(ControlToken::Continue) -                } -                Some('/') if self.emitter.current_is_appropriate_end_tag_token() => { -                    self.state = State::SelfClosingStartTag; -                    Ok(ControlToken::Continue) -                } -                Some('>') if self.emitter.current_is_appropriate_end_tag_token() => { -                    self.state = State::Data; -                    self.emitter.emit_current_tag(); -                    Ok(ControlToken::Continue) -                } -                Some(x) if x.is_ascii_alphabetic() => { -                    self.emitter.push_tag_name(ctostr!(x.to_ascii_lowercase())); -                    self.temporary_buffer.push(x.to_ascii_lowercase()); -                    Ok(ControlToken::Continue) -                } -                c => { -                    self.emitter.emit_string("</"); -                    self.flush_buffer_characters(); -                    self.state = State::Data; -                    self.unread_char(c); -                    Ok(ControlToken::Continue) -                } -            }, -            State::ScriptDataEscapeStart => match self.read_char()? { -                Some('-') => { -                    self.state = State::ScriptDataEscapeStartDash; -                    self.emitter.emit_string("-"); -                    Ok(ControlToken::Continue) -                } -                c => { -                    self.state = State::ScriptData; -                    self.unread_char(c); -                    Ok(ControlToken::Continue) -                } -            }, -            State::ScriptDataEscapeStartDash => match self.read_char()? { -                Some('-') => { -                    self.state = State::ScriptDataEscapedDashDash; -                    self.emitter.emit_string("-"); -                    Ok(ControlToken::Continue) -                } -                c => { -                    self.state = State::ScriptData; -                    self.unread_char(c); -                    Ok(ControlToken::Continue) -                } -            }, -            State::ScriptDataEscaped => match self.read_char()? { -                Some('-') => { -                    self.state = State::ScriptDataEscapedDash; -                    self.emitter.emit_string("-"); -                    Ok(ControlToken::Continue) -                } -                Some('<') => { -                    self.state = State::ScriptDataEscapedLessThanSign; -                    Ok(ControlToken::Continue) -                } -                Some('\0') => { -                    self.emitter.emit_error(Error::UnexpectedNullCharacter); -                    self.emitter.emit_string("\u{fffd}"); -                    Ok(ControlToken::Continue) -                } -                None => { -                    self.emitter -                        .emit_error(Error::EofInScriptHtmlCommentLikeText); -                    Ok(ControlToken::Eof) -                } -                Some(x) => { -                    self.emitter.emit_string(ctostr!(x)); -                    Ok(ControlToken::Continue) -                } -            }, -            State::ScriptDataEscapedDash => match self.read_char()? { -                Some('-') => { -                    self.state = State::ScriptDataEscapedDashDash; -                    self.emitter.emit_string("-"); -                    Ok(ControlToken::Continue) -                } -                Some('<') => { -                    self.state = State::ScriptDataEscapedLessThanSign; -                    Ok(ControlToken::Continue) -                } -                Some('\0') => { -                    self.emitter.emit_error(Error::UnexpectedNullCharacter); -                    self.state = State::ScriptDataEscaped; -                    self.emitter.emit_string("\u{fffd}"); -                    Ok(ControlToken::Continue) -                } -                None => { -                    self.emitter -                        .emit_error(Error::EofInScriptHtmlCommentLikeText); -                    Ok(ControlToken::Eof) -                } -                Some(x) => { -                    self.state = State::ScriptDataEscaped; -                    self.emitter.emit_string(ctostr!(x)); -                    Ok(ControlToken::Continue) -                } -            }, -            State::ScriptDataEscapedDashDash => match self.read_char()? { -                Some('-') => { -                    self.emitter.emit_string("-"); -                    Ok(ControlToken::Continue) -                } -                Some('<') => { -                    self.state = State::ScriptDataEscapedLessThanSign; -                    Ok(ControlToken::Continue) -                } -                Some('>') => { -                    self.state = State::ScriptData; -                    self.emitter.emit_string(">"); -                    Ok(ControlToken::Continue) -                } -                Some('\0') => { -                    self.emitter.emit_error(Error::UnexpectedNullCharacter); -                    self.state = State::ScriptDataEscaped; -                    self.emitter.emit_string("\u{fffd}"); -                    Ok(ControlToken::Continue) -                } -                None => { -                    self.emitter -                        .emit_error(Error::EofInScriptHtmlCommentLikeText); -                    Ok(ControlToken::Eof) -                } -                Some(x) => { -                    self.state = State::ScriptDataEscaped; -                    self.emitter.emit_string(ctostr!(x)); -                    Ok(ControlToken::Continue) -                } -            }, -            State::ScriptDataEscapedLessThanSign => match self.read_char()? { -                Some('/') => { -                    self.temporary_buffer.clear(); -                    self.state = State::ScriptDataEscapedEndTagOpen; -                    Ok(ControlToken::Continue) -                } -                Some(x) if x.is_ascii_alphabetic() => { -                    self.temporary_buffer.clear(); -                    self.emitter.emit_string("<"); -                    self.state = State::ScriptDataDoubleEscapeStart; -                    self.unread_char(Some(x)); -                    Ok(ControlToken::Continue) -                } -                c => { -                    self.emitter.emit_string("<"); -                    self.state = State::ScriptDataEscaped; -                    self.unread_char(c); -                    Ok(ControlToken::Continue) -                } -            }, -            State::ScriptDataEscapedEndTagOpen => match self.read_char()? { -                Some(x) if x.is_ascii_alphabetic() => { -                    self.emitter.init_end_tag(); -                    self.state = State::ScriptDataEscapedEndTagName; -                    self.unread_char(Some(x)); -                    Ok(ControlToken::Continue) -                } -                c => { -                    self.emitter.emit_string("</"); -                    self.unread_char(c); -                    self.state = State::ScriptDataEscaped; -                    Ok(ControlToken::Continue) -                } -            }, -            State::ScriptDataEscapedEndTagName => match self.read_char()? { -                Some(whitespace_pat!()) if self.emitter.current_is_appropriate_end_tag_token() => { -                    self.state = State::BeforeAttributeName; -                    Ok(ControlToken::Continue) -                } -                Some('/') if self.emitter.current_is_appropriate_end_tag_token() => { -                    self.state = State::SelfClosingStartTag; -                    Ok(ControlToken::Continue) -                } -                Some('>') if self.emitter.current_is_appropriate_end_tag_token() => { -                    self.state = State::Data; -                    self.emitter.emit_current_tag(); -                    Ok(ControlToken::Continue) -                } -                Some(x) if x.is_ascii_alphabetic() => { -                    self.emitter.push_tag_name(ctostr!(x.to_ascii_lowercase())); -                    self.temporary_buffer.push(x); -                    Ok(ControlToken::Continue) -                } -                c => { -                    self.emitter.emit_string("</"); -                    self.flush_buffer_characters(); -                    self.state = State::ScriptDataEscaped; -                    self.unread_char(c); -                    Ok(ControlToken::Continue) -                } -            }, -            State::ScriptDataDoubleEscapeStart => match self.read_char()? { -                Some(x @ whitespace_pat!() | x @ '/' | x @ '>') => { -                    if self.temporary_buffer == "script" { -                        self.state = State::ScriptDataDoubleEscaped; -                    } else { -                        self.state = State::ScriptDataEscaped; -                    } -                    self.emitter.emit_string(ctostr!(x)); -                    Ok(ControlToken::Continue) -                } -                Some(x) if x.is_ascii_alphabetic() => { -                    self.temporary_buffer.push(x.to_ascii_lowercase()); -                    self.emitter.emit_string(ctostr!(x)); -                    Ok(ControlToken::Continue) -                } -                c => { -                    self.state = State::ScriptDataEscaped; -                    self.unread_char(c); -                    Ok(ControlToken::Continue) -                } -            }, -            State::ScriptDataDoubleEscaped => match self.read_char()? { -                Some('-') => { -                    self.state = State::ScriptDataDoubleEscapedDash; -                    self.emitter.emit_string("-"); -                    Ok(ControlToken::Continue) -                } -                Some('<') => { -                    self.state = State::ScriptDataDoubleEscapedLessThanSign; -                    self.emitter.emit_string("<"); -                    Ok(ControlToken::Continue) -                } -                Some('\0') => { -                    self.emitter.emit_error(Error::UnexpectedNullCharacter); -                    self.emitter.emit_string("\u{fffd}"); -                    Ok(ControlToken::Continue) -                } -                None => { -                    self.emitter -                        .emit_error(Error::EofInScriptHtmlCommentLikeText); -                    Ok(ControlToken::Eof) -                } -                Some(x) => { -                    self.emitter.emit_string(ctostr!(x)); -                    Ok(ControlToken::Continue) -                } -            }, -            State::ScriptDataDoubleEscapedDash => match self.read_char()? { -                Some('-') => { -                    self.state = State::ScriptDataDoubleEscapedDashDash; -                    self.emitter.emit_string("-"); -                    Ok(ControlToken::Continue) -                } -                Some('<') => { -                    self.state = State::ScriptDataDoubleEscapedLessThanSign; -                    self.emitter.emit_string("<"); -                    Ok(ControlToken::Continue) -                } -                Some('\0') => { -                    self.emitter.emit_error(Error::UnexpectedNullCharacter); -                    self.state = State::ScriptDataDoubleEscaped; -                    self.emitter.emit_string("\u{fffd}"); -                    Ok(ControlToken::Continue) -                } -                None => { -                    self.emitter -                        .emit_error(Error::EofInScriptHtmlCommentLikeText); -                    Ok(ControlToken::Eof) -                } -                Some(x) => { -                    self.state = State::ScriptDataDoubleEscaped; -                    self.emitter.emit_string(ctostr!(x)); -                    Ok(ControlToken::Continue) -                } -            }, -            State::ScriptDataDoubleEscapedDashDash => match self.read_char()? { -                Some('-') => { -                    self.emitter.emit_string("-"); -                    Ok(ControlToken::Continue) -                } -                Some('<') => { -                    self.emitter.emit_string("<"); -                    self.state = State::ScriptDataDoubleEscapedLessThanSign; -                    Ok(ControlToken::Continue) -                } -                Some('>') => { -                    self.emitter.emit_string(">"); -                    self.state = State::ScriptData; -                    Ok(ControlToken::Continue) -                } -                Some('\0') => { -                    self.emitter.emit_error(Error::UnexpectedNullCharacter); -                    self.state = State::ScriptDataDoubleEscaped; -                    self.emitter.emit_string("\u{fffd}"); -                    Ok(ControlToken::Continue) -                } -                None => { -                    self.emitter -                        .emit_error(Error::EofInScriptHtmlCommentLikeText); -                    Ok(ControlToken::Eof) -                } -                Some(x) => { -                    self.state = State::ScriptDataDoubleEscaped; -                    self.emitter.emit_string(ctostr!(x)); -                    Ok(ControlToken::Continue) -                } -            }, -            State::ScriptDataDoubleEscapedLessThanSign => match self.read_char()? { -                Some('/') => { -                    self.temporary_buffer.clear(); -                    self.state = State::ScriptDataDoubleEscapeEnd; -                    self.emitter.emit_string("/"); -                    Ok(ControlToken::Continue) -                } -                c => { -                    self.state = State::ScriptDataDoubleEscaped; -                    self.unread_char(c); -                    Ok(ControlToken::Continue) -                } -            }, -            State::ScriptDataDoubleEscapeEnd => match self.read_char()? { -                Some(x @ whitespace_pat!() | x @ '/' | x @ '>') => { -                    if self.temporary_buffer == "script" { -                        self.state = State::ScriptDataEscaped; -                    } else { -                        self.state = State::ScriptDataDoubleEscaped; -                    } - -                    self.emitter.emit_string(ctostr!(x)); -                    Ok(ControlToken::Continue) -                } -                Some(x) if x.is_ascii_alphabetic() => { -                    self.temporary_buffer.push(x.to_ascii_lowercase()); -                    self.emitter.emit_string(ctostr!(x)); -                    Ok(ControlToken::Continue) -                } -                c => { -                    self.state = State::ScriptDataDoubleEscaped; -                    self.unread_char(c); -                    Ok(ControlToken::Continue) -                } -            }, -            State::BeforeAttributeName => match self.read_char()? { -                Some(whitespace_pat!()) => Ok(ControlToken::Continue), -                c @ Some('/' | '>') | c @ None => { -                    self.state = State::AfterAttributeName; -                    self.unread_char(c); -                    Ok(ControlToken::Continue) -                } -                Some('=') => { -                    self.emitter -                        .emit_error(Error::UnexpectedEqualsSignBeforeAttributeName); -                    self.emitter.init_attribute(); -                    self.emitter.push_attribute_name("="); -                    self.state = State::AttributeName; -                    Ok(ControlToken::Continue) -                } -                Some(x) => { -                    self.emitter.init_attribute(); -                    self.state = State::AttributeName; -                    self.unread_char(Some(x)); -                    Ok(ControlToken::Continue) -                } -            }, -            State::AttributeName => match self.read_char()? { -                c @ Some(whitespace_pat!() | '/' | '>') | c @ None => { -                    self.state = State::AfterAttributeName; -                    self.unread_char(c); -                    Ok(ControlToken::Continue) -                } -                Some('=') => { -                    self.state = State::BeforeAttributeValue; -                    Ok(ControlToken::Continue) -                } -                Some('\0') => { -                    self.emitter.emit_error(Error::UnexpectedNullCharacter); -                    self.emitter.push_attribute_name("\u{fffd}"); -                    Ok(ControlToken::Continue) -                } -                Some(x @ '"' | x @ '\'' | x @ '<') => { -                    self.emitter -                        .emit_error(Error::UnexpectedCharacterInAttributeName); -                    self.emitter -                        .push_attribute_name(ctostr!(x.to_ascii_lowercase())); -                    Ok(ControlToken::Continue) -                } -                Some(x) => { -                    self.emitter -                        .push_attribute_name(ctostr!(x.to_ascii_lowercase())); -                    Ok(ControlToken::Continue) -                } -            }, -            State::AfterAttributeName => match self.read_char()? { -                Some(whitespace_pat!()) => Ok(ControlToken::Continue), -                Some('/') => { -                    self.state = State::SelfClosingStartTag; -                    Ok(ControlToken::Continue) -                } -                Some('=') => { -                    self.state = State::BeforeAttributeValue; -                    Ok(ControlToken::Continue) -                } -                Some('>') => { -                    self.state = State::Data; -                    self.emitter.emit_current_tag(); -                    Ok(ControlToken::Continue) -                } -                None => { -                    self.emitter.emit_error(Error::EofInTag); -                    Ok(ControlToken::Eof) -                } -                Some(x) => { -                    self.emitter.init_attribute(); -                    self.state = State::AttributeName; -                    self.unread_char(Some(x)); -                    Ok(ControlToken::Continue) -                } -            }, -            State::BeforeAttributeValue => match self.read_char()? { -                Some(whitespace_pat!()) => Ok(ControlToken::Continue), -                Some('"') => { -                    self.state = State::AttributeValueDoubleQuoted; -                    Ok(ControlToken::Continue) -                } -                Some('\'') => { -                    self.state = State::AttributeValueSingleQuoted; -                    Ok(ControlToken::Continue) -                } -                Some('>') => { -                    self.emitter.emit_error(Error::MissingAttributeValue); -                    self.state = State::Data; -                    self.emitter.emit_current_tag(); -                    Ok(ControlToken::Continue) -                } -                c => { -                    self.state = State::AttributeValueUnquoted; -                    self.unread_char(c); -                    Ok(ControlToken::Continue) -                } -            }, -            State::AttributeValueDoubleQuoted => match self.read_char()? { -                Some('"') => { -                    self.state = State::AfterAttributeValueQuoted; -                    Ok(ControlToken::Continue) -                } -                Some('&') => { -                    self.return_state = Some(State::AttributeValueDoubleQuoted); -                    self.state = State::CharacterReference; -                    Ok(ControlToken::Continue) -                } -                Some('\0') => { -                    self.emitter.emit_error(Error::UnexpectedNullCharacter); -                    self.emitter.push_attribute_value("\u{fffd}"); -                    Ok(ControlToken::Continue) -                } -                None => { -                    self.emitter.emit_error(Error::EofInTag); -                    Ok(ControlToken::Eof) -                } -                Some(x) => { -                    self.emitter.push_attribute_value(ctostr!(x)); -                    Ok(ControlToken::Continue) -                } -            }, -            State::AttributeValueSingleQuoted => match self.read_char()? { -                Some('\'') => { -                    self.state = State::AfterAttributeValueQuoted; -                    Ok(ControlToken::Continue) -                } -                Some('&') => { -                    self.return_state = Some(State::AttributeValueSingleQuoted); -                    self.state = State::CharacterReference; -                    Ok(ControlToken::Continue) -                } -                Some('\0') => { -                    self.emitter.emit_error(Error::UnexpectedNullCharacter); -                    self.emitter.push_attribute_value("\u{fffd}"); -                    Ok(ControlToken::Continue) -                } -                None => { -                    self.emitter.emit_error(Error::EofInTag); -                    Ok(ControlToken::Eof) -                } -                Some(x) => { -                    self.emitter.push_attribute_value(ctostr!(x)); -                    Ok(ControlToken::Continue) -                } -            }, -            State::AttributeValueUnquoted => match self.read_char()? { -                Some(whitespace_pat!()) => { -                    self.state = State::BeforeAttributeName; -                    Ok(ControlToken::Continue) -                } -                Some('&') => { -                    self.return_state = Some(State::AttributeValueUnquoted); -                    self.state = State::CharacterReference; -                    Ok(ControlToken::Continue) -                } -                Some('>') => { -                    self.state = State::Data; -                    self.emitter.emit_current_tag(); -                    Ok(ControlToken::Continue) -                } -                Some('\0') => { -                    self.emitter.emit_error(Error::UnexpectedNullCharacter); -                    self.emitter.push_attribute_value("\u{fffd}"); -                    Ok(ControlToken::Continue) -                } -                Some(x @ '"' | x @ '\'' | x @ '<' | x @ '=' | x @ '\u{60}') => { -                    self.emitter -                        .emit_error(Error::UnexpectedCharacterInUnquotedAttributeValue); -                    self.emitter.push_attribute_value(ctostr!(x)); -                    Ok(ControlToken::Continue) -                } -                None => { -                    self.emitter.emit_error(Error::EofInTag); -                    Ok(ControlToken::Eof) -                } -                Some(x) => { -                    self.emitter.push_attribute_value(ctostr!(x)); -                    Ok(ControlToken::Continue) -                } -            }, -            State::AfterAttributeValueQuoted => match self.read_char()? { -                Some(whitespace_pat!()) => { -                    self.state = State::BeforeAttributeName; -                    Ok(ControlToken::Continue) -                } -                Some('/') => { -                    self.state = State::SelfClosingStartTag; -                    Ok(ControlToken::Continue) -                } -                Some('>') => { -                    self.state = State::Data; -                    self.emitter.emit_current_tag(); -                    Ok(ControlToken::Continue) -                } -                None => { -                    self.emitter.emit_error(Error::EofInTag); -                    Ok(ControlToken::Eof) -                } -                Some(x) => { -                    self.emitter -                        .emit_error(Error::MissingWhitespaceBetweenAttributes); -                    self.state = State::BeforeAttributeName; -                    self.unread_char(Some(x)); -                    Ok(ControlToken::Continue) -                } -            }, -            State::SelfClosingStartTag => match self.read_char()? { -                Some('>') => { -                    self.emitter.set_self_closing(); -                    self.state = State::Data; -                    self.emitter.emit_current_tag(); -                    Ok(ControlToken::Continue) -                } -                None => { -                    self.emitter.emit_error(Error::EofInTag); -                    Ok(ControlToken::Eof) -                } -                Some(x) => { -                    self.emitter.emit_error(Error::UnexpectedSolidusInTag); -                    self.state = State::BeforeAttributeName; -                    self.unread_char(Some(x)); -                    Ok(ControlToken::Continue) -                } -            }, -            State::BogusComment => match self.read_char()? { -                Some('>') => { -                    self.state = State::Data; -                    self.emitter.emit_current_comment(); -                    Ok(ControlToken::Continue) -                } -                None => { -                    self.emitter.emit_current_comment(); -                    Ok(ControlToken::Eof) -                } -                Some('\0') => { -                    self.emitter.emit_error(Error::UnexpectedNullCharacter); -                    self.emitter.push_comment("\u{fffd}"); -                    Ok(ControlToken::Continue) -                } -                Some(x) => { -                    self.emitter.push_comment(ctostr!(x)); -                    Ok(ControlToken::Continue) -                } -            }, -            State::MarkupDeclarationOpen => match self.read_char()? { -                Some('-') if self.try_read_string("-", true)? => { -                    self.emitter.init_comment(); -                    self.state = State::CommentStart; -                    Ok(ControlToken::Continue) -                } -                Some('d' | 'D') if self.try_read_string("octype", false)? => { -                    self.state = State::Doctype; -                    Ok(ControlToken::Continue) -                } -                Some('[') if self.try_read_string("CDATA[", true)? => { -                    // missing: check for adjusted current element: we don't have an element stack -                    // at all -                    // -                    // missing: cdata transition -                    // -                    // let's hope that bogus comment can just sort of skip over cdata -                    self.emitter.emit_error(Error::CdataInHtmlContent); - -                    self.emitter.init_comment(); -                    self.emitter.push_comment("[CDATA["); -                    self.state = State::BogusComment; -                    Ok(ControlToken::Continue) -                } -                c => { -                    self.emitter.emit_error(Error::IncorrectlyOpenedComment); -                    self.emitter.init_comment(); -                    self.state = State::BogusComment; -                    self.unread_char(c); -                    Ok(ControlToken::Continue) -                } -            }, -            State::CommentStart => match self.read_char()? { -                Some('-') => { -                    self.state = State::CommentStartDash; -                    Ok(ControlToken::Continue) -                } -                Some('>') => { -                    self.emitter.emit_error(Error::AbruptClosingOfEmptyComment); -                    self.state = State::Data; -                    self.emitter.emit_current_comment(); -                    Ok(ControlToken::Continue) -                } -                c => { -                    self.unread_char(c); -                    self.state = State::Comment; -                    Ok(ControlToken::Continue) -                } -            }, -            State::CommentStartDash => match self.read_char()? { -                Some('-') => { -                    self.state = State::CommentEnd; -                    Ok(ControlToken::Continue) -                } -                Some('>') => { -                    self.emitter.emit_error(Error::AbruptClosingOfEmptyComment); -                    self.state = State::Data; -                    self.emitter.emit_current_comment(); -                    Ok(ControlToken::Continue) -                } -                None => { -                    self.emitter.emit_error(Error::EofInComment); -                    self.emitter.emit_current_comment(); -                    Ok(ControlToken::Eof) -                } -                c @ Some(_) => { -                    self.emitter.push_comment("-"); -                    self.unread_char(c); -                    self.state = State::Comment; -                    Ok(ControlToken::Continue) -                } -            }, -            State::Comment => match self.read_char()? { -                Some('<') => { -                    self.emitter.push_comment("<"); -                    self.state = State::CommentLessThanSign; -                    Ok(ControlToken::Continue) -                } -                Some('-') => { -                    self.state = State::CommentEndDash; -                    Ok(ControlToken::Continue) -                } -                Some('\0') => { -                    self.emitter.emit_error(Error::UnexpectedNullCharacter); -                    self.emitter.push_comment("\u{fffd}"); -                    Ok(ControlToken::Continue) -                } -                None => { -                    self.emitter.emit_error(Error::EofInComment); -                    self.emitter.emit_current_comment(); -                    Ok(ControlToken::Eof) -                } -                Some(x) => { -                    self.emitter.push_comment(ctostr!(x)); -                    Ok(ControlToken::Continue) -                } -            }, -            State::CommentLessThanSign => match self.read_char()? { -                Some('!') => { -                    self.emitter.push_comment("!"); -                    self.state = State::CommentLessThanSignBang; -                    Ok(ControlToken::Continue) -                } -                Some('<') => { -                    self.emitter.push_comment("<"); -                    Ok(ControlToken::Continue) -                } -                c => { -                    self.unread_char(c); -                    self.state = State::Comment; -                    Ok(ControlToken::Continue) -                } -            }, -            State::CommentLessThanSignBang => match self.read_char()? { -                Some('-') => { -                    self.state = State::CommentLessThanSignBangDash; -                    Ok(ControlToken::Continue) -                } -                c => { -                    self.unread_char(c); -                    self.state = State::Comment; -                    Ok(ControlToken::Continue) -                } -            }, -            State::CommentLessThanSignBangDash => match self.read_char()? { -                Some('-') => { -                    self.state = State::CommentLessThanSignBangDashDash; -                    Ok(ControlToken::Continue) -                } -                c => { -                    self.unread_char(c); -                    self.state = State::CommentEndDash; -                    Ok(ControlToken::Continue) -                } -            }, -            State::CommentLessThanSignBangDashDash => match self.read_char()? { -                c @ Some('>') | c @ None => { -                    self.unread_char(c); -                    self.state = State::CommentEnd; -                    Ok(ControlToken::Continue) -                } -                c => { -                    self.emitter.emit_error(Error::NestedComment); -                    self.unread_char(c); -                    self.state = State::CommentEnd; -                    Ok(ControlToken::Continue) -                } -            }, -            State::CommentEndDash => match self.read_char()? { -                Some('-') => { -                    self.state = State::CommentEnd; -                    Ok(ControlToken::Continue) -                } -                None => { -                    self.emitter.emit_error(Error::EofInComment); -                    self.emitter.emit_current_comment(); -                    Ok(ControlToken::Eof) -                } -                c => { -                    self.emitter.push_comment("-"); -                    self.unread_char(c); -                    self.state = State::Comment; -                    Ok(ControlToken::Continue) -                } -            }, -            State::CommentEnd => match self.read_char()? { -                Some('>') => { -                    self.state = State::Data; -                    self.emitter.emit_current_comment(); -                    Ok(ControlToken::Continue) -                } -                Some('!') => { -                    self.state = State::CommentEndBang; -                    Ok(ControlToken::Continue) -                } -                Some('-') => { -                    self.emitter.push_comment("-"); -                    Ok(ControlToken::Continue) -                } -                None => { -                    self.emitter.emit_error(Error::EofInComment); -                    self.emitter.emit_current_comment(); -                    Ok(ControlToken::Eof) -                } -                c @ Some(_) => { -                    self.emitter.push_comment("-"); -                    self.emitter.push_comment("-"); -                    self.unread_char(c); -                    self.state = State::Comment; -                    Ok(ControlToken::Continue) -                } -            }, -            State::CommentEndBang => match self.read_char()? { -                Some('-') => { -                    self.emitter.push_comment("-"); -                    self.emitter.push_comment("-"); -                    self.emitter.push_comment("!"); -                    self.state = State::CommentEndDash; -                    Ok(ControlToken::Continue) -                } -                Some('>') => { -                    self.emitter.emit_error(Error::IncorrectlyClosedComment); -                    self.state = State::Data; -                    self.emitter.emit_current_comment(); -                    Ok(ControlToken::Continue) -                } -                None => { -                    self.emitter.emit_error(Error::EofInComment); -                    self.emitter.emit_current_comment(); -                    Ok(ControlToken::Eof) -                } -                c @ Some(_) => { -                    self.emitter.push_comment("-"); -                    self.emitter.push_comment("-"); -                    self.emitter.push_comment("!"); -                    self.state = State::Comment; -                    self.unread_char(c); -                    Ok(ControlToken::Continue) -                } -            }, -            State::Doctype => match self.read_char()? { -                Some(whitespace_pat!()) => { -                    self.state = State::BeforeDoctypeName; -                    Ok(ControlToken::Continue) -                } -                c @ Some('>') => { -                    self.unread_char(c); -                    self.state = State::BeforeDoctypeName; -                    Ok(ControlToken::Continue) -                } -                None => { -                    self.emitter.emit_error(Error::EofInDoctype); -                    self.emitter.init_doctype(); -                    self.emitter.set_force_quirks(); -                    self.emitter.emit_current_doctype(); -                    Ok(ControlToken::Eof) -                } -                c @ Some(_) => { -                    self.emitter -                        .emit_error(Error::MissingWhitespaceBeforeDoctypeName); -                    self.unread_char(c); -                    self.state = State::BeforeDoctypeName; -                    Ok(ControlToken::Continue) -                } -            }, -            State::BeforeDoctypeName => match self.read_char()? { -                Some(whitespace_pat!()) => Ok(ControlToken::Continue), -                Some('\0') => { -                    self.emitter.emit_error(Error::UnexpectedNullCharacter); -                    self.emitter.init_doctype(); -                    self.emitter.push_doctype_name("\u{fffd}"); -                    self.state = State::DoctypeName; -                    Ok(ControlToken::Continue) -                } -                Some('>') => { -                    self.emitter.emit_error(Error::MissingDoctypeName); -                    self.emitter.init_doctype(); -                    self.emitter.set_force_quirks(); -                    self.state = State::Data; -                    self.emitter.emit_current_doctype(); -                    Ok(ControlToken::Continue) -                } -                None => { -                    self.emitter.emit_error(Error::EofInDoctype); -                    self.emitter.init_doctype(); -                    self.emitter.set_force_quirks(); -                    self.emitter.emit_current_doctype(); -                    Ok(ControlToken::Eof) -                } -                Some(x) => { -                    self.emitter.init_doctype(); -                    self.emitter -                        .push_doctype_name(ctostr!(x.to_ascii_lowercase())); -                    self.state = State::DoctypeName; -                    Ok(ControlToken::Continue) -                } -            }, -            State::DoctypeName => match self.read_char()? { -                Some(whitespace_pat!()) => { -                    self.state = State::AfterDoctypeName; -                    Ok(ControlToken::Continue) -                } -                Some('>') => { -                    self.state = State::Data; -                    self.emitter.emit_current_doctype(); -                    Ok(ControlToken::Continue) -                } -                Some('\0') => { -                    self.emitter.emit_error(Error::UnexpectedNullCharacter); -                    self.emitter.push_doctype_name("\u{fffd}"); -                    Ok(ControlToken::Continue) -                } -                None => { -                    self.emitter.emit_error(Error::EofInDoctype); -                    self.emitter.set_force_quirks(); -                    self.emitter.emit_current_doctype(); -                    Ok(ControlToken::Eof) -                } -                Some(x) => { -                    self.emitter -                        .push_doctype_name(ctostr!(x.to_ascii_lowercase())); -                    Ok(ControlToken::Continue) -                } -            }, -            State::AfterDoctypeName => match self.read_char()? { -                Some(whitespace_pat!()) => Ok(ControlToken::Continue), -                Some('>') => { -                    self.state = State::Data; -                    self.emitter.emit_current_doctype(); -                    Ok(ControlToken::Continue) -                } -                None => { -                    self.emitter.emit_error(Error::EofInDoctype); -                    self.emitter.set_force_quirks(); -                    self.emitter.emit_current_doctype(); -                    Ok(ControlToken::Eof) -                } -                Some('p' | 'P') if self.try_read_string("ublic", false)? => { -                    self.state = State::AfterDoctypePublicKeyword; -                    Ok(ControlToken::Continue) -                } -                Some('s' | 'S') if self.try_read_string("ystem", false)? => { -                    self.state = State::AfterDoctypeSystemKeyword; -                    Ok(ControlToken::Continue) -                } -                c @ Some(_) => { -                    self.emitter -                        .emit_error(Error::InvalidCharacterSequenceAfterDoctypeName); -                    self.emitter.set_force_quirks(); -                    self.unread_char(c); -                    self.state = State::BogusDoctype; -                    Ok(ControlToken::Continue) -                } -            }, -            State::AfterDoctypePublicKeyword => match self.read_char()? { -                Some(whitespace_pat!()) => { -                    self.state = State::BeforeDoctypePublicIdentifier; -                    Ok(ControlToken::Continue) -                } -                Some('"') => { -                    self.emitter -                        .emit_error(Error::MissingWhitespaceAfterDoctypePublicKeyword); -                    self.emitter.set_doctype_public_identifier(""); -                    self.state = State::DoctypePublicIdentifierDoubleQuoted; -                    Ok(ControlToken::Continue) -                } -                Some('\'') => { -                    self.emitter -                        .emit_error(Error::MissingWhitespaceAfterDoctypePublicKeyword); -                    self.emitter.set_doctype_public_identifier(""); -                    self.state = State::DoctypePublicIdentifierSingleQuoted; -                    Ok(ControlToken::Continue) -                } -                Some('>') => { -                    self.emitter -                        .emit_error(Error::MissingDoctypePublicIdentifier); -                    self.emitter.set_force_quirks(); -                    self.state = State::Data; -                    self.emitter.emit_current_doctype(); -                    Ok(ControlToken::Continue) -                } -                None => { -                    self.emitter.emit_error(Error::EofInDoctype); -                    self.emitter.set_force_quirks(); -                    self.emitter.emit_current_doctype(); -                    Ok(ControlToken::Eof) -                } -                c @ Some(_) => { -                    self.emitter -                        .emit_error(Error::MissingQuoteBeforeDoctypePublicIdentifier); -                    self.emitter.set_force_quirks(); -                    self.unread_char(c); -                    self.state = State::BogusDoctype; -                    Ok(ControlToken::Continue) -                } -            }, -            State::BeforeDoctypePublicIdentifier => match self.read_char()? { -                Some(whitespace_pat!()) => Ok(ControlToken::Continue), -                Some('"') => { -                    self.emitter.set_doctype_public_identifier(""); -                    self.state = State::DoctypePublicIdentifierDoubleQuoted; -                    Ok(ControlToken::Continue) -                } -                Some('\'') => { -                    self.emitter.set_doctype_public_identifier(""); -                    self.state = State::DoctypePublicIdentifierSingleQuoted; -                    Ok(ControlToken::Continue) -                } -                Some('>') => { -                    self.emitter -                        .emit_error(Error::MissingDoctypePublicIdentifier); -                    self.emitter.set_force_quirks(); -                    self.state = State::Data; -                    self.emitter.emit_current_doctype(); -                    Ok(ControlToken::Continue) -                } -                None => { -                    self.emitter.emit_error(Error::EofInDoctype); -                    self.emitter.set_force_quirks(); -                    self.emitter.emit_current_doctype(); -                    Ok(ControlToken::Eof) -                } -                c @ Some(_) => { -                    self.emitter -                        .emit_error(Error::MissingQuoteBeforeDoctypePublicIdentifier); -                    self.emitter.set_force_quirks(); -                    self.unread_char(c); -                    self.state = State::BogusDoctype; -                    Ok(ControlToken::Continue) -                } -            }, -            State::DoctypePublicIdentifierDoubleQuoted => match self.read_char()? { -                Some('"') => { -                    self.state = State::AfterDoctypePublicIdentifier; -                    Ok(ControlToken::Continue) -                } -                Some('\0') => { -                    self.emitter.emit_error(Error::UnexpectedNullCharacter); -                    self.emitter.push_doctype_public_identifier("\u{fffd}"); -                    Ok(ControlToken::Continue) -                } -                Some('>') => { -                    self.emitter -                        .emit_error(Error::AbruptDoctypePublicIdentifier); -                    self.emitter.set_force_quirks(); -                    self.state = State::Data; -                    self.emitter.emit_current_doctype(); -                    Ok(ControlToken::Continue) -                } -                None => { -                    self.emitter.emit_error(Error::EofInDoctype); -                    self.emitter.set_force_quirks(); -                    self.emitter.emit_current_doctype(); -                    Ok(ControlToken::Eof) -                } -                Some(x) => { -                    self.emitter.push_doctype_public_identifier(ctostr!(x)); -                    Ok(ControlToken::Continue) -                } -            }, -            State::DoctypePublicIdentifierSingleQuoted => match self.read_char()? { -                Some('\'') => { -                    self.state = State::AfterDoctypePublicIdentifier; -                    Ok(ControlToken::Continue) -                } -                Some('\0') => { -                    self.emitter.emit_error(Error::UnexpectedNullCharacter); -                    self.emitter.push_doctype_public_identifier("\u{fffd}"); -                    Ok(ControlToken::Continue) -                } -                Some('>') => { -                    self.emitter -                        .emit_error(Error::AbruptDoctypePublicIdentifier); -                    self.emitter.set_force_quirks(); -                    self.state = State::Data; -                    self.emitter.emit_current_doctype(); -                    Ok(ControlToken::Continue) -                } -                None => { -                    self.emitter.emit_error(Error::EofInDoctype); -                    self.emitter.set_force_quirks(); -                    self.emitter.emit_current_doctype(); -                    Ok(ControlToken::Eof) -                } -                Some(x) => { -                    self.emitter.push_doctype_public_identifier(ctostr!(x)); -                    Ok(ControlToken::Continue) -                } -            }, -            State::AfterDoctypePublicIdentifier => match self.read_char()? { -                Some(whitespace_pat!()) => { -                    self.state = State::BetweenDoctypePublicAndSystemIdentifiers; -                    Ok(ControlToken::Continue) -                } -                Some('>') => { -                    self.state = State::Data; -                    self.emitter.emit_current_doctype(); -                    Ok(ControlToken::Continue) -                } -                Some('"') => { -                    self.emitter.emit_error( -                        Error::MissingWhitespaceBetweenDoctypePublicAndSystemIdentifiers, -                    ); -                    self.emitter.set_doctype_system_identifier(""); -                    self.state = State::DoctypeSystemIdentifierDoubleQuoted; -                    Ok(ControlToken::Continue) -                } -                Some('\'') => { -                    self.emitter.emit_error( -                        Error::MissingWhitespaceBetweenDoctypePublicAndSystemIdentifiers, -                    ); -                    self.emitter.set_doctype_system_identifier(""); -                    self.state = State::DoctypeSystemIdentifierSingleQuoted; -                    Ok(ControlToken::Continue) -                } -                None => { -                    self.emitter.emit_error(Error::EofInDoctype); -                    self.emitter.set_force_quirks(); -                    self.emitter.emit_current_doctype(); -                    Ok(ControlToken::Eof) -                } -                c @ Some(_) => { -                    self.emitter -                        .emit_error(Error::MissingQuoteBeforeDoctypeSystemIdentifier); -                    self.emitter.set_force_quirks(); -                    self.unread_char(c); -                    self.state = State::BogusDoctype; -                    Ok(ControlToken::Continue) -                } -            }, -            State::BetweenDoctypePublicAndSystemIdentifiers => match self.read_char()? { -                Some(whitespace_pat!()) => Ok(ControlToken::Continue), -                Some('>') => { -                    self.state = State::Data; -                    self.emitter.emit_current_doctype(); -                    Ok(ControlToken::Continue) -                } -                Some('"') => { -                    self.emitter.set_doctype_system_identifier(""); -                    self.state = State::DoctypeSystemIdentifierDoubleQuoted; -                    Ok(ControlToken::Continue) -                } -                Some('\'') => { -                    self.emitter.set_doctype_system_identifier(""); -                    self.state = State::DoctypeSystemIdentifierSingleQuoted; -                    Ok(ControlToken::Continue) -                } -                None => { -                    self.emitter.emit_error(Error::EofInDoctype); -                    self.emitter.set_force_quirks(); -                    self.emitter.emit_current_doctype(); -                    Ok(ControlToken::Eof) -                } -                c @ Some(_) => { -                    self.emitter -                        .emit_error(Error::MissingQuoteBeforeDoctypeSystemIdentifier); -                    self.emitter.set_force_quirks(); -                    self.state = State::BogusDoctype; -                    self.unread_char(c); -                    Ok(ControlToken::Continue) -                } -            }, -            State::AfterDoctypeSystemKeyword => match self.read_char()? { -                Some(whitespace_pat!()) => { -                    self.state = State::BeforeDoctypeSystemIdentifier; -                    Ok(ControlToken::Continue) -                } -                Some('"') => { -                    self.emitter -                        .emit_error(Error::MissingWhitespaceAfterDoctypeSystemKeyword); -                    self.emitter.set_doctype_system_identifier(""); -                    self.state = State::DoctypeSystemIdentifierDoubleQuoted; -                    Ok(ControlToken::Continue) -                } -                Some('\'') => { -                    self.emitter -                        .emit_error(Error::MissingWhitespaceAfterDoctypeSystemKeyword); -                    self.emitter.set_doctype_system_identifier(""); -                    self.state = State::DoctypeSystemIdentifierSingleQuoted; -                    Ok(ControlToken::Continue) -                } -                Some('>') => { -                    self.emitter -                        .emit_error(Error::MissingDoctypeSystemIdentifier); -                    self.emitter.set_force_quirks(); -                    self.state = State::Data; -                    self.emitter.emit_current_doctype(); -                    Ok(ControlToken::Continue) -                } -                None => { -                    self.emitter.emit_error(Error::EofInDoctype); -                    self.emitter.set_force_quirks(); -                    self.emitter.emit_current_doctype(); -                    Ok(ControlToken::Eof) -                } -                c @ Some(_) => { -                    self.emitter -                        .emit_error(Error::MissingQuoteBeforeDoctypeSystemIdentifier); -                    self.emitter.set_force_quirks(); -                    self.state = State::BogusDoctype; -                    self.unread_char(c); -                    Ok(ControlToken::Continue) -                } -            }, -            State::BeforeDoctypeSystemIdentifier => match self.read_char()? { -                Some(whitespace_pat!()) => Ok(ControlToken::Continue), -                Some('"') => { -                    self.emitter.set_doctype_system_identifier(""); -                    self.state = State::DoctypeSystemIdentifierDoubleQuoted; -                    Ok(ControlToken::Continue) -                } -                Some('\'') => { -                    self.emitter.set_doctype_system_identifier(""); -                    self.state = State::DoctypeSystemIdentifierSingleQuoted; -                    Ok(ControlToken::Continue) -                } -                Some('>') => { -                    self.emitter -                        .emit_error(Error::MissingDoctypeSystemIdentifier); -                    self.emitter.set_force_quirks(); -                    self.state = State::Data; -                    self.emitter.emit_current_doctype(); -                    Ok(ControlToken::Continue) -                } -                None => { -                    self.emitter.emit_error(Error::EofInDoctype); -                    self.emitter.set_force_quirks(); -                    self.emitter.emit_current_doctype(); -                    Ok(ControlToken::Eof) -                } -                c @ Some(_) => { -                    self.emitter -                        .emit_error(Error::MissingQuoteBeforeDoctypeSystemIdentifier); -                    self.emitter.set_force_quirks(); -                    self.state = State::BogusDoctype; -                    self.unread_char(c); -                    Ok(ControlToken::Continue) -                } -            }, -            State::DoctypeSystemIdentifierDoubleQuoted => match self.read_char()? { -                Some('"') => { -                    self.state = State::AfterDoctypeSystemIdentifier; -                    Ok(ControlToken::Continue) -                } -                Some('\0') => { -                    self.emitter.emit_error(Error::UnexpectedNullCharacter); -                    self.emitter.push_doctype_system_identifier("\u{fffd}"); -                    Ok(ControlToken::Continue) -                } -                Some('>') => { -                    self.emitter -                        .emit_error(Error::AbruptDoctypeSystemIdentifier); -                    self.emitter.set_force_quirks(); -                    self.state = State::Data; -                    self.emitter.emit_current_doctype(); -                    Ok(ControlToken::Continue) -                } -                None => { -                    self.emitter.emit_error(Error::EofInDoctype); -                    self.emitter.set_force_quirks(); -                    self.emitter.emit_current_doctype(); -                    Ok(ControlToken::Eof) -                } -                Some(x) => { -                    self.emitter.push_doctype_system_identifier(ctostr!(x)); -                    Ok(ControlToken::Continue) -                } -            }, -            State::DoctypeSystemIdentifierSingleQuoted => match self.read_char()? { -                Some('\'') => { -                    self.state = State::AfterDoctypeSystemIdentifier; -                    Ok(ControlToken::Continue) -                } -                Some('\0') => { -                    self.emitter.emit_error(Error::UnexpectedNullCharacter); -                    self.emitter.push_doctype_system_identifier("\u{fffd}"); -                    Ok(ControlToken::Continue) -                } -                Some('>') => { -                    self.emitter -                        .emit_error(Error::AbruptDoctypeSystemIdentifier); -                    self.emitter.set_force_quirks(); -                    self.state = State::Data; -                    self.emitter.emit_current_doctype(); -                    Ok(ControlToken::Continue) -                } -                None => { -                    self.emitter.emit_error(Error::EofInDoctype); -                    self.emitter.set_force_quirks(); -                    self.emitter.emit_current_doctype(); -                    Ok(ControlToken::Eof) -                } -                Some(x) => { -                    self.emitter.push_doctype_system_identifier(ctostr!(x)); -                    Ok(ControlToken::Continue) -                } -            }, -            State::AfterDoctypeSystemIdentifier => match self.read_char()? { -                Some(whitespace_pat!()) => Ok(ControlToken::Continue), -                Some('>') => { -                    self.state = State::Data; -                    self.emitter.emit_current_doctype(); -                    Ok(ControlToken::Continue) -                } -                None => { -                    self.emitter.emit_error(Error::EofInDoctype); -                    self.emitter.set_force_quirks(); -                    self.emitter.emit_current_doctype(); -                    Ok(ControlToken::Eof) -                } -                c @ Some(_) => { -                    self.emitter -                        .emit_error(Error::UnexpectedCharacterAfterDoctypeSystemIdentifier); -                    self.unread_char(c); -                    self.state = State::BogusDoctype; -                    Ok(ControlToken::Continue) -                } -            }, -            State::BogusDoctype => match self.read_char()? { -                Some('>') => { -                    self.state = State::Data; -                    self.emitter.emit_current_doctype(); -                    Ok(ControlToken::Continue) -                } -                Some('\0') => { -                    self.emitter.emit_error(Error::UnexpectedNullCharacter); -                    Ok(ControlToken::Continue) -                } -                None => { -                    self.emitter.emit_current_doctype(); -                    Ok(ControlToken::Eof) -                } -                Some(_) => Ok(ControlToken::Continue), -            }, -            State::CdataSection => match self.read_char()? { -                Some(']') => { -                    self.state = State::CdataSectionBracket; -                    Ok(ControlToken::Continue) -                } -                None => { -                    self.emitter.emit_error(Error::EofInCdata); -                    Ok(ControlToken::Eof) -                } -                Some(x) => { -                    self.emitter.emit_string(ctostr!(x)); -                    Ok(ControlToken::Continue) -                } -            }, -            State::CdataSectionBracket => match self.read_char()? { -                Some(']') => { -                    self.state = State::CdataSectionEnd; -                    Ok(ControlToken::Continue) -                } -                c => { -                    self.emitter.emit_string("]"); -                    self.state = State::CdataSection; -                    self.unread_char(c); -                    Ok(ControlToken::Continue) -                } -            }, -            State::CdataSectionEnd => match self.read_char()? { -                Some(']') => { -                    self.emitter.emit_string("]"); -                    Ok(ControlToken::Continue) -                } -                Some('>') => { -                    self.state = State::Data; -                    Ok(ControlToken::Continue) -                } -                c => { -                    self.emitter.emit_string("]]"); -                    self.unread_char(c); -                    self.state = State::CdataSection; -                    Ok(ControlToken::Continue) -                } -            }, -            State::CharacterReference => { -                self.temporary_buffer.clear(); -                self.temporary_buffer.push('&'); -                match self.read_char()? { -                    Some(x) if x.is_ascii_alphanumeric() => { -                        self.unread_char(Some(x)); -                        self.state = State::NamedCharacterReference; -                        Ok(ControlToken::Continue) -                    } -                    Some('#') => { -                        self.temporary_buffer.push('#'); -                        self.state = State::NumericCharacterReference; -                        Ok(ControlToken::Continue) -                    } -                    c => { -                        self.flush_code_points_consumed_as_character_reference(); -                        self.state = self.return_state.take().unwrap(); -                        self.unread_char(c); -                        Ok(ControlToken::Continue) -                    } -                } -            } -            State::NamedCharacterReference => { -                let c = self.read_char()?; - -                let char_ref = match c { -                    Some(x) => entities::try_read_character_reference(x, |x| { -                        self.try_read_string(x, true) -                    })? -                    .map(|char_ref| (x, char_ref)), - -                    None => None, -                }; - -                if let Some((x, char_ref)) = char_ref { -                    self.temporary_buffer.push(x); -                    self.temporary_buffer.push_str(char_ref.name); -                    let char_ref_name_last_character = char_ref.name.chars().last(); -                    let next_character = self.next_input_character()?; -                    if self.is_consumed_as_part_of_an_attribute() -                        && char_ref_name_last_character != Some(';') -                        && matches!(next_character, Some(x) if x == '=' || x.is_ascii_alphanumeric()) -                    { -                        self.flush_code_points_consumed_as_character_reference(); -                        self.state = self.return_state.take().unwrap(); -                        Ok(ControlToken::Continue) -                    } else { -                        if char_ref_name_last_character != Some(';') { -                            self.emitter -                                .emit_error(Error::MissingSemicolonAfterCharacterReference); -                        } - -                        self.temporary_buffer.clear(); -                        self.temporary_buffer.push_str(char_ref.characters); -                        self.flush_code_points_consumed_as_character_reference(); -                        self.state = self.return_state.take().unwrap(); -                        Ok(ControlToken::Continue) -                    } -                } else { -                    self.unread_char(c); -                    self.flush_code_points_consumed_as_character_reference(); -                    self.state = State::AmbiguousAmpersand; -                    Ok(ControlToken::Continue) -                } -            } -            State::AmbiguousAmpersand => match self.read_char()? { -                Some(x) if x.is_ascii_alphanumeric() => { -                    if self.is_consumed_as_part_of_an_attribute() { -                        self.emitter.push_attribute_value(ctostr!(x)); -                    } else { -                        self.emitter.emit_string(ctostr!(x)); -                    } - -                    Ok(ControlToken::Continue) -                } -                c @ Some(';') => { -                    self.emitter -                        .emit_error(Error::UnknownNamedCharacterReference); -                    self.unread_char(c); -                    self.state = self.return_state.take().unwrap(); -                    Ok(ControlToken::Continue) -                } -                c => { -                    self.unread_char(c); -                    self.state = self.return_state.take().unwrap(); -                    Ok(ControlToken::Continue) -                } -            }, -            State::NumericCharacterReference => { -                self.character_reference_code = 0; -                match self.read_char()? { -                    Some(x @ 'x' | x @ 'X') => { -                        self.temporary_buffer.push(x); -                        self.state = State::HexadecimalCharacterReferenceStart; -                        Ok(ControlToken::Continue) -                    } -                    c => { -                        self.unread_char(c); -                        self.state = State::DecimalCharacterReferenceStart; -                        Ok(ControlToken::Continue) -                    } -                } -            } -            State::HexadecimalCharacterReferenceStart => match self.read_char()? { -                c @ Some('0'..='9' | 'A'..='F' | 'a'..='f') => { -                    self.unread_char(c); -                    self.state = State::HexadecimalCharacterReference; -                    Ok(ControlToken::Continue) -                } -                c => { -                    self.emitter -                        .emit_error(Error::AbsenceOfDigitsInNumericCharacterReference); -                    self.flush_code_points_consumed_as_character_reference(); -                    self.unread_char(c); -                    self.state = self.return_state.take().unwrap(); -                    Ok(ControlToken::Continue) -                } -            }, -            State::DecimalCharacterReferenceStart => match self.read_char()? { -                Some(x @ ascii_digit_pat!()) => { -                    self.unread_char(Some(x)); -                    self.state = State::DecimalCharacterReference; -                    Ok(ControlToken::Continue) -                } -                c => { -                    self.emitter -                        .emit_error(Error::AbsenceOfDigitsInNumericCharacterReference); -                    self.flush_code_points_consumed_as_character_reference(); -                    self.unread_char(c); -                    self.state = self.return_state.take().unwrap(); -                    Ok(ControlToken::Continue) -                } -            }, -            State::HexadecimalCharacterReference => match self.read_char()? { -                Some(x @ ascii_digit_pat!()) => { -                    mutate_character_reference!(*16 + x - 0x0030); -                    Ok(ControlToken::Continue) -                } -                Some(x @ 'A'..='F') => { -                    mutate_character_reference!(*16 + x - 0x0037); -                    Ok(ControlToken::Continue) -                } -                Some(x @ 'a'..='f') => { -                    mutate_character_reference!(*16 + x - 0x0057); -                    Ok(ControlToken::Continue) -                } -                Some(';') => { -                    self.state = State::NumericCharacterReferenceEnd; -                    Ok(ControlToken::Continue) -                } -                c => { -                    self.emitter -                        .emit_error(Error::MissingSemicolonAfterCharacterReference); -                    self.unread_char(c); -                    self.state = State::NumericCharacterReferenceEnd; -                    Ok(ControlToken::Continue) -                } -            }, -            State::DecimalCharacterReference => match self.read_char()? { -                Some(x @ ascii_digit_pat!()) => { -                    mutate_character_reference!(*10 + x - 0x0030); -                    Ok(ControlToken::Continue) -                } -                Some(';') => { -                    self.state = State::NumericCharacterReferenceEnd; -                    Ok(ControlToken::Continue) -                } -                c => { -                    self.emitter -                        .emit_error(Error::MissingSemicolonAfterCharacterReference); -                    self.unread_char(c); -                    self.state = State::NumericCharacterReferenceEnd; -                    Ok(ControlToken::Continue) -                } -            }, -            State::NumericCharacterReferenceEnd => { -                match self.character_reference_code { -                    0x00 => { -                        self.emitter.emit_error(Error::NullCharacterReference); -                        self.character_reference_code = 0xfffd; -                    } -                    0x110000.. => { -                        self.emitter -                            .emit_error(Error::CharacterReferenceOutsideUnicodeRange); -                        self.character_reference_code = 0xfffd; -                    } -                    surrogate_pat!() => { -                        self.emitter.emit_error(Error::SurrogateCharacterReference); -                        self.character_reference_code = 0xfffd; -                    } -                    // noncharacter -                    noncharacter_pat!() => { -                        self.emitter -                            .emit_error(Error::NoncharacterCharacterReference); -                    } -                    // 0x000d, or a control that is not whitespace -                    x @ 0x000d | x @ control_pat!() -                        if !matches!(x, 0x0009 | 0x000a | 0x000c | 0x0020) => -                    { -                        self.emitter.emit_error(Error::ControlCharacterReference); -                        self.character_reference_code = match x { -                            0x80 => 0x20AC, // EURO SIGN (€) -                            0x82 => 0x201A, // SINGLE LOW-9 QUOTATION MARK (‚) -                            0x83 => 0x0192, // LATIN SMALL LETTER F WITH HOOK (ƒ) -                            0x84 => 0x201E, // DOUBLE LOW-9 QUOTATION MARK („) -                            0x85 => 0x2026, // HORIZONTAL ELLIPSIS (…) -                            0x86 => 0x2020, // DAGGER (†) -                            0x87 => 0x2021, // DOUBLE DAGGER (‡) -                            0x88 => 0x02C6, // MODIFIER LETTER CIRCUMFLEX ACCENT (ˆ) -                            0x89 => 0x2030, // PER MILLE SIGN (‰) -                            0x8A => 0x0160, // LATIN CAPITAL LETTER S WITH CARON (Š) -                            0x8B => 0x2039, // SINGLE LEFT-POINTING ANGLE QUOTATION MARK (‹) -                            0x8C => 0x0152, // LATIN CAPITAL LIGATURE OE (Œ) -                            0x8E => 0x017D, // LATIN CAPITAL LETTER Z WITH CARON (Ž) -                            0x91 => 0x2018, // LEFT SINGLE QUOTATION MARK (‘) -                            0x92 => 0x2019, // RIGHT SINGLE QUOTATION MARK (’) -                            0x93 => 0x201C, // LEFT DOUBLE QUOTATION MARK (“) -                            0x94 => 0x201D, // RIGHT DOUBLE QUOTATION MARK (”) -                            0x95 => 0x2022, // BULLET (•) -                            0x96 => 0x2013, // EN DASH (–) -                            0x97 => 0x2014, // EM DASH (—) -                            0x98 => 0x02DC, // SMALL TILDE (˜) -                            0x99 => 0x2122, // TRADE MARK SIGN (™) -                            0x9A => 0x0161, // LATIN SMALL LETTER S WITH CARON (š) -                            0x9B => 0x203A, // SINGLE RIGHT-POINTING ANGLE QUOTATION MARK (›) -                            0x9C => 0x0153, // LATIN SMALL LIGATURE OE (œ) -                            0x9E => 0x017E, // LATIN SMALL LETTER Z WITH CARON (ž) -                            0x9F => 0x0178, // LATIN CAPITAL LETTER Y WITH DIAERESIS (Ÿ) -                            _ => self.character_reference_code, -                        }; -                    } -                    _ => (), -                } - -                self.temporary_buffer.clear(); -                self.temporary_buffer -                    .push(std::char::from_u32(self.character_reference_code).unwrap()); -                self.flush_code_points_consumed_as_character_reference(); -                self.state = self.return_state.take().unwrap(); -                Ok(ControlToken::Continue) -            } -        } -    } -} - -impl<R: Reader, E: Emitter> Iterator for Tokenizer<R, E> { -    type Item = Result<E::Token, R::Error>; - -    fn next(&mut self) -> Option<Self::Item> { -        loop { -            if let Some(token) = self.emitter.pop_token() { -                break Some(Ok(token)); -            } else if !self.eof { -                match self.consume() { -                    Ok(ControlToken::Continue) => (), -                    Ok(ControlToken::Eof) => { -                        self.eof = true; -                        self.emitter.emit_eof(); -                    } -                    Err(e) => break Some(Err(e)), -                } -            } else { -                break None; -            } -        } -    } -} - -/// A kind of tokenizer that directly yields tokens when used as an iterator, so `Token` instead of -/// `Result<Token, _>`. -/// -/// This is the return value of [`Tokenizer::infallible`]. -pub struct InfallibleTokenizer<R: Reader<Error = Never>, E: Emitter>(Tokenizer<R, E>); - -impl<R: Reader<Error = Never>, E: Emitter> Tokenizer<R, E> { -    /// Statically assert that this iterator is infallible. -    /// -    /// Call this to get rid of error handling when parsing HTML from strings. -    pub fn infallible(self) -> InfallibleTokenizer<R, E> { -        InfallibleTokenizer(self) -    } -} - -impl<R: Reader<Error = Never>, E: Emitter> Iterator for InfallibleTokenizer<R, E> { -    type Item = E::Token; - -    fn next(&mut self) -> Option<Self::Item> { -        match self.0.next()? { -            Ok(token) => Some(token), -            Err(e) => match e {}, -        } -    } -} +pub use tokenizer::{InfallibleTokenizer, Tokenizer}; diff --git a/src/machine.rs b/src/machine.rs index 67db1b9..5991912 100644 --- a/src/machine.rs +++ b/src/machine.rs @@ -1,164 +1,1926 @@ -macro_rules! surrogate_pat { -    () => { -        0xd800..=0xdfff +use crate::entities::try_read_character_reference; +use crate::utils::{ +    ascii_digit_pat, control_pat, noncharacter_pat, surrogate_pat, whitespace_pat, ControlToken, +    State, +}; +use crate::{Emitter, Error, Reader, Tokenizer}; + +macro_rules! ctostr { +    ($c:expr) => { +        &*$c.encode_utf8(&mut [0; 4])      };  } -pub(crate) use surrogate_pat; +// Note: This is not implemented as a method on Tokenizer because there's fields on Tokenizer that +// should not be available in this method, such as Tokenizer.to_reconsume or the Reader instance +#[inline] +pub fn consume<R: Reader, E: Emitter>(slf: &mut Tokenizer<R, E>) -> Result<ControlToken, R::Error> { +    macro_rules! mutate_character_reference { +        (* $mul:literal + $x:ident - $sub:literal) => { +            match slf +                .character_reference_code +                .checked_mul($mul) +                .and_then(|cr| cr.checked_add($x as u32 - $sub)) +            { +                Some(cr) => slf.character_reference_code = cr, +                None => { +                    // provoke err +                    slf.character_reference_code = 0x110000; +                } +            }; +        }; +    } -macro_rules! control_pat { -    () => (0x0d | 0x0000..=0x001f | 0x007f..=0x009f) -} +    match slf.state { +        State::Data => match slf.read_char()? { +            Some('&') => { +                slf.return_state = Some(slf.state); +                slf.state = State::CharacterReference; +                Ok(ControlToken::Continue) +            } +            Some('<') => { +                slf.state = State::TagOpen; +                Ok(ControlToken::Continue) +            } +            Some('\0') => { +                slf.emitter.emit_error(Error::UnexpectedNullCharacter); +                slf.emitter.emit_string("\0"); +                Ok(ControlToken::Continue) +            } +            Some(x) => { +                slf.emitter.emit_string(ctostr!(x)); +                Ok(ControlToken::Continue) +            } +            None => Ok(ControlToken::Eof), +        }, +        State::RcData => match slf.read_char()? { +            Some('&') => { +                slf.return_state = Some(State::RcData); +                slf.state = State::CharacterReference; +                Ok(ControlToken::Continue) +            } +            Some('<') => { +                slf.state = State::RcDataLessThanSign; +                Ok(ControlToken::Continue) +            } +            Some('\0') => { +                slf.emitter.emit_error(Error::UnexpectedNullCharacter); +                slf.emitter.emit_string("\u{fffd}"); +                Ok(ControlToken::Continue) +            } +            Some(x) => { +                slf.emitter.emit_string(ctostr!(x)); +                Ok(ControlToken::Continue) +            } +            None => Ok(ControlToken::Eof), +        }, +        State::RawText => match slf.read_char()? { +            Some('<') => { +                slf.state = State::RawTextLessThanSign; +                Ok(ControlToken::Continue) +            } +            Some('\0') => { +                slf.emitter.emit_error(Error::UnexpectedNullCharacter); +                slf.emitter.emit_string("\u{fffd}"); +                Ok(ControlToken::Continue) +            } +            Some(x) => { +                slf.emitter.emit_string(ctostr!(x)); +                Ok(ControlToken::Continue) +            } +            None => Ok(ControlToken::Eof), +        }, +        State::ScriptData => match slf.read_char()? { +            Some('<') => { +                slf.state = State::ScriptDataLessThanSign; +                Ok(ControlToken::Continue) +            } +            Some('\0') => { +                slf.emitter.emit_error(Error::UnexpectedNullCharacter); +                slf.emitter.emit_string("\u{fffd}"); +                Ok(ControlToken::Continue) +            } +            Some(x) => { +                slf.emitter.emit_string(ctostr!(x)); +                Ok(ControlToken::Continue) +            } +            None => Ok(ControlToken::Eof), +        }, +        State::PlainText => match slf.read_char()? { +            Some('\0') => { +                slf.emitter.emit_error(Error::UnexpectedNullCharacter); +                slf.emitter.emit_string("\u{fffd}"); +                Ok(ControlToken::Continue) +            } +            Some(x) => { +                slf.emitter.emit_string(ctostr!(x)); +                Ok(ControlToken::Continue) +            } +            None => Ok(ControlToken::Eof), +        }, +        State::TagOpen => match slf.read_char()? { +            Some('!') => { +                slf.state = State::MarkupDeclarationOpen; +                Ok(ControlToken::Continue) +            } +            Some('/') => { +                slf.state = State::EndTagOpen; +                Ok(ControlToken::Continue) +            } +            Some(x) if x.is_ascii_alphabetic() => { +                slf.emitter.init_start_tag(); +                slf.state = State::TagName; +                slf.unread_char(Some(x)); +                Ok(ControlToken::Continue) +            } +            c @ Some('?') => { +                slf.emitter +                    .emit_error(Error::UnexpectedQuestionMarkInsteadOfTagName); +                slf.emitter.init_comment(); +                slf.state = State::BogusComment; +                slf.unread_char(c); +                Ok(ControlToken::Continue) +            } +            None => { +                slf.emitter.emit_error(Error::EofBeforeTagName); +                slf.emitter.emit_string("<"); +                Ok(ControlToken::Eof) +            } +            c @ Some(_) => { +                slf.emitter +                    .emit_error(Error::InvalidFirstCharacterOfTagName); +                slf.state = State::Data; +                slf.emitter.emit_string("<"); +                slf.unread_char(c); +                Ok(ControlToken::Continue) +            } +        }, +        State::EndTagOpen => match slf.read_char()? { +            Some(x) if x.is_ascii_alphabetic() => { +                slf.emitter.init_end_tag(); +                slf.state = State::TagName; +                slf.unread_char(Some(x)); +                Ok(ControlToken::Continue) +            } +            Some('>') => { +                slf.emitter.emit_error(Error::MissingEndTagName); +                slf.state = State::Data; +                Ok(ControlToken::Continue) +            } +            None => { +                slf.emitter.emit_error(Error::EofBeforeTagName); +                slf.emitter.emit_string("</"); +                Ok(ControlToken::Eof) +            } +            Some(x) => { +                slf.emitter +                    .emit_error(Error::InvalidFirstCharacterOfTagName); +                slf.emitter.init_comment(); +                slf.state = State::BogusComment; +                slf.unread_char(Some(x)); +                Ok(ControlToken::Continue) +            } +        }, +        State::TagName => match slf.read_char()? { +            Some(whitespace_pat!()) => { +                slf.state = State::BeforeAttributeName; +                Ok(ControlToken::Continue) +            } +            Some('/') => { +                slf.state = State::SelfClosingStartTag; +                Ok(ControlToken::Continue) +            } +            Some('>') => { +                slf.state = State::Data; +                slf.emitter.emit_current_tag(); +                Ok(ControlToken::Continue) +            } +            Some('\0') => { +                slf.emitter.emit_error(Error::UnexpectedNullCharacter); +                slf.emitter.push_tag_name("\u{fffd}"); +                Ok(ControlToken::Continue) +            } +            Some(x) => { +                slf.emitter.push_tag_name(ctostr!(x.to_ascii_lowercase())); +                Ok(ControlToken::Continue) +            } +            None => { +                slf.emitter.emit_error(Error::EofInTag); +                Ok(ControlToken::Eof) +            } +        }, +        State::RcDataLessThanSign => match slf.read_char()? { +            Some('/') => { +                slf.temporary_buffer.clear(); +                slf.state = State::RcDataEndTagOpen; +                Ok(ControlToken::Continue) +            } +            c => { +                slf.emitter.emit_string("<"); +                slf.state = State::RcData; +                slf.unread_char(c); +                Ok(ControlToken::Continue) +            } +        }, +        State::RcDataEndTagOpen => match slf.read_char()? { +            Some(x) if x.is_ascii_alphabetic() => { +                slf.emitter.init_end_tag(); +                slf.state = State::RcDataEndTagName; +                slf.unread_char(Some(x)); +                Ok(ControlToken::Continue) +            } +            c => { +                slf.emitter.emit_string("</"); +                slf.state = State::RcData; +                slf.unread_char(c); +                Ok(ControlToken::Continue) +            } +        }, +        State::RcDataEndTagName => match slf.read_char()? { +            Some(whitespace_pat!()) if slf.emitter.current_is_appropriate_end_tag_token() => { +                slf.state = State::BeforeAttributeName; +                Ok(ControlToken::Continue) +            } +            Some('/') if slf.emitter.current_is_appropriate_end_tag_token() => { +                slf.state = State::SelfClosingStartTag; +                Ok(ControlToken::Continue) +            } +            Some('>') if slf.emitter.current_is_appropriate_end_tag_token() => { +                slf.state = State::Data; +                slf.emitter.emit_current_tag(); +                Ok(ControlToken::Continue) +            } +            Some(x) if x.is_ascii_alphabetic() => { +                slf.emitter.push_tag_name(ctostr!(x.to_ascii_lowercase())); +                slf.temporary_buffer.push(x); +                Ok(ControlToken::Continue) +            } +            c => { +                slf.emitter.emit_string("</"); +                slf.flush_buffer_characters(); -pub(crate) use control_pat; +                slf.state = State::RcData; +                slf.unread_char(c); +                Ok(ControlToken::Continue) +            } +        }, +        State::RawTextLessThanSign => match slf.read_char()? { +            Some('/') => { +                slf.temporary_buffer.clear(); +                slf.state = State::RawTextEndTagOpen; +                Ok(ControlToken::Continue) +            } +            c => { +                slf.emitter.emit_string("<"); +                slf.state = State::RawText; +                slf.unread_char(c); +                Ok(ControlToken::Continue) +            } +        }, +        State::RawTextEndTagOpen => match slf.read_char()? { +            Some(x) if x.is_ascii_alphabetic() => { +                slf.emitter.init_end_tag(); +                slf.state = State::RawTextEndTagName; +                slf.unread_char(Some(x)); +                Ok(ControlToken::Continue) +            } +            c => { +                slf.emitter.emit_string("</"); +                slf.state = State::RawText; +                slf.unread_char(c); +                Ok(ControlToken::Continue) +            } +        }, +        State::RawTextEndTagName => match slf.read_char()? { +            Some(whitespace_pat!()) if slf.emitter.current_is_appropriate_end_tag_token() => { +                slf.state = State::BeforeAttributeName; +                Ok(ControlToken::Continue) +            } +            Some('/') if slf.emitter.current_is_appropriate_end_tag_token() => { +                slf.state = State::SelfClosingStartTag; +                Ok(ControlToken::Continue) +            } +            Some('>') if slf.emitter.current_is_appropriate_end_tag_token() => { +                slf.state = State::Data; +                slf.emitter.emit_current_tag(); +                Ok(ControlToken::Continue) +            } +            Some(x) if x.is_ascii_alphabetic() => { +                slf.emitter.push_tag_name(ctostr!(x.to_ascii_lowercase())); +                slf.temporary_buffer.push(x); +                Ok(ControlToken::Continue) +            } +            c => { +                slf.emitter.emit_string("</"); +                slf.flush_buffer_characters(); -macro_rules! ascii_digit_pat { -    () => { -        '0'..='9' -    }; -} +                slf.state = State::RawText; +                slf.unread_char(c); +                Ok(ControlToken::Continue) +            } +        }, +        State::ScriptDataLessThanSign => match slf.read_char()? { +            Some('/') => { +                slf.temporary_buffer.clear(); +                slf.state = State::ScriptDataEndTagOpen; +                Ok(ControlToken::Continue) +            } +            Some('!') => { +                slf.state = State::ScriptDataEscapeStart; +                slf.emitter.emit_string("<!"); +                Ok(ControlToken::Continue) +            } +            c => { +                slf.emitter.emit_string("<"); +                slf.state = State::Data; +                slf.unread_char(c); +                Ok(ControlToken::Continue) +            } +        }, +        State::ScriptDataEndTagOpen => match slf.read_char()? { +            Some(x) if x.is_ascii_alphabetic() => { +                slf.emitter.init_end_tag(); +                slf.state = State::ScriptDataEndTagName; +                slf.unread_char(Some(x)); +                Ok(ControlToken::Continue) +            } +            c => { +                slf.emitter.emit_string("</"); +                slf.state = State::ScriptData; +                slf.unread_char(c); +                Ok(ControlToken::Continue) +            } +        }, +        State::ScriptDataEndTagName => match slf.read_char()? { +            Some(whitespace_pat!()) if slf.emitter.current_is_appropriate_end_tag_token() => { +                slf.state = State::BeforeAttributeName; +                Ok(ControlToken::Continue) +            } +            Some('/') if slf.emitter.current_is_appropriate_end_tag_token() => { +                slf.state = State::SelfClosingStartTag; +                Ok(ControlToken::Continue) +            } +            Some('>') if slf.emitter.current_is_appropriate_end_tag_token() => { +                slf.state = State::Data; +                slf.emitter.emit_current_tag(); +                Ok(ControlToken::Continue) +            } +            Some(x) if x.is_ascii_alphabetic() => { +                slf.emitter.push_tag_name(ctostr!(x.to_ascii_lowercase())); +                slf.temporary_buffer.push(x.to_ascii_lowercase()); +                Ok(ControlToken::Continue) +            } +            c => { +                slf.emitter.emit_string("</"); +                slf.flush_buffer_characters(); +                slf.state = State::Data; +                slf.unread_char(c); +                Ok(ControlToken::Continue) +            } +        }, +        State::ScriptDataEscapeStart => match slf.read_char()? { +            Some('-') => { +                slf.state = State::ScriptDataEscapeStartDash; +                slf.emitter.emit_string("-"); +                Ok(ControlToken::Continue) +            } +            c => { +                slf.state = State::ScriptData; +                slf.unread_char(c); +                Ok(ControlToken::Continue) +            } +        }, +        State::ScriptDataEscapeStartDash => match slf.read_char()? { +            Some('-') => { +                slf.state = State::ScriptDataEscapedDashDash; +                slf.emitter.emit_string("-"); +                Ok(ControlToken::Continue) +            } +            c => { +                slf.state = State::ScriptData; +                slf.unread_char(c); +                Ok(ControlToken::Continue) +            } +        }, +        State::ScriptDataEscaped => match slf.read_char()? { +            Some('-') => { +                slf.state = State::ScriptDataEscapedDash; +                slf.emitter.emit_string("-"); +                Ok(ControlToken::Continue) +            } +            Some('<') => { +                slf.state = State::ScriptDataEscapedLessThanSign; +                Ok(ControlToken::Continue) +            } +            Some('\0') => { +                slf.emitter.emit_error(Error::UnexpectedNullCharacter); +                slf.emitter.emit_string("\u{fffd}"); +                Ok(ControlToken::Continue) +            } +            None => { +                slf.emitter +                    .emit_error(Error::EofInScriptHtmlCommentLikeText); +                Ok(ControlToken::Eof) +            } +            Some(x) => { +                slf.emitter.emit_string(ctostr!(x)); +                Ok(ControlToken::Continue) +            } +        }, +        State::ScriptDataEscapedDash => match slf.read_char()? { +            Some('-') => { +                slf.state = State::ScriptDataEscapedDashDash; +                slf.emitter.emit_string("-"); +                Ok(ControlToken::Continue) +            } +            Some('<') => { +                slf.state = State::ScriptDataEscapedLessThanSign; +                Ok(ControlToken::Continue) +            } +            Some('\0') => { +                slf.emitter.emit_error(Error::UnexpectedNullCharacter); +                slf.state = State::ScriptDataEscaped; +                slf.emitter.emit_string("\u{fffd}"); +                Ok(ControlToken::Continue) +            } +            None => { +                slf.emitter +                    .emit_error(Error::EofInScriptHtmlCommentLikeText); +                Ok(ControlToken::Eof) +            } +            Some(x) => { +                slf.state = State::ScriptDataEscaped; +                slf.emitter.emit_string(ctostr!(x)); +                Ok(ControlToken::Continue) +            } +        }, +        State::ScriptDataEscapedDashDash => match slf.read_char()? { +            Some('-') => { +                slf.emitter.emit_string("-"); +                Ok(ControlToken::Continue) +            } +            Some('<') => { +                slf.state = State::ScriptDataEscapedLessThanSign; +                Ok(ControlToken::Continue) +            } +            Some('>') => { +                slf.state = State::ScriptData; +                slf.emitter.emit_string(">"); +                Ok(ControlToken::Continue) +            } +            Some('\0') => { +                slf.emitter.emit_error(Error::UnexpectedNullCharacter); +                slf.state = State::ScriptDataEscaped; +                slf.emitter.emit_string("\u{fffd}"); +                Ok(ControlToken::Continue) +            } +            None => { +                slf.emitter +                    .emit_error(Error::EofInScriptHtmlCommentLikeText); +                Ok(ControlToken::Eof) +            } +            Some(x) => { +                slf.state = State::ScriptDataEscaped; +                slf.emitter.emit_string(ctostr!(x)); +                Ok(ControlToken::Continue) +            } +        }, +        State::ScriptDataEscapedLessThanSign => match slf.read_char()? { +            Some('/') => { +                slf.temporary_buffer.clear(); +                slf.state = State::ScriptDataEscapedEndTagOpen; +                Ok(ControlToken::Continue) +            } +            Some(x) if x.is_ascii_alphabetic() => { +                slf.temporary_buffer.clear(); +                slf.emitter.emit_string("<"); +                slf.state = State::ScriptDataDoubleEscapeStart; +                slf.unread_char(Some(x)); +                Ok(ControlToken::Continue) +            } +            c => { +                slf.emitter.emit_string("<"); +                slf.state = State::ScriptDataEscaped; +                slf.unread_char(c); +                Ok(ControlToken::Continue) +            } +        }, +        State::ScriptDataEscapedEndTagOpen => match slf.read_char()? { +            Some(x) if x.is_ascii_alphabetic() => { +                slf.emitter.init_end_tag(); +                slf.state = State::ScriptDataEscapedEndTagName; +                slf.unread_char(Some(x)); +                Ok(ControlToken::Continue) +            } +            c => { +                slf.emitter.emit_string("</"); +                slf.unread_char(c); +                slf.state = State::ScriptDataEscaped; +                Ok(ControlToken::Continue) +            } +        }, +        State::ScriptDataEscapedEndTagName => match slf.read_char()? { +            Some(whitespace_pat!()) if slf.emitter.current_is_appropriate_end_tag_token() => { +                slf.state = State::BeforeAttributeName; +                Ok(ControlToken::Continue) +            } +            Some('/') if slf.emitter.current_is_appropriate_end_tag_token() => { +                slf.state = State::SelfClosingStartTag; +                Ok(ControlToken::Continue) +            } +            Some('>') if slf.emitter.current_is_appropriate_end_tag_token() => { +                slf.state = State::Data; +                slf.emitter.emit_current_tag(); +                Ok(ControlToken::Continue) +            } +            Some(x) if x.is_ascii_alphabetic() => { +                slf.emitter.push_tag_name(ctostr!(x.to_ascii_lowercase())); +                slf.temporary_buffer.push(x); +                Ok(ControlToken::Continue) +            } +            c => { +                slf.emitter.emit_string("</"); +                slf.flush_buffer_characters(); +                slf.state = State::ScriptDataEscaped; +                slf.unread_char(c); +                Ok(ControlToken::Continue) +            } +        }, +        State::ScriptDataDoubleEscapeStart => match slf.read_char()? { +            Some(x @ whitespace_pat!() | x @ '/' | x @ '>') => { +                if slf.temporary_buffer == "script" { +                    slf.state = State::ScriptDataDoubleEscaped; +                } else { +                    slf.state = State::ScriptDataEscaped; +                } +                slf.emitter.emit_string(ctostr!(x)); +                Ok(ControlToken::Continue) +            } +            Some(x) if x.is_ascii_alphabetic() => { +                slf.temporary_buffer.push(x.to_ascii_lowercase()); +                slf.emitter.emit_string(ctostr!(x)); +                Ok(ControlToken::Continue) +            } +            c => { +                slf.state = State::ScriptDataEscaped; +                slf.unread_char(c); +                Ok(ControlToken::Continue) +            } +        }, +        State::ScriptDataDoubleEscaped => match slf.read_char()? { +            Some('-') => { +                slf.state = State::ScriptDataDoubleEscapedDash; +                slf.emitter.emit_string("-"); +                Ok(ControlToken::Continue) +            } +            Some('<') => { +                slf.state = State::ScriptDataDoubleEscapedLessThanSign; +                slf.emitter.emit_string("<"); +                Ok(ControlToken::Continue) +            } +            Some('\0') => { +                slf.emitter.emit_error(Error::UnexpectedNullCharacter); +                slf.emitter.emit_string("\u{fffd}"); +                Ok(ControlToken::Continue) +            } +            None => { +                slf.emitter +                    .emit_error(Error::EofInScriptHtmlCommentLikeText); +                Ok(ControlToken::Eof) +            } +            Some(x) => { +                slf.emitter.emit_string(ctostr!(x)); +                Ok(ControlToken::Continue) +            } +        }, +        State::ScriptDataDoubleEscapedDash => match slf.read_char()? { +            Some('-') => { +                slf.state = State::ScriptDataDoubleEscapedDashDash; +                slf.emitter.emit_string("-"); +                Ok(ControlToken::Continue) +            } +            Some('<') => { +                slf.state = State::ScriptDataDoubleEscapedLessThanSign; +                slf.emitter.emit_string("<"); +                Ok(ControlToken::Continue) +            } +            Some('\0') => { +                slf.emitter.emit_error(Error::UnexpectedNullCharacter); +                slf.state = State::ScriptDataDoubleEscaped; +                slf.emitter.emit_string("\u{fffd}"); +                Ok(ControlToken::Continue) +            } +            None => { +                slf.emitter +                    .emit_error(Error::EofInScriptHtmlCommentLikeText); +                Ok(ControlToken::Eof) +            } +            Some(x) => { +                slf.state = State::ScriptDataDoubleEscaped; +                slf.emitter.emit_string(ctostr!(x)); +                Ok(ControlToken::Continue) +            } +        }, +        State::ScriptDataDoubleEscapedDashDash => match slf.read_char()? { +            Some('-') => { +                slf.emitter.emit_string("-"); +                Ok(ControlToken::Continue) +            } +            Some('<') => { +                slf.emitter.emit_string("<"); +                slf.state = State::ScriptDataDoubleEscapedLessThanSign; +                Ok(ControlToken::Continue) +            } +            Some('>') => { +                slf.emitter.emit_string(">"); +                slf.state = State::ScriptData; +                Ok(ControlToken::Continue) +            } +            Some('\0') => { +                slf.emitter.emit_error(Error::UnexpectedNullCharacter); +                slf.state = State::ScriptDataDoubleEscaped; +                slf.emitter.emit_string("\u{fffd}"); +                Ok(ControlToken::Continue) +            } +            None => { +                slf.emitter +                    .emit_error(Error::EofInScriptHtmlCommentLikeText); +                Ok(ControlToken::Eof) +            } +            Some(x) => { +                slf.state = State::ScriptDataDoubleEscaped; +                slf.emitter.emit_string(ctostr!(x)); +                Ok(ControlToken::Continue) +            } +        }, +        State::ScriptDataDoubleEscapedLessThanSign => match slf.read_char()? { +            Some('/') => { +                slf.temporary_buffer.clear(); +                slf.state = State::ScriptDataDoubleEscapeEnd; +                slf.emitter.emit_string("/"); +                Ok(ControlToken::Continue) +            } +            c => { +                slf.state = State::ScriptDataDoubleEscaped; +                slf.unread_char(c); +                Ok(ControlToken::Continue) +            } +        }, +        State::ScriptDataDoubleEscapeEnd => match slf.read_char()? { +            Some(x @ whitespace_pat!() | x @ '/' | x @ '>') => { +                if slf.temporary_buffer == "script" { +                    slf.state = State::ScriptDataEscaped; +                } else { +                    slf.state = State::ScriptDataDoubleEscaped; +                } -pub(crate) use ascii_digit_pat; +                slf.emitter.emit_string(ctostr!(x)); +                Ok(ControlToken::Continue) +            } +            Some(x) if x.is_ascii_alphabetic() => { +                slf.temporary_buffer.push(x.to_ascii_lowercase()); +                slf.emitter.emit_string(ctostr!(x)); +                Ok(ControlToken::Continue) +            } +            c => { +                slf.state = State::ScriptDataDoubleEscaped; +                slf.unread_char(c); +                Ok(ControlToken::Continue) +            } +        }, +        State::BeforeAttributeName => match slf.read_char()? { +            Some(whitespace_pat!()) => Ok(ControlToken::Continue), +            c @ Some('/' | '>') | c @ None => { +                slf.state = State::AfterAttributeName; +                slf.unread_char(c); +                Ok(ControlToken::Continue) +            } +            Some('=') => { +                slf.emitter +                    .emit_error(Error::UnexpectedEqualsSignBeforeAttributeName); +                slf.emitter.init_attribute(); +                slf.emitter.push_attribute_name("="); +                slf.state = State::AttributeName; +                Ok(ControlToken::Continue) +            } +            Some(x) => { +                slf.emitter.init_attribute(); +                slf.state = State::AttributeName; +                slf.unread_char(Some(x)); +                Ok(ControlToken::Continue) +            } +        }, +        State::AttributeName => match slf.read_char()? { +            c @ Some(whitespace_pat!() | '/' | '>') | c @ None => { +                slf.state = State::AfterAttributeName; +                slf.unread_char(c); +                Ok(ControlToken::Continue) +            } +            Some('=') => { +                slf.state = State::BeforeAttributeValue; +                Ok(ControlToken::Continue) +            } +            Some('\0') => { +                slf.emitter.emit_error(Error::UnexpectedNullCharacter); +                slf.emitter.push_attribute_name("\u{fffd}"); +                Ok(ControlToken::Continue) +            } +            Some(x @ '"' | x @ '\'' | x @ '<') => { +                slf.emitter +                    .emit_error(Error::UnexpectedCharacterInAttributeName); +                slf.emitter +                    .push_attribute_name(ctostr!(x.to_ascii_lowercase())); +                Ok(ControlToken::Continue) +            } +            Some(x) => { +                slf.emitter +                    .push_attribute_name(ctostr!(x.to_ascii_lowercase())); +                Ok(ControlToken::Continue) +            } +        }, +        State::AfterAttributeName => match slf.read_char()? { +            Some(whitespace_pat!()) => Ok(ControlToken::Continue), +            Some('/') => { +                slf.state = State::SelfClosingStartTag; +                Ok(ControlToken::Continue) +            } +            Some('=') => { +                slf.state = State::BeforeAttributeValue; +                Ok(ControlToken::Continue) +            } +            Some('>') => { +                slf.state = State::Data; +                slf.emitter.emit_current_tag(); +                Ok(ControlToken::Continue) +            } +            None => { +                slf.emitter.emit_error(Error::EofInTag); +                Ok(ControlToken::Eof) +            } +            Some(x) => { +                slf.emitter.init_attribute(); +                slf.state = State::AttributeName; +                slf.unread_char(Some(x)); +                Ok(ControlToken::Continue) +            } +        }, +        State::BeforeAttributeValue => match slf.read_char()? { +            Some(whitespace_pat!()) => Ok(ControlToken::Continue), +            Some('"') => { +                slf.state = State::AttributeValueDoubleQuoted; +                Ok(ControlToken::Continue) +            } +            Some('\'') => { +                slf.state = State::AttributeValueSingleQuoted; +                Ok(ControlToken::Continue) +            } +            Some('>') => { +                slf.emitter.emit_error(Error::MissingAttributeValue); +                slf.state = State::Data; +                slf.emitter.emit_current_tag(); +                Ok(ControlToken::Continue) +            } +            c => { +                slf.state = State::AttributeValueUnquoted; +                slf.unread_char(c); +                Ok(ControlToken::Continue) +            } +        }, +        State::AttributeValueDoubleQuoted => match slf.read_char()? { +            Some('"') => { +                slf.state = State::AfterAttributeValueQuoted; +                Ok(ControlToken::Continue) +            } +            Some('&') => { +                slf.return_state = Some(State::AttributeValueDoubleQuoted); +                slf.state = State::CharacterReference; +                Ok(ControlToken::Continue) +            } +            Some('\0') => { +                slf.emitter.emit_error(Error::UnexpectedNullCharacter); +                slf.emitter.push_attribute_value("\u{fffd}"); +                Ok(ControlToken::Continue) +            } +            None => { +                slf.emitter.emit_error(Error::EofInTag); +                Ok(ControlToken::Eof) +            } +            Some(x) => { +                slf.emitter.push_attribute_value(ctostr!(x)); +                Ok(ControlToken::Continue) +            } +        }, +        State::AttributeValueSingleQuoted => match slf.read_char()? { +            Some('\'') => { +                slf.state = State::AfterAttributeValueQuoted; +                Ok(ControlToken::Continue) +            } +            Some('&') => { +                slf.return_state = Some(State::AttributeValueSingleQuoted); +                slf.state = State::CharacterReference; +                Ok(ControlToken::Continue) +            } +            Some('\0') => { +                slf.emitter.emit_error(Error::UnexpectedNullCharacter); +                slf.emitter.push_attribute_value("\u{fffd}"); +                Ok(ControlToken::Continue) +            } +            None => { +                slf.emitter.emit_error(Error::EofInTag); +                Ok(ControlToken::Eof) +            } +            Some(x) => { +                slf.emitter.push_attribute_value(ctostr!(x)); +                Ok(ControlToken::Continue) +            } +        }, +        State::AttributeValueUnquoted => match slf.read_char()? { +            Some(whitespace_pat!()) => { +                slf.state = State::BeforeAttributeName; +                Ok(ControlToken::Continue) +            } +            Some('&') => { +                slf.return_state = Some(State::AttributeValueUnquoted); +                slf.state = State::CharacterReference; +                Ok(ControlToken::Continue) +            } +            Some('>') => { +                slf.state = State::Data; +                slf.emitter.emit_current_tag(); +                Ok(ControlToken::Continue) +            } +            Some('\0') => { +                slf.emitter.emit_error(Error::UnexpectedNullCharacter); +                slf.emitter.push_attribute_value("\u{fffd}"); +                Ok(ControlToken::Continue) +            } +            Some(x @ '"' | x @ '\'' | x @ '<' | x @ '=' | x @ '\u{60}') => { +                slf.emitter +                    .emit_error(Error::UnexpectedCharacterInUnquotedAttributeValue); +                slf.emitter.push_attribute_value(ctostr!(x)); +                Ok(ControlToken::Continue) +            } +            None => { +                slf.emitter.emit_error(Error::EofInTag); +                Ok(ControlToken::Eof) +            } +            Some(x) => { +                slf.emitter.push_attribute_value(ctostr!(x)); +                Ok(ControlToken::Continue) +            } +        }, +        State::AfterAttributeValueQuoted => match slf.read_char()? { +            Some(whitespace_pat!()) => { +                slf.state = State::BeforeAttributeName; +                Ok(ControlToken::Continue) +            } +            Some('/') => { +                slf.state = State::SelfClosingStartTag; +                Ok(ControlToken::Continue) +            } +            Some('>') => { +                slf.state = State::Data; +                slf.emitter.emit_current_tag(); +                Ok(ControlToken::Continue) +            } +            None => { +                slf.emitter.emit_error(Error::EofInTag); +                Ok(ControlToken::Eof) +            } +            Some(x) => { +                slf.emitter +                    .emit_error(Error::MissingWhitespaceBetweenAttributes); +                slf.state = State::BeforeAttributeName; +                slf.unread_char(Some(x)); +                Ok(ControlToken::Continue) +            } +        }, +        State::SelfClosingStartTag => match slf.read_char()? { +            Some('>') => { +                slf.emitter.set_self_closing(); +                slf.state = State::Data; +                slf.emitter.emit_current_tag(); +                Ok(ControlToken::Continue) +            } +            None => { +                slf.emitter.emit_error(Error::EofInTag); +                Ok(ControlToken::Eof) +            } +            Some(x) => { +                slf.emitter.emit_error(Error::UnexpectedSolidusInTag); +                slf.state = State::BeforeAttributeName; +                slf.unread_char(Some(x)); +                Ok(ControlToken::Continue) +            } +        }, +        State::BogusComment => match slf.read_char()? { +            Some('>') => { +                slf.state = State::Data; +                slf.emitter.emit_current_comment(); +                Ok(ControlToken::Continue) +            } +            None => { +                slf.emitter.emit_current_comment(); +                Ok(ControlToken::Eof) +            } +            Some('\0') => { +                slf.emitter.emit_error(Error::UnexpectedNullCharacter); +                slf.emitter.push_comment("\u{fffd}"); +                Ok(ControlToken::Continue) +            } +            Some(x) => { +                slf.emitter.push_comment(ctostr!(x)); +                Ok(ControlToken::Continue) +            } +        }, +        State::MarkupDeclarationOpen => match slf.read_char()? { +            Some('-') if slf.try_read_string("-", true)? => { +                slf.emitter.init_comment(); +                slf.state = State::CommentStart; +                Ok(ControlToken::Continue) +            } +            Some('d' | 'D') if slf.try_read_string("octype", false)? => { +                slf.state = State::Doctype; +                Ok(ControlToken::Continue) +            } +            Some('[') if slf.try_read_string("CDATA[", true)? => { +                // missing: check for adjusted current element: we don't have an element stack +                // at all +                // +                // missing: cdata transition +                // +                // let's hope that bogus comment can just sort of skip over cdata +                slf.emitter.emit_error(Error::CdataInHtmlContent); -macro_rules! whitespace_pat { -    () => { -        '\t' | '\u{0A}' | '\u{0C}' | ' ' -    }; -} +                slf.emitter.init_comment(); +                slf.emitter.push_comment("[CDATA["); +                slf.state = State::BogusComment; +                Ok(ControlToken::Continue) +            } +            c => { +                slf.emitter.emit_error(Error::IncorrectlyOpenedComment); +                slf.emitter.init_comment(); +                slf.state = State::BogusComment; +                slf.unread_char(c); +                Ok(ControlToken::Continue) +            } +        }, +        State::CommentStart => match slf.read_char()? { +            Some('-') => { +                slf.state = State::CommentStartDash; +                Ok(ControlToken::Continue) +            } +            Some('>') => { +                slf.emitter.emit_error(Error::AbruptClosingOfEmptyComment); +                slf.state = State::Data; +                slf.emitter.emit_current_comment(); +                Ok(ControlToken::Continue) +            } +            c => { +                slf.unread_char(c); +                slf.state = State::Comment; +                Ok(ControlToken::Continue) +            } +        }, +        State::CommentStartDash => match slf.read_char()? { +            Some('-') => { +                slf.state = State::CommentEnd; +                Ok(ControlToken::Continue) +            } +            Some('>') => { +                slf.emitter.emit_error(Error::AbruptClosingOfEmptyComment); +                slf.state = State::Data; +                slf.emitter.emit_current_comment(); +                Ok(ControlToken::Continue) +            } +            None => { +                slf.emitter.emit_error(Error::EofInComment); +                slf.emitter.emit_current_comment(); +                Ok(ControlToken::Eof) +            } +            c @ Some(_) => { +                slf.emitter.push_comment("-"); +                slf.unread_char(c); +                slf.state = State::Comment; +                Ok(ControlToken::Continue) +            } +        }, +        State::Comment => match slf.read_char()? { +            Some('<') => { +                slf.emitter.push_comment("<"); +                slf.state = State::CommentLessThanSign; +                Ok(ControlToken::Continue) +            } +            Some('-') => { +                slf.state = State::CommentEndDash; +                Ok(ControlToken::Continue) +            } +            Some('\0') => { +                slf.emitter.emit_error(Error::UnexpectedNullCharacter); +                slf.emitter.push_comment("\u{fffd}"); +                Ok(ControlToken::Continue) +            } +            None => { +                slf.emitter.emit_error(Error::EofInComment); +                slf.emitter.emit_current_comment(); +                Ok(ControlToken::Eof) +            } +            Some(x) => { +                slf.emitter.push_comment(ctostr!(x)); +                Ok(ControlToken::Continue) +            } +        }, +        State::CommentLessThanSign => match slf.read_char()? { +            Some('!') => { +                slf.emitter.push_comment("!"); +                slf.state = State::CommentLessThanSignBang; +                Ok(ControlToken::Continue) +            } +            Some('<') => { +                slf.emitter.push_comment("<"); +                Ok(ControlToken::Continue) +            } +            c => { +                slf.unread_char(c); +                slf.state = State::Comment; +                Ok(ControlToken::Continue) +            } +        }, +        State::CommentLessThanSignBang => match slf.read_char()? { +            Some('-') => { +                slf.state = State::CommentLessThanSignBangDash; +                Ok(ControlToken::Continue) +            } +            c => { +                slf.unread_char(c); +                slf.state = State::Comment; +                Ok(ControlToken::Continue) +            } +        }, +        State::CommentLessThanSignBangDash => match slf.read_char()? { +            Some('-') => { +                slf.state = State::CommentLessThanSignBangDashDash; +                Ok(ControlToken::Continue) +            } +            c => { +                slf.unread_char(c); +                slf.state = State::CommentEndDash; +                Ok(ControlToken::Continue) +            } +        }, +        State::CommentLessThanSignBangDashDash => match slf.read_char()? { +            c @ Some('>') | c @ None => { +                slf.unread_char(c); +                slf.state = State::CommentEnd; +                Ok(ControlToken::Continue) +            } +            c => { +                slf.emitter.emit_error(Error::NestedComment); +                slf.unread_char(c); +                slf.state = State::CommentEnd; +                Ok(ControlToken::Continue) +            } +        }, +        State::CommentEndDash => match slf.read_char()? { +            Some('-') => { +                slf.state = State::CommentEnd; +                Ok(ControlToken::Continue) +            } +            None => { +                slf.emitter.emit_error(Error::EofInComment); +                slf.emitter.emit_current_comment(); +                Ok(ControlToken::Eof) +            } +            c => { +                slf.emitter.push_comment("-"); +                slf.unread_char(c); +                slf.state = State::Comment; +                Ok(ControlToken::Continue) +            } +        }, +        State::CommentEnd => match slf.read_char()? { +            Some('>') => { +                slf.state = State::Data; +                slf.emitter.emit_current_comment(); +                Ok(ControlToken::Continue) +            } +            Some('!') => { +                slf.state = State::CommentEndBang; +                Ok(ControlToken::Continue) +            } +            Some('-') => { +                slf.emitter.push_comment("-"); +                Ok(ControlToken::Continue) +            } +            None => { +                slf.emitter.emit_error(Error::EofInComment); +                slf.emitter.emit_current_comment(); +                Ok(ControlToken::Eof) +            } +            c @ Some(_) => { +                slf.emitter.push_comment("-"); +                slf.emitter.push_comment("-"); +                slf.unread_char(c); +                slf.state = State::Comment; +                Ok(ControlToken::Continue) +            } +        }, +        State::CommentEndBang => match slf.read_char()? { +            Some('-') => { +                slf.emitter.push_comment("-"); +                slf.emitter.push_comment("-"); +                slf.emitter.push_comment("!"); +                slf.state = State::CommentEndDash; +                Ok(ControlToken::Continue) +            } +            Some('>') => { +                slf.emitter.emit_error(Error::IncorrectlyClosedComment); +                slf.state = State::Data; +                slf.emitter.emit_current_comment(); +                Ok(ControlToken::Continue) +            } +            None => { +                slf.emitter.emit_error(Error::EofInComment); +                slf.emitter.emit_current_comment(); +                Ok(ControlToken::Eof) +            } +            c @ Some(_) => { +                slf.emitter.push_comment("-"); +                slf.emitter.push_comment("-"); +                slf.emitter.push_comment("!"); +                slf.state = State::Comment; +                slf.unread_char(c); +                Ok(ControlToken::Continue) +            } +        }, +        State::Doctype => match slf.read_char()? { +            Some(whitespace_pat!()) => { +                slf.state = State::BeforeDoctypeName; +                Ok(ControlToken::Continue) +            } +            c @ Some('>') => { +                slf.unread_char(c); +                slf.state = State::BeforeDoctypeName; +                Ok(ControlToken::Continue) +            } +            None => { +                slf.emitter.emit_error(Error::EofInDoctype); +                slf.emitter.init_doctype(); +                slf.emitter.set_force_quirks(); +                slf.emitter.emit_current_doctype(); +                Ok(ControlToken::Eof) +            } +            c @ Some(_) => { +                slf.emitter +                    .emit_error(Error::MissingWhitespaceBeforeDoctypeName); +                slf.unread_char(c); +                slf.state = State::BeforeDoctypeName; +                Ok(ControlToken::Continue) +            } +        }, +        State::BeforeDoctypeName => match slf.read_char()? { +            Some(whitespace_pat!()) => Ok(ControlToken::Continue), +            Some('\0') => { +                slf.emitter.emit_error(Error::UnexpectedNullCharacter); +                slf.emitter.init_doctype(); +                slf.emitter.push_doctype_name("\u{fffd}"); +                slf.state = State::DoctypeName; +                Ok(ControlToken::Continue) +            } +            Some('>') => { +                slf.emitter.emit_error(Error::MissingDoctypeName); +                slf.emitter.init_doctype(); +                slf.emitter.set_force_quirks(); +                slf.state = State::Data; +                slf.emitter.emit_current_doctype(); +                Ok(ControlToken::Continue) +            } +            None => { +                slf.emitter.emit_error(Error::EofInDoctype); +                slf.emitter.init_doctype(); +                slf.emitter.set_force_quirks(); +                slf.emitter.emit_current_doctype(); +                Ok(ControlToken::Eof) +            } +            Some(x) => { +                slf.emitter.init_doctype(); +                slf.emitter +                    .push_doctype_name(ctostr!(x.to_ascii_lowercase())); +                slf.state = State::DoctypeName; +                Ok(ControlToken::Continue) +            } +        }, +        State::DoctypeName => match slf.read_char()? { +            Some(whitespace_pat!()) => { +                slf.state = State::AfterDoctypeName; +                Ok(ControlToken::Continue) +            } +            Some('>') => { +                slf.state = State::Data; +                slf.emitter.emit_current_doctype(); +                Ok(ControlToken::Continue) +            } +            Some('\0') => { +                slf.emitter.emit_error(Error::UnexpectedNullCharacter); +                slf.emitter.push_doctype_name("\u{fffd}"); +                Ok(ControlToken::Continue) +            } +            None => { +                slf.emitter.emit_error(Error::EofInDoctype); +                slf.emitter.set_force_quirks(); +                slf.emitter.emit_current_doctype(); +                Ok(ControlToken::Eof) +            } +            Some(x) => { +                slf.emitter +                    .push_doctype_name(ctostr!(x.to_ascii_lowercase())); +                Ok(ControlToken::Continue) +            } +        }, +        State::AfterDoctypeName => match slf.read_char()? { +            Some(whitespace_pat!()) => Ok(ControlToken::Continue), +            Some('>') => { +                slf.state = State::Data; +                slf.emitter.emit_current_doctype(); +                Ok(ControlToken::Continue) +            } +            None => { +                slf.emitter.emit_error(Error::EofInDoctype); +                slf.emitter.set_force_quirks(); +                slf.emitter.emit_current_doctype(); +                Ok(ControlToken::Eof) +            } +            Some('p' | 'P') if slf.try_read_string("ublic", false)? => { +                slf.state = State::AfterDoctypePublicKeyword; +                Ok(ControlToken::Continue) +            } +            Some('s' | 'S') if slf.try_read_string("ystem", false)? => { +                slf.state = State::AfterDoctypeSystemKeyword; +                Ok(ControlToken::Continue) +            } +            c @ Some(_) => { +                slf.emitter +                    .emit_error(Error::InvalidCharacterSequenceAfterDoctypeName); +                slf.emitter.set_force_quirks(); +                slf.unread_char(c); +                slf.state = State::BogusDoctype; +                Ok(ControlToken::Continue) +            } +        }, +        State::AfterDoctypePublicKeyword => match slf.read_char()? { +            Some(whitespace_pat!()) => { +                slf.state = State::BeforeDoctypePublicIdentifier; +                Ok(ControlToken::Continue) +            } +            Some('"') => { +                slf.emitter +                    .emit_error(Error::MissingWhitespaceAfterDoctypePublicKeyword); +                slf.emitter.set_doctype_public_identifier(""); +                slf.state = State::DoctypePublicIdentifierDoubleQuoted; +                Ok(ControlToken::Continue) +            } +            Some('\'') => { +                slf.emitter +                    .emit_error(Error::MissingWhitespaceAfterDoctypePublicKeyword); +                slf.emitter.set_doctype_public_identifier(""); +                slf.state = State::DoctypePublicIdentifierSingleQuoted; +                Ok(ControlToken::Continue) +            } +            Some('>') => { +                slf.emitter +                    .emit_error(Error::MissingDoctypePublicIdentifier); +                slf.emitter.set_force_quirks(); +                slf.state = State::Data; +                slf.emitter.emit_current_doctype(); +                Ok(ControlToken::Continue) +            } +            None => { +                slf.emitter.emit_error(Error::EofInDoctype); +                slf.emitter.set_force_quirks(); +                slf.emitter.emit_current_doctype(); +                Ok(ControlToken::Eof) +            } +            c @ Some(_) => { +                slf.emitter +                    .emit_error(Error::MissingQuoteBeforeDoctypePublicIdentifier); +                slf.emitter.set_force_quirks(); +                slf.unread_char(c); +                slf.state = State::BogusDoctype; +                Ok(ControlToken::Continue) +            } +        }, +        State::BeforeDoctypePublicIdentifier => match slf.read_char()? { +            Some(whitespace_pat!()) => Ok(ControlToken::Continue), +            Some('"') => { +                slf.emitter.set_doctype_public_identifier(""); +                slf.state = State::DoctypePublicIdentifierDoubleQuoted; +                Ok(ControlToken::Continue) +            } +            Some('\'') => { +                slf.emitter.set_doctype_public_identifier(""); +                slf.state = State::DoctypePublicIdentifierSingleQuoted; +                Ok(ControlToken::Continue) +            } +            Some('>') => { +                slf.emitter +                    .emit_error(Error::MissingDoctypePublicIdentifier); +                slf.emitter.set_force_quirks(); +                slf.state = State::Data; +                slf.emitter.emit_current_doctype(); +                Ok(ControlToken::Continue) +            } +            None => { +                slf.emitter.emit_error(Error::EofInDoctype); +                slf.emitter.set_force_quirks(); +                slf.emitter.emit_current_doctype(); +                Ok(ControlToken::Eof) +            } +            c @ Some(_) => { +                slf.emitter +                    .emit_error(Error::MissingQuoteBeforeDoctypePublicIdentifier); +                slf.emitter.set_force_quirks(); +                slf.unread_char(c); +                slf.state = State::BogusDoctype; +                Ok(ControlToken::Continue) +            } +        }, +        State::DoctypePublicIdentifierDoubleQuoted => match slf.read_char()? { +            Some('"') => { +                slf.state = State::AfterDoctypePublicIdentifier; +                Ok(ControlToken::Continue) +            } +            Some('\0') => { +                slf.emitter.emit_error(Error::UnexpectedNullCharacter); +                slf.emitter.push_doctype_public_identifier("\u{fffd}"); +                Ok(ControlToken::Continue) +            } +            Some('>') => { +                slf.emitter.emit_error(Error::AbruptDoctypePublicIdentifier); +                slf.emitter.set_force_quirks(); +                slf.state = State::Data; +                slf.emitter.emit_current_doctype(); +                Ok(ControlToken::Continue) +            } +            None => { +                slf.emitter.emit_error(Error::EofInDoctype); +                slf.emitter.set_force_quirks(); +                slf.emitter.emit_current_doctype(); +                Ok(ControlToken::Eof) +            } +            Some(x) => { +                slf.emitter.push_doctype_public_identifier(ctostr!(x)); +                Ok(ControlToken::Continue) +            } +        }, +        State::DoctypePublicIdentifierSingleQuoted => match slf.read_char()? { +            Some('\'') => { +                slf.state = State::AfterDoctypePublicIdentifier; +                Ok(ControlToken::Continue) +            } +            Some('\0') => { +                slf.emitter.emit_error(Error::UnexpectedNullCharacter); +                slf.emitter.push_doctype_public_identifier("\u{fffd}"); +                Ok(ControlToken::Continue) +            } +            Some('>') => { +                slf.emitter.emit_error(Error::AbruptDoctypePublicIdentifier); +                slf.emitter.set_force_quirks(); +                slf.state = State::Data; +                slf.emitter.emit_current_doctype(); +                Ok(ControlToken::Continue) +            } +            None => { +                slf.emitter.emit_error(Error::EofInDoctype); +                slf.emitter.set_force_quirks(); +                slf.emitter.emit_current_doctype(); +                Ok(ControlToken::Eof) +            } +            Some(x) => { +                slf.emitter.push_doctype_public_identifier(ctostr!(x)); +                Ok(ControlToken::Continue) +            } +        }, +        State::AfterDoctypePublicIdentifier => match slf.read_char()? { +            Some(whitespace_pat!()) => { +                slf.state = State::BetweenDoctypePublicAndSystemIdentifiers; +                Ok(ControlToken::Continue) +            } +            Some('>') => { +                slf.state = State::Data; +                slf.emitter.emit_current_doctype(); +                Ok(ControlToken::Continue) +            } +            Some('"') => { +                slf.emitter +                    .emit_error(Error::MissingWhitespaceBetweenDoctypePublicAndSystemIdentifiers); +                slf.emitter.set_doctype_system_identifier(""); +                slf.state = State::DoctypeSystemIdentifierDoubleQuoted; +                Ok(ControlToken::Continue) +            } +            Some('\'') => { +                slf.emitter +                    .emit_error(Error::MissingWhitespaceBetweenDoctypePublicAndSystemIdentifiers); +                slf.emitter.set_doctype_system_identifier(""); +                slf.state = State::DoctypeSystemIdentifierSingleQuoted; +                Ok(ControlToken::Continue) +            } +            None => { +                slf.emitter.emit_error(Error::EofInDoctype); +                slf.emitter.set_force_quirks(); +                slf.emitter.emit_current_doctype(); +                Ok(ControlToken::Eof) +            } +            c @ Some(_) => { +                slf.emitter +                    .emit_error(Error::MissingQuoteBeforeDoctypeSystemIdentifier); +                slf.emitter.set_force_quirks(); +                slf.unread_char(c); +                slf.state = State::BogusDoctype; +                Ok(ControlToken::Continue) +            } +        }, +        State::BetweenDoctypePublicAndSystemIdentifiers => match slf.read_char()? { +            Some(whitespace_pat!()) => Ok(ControlToken::Continue), +            Some('>') => { +                slf.state = State::Data; +                slf.emitter.emit_current_doctype(); +                Ok(ControlToken::Continue) +            } +            Some('"') => { +                slf.emitter.set_doctype_system_identifier(""); +                slf.state = State::DoctypeSystemIdentifierDoubleQuoted; +                Ok(ControlToken::Continue) +            } +            Some('\'') => { +                slf.emitter.set_doctype_system_identifier(""); +                slf.state = State::DoctypeSystemIdentifierSingleQuoted; +                Ok(ControlToken::Continue) +            } +            None => { +                slf.emitter.emit_error(Error::EofInDoctype); +                slf.emitter.set_force_quirks(); +                slf.emitter.emit_current_doctype(); +                Ok(ControlToken::Eof) +            } +            c @ Some(_) => { +                slf.emitter +                    .emit_error(Error::MissingQuoteBeforeDoctypeSystemIdentifier); +                slf.emitter.set_force_quirks(); +                slf.state = State::BogusDoctype; +                slf.unread_char(c); +                Ok(ControlToken::Continue) +            } +        }, +        State::AfterDoctypeSystemKeyword => match slf.read_char()? { +            Some(whitespace_pat!()) => { +                slf.state = State::BeforeDoctypeSystemIdentifier; +                Ok(ControlToken::Continue) +            } +            Some('"') => { +                slf.emitter +                    .emit_error(Error::MissingWhitespaceAfterDoctypeSystemKeyword); +                slf.emitter.set_doctype_system_identifier(""); +                slf.state = State::DoctypeSystemIdentifierDoubleQuoted; +                Ok(ControlToken::Continue) +            } +            Some('\'') => { +                slf.emitter +                    .emit_error(Error::MissingWhitespaceAfterDoctypeSystemKeyword); +                slf.emitter.set_doctype_system_identifier(""); +                slf.state = State::DoctypeSystemIdentifierSingleQuoted; +                Ok(ControlToken::Continue) +            } +            Some('>') => { +                slf.emitter +                    .emit_error(Error::MissingDoctypeSystemIdentifier); +                slf.emitter.set_force_quirks(); +                slf.state = State::Data; +                slf.emitter.emit_current_doctype(); +                Ok(ControlToken::Continue) +            } +            None => { +                slf.emitter.emit_error(Error::EofInDoctype); +                slf.emitter.set_force_quirks(); +                slf.emitter.emit_current_doctype(); +                Ok(ControlToken::Eof) +            } +            c @ Some(_) => { +                slf.emitter +                    .emit_error(Error::MissingQuoteBeforeDoctypeSystemIdentifier); +                slf.emitter.set_force_quirks(); +                slf.state = State::BogusDoctype; +                slf.unread_char(c); +                Ok(ControlToken::Continue) +            } +        }, +        State::BeforeDoctypeSystemIdentifier => match slf.read_char()? { +            Some(whitespace_pat!()) => Ok(ControlToken::Continue), +            Some('"') => { +                slf.emitter.set_doctype_system_identifier(""); +                slf.state = State::DoctypeSystemIdentifierDoubleQuoted; +                Ok(ControlToken::Continue) +            } +            Some('\'') => { +                slf.emitter.set_doctype_system_identifier(""); +                slf.state = State::DoctypeSystemIdentifierSingleQuoted; +                Ok(ControlToken::Continue) +            } +            Some('>') => { +                slf.emitter +                    .emit_error(Error::MissingDoctypeSystemIdentifier); +                slf.emitter.set_force_quirks(); +                slf.state = State::Data; +                slf.emitter.emit_current_doctype(); +                Ok(ControlToken::Continue) +            } +            None => { +                slf.emitter.emit_error(Error::EofInDoctype); +                slf.emitter.set_force_quirks(); +                slf.emitter.emit_current_doctype(); +                Ok(ControlToken::Eof) +            } +            c @ Some(_) => { +                slf.emitter +                    .emit_error(Error::MissingQuoteBeforeDoctypeSystemIdentifier); +                slf.emitter.set_force_quirks(); +                slf.state = State::BogusDoctype; +                slf.unread_char(c); +                Ok(ControlToken::Continue) +            } +        }, +        State::DoctypeSystemIdentifierDoubleQuoted => match slf.read_char()? { +            Some('"') => { +                slf.state = State::AfterDoctypeSystemIdentifier; +                Ok(ControlToken::Continue) +            } +            Some('\0') => { +                slf.emitter.emit_error(Error::UnexpectedNullCharacter); +                slf.emitter.push_doctype_system_identifier("\u{fffd}"); +                Ok(ControlToken::Continue) +            } +            Some('>') => { +                slf.emitter.emit_error(Error::AbruptDoctypeSystemIdentifier); +                slf.emitter.set_force_quirks(); +                slf.state = State::Data; +                slf.emitter.emit_current_doctype(); +                Ok(ControlToken::Continue) +            } +            None => { +                slf.emitter.emit_error(Error::EofInDoctype); +                slf.emitter.set_force_quirks(); +                slf.emitter.emit_current_doctype(); +                Ok(ControlToken::Eof) +            } +            Some(x) => { +                slf.emitter.push_doctype_system_identifier(ctostr!(x)); +                Ok(ControlToken::Continue) +            } +        }, +        State::DoctypeSystemIdentifierSingleQuoted => match slf.read_char()? { +            Some('\'') => { +                slf.state = State::AfterDoctypeSystemIdentifier; +                Ok(ControlToken::Continue) +            } +            Some('\0') => { +                slf.emitter.emit_error(Error::UnexpectedNullCharacter); +                slf.emitter.push_doctype_system_identifier("\u{fffd}"); +                Ok(ControlToken::Continue) +            } +            Some('>') => { +                slf.emitter.emit_error(Error::AbruptDoctypeSystemIdentifier); +                slf.emitter.set_force_quirks(); +                slf.state = State::Data; +                slf.emitter.emit_current_doctype(); +                Ok(ControlToken::Continue) +            } +            None => { +                slf.emitter.emit_error(Error::EofInDoctype); +                slf.emitter.set_force_quirks(); +                slf.emitter.emit_current_doctype(); +                Ok(ControlToken::Eof) +            } +            Some(x) => { +                slf.emitter.push_doctype_system_identifier(ctostr!(x)); +                Ok(ControlToken::Continue) +            } +        }, +        State::AfterDoctypeSystemIdentifier => match slf.read_char()? { +            Some(whitespace_pat!()) => Ok(ControlToken::Continue), +            Some('>') => { +                slf.state = State::Data; +                slf.emitter.emit_current_doctype(); +                Ok(ControlToken::Continue) +            } +            None => { +                slf.emitter.emit_error(Error::EofInDoctype); +                slf.emitter.set_force_quirks(); +                slf.emitter.emit_current_doctype(); +                Ok(ControlToken::Eof) +            } +            c @ Some(_) => { +                slf.emitter +                    .emit_error(Error::UnexpectedCharacterAfterDoctypeSystemIdentifier); +                slf.unread_char(c); +                slf.state = State::BogusDoctype; +                Ok(ControlToken::Continue) +            } +        }, +        State::BogusDoctype => match slf.read_char()? { +            Some('>') => { +                slf.state = State::Data; +                slf.emitter.emit_current_doctype(); +                Ok(ControlToken::Continue) +            } +            Some('\0') => { +                slf.emitter.emit_error(Error::UnexpectedNullCharacter); +                Ok(ControlToken::Continue) +            } +            None => { +                slf.emitter.emit_current_doctype(); +                Ok(ControlToken::Eof) +            } +            Some(_) => Ok(ControlToken::Continue), +        }, +        State::CdataSection => match slf.read_char()? { +            Some(']') => { +                slf.state = State::CdataSectionBracket; +                Ok(ControlToken::Continue) +            } +            None => { +                slf.emitter.emit_error(Error::EofInCdata); +                Ok(ControlToken::Eof) +            } +            Some(x) => { +                slf.emitter.emit_string(ctostr!(x)); +                Ok(ControlToken::Continue) +            } +        }, +        State::CdataSectionBracket => match slf.read_char()? { +            Some(']') => { +                slf.state = State::CdataSectionEnd; +                Ok(ControlToken::Continue) +            } +            c => { +                slf.emitter.emit_string("]"); +                slf.state = State::CdataSection; +                slf.unread_char(c); +                Ok(ControlToken::Continue) +            } +        }, +        State::CdataSectionEnd => match slf.read_char()? { +            Some(']') => { +                slf.emitter.emit_string("]"); +                Ok(ControlToken::Continue) +            } +            Some('>') => { +                slf.state = State::Data; +                Ok(ControlToken::Continue) +            } +            c => { +                slf.emitter.emit_string("]]"); +                slf.unread_char(c); +                slf.state = State::CdataSection; +                Ok(ControlToken::Continue) +            } +        }, +        State::CharacterReference => { +            slf.temporary_buffer.clear(); +            slf.temporary_buffer.push('&'); +            match slf.read_char()? { +                Some(x) if x.is_ascii_alphanumeric() => { +                    slf.unread_char(Some(x)); +                    slf.state = State::NamedCharacterReference; +                    Ok(ControlToken::Continue) +                } +                Some('#') => { +                    slf.temporary_buffer.push('#'); +                    slf.state = State::NumericCharacterReference; +                    Ok(ControlToken::Continue) +                } +                c => { +                    slf.flush_code_points_consumed_as_character_reference(); +                    slf.state = slf.return_state.take().unwrap(); +                    slf.unread_char(c); +                    Ok(ControlToken::Continue) +                } +            } +        } +        State::NamedCharacterReference => { +            let c = slf.read_char()?; -pub(crate) use whitespace_pat; +            let char_ref = match c { +                Some(x) => try_read_character_reference(x, |x| slf.try_read_string(x, true))? +                    .map(|char_ref| (x, char_ref)), -macro_rules! noncharacter_pat { -    () => { -        0xfdd0 -            ..=0xfdef -                | 0xfffe -                | 0xffff -                | 0x1fffe -                | 0x1ffff -                | 0x2fffe -                | 0x2ffff -                | 0x3fffe -                | 0x3ffff -                | 0x4fffe -                | 0x4ffff -                | 0x5fffe -                | 0x5ffff -                | 0x6fffe -                | 0x6ffff -                | 0x7fffe -                | 0x7ffff -                | 0x8fffe -                | 0x8ffff -                | 0x9fffe -                | 0x9ffff -                | 0xafffe -                | 0xaffff -                | 0xbfffe -                | 0xbffff -                | 0xcfffe -                | 0xcffff -                | 0xdfffe -                | 0xdffff -                | 0xefffe -                | 0xeffff -                | 0xffffe -                | 0xfffff -                | 0x10fffe -                | 0x10ffff -    }; -} +                None => None, +            }; -pub(crate) use noncharacter_pat; +            if let Some((x, char_ref)) = char_ref { +                slf.temporary_buffer.push(x); +                slf.temporary_buffer.push_str(char_ref.name); +                let char_ref_name_last_character = char_ref.name.chars().last(); +                let next_character = slf.next_input_character()?; +                if slf.is_consumed_as_part_of_an_attribute() +                    && char_ref_name_last_character != Some(';') +                    && matches!(next_character, Some(x) if x == '=' || x.is_ascii_alphanumeric()) +                { +                    slf.flush_code_points_consumed_as_character_reference(); +                    slf.state = slf.return_state.take().unwrap(); +                    Ok(ControlToken::Continue) +                } else { +                    if char_ref_name_last_character != Some(';') { +                        slf.emitter +                            .emit_error(Error::MissingSemicolonAfterCharacterReference); +                    } -// When integration tests are running, this enum is public and we get warnings about missing docs. -// However, it's not actually part of public API. -#[allow(missing_docs)] -#[derive(Clone, Copy, Debug, Eq, PartialEq)] -pub enum State { -    Data, -    RcData, -    RawText, -    ScriptData, -    PlainText, -    TagOpen, -    EndTagOpen, -    TagName, -    RcDataLessThanSign, -    RcDataEndTagOpen, -    RcDataEndTagName, -    RawTextLessThanSign, -    RawTextEndTagOpen, -    RawTextEndTagName, -    ScriptDataLessThanSign, -    ScriptDataEndTagOpen, -    ScriptDataEndTagName, -    ScriptDataEscapeStart, -    ScriptDataEscapeStartDash, -    ScriptDataEscaped, -    ScriptDataEscapedDash, -    ScriptDataEscapedDashDash, -    ScriptDataEscapedLessThanSign, -    ScriptDataEscapedEndTagOpen, -    ScriptDataEscapedEndTagName, -    ScriptDataDoubleEscapeStart, -    ScriptDataDoubleEscaped, -    ScriptDataDoubleEscapedDash, -    ScriptDataDoubleEscapedDashDash, -    ScriptDataDoubleEscapedLessThanSign, -    ScriptDataDoubleEscapeEnd, -    BeforeAttributeName, -    AttributeName, -    AfterAttributeName, -    BeforeAttributeValue, -    AttributeValueDoubleQuoted, -    AttributeValueSingleQuoted, -    AttributeValueUnquoted, -    AfterAttributeValueQuoted, -    SelfClosingStartTag, -    BogusComment, -    MarkupDeclarationOpen, -    CommentStart, -    CommentStartDash, -    Comment, -    CommentLessThanSign, -    CommentLessThanSignBang, -    CommentLessThanSignBangDash, -    CommentLessThanSignBangDashDash, -    CommentEndDash, -    CommentEnd, -    CommentEndBang, -    Doctype, -    BeforeDoctypeName, -    DoctypeName, -    AfterDoctypeName, -    AfterDoctypePublicKeyword, -    BeforeDoctypePublicIdentifier, -    DoctypePublicIdentifierDoubleQuoted, -    DoctypePublicIdentifierSingleQuoted, -    AfterDoctypePublicIdentifier, -    BetweenDoctypePublicAndSystemIdentifiers, -    AfterDoctypeSystemKeyword, -    BeforeDoctypeSystemIdentifier, -    DoctypeSystemIdentifierDoubleQuoted, -    DoctypeSystemIdentifierSingleQuoted, -    AfterDoctypeSystemIdentifier, -    BogusDoctype, -    CdataSection, -    CdataSectionBracket, -    CdataSectionEnd, -    CharacterReference, -    NamedCharacterReference, -    AmbiguousAmpersand, -    NumericCharacterReference, -    HexadecimalCharacterReferenceStart, -    DecimalCharacterReferenceStart, -    HexadecimalCharacterReference, -    DecimalCharacterReference, -    NumericCharacterReferenceEnd, -} +                    slf.temporary_buffer.clear(); +                    slf.temporary_buffer.push_str(char_ref.characters); +                    slf.flush_code_points_consumed_as_character_reference(); +                    slf.state = slf.return_state.take().unwrap(); +                    Ok(ControlToken::Continue) +                } +            } else { +                slf.unread_char(c); +                slf.flush_code_points_consumed_as_character_reference(); +                slf.state = State::AmbiguousAmpersand; +                Ok(ControlToken::Continue) +            } +        } +        State::AmbiguousAmpersand => match slf.read_char()? { +            Some(x) if x.is_ascii_alphanumeric() => { +                if slf.is_consumed_as_part_of_an_attribute() { +                    slf.emitter.push_attribute_value(ctostr!(x)); +                } else { +                    slf.emitter.emit_string(ctostr!(x)); +                } + +                Ok(ControlToken::Continue) +            } +            c @ Some(';') => { +                slf.emitter +                    .emit_error(Error::UnknownNamedCharacterReference); +                slf.unread_char(c); +                slf.state = slf.return_state.take().unwrap(); +                Ok(ControlToken::Continue) +            } +            c => { +                slf.unread_char(c); +                slf.state = slf.return_state.take().unwrap(); +                Ok(ControlToken::Continue) +            } +        }, +        State::NumericCharacterReference => { +            slf.character_reference_code = 0; +            match slf.read_char()? { +                Some(x @ 'x' | x @ 'X') => { +                    slf.temporary_buffer.push(x); +                    slf.state = State::HexadecimalCharacterReferenceStart; +                    Ok(ControlToken::Continue) +                } +                c => { +                    slf.unread_char(c); +                    slf.state = State::DecimalCharacterReferenceStart; +                    Ok(ControlToken::Continue) +                } +            } +        } +        State::HexadecimalCharacterReferenceStart => match slf.read_char()? { +            c @ Some('0'..='9' | 'A'..='F' | 'a'..='f') => { +                slf.unread_char(c); +                slf.state = State::HexadecimalCharacterReference; +                Ok(ControlToken::Continue) +            } +            c => { +                slf.emitter +                    .emit_error(Error::AbsenceOfDigitsInNumericCharacterReference); +                slf.flush_code_points_consumed_as_character_reference(); +                slf.unread_char(c); +                slf.state = slf.return_state.take().unwrap(); +                Ok(ControlToken::Continue) +            } +        }, +        State::DecimalCharacterReferenceStart => match slf.read_char()? { +            Some(x @ ascii_digit_pat!()) => { +                slf.unread_char(Some(x)); +                slf.state = State::DecimalCharacterReference; +                Ok(ControlToken::Continue) +            } +            c => { +                slf.emitter +                    .emit_error(Error::AbsenceOfDigitsInNumericCharacterReference); +                slf.flush_code_points_consumed_as_character_reference(); +                slf.unread_char(c); +                slf.state = slf.return_state.take().unwrap(); +                Ok(ControlToken::Continue) +            } +        }, +        State::HexadecimalCharacterReference => match slf.read_char()? { +            Some(x @ ascii_digit_pat!()) => { +                mutate_character_reference!(*16 + x - 0x0030); +                Ok(ControlToken::Continue) +            } +            Some(x @ 'A'..='F') => { +                mutate_character_reference!(*16 + x - 0x0037); +                Ok(ControlToken::Continue) +            } +            Some(x @ 'a'..='f') => { +                mutate_character_reference!(*16 + x - 0x0057); +                Ok(ControlToken::Continue) +            } +            Some(';') => { +                slf.state = State::NumericCharacterReferenceEnd; +                Ok(ControlToken::Continue) +            } +            c => { +                slf.emitter +                    .emit_error(Error::MissingSemicolonAfterCharacterReference); +                slf.unread_char(c); +                slf.state = State::NumericCharacterReferenceEnd; +                Ok(ControlToken::Continue) +            } +        }, +        State::DecimalCharacterReference => match slf.read_char()? { +            Some(x @ ascii_digit_pat!()) => { +                mutate_character_reference!(*10 + x - 0x0030); +                Ok(ControlToken::Continue) +            } +            Some(';') => { +                slf.state = State::NumericCharacterReferenceEnd; +                Ok(ControlToken::Continue) +            } +            c => { +                slf.emitter +                    .emit_error(Error::MissingSemicolonAfterCharacterReference); +                slf.unread_char(c); +                slf.state = State::NumericCharacterReferenceEnd; +                Ok(ControlToken::Continue) +            } +        }, +        State::NumericCharacterReferenceEnd => { +            match slf.character_reference_code { +                0x00 => { +                    slf.emitter.emit_error(Error::NullCharacterReference); +                    slf.character_reference_code = 0xfffd; +                } +                0x110000.. => { +                    slf.emitter +                        .emit_error(Error::CharacterReferenceOutsideUnicodeRange); +                    slf.character_reference_code = 0xfffd; +                } +                surrogate_pat!() => { +                    slf.emitter.emit_error(Error::SurrogateCharacterReference); +                    slf.character_reference_code = 0xfffd; +                } +                // noncharacter +                noncharacter_pat!() => { +                    slf.emitter +                        .emit_error(Error::NoncharacterCharacterReference); +                } +                // 0x000d, or a control that is not whitespace +                x @ 0x000d | x @ control_pat!() +                    if !matches!(x, 0x0009 | 0x000a | 0x000c | 0x0020) => +                { +                    slf.emitter.emit_error(Error::ControlCharacterReference); +                    slf.character_reference_code = match x { +                        0x80 => 0x20AC, // EURO SIGN (€) +                        0x82 => 0x201A, // SINGLE LOW-9 QUOTATION MARK (‚) +                        0x83 => 0x0192, // LATIN SMALL LETTER F WITH HOOK (ƒ) +                        0x84 => 0x201E, // DOUBLE LOW-9 QUOTATION MARK („) +                        0x85 => 0x2026, // HORIZONTAL ELLIPSIS (…) +                        0x86 => 0x2020, // DAGGER (†) +                        0x87 => 0x2021, // DOUBLE DAGGER (‡) +                        0x88 => 0x02C6, // MODIFIER LETTER CIRCUMFLEX ACCENT (ˆ) +                        0x89 => 0x2030, // PER MILLE SIGN (‰) +                        0x8A => 0x0160, // LATIN CAPITAL LETTER S WITH CARON (Š) +                        0x8B => 0x2039, // SINGLE LEFT-POINTING ANGLE QUOTATION MARK (‹) +                        0x8C => 0x0152, // LATIN CAPITAL LIGATURE OE (Œ) +                        0x8E => 0x017D, // LATIN CAPITAL LETTER Z WITH CARON (Ž) +                        0x91 => 0x2018, // LEFT SINGLE QUOTATION MARK (‘) +                        0x92 => 0x2019, // RIGHT SINGLE QUOTATION MARK (’) +                        0x93 => 0x201C, // LEFT DOUBLE QUOTATION MARK (“) +                        0x94 => 0x201D, // RIGHT DOUBLE QUOTATION MARK (”) +                        0x95 => 0x2022, // BULLET (•) +                        0x96 => 0x2013, // EN DASH (–) +                        0x97 => 0x2014, // EM DASH (—) +                        0x98 => 0x02DC, // SMALL TILDE (˜) +                        0x99 => 0x2122, // TRADE MARK SIGN (™) +                        0x9A => 0x0161, // LATIN SMALL LETTER S WITH CARON (š) +                        0x9B => 0x203A, // SINGLE RIGHT-POINTING ANGLE QUOTATION MARK (›) +                        0x9C => 0x0153, // LATIN SMALL LIGATURE OE (œ) +                        0x9E => 0x017E, // LATIN SMALL LETTER Z WITH CARON (ž) +                        0x9F => 0x0178, // LATIN CAPITAL LETTER Y WITH DIAERESIS (Ÿ) +                        _ => slf.character_reference_code, +                    }; +                } +                _ => (), +            } -pub enum ControlToken { -    Eof, -    Continue, +            slf.temporary_buffer.clear(); +            slf.temporary_buffer +                .push(std::char::from_u32(slf.character_reference_code).unwrap()); +            slf.flush_code_points_consumed_as_character_reference(); +            slf.state = slf.return_state.take().unwrap(); +            Ok(ControlToken::Continue) +        } +    }  } diff --git a/src/tokenizer.rs b/src/tokenizer.rs new file mode 100644 index 0000000..d7e60ac --- /dev/null +++ b/src/tokenizer.rs @@ -0,0 +1,244 @@ +use crate::machine; +use crate::utils::{control_pat, noncharacter_pat, surrogate_pat, ControlToken, State}; +use crate::{DefaultEmitter, Emitter, Error, Never, Readable, Reader}; + +// this is a stack that can hold 0 to 2 Ts +#[derive(Debug, Default)] +struct Stack2<T: Copy>(Option<(T, Option<T>)>); + +impl<T: Copy> Stack2<T> { +    #[inline] +    fn push(&mut self, c: T) { +        self.0 = match self.0 { +            None => Some((c, None)), +            Some((c1, None)) => Some((c1, Some(c))), +            Some((_c1, Some(_c2))) => panic!("stack full!"), +        } +    } + +    #[inline] +    fn pop(&mut self) -> Option<T> { +        let (new_self, rv) = match self.0 { +            Some((c1, Some(c2))) => (Some((c1, None)), Some(c2)), +            Some((c1, None)) => (None, Some(c1)), +            None => (None, None), +        }; +        self.0 = new_self; +        rv +    } + +    #[inline] +    fn is_empty(&self) -> bool { +        matches!(self.0, None) +    } +} + +/// A HTML tokenizer. See crate-level docs for basic usage. +pub struct Tokenizer<R: Reader, E: Emitter = DefaultEmitter> { +    eof: bool, +    pub(crate) state: State, +    pub(crate) emitter: E, +    pub(crate) temporary_buffer: String, +    reader: R, +    to_reconsume: Stack2<Option<char>>, +    pub(crate) character_reference_code: u32, +    pub(crate) return_state: Option<State>, +} + +impl<R: Reader> Tokenizer<R> { +    /// Create a new tokenizer from some input. +    /// +    /// `input` can be `&String` or `&str` at the moment, as those are the types for which +    /// [`crate::Readable`] is implemented, but you can implement that trait on your own types. +    /// +    /// Patches are welcome for providing an efficient implementation over async streams, +    /// iterators, files, etc, as long as any dependencies come behind featureflags. +    pub fn new<'a, S: Readable<'a, Reader = R>>(input: S) -> Self { +        Tokenizer::<S::Reader>::new_with_emitter(input, DefaultEmitter::default()) +    } +} + +impl<R: Reader, E: Emitter> Tokenizer<R, E> { +    /// Construct a new tokenizer from some input and a custom emitter. +    /// +    /// Use this method over [`Tokenizer::new`] when you want to have more control over string allocation for +    /// tokens. +    pub fn new_with_emitter<'a, S: Readable<'a, Reader = R>>(input: S, emitter: E) -> Self { +        Tokenizer { +            eof: false, +            state: State::Data, +            emitter, +            temporary_buffer: String::new(), +            to_reconsume: Stack2::default(), +            reader: input.to_reader(), +            character_reference_code: 0, +            return_state: None, +        } +    } + +    /// Test-internal function to override internal state. +    /// +    /// Only available with the `integration-tests` feature which is not public API. +    #[cfg(feature = "integration-tests")] +    pub fn set_state(&mut self, state: State) { +        self.state = state; +    } + +    /// Set the statemachine to start/continue in [plaintext +    /// state](https://html.spec.whatwg.org/#plaintext-state). +    /// +    /// This tokenizer never gets into that state naturally. +    pub fn set_plaintext_state(&mut self) { +        self.state = State::PlainText; +    } + +    /// Test-internal function to override internal state. +    /// +    /// Only available with the `integration-tests` feature which is not public API. +    #[cfg(feature = "integration-tests")] +    pub fn set_last_start_tag(&mut self, last_start_tag: Option<&str>) { +        self.emitter.set_last_start_tag(last_start_tag); +    } + +    #[inline] +    pub(crate) fn unread_char(&mut self, c: Option<char>) { +        self.to_reconsume.push(c); +    } + +    #[inline] +    fn validate_char(&mut self, c: char) { +        match c as u32 { +            surrogate_pat!() => { +                self.emitter.emit_error(Error::SurrogateInInputStream); +            } +            noncharacter_pat!() => { +                self.emitter.emit_error(Error::NoncharacterInInputStream); +            } +            // control without whitespace or nul +            x @ control_pat!() +                if !matches!(x, 0x0000 | 0x0009 | 0x000a | 0x000c | 0x000d | 0x0020) => +            { +                self.emitter +                    .emit_error(Error::ControlCharacterInInputStream); +            } +            _ => (), +        } +    } + +    pub(crate) fn read_char(&mut self) -> Result<Option<char>, R::Error> { +        let (c_res, reconsumed) = match self.to_reconsume.pop() { +            Some(c) => (Ok(c), true), +            None => (self.reader.read_char(), false), +        }; + +        let mut c = match c_res { +            Ok(Some(c)) => c, +            res => return res, +        }; + +        if c == '\r' { +            c = '\n'; +            let c2 = self.reader.read_char()?; +            if c2 != Some('\n') { +                self.unread_char(c2); +            } +        } + +        if !reconsumed { +            self.validate_char(c); +        } + +        Ok(Some(c)) +    } + +    #[inline] +    pub(crate) fn try_read_string( +        &mut self, +        s: &str, +        case_sensitive: bool, +    ) -> Result<bool, R::Error> { +        debug_assert!(!s.is_empty()); +        debug_assert!(self.to_reconsume.is_empty()); +        self.reader.try_read_string(s, case_sensitive) +    } + +    pub(crate) fn is_consumed_as_part_of_an_attribute(&self) -> bool { +        matches!( +            self.return_state, +            Some( +                State::AttributeValueDoubleQuoted +                    | State::AttributeValueSingleQuoted +                    | State::AttributeValueUnquoted +            ) +        ) +    } + +    pub(crate) fn flush_code_points_consumed_as_character_reference(&mut self) { +        if self.is_consumed_as_part_of_an_attribute() { +            self.emitter.push_attribute_value(&self.temporary_buffer); +            self.temporary_buffer.clear(); +        } else { +            self.flush_buffer_characters(); +        } +    } + +    pub(crate) fn next_input_character(&mut self) -> Result<Option<char>, R::Error> { +        let rv = self.read_char()?; +        self.unread_char(rv); +        Ok(rv) +    } + +    pub(crate) fn flush_buffer_characters(&mut self) { +        self.emitter.emit_string(&self.temporary_buffer); +        self.temporary_buffer.clear(); +    } +} + +impl<R: Reader, E: Emitter> Iterator for Tokenizer<R, E> { +    type Item = Result<E::Token, R::Error>; + +    fn next(&mut self) -> Option<Self::Item> { +        loop { +            if let Some(token) = self.emitter.pop_token() { +                break Some(Ok(token)); +            } else if !self.eof { +                match machine::consume(self) { +                    Ok(ControlToken::Continue) => (), +                    Ok(ControlToken::Eof) => { +                        self.eof = true; +                        self.emitter.emit_eof(); +                    } +                    Err(e) => break Some(Err(e)), +                } +            } else { +                break None; +            } +        } +    } +} + +/// A kind of tokenizer that directly yields tokens when used as an iterator, so `Token` instead of +/// `Result<Token, _>`. +/// +/// This is the return value of [`Tokenizer::infallible`]. +pub struct InfallibleTokenizer<R: Reader<Error = Never>, E: Emitter>(Tokenizer<R, E>); + +impl<R: Reader<Error = Never>, E: Emitter> Tokenizer<R, E> { +    /// Statically assert that this iterator is infallible. +    /// +    /// Call this to get rid of error handling when parsing HTML from strings. +    pub fn infallible(self) -> InfallibleTokenizer<R, E> { +        InfallibleTokenizer(self) +    } +} + +impl<R: Reader<Error = Never>, E: Emitter> Iterator for InfallibleTokenizer<R, E> { +    type Item = E::Token; + +    fn next(&mut self) -> Option<Self::Item> { +        match self.0.next()? { +            Ok(token) => Some(token), +            Err(e) => match e {}, +        } +    } +} diff --git a/src/utils.rs b/src/utils.rs new file mode 100644 index 0000000..67db1b9 --- /dev/null +++ b/src/utils.rs @@ -0,0 +1,164 @@ +macro_rules! surrogate_pat { +    () => { +        0xd800..=0xdfff +    }; +} + +pub(crate) use surrogate_pat; + +macro_rules! control_pat { +    () => (0x0d | 0x0000..=0x001f | 0x007f..=0x009f) +} + +pub(crate) use control_pat; + +macro_rules! ascii_digit_pat { +    () => { +        '0'..='9' +    }; +} + +pub(crate) use ascii_digit_pat; + +macro_rules! whitespace_pat { +    () => { +        '\t' | '\u{0A}' | '\u{0C}' | ' ' +    }; +} + +pub(crate) use whitespace_pat; + +macro_rules! noncharacter_pat { +    () => { +        0xfdd0 +            ..=0xfdef +                | 0xfffe +                | 0xffff +                | 0x1fffe +                | 0x1ffff +                | 0x2fffe +                | 0x2ffff +                | 0x3fffe +                | 0x3ffff +                | 0x4fffe +                | 0x4ffff +                | 0x5fffe +                | 0x5ffff +                | 0x6fffe +                | 0x6ffff +                | 0x7fffe +                | 0x7ffff +                | 0x8fffe +                | 0x8ffff +                | 0x9fffe +                | 0x9ffff +                | 0xafffe +                | 0xaffff +                | 0xbfffe +                | 0xbffff +                | 0xcfffe +                | 0xcffff +                | 0xdfffe +                | 0xdffff +                | 0xefffe +                | 0xeffff +                | 0xffffe +                | 0xfffff +                | 0x10fffe +                | 0x10ffff +    }; +} + +pub(crate) use noncharacter_pat; + +// When integration tests are running, this enum is public and we get warnings about missing docs. +// However, it's not actually part of public API. +#[allow(missing_docs)] +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum State { +    Data, +    RcData, +    RawText, +    ScriptData, +    PlainText, +    TagOpen, +    EndTagOpen, +    TagName, +    RcDataLessThanSign, +    RcDataEndTagOpen, +    RcDataEndTagName, +    RawTextLessThanSign, +    RawTextEndTagOpen, +    RawTextEndTagName, +    ScriptDataLessThanSign, +    ScriptDataEndTagOpen, +    ScriptDataEndTagName, +    ScriptDataEscapeStart, +    ScriptDataEscapeStartDash, +    ScriptDataEscaped, +    ScriptDataEscapedDash, +    ScriptDataEscapedDashDash, +    ScriptDataEscapedLessThanSign, +    ScriptDataEscapedEndTagOpen, +    ScriptDataEscapedEndTagName, +    ScriptDataDoubleEscapeStart, +    ScriptDataDoubleEscaped, +    ScriptDataDoubleEscapedDash, +    ScriptDataDoubleEscapedDashDash, +    ScriptDataDoubleEscapedLessThanSign, +    ScriptDataDoubleEscapeEnd, +    BeforeAttributeName, +    AttributeName, +    AfterAttributeName, +    BeforeAttributeValue, +    AttributeValueDoubleQuoted, +    AttributeValueSingleQuoted, +    AttributeValueUnquoted, +    AfterAttributeValueQuoted, +    SelfClosingStartTag, +    BogusComment, +    MarkupDeclarationOpen, +    CommentStart, +    CommentStartDash, +    Comment, +    CommentLessThanSign, +    CommentLessThanSignBang, +    CommentLessThanSignBangDash, +    CommentLessThanSignBangDashDash, +    CommentEndDash, +    CommentEnd, +    CommentEndBang, +    Doctype, +    BeforeDoctypeName, +    DoctypeName, +    AfterDoctypeName, +    AfterDoctypePublicKeyword, +    BeforeDoctypePublicIdentifier, +    DoctypePublicIdentifierDoubleQuoted, +    DoctypePublicIdentifierSingleQuoted, +    AfterDoctypePublicIdentifier, +    BetweenDoctypePublicAndSystemIdentifiers, +    AfterDoctypeSystemKeyword, +    BeforeDoctypeSystemIdentifier, +    DoctypeSystemIdentifierDoubleQuoted, +    DoctypeSystemIdentifierSingleQuoted, +    AfterDoctypeSystemIdentifier, +    BogusDoctype, +    CdataSection, +    CdataSectionBracket, +    CdataSectionEnd, +    CharacterReference, +    NamedCharacterReference, +    AmbiguousAmpersand, +    NumericCharacterReference, +    HexadecimalCharacterReferenceStart, +    DecimalCharacterReferenceStart, +    HexadecimalCharacterReference, +    DecimalCharacterReference, +    NumericCharacterReferenceEnd, +} + +pub enum ControlToken { +    Eof, +    Continue, +} | 
