diff options
Diffstat (limited to 'src/tokenizer')
| -rw-r--r-- | src/tokenizer/char_ref/mod.rs | 449 | ||||
| -rw-r--r-- | src/tokenizer/interface.rs | 110 | ||||
| -rw-r--r-- | src/tokenizer/mod.rs | 1713 | ||||
| -rw-r--r-- | src/tokenizer/states.rs | 93 | 
4 files changed, 2365 insertions, 0 deletions
| diff --git a/src/tokenizer/char_ref/mod.rs b/src/tokenizer/char_ref/mod.rs new file mode 100644 index 0000000..a52485d --- /dev/null +++ b/src/tokenizer/char_ref/mod.rs @@ -0,0 +1,449 @@ +// Copyright 2014-2017 The html5ever Project Developers. See the +// COPYRIGHT file at the top-level directory of this distribution. +// +// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or +// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license +// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +use super::{TokenSink, Tokenizer}; +use crate::buffer_queue::BufferQueue; +use crate::data; +use crate::tendril::StrTendril; +use crate::util::str::is_ascii_alnum; + +use log::debug; +use mac::format_if; +use std::borrow::Cow::Borrowed; +use std::char::from_u32; + +use self::State::*; +pub use self::Status::*; + +//§ tokenizing-character-references +pub struct CharRef { +    /// The resulting character(s) +    pub chars: [char; 2], + +    /// How many slots in `chars` are valid? +    pub num_chars: u8, +} + +pub enum Status { +    Stuck, +    Progress, +    Done, +} + +#[derive(Debug)] +enum State { +    Begin, +    Octothorpe, +    Numeric(u32), // base +    NumericSemicolon, +    Named, +    BogusName, +} + +pub struct CharRefTokenizer { +    state: State, +    addnl_allowed: Option<char>, +    result: Option<CharRef>, + +    num: u32, +    num_too_big: bool, +    seen_digit: bool, +    hex_marker: Option<char>, + +    name_buf_opt: Option<StrTendril>, +    name_match: Option<(u32, u32)>, +    name_len: usize, +} + +impl CharRefTokenizer { +    // NB: We assume that we have an additional allowed character iff we're +    // tokenizing in an attribute value. +    pub fn new(addnl_allowed: Option<char>) -> CharRefTokenizer { +        CharRefTokenizer { +            state: Begin, +            addnl_allowed, +            result: None, +            num: 0, +            num_too_big: false, +            seen_digit: false, +            hex_marker: None, +            name_buf_opt: None, +            name_match: None, +            name_len: 0, +        } +    } + +    // A CharRefTokenizer can only tokenize one character reference, +    // so this method consumes the tokenizer. +    pub fn get_result(self) -> CharRef { +        self.result.expect("get_result called before done") +    } + +    fn name_buf(&self) -> &StrTendril { +        self.name_buf_opt +            .as_ref() +            .expect("name_buf missing in named character reference") +    } + +    fn name_buf_mut(&mut self) -> &mut StrTendril { +        self.name_buf_opt +            .as_mut() +            .expect("name_buf missing in named character reference") +    } + +    fn finish_none(&mut self) -> Status { +        self.result = Some(CharRef { +            chars: ['\0', '\0'], +            num_chars: 0, +        }); +        Done +    } + +    fn finish_one(&mut self, c: char) -> Status { +        self.result = Some(CharRef { +            chars: [c, '\0'], +            num_chars: 1, +        }); +        Done +    } +} + +impl CharRefTokenizer { +    pub fn step<Sink: TokenSink>( +        &mut self, +        tokenizer: &mut Tokenizer<Sink>, +        input: &mut BufferQueue, +    ) -> Status { +        if self.result.is_some() { +            return Done; +        } + +        debug!("char ref tokenizer stepping in state {:?}", self.state); +        match self.state { +            Begin => self.do_begin(tokenizer, input), +            Octothorpe => self.do_octothorpe(tokenizer, input), +            Numeric(base) => self.do_numeric(tokenizer, input, base), +            NumericSemicolon => self.do_numeric_semicolon(tokenizer, input), +            Named => self.do_named(tokenizer, input), +            BogusName => self.do_bogus_name(tokenizer, input), +        } +    } + +    fn do_begin<Sink: TokenSink>( +        &mut self, +        tokenizer: &mut Tokenizer<Sink>, +        input: &mut BufferQueue, +    ) -> Status { +        match unwrap_or_return!(tokenizer.peek(input), Stuck) { +            '\t' | '\n' | '\x0C' | ' ' | '<' | '&' => self.finish_none(), +            c if Some(c) == self.addnl_allowed => self.finish_none(), + +            '#' => { +                tokenizer.discard_char(input); +                self.state = Octothorpe; +                Progress +            }, + +            _ => { +                self.state = Named; +                self.name_buf_opt = Some(StrTendril::new()); +                Progress +            }, +        } +    } + +    fn do_octothorpe<Sink: TokenSink>( +        &mut self, +        tokenizer: &mut Tokenizer<Sink>, +        input: &mut BufferQueue, +    ) -> Status { +        let c = unwrap_or_return!(tokenizer.peek(input), Stuck); +        match c { +            'x' | 'X' => { +                tokenizer.discard_char(input); +                self.hex_marker = Some(c); +                self.state = Numeric(16); +            }, + +            _ => { +                self.hex_marker = None; +                self.state = Numeric(10); +            }, +        } +        Progress +    } + +    fn do_numeric<Sink: TokenSink>( +        &mut self, +        tokenizer: &mut Tokenizer<Sink>, +        input: &mut BufferQueue, +        base: u32, +    ) -> Status { +        let c = unwrap_or_return!(tokenizer.peek(input), Stuck); +        match c.to_digit(base) { +            Some(n) => { +                tokenizer.discard_char(input); +                self.num = self.num.wrapping_mul(base); +                if self.num > 0x10FFFF { +                    // We might overflow, and the character is definitely invalid. +                    // We still parse digits and semicolon, but don't use the result. +                    self.num_too_big = true; +                } +                self.num = self.num.wrapping_add(n); +                self.seen_digit = true; +                Progress +            }, + +            None if !self.seen_digit => self.unconsume_numeric(tokenizer, input), + +            None => { +                self.state = NumericSemicolon; +                Progress +            }, +        } +    } + +    fn do_numeric_semicolon<Sink: TokenSink>( +        &mut self, +        tokenizer: &mut Tokenizer<Sink>, +        input: &mut BufferQueue, +    ) -> Status { +        match unwrap_or_return!(tokenizer.peek(input), Stuck) { +            ';' => tokenizer.discard_char(input), +            _ => tokenizer.emit_error(Borrowed( +                "Semicolon missing after numeric character reference", +            )), +        }; +        self.finish_numeric(tokenizer) +    } + +    fn unconsume_numeric<Sink: TokenSink>( +        &mut self, +        tokenizer: &mut Tokenizer<Sink>, +        input: &mut BufferQueue, +    ) -> Status { +        let mut unconsume = StrTendril::from_char('#'); +        match self.hex_marker { +            Some(c) => unconsume.push_char(c), +            None => (), +        } + +        input.push_front(unconsume); +        tokenizer.emit_error(Borrowed("Numeric character reference without digits")); +        self.finish_none() +    } + +    fn finish_numeric<Sink: TokenSink>(&mut self, tokenizer: &mut Tokenizer<Sink>) -> Status { +        fn conv(n: u32) -> char { +            from_u32(n).expect("invalid char missed by error handling cases") +        } + +        let (c, error) = match self.num { +            n if (n > 0x10FFFF) || self.num_too_big => ('\u{fffd}', true), +            0x00 | 0xD800..=0xDFFF => ('\u{fffd}', true), + +            0x80..=0x9F => match data::C1_REPLACEMENTS[(self.num - 0x80) as usize] { +                Some(c) => (c, true), +                None => (conv(self.num), true), +            }, + +            0x01..=0x08 | 0x0B | 0x0D..=0x1F | 0x7F | 0xFDD0..=0xFDEF => (conv(self.num), true), + +            n if (n & 0xFFFE) == 0xFFFE => (conv(n), true), + +            n => (conv(n), false), +        }; + +        if error { +            let msg = format_if!( +                tokenizer.opts.exact_errors, +                "Invalid numeric character reference", +                "Invalid numeric character reference value 0x{:06X}", +                self.num +            ); +            tokenizer.emit_error(msg); +        } + +        self.finish_one(c) +    } + +    fn do_named<Sink: TokenSink>( +        &mut self, +        tokenizer: &mut Tokenizer<Sink>, +        input: &mut BufferQueue, +    ) -> Status { +        let c = unwrap_or_return!(tokenizer.get_char(input), Stuck); +        self.name_buf_mut().push_char(c); +        match data::NAMED_ENTITIES.get(&self.name_buf()[..]) { +            // We have either a full match or a prefix of one. +            Some(&m) => { +                if m.0 != 0 { +                    // We have a full match, but there might be a longer one to come. +                    self.name_match = Some(m); +                    self.name_len = self.name_buf().len(); +                } +                // Otherwise we just have a prefix match. +                Progress +            }, + +            // Can't continue the match. +            None => self.finish_named(tokenizer, input, Some(c)), +        } +    } + +    fn emit_name_error<Sink: TokenSink>(&mut self, tokenizer: &mut Tokenizer<Sink>) { +        let msg = format_if!( +            tokenizer.opts.exact_errors, +            "Invalid character reference", +            "Invalid character reference &{}", +            self.name_buf() +        ); +        tokenizer.emit_error(msg); +    } + +    fn unconsume_name(&mut self, input: &mut BufferQueue) { +        input.push_front(self.name_buf_opt.take().unwrap()); +    } + +    fn finish_named<Sink: TokenSink>( +        &mut self, +        tokenizer: &mut Tokenizer<Sink>, +        input: &mut BufferQueue, +        end_char: Option<char>, +    ) -> Status { +        match self.name_match { +            None => { +                match end_char { +                    Some(c) if is_ascii_alnum(c) => { +                        // Keep looking for a semicolon, to determine whether +                        // we emit a parse error. +                        self.state = BogusName; +                        return Progress; +                    }, + +                    // Check length because &; is not a parse error. +                    Some(';') if self.name_buf().len() > 1 => self.emit_name_error(tokenizer), + +                    _ => (), +                } +                self.unconsume_name(input); +                self.finish_none() +            }, + +            Some((c1, c2)) => { +                // We have a complete match, but we may have consumed +                // additional characters into self.name_buf.  Usually +                // at least one, but several in cases like +                // +                //     ¬    => match for U+00AC +                //     ¬i   => valid prefix for ¬in +                //     ¬it  => can't continue match + +                let name_len = self.name_len; +                assert!(name_len > 0); +                let last_matched = self.name_buf()[name_len - 1..].chars().next().unwrap(); + +                // There might not be a next character after the match, if +                // we had a full match and then hit EOF. +                let next_after = if name_len == self.name_buf().len() { +                    None +                } else { +                    Some(self.name_buf()[name_len..].chars().next().unwrap()) +                }; + +                // "If the character reference is being consumed as part of an +                // attribute, and the last character matched is not a U+003B +                // SEMICOLON character (;), and the next character is either a +                // U+003D EQUALS SIGN character (=) or an alphanumeric ASCII +                // character, then, for historical reasons, all the characters +                // that were matched after the U+0026 AMPERSAND character (&) +                // must be unconsumed, and nothing is returned. However, if +                // this next character is in fact a U+003D EQUALS SIGN +                // character (=), then this is a parse error" + +                let unconsume_all = match (self.addnl_allowed, last_matched, next_after) { +                    (_, ';', _) => false, +                    (Some(_), _, Some('=')) => { +                        tokenizer.emit_error(Borrowed( +                            "Equals sign after character reference in attribute", +                        )); +                        true +                    }, +                    (Some(_), _, Some(c)) if is_ascii_alnum(c) => true, +                    _ => { +                        tokenizer.emit_error(Borrowed( +                            "Character reference does not end with semicolon", +                        )); +                        false +                    }, +                }; + +                if unconsume_all { +                    self.unconsume_name(input); +                    self.finish_none() +                } else { +                    input.push_front(StrTendril::from_slice(&self.name_buf()[name_len..])); +                    self.result = Some(CharRef { +                        chars: [from_u32(c1).unwrap(), from_u32(c2).unwrap()], +                        num_chars: if c2 == 0 { 1 } else { 2 }, +                    }); +                    Done +                } +            }, +        } +    } + +    fn do_bogus_name<Sink: TokenSink>( +        &mut self, +        tokenizer: &mut Tokenizer<Sink>, +        input: &mut BufferQueue, +    ) -> Status { +        let c = unwrap_or_return!(tokenizer.get_char(input), Stuck); +        self.name_buf_mut().push_char(c); +        match c { +            _ if is_ascii_alnum(c) => return Progress, +            ';' => self.emit_name_error(tokenizer), +            _ => (), +        } +        self.unconsume_name(input); +        self.finish_none() +    } + +    pub fn end_of_file<Sink: TokenSink>( +        &mut self, +        tokenizer: &mut Tokenizer<Sink>, +        input: &mut BufferQueue, +    ) { +        while self.result.is_none() { +            match self.state { +                Begin => drop(self.finish_none()), + +                Numeric(_) if !self.seen_digit => drop(self.unconsume_numeric(tokenizer, input)), + +                Numeric(_) | NumericSemicolon => { +                    tokenizer.emit_error(Borrowed("EOF in numeric character reference")); +                    self.finish_numeric(tokenizer); +                }, + +                Named => drop(self.finish_named(tokenizer, input, None)), + +                BogusName => { +                    self.unconsume_name(input); +                    self.finish_none(); +                }, + +                Octothorpe => { +                    input.push_front(StrTendril::from_slice("#")); +                    tokenizer.emit_error(Borrowed("EOF after '#' in character reference")); +                    self.finish_none(); +                }, +            } +        } +    } +} diff --git a/src/tokenizer/interface.rs b/src/tokenizer/interface.rs new file mode 100644 index 0000000..22d11be --- /dev/null +++ b/src/tokenizer/interface.rs @@ -0,0 +1,110 @@ +// Copyright 2014-2017 The html5ever Project Developers. See the +// COPYRIGHT file at the top-level directory of this distribution. +// +// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or +// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license +// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +use crate::interface::Attribute; +use crate::tendril::StrTendril; +use crate::tokenizer::states; +use crate::LocalName; +use std::borrow::Cow; + +pub use self::TagKind::{EndTag, StartTag}; +pub use self::Token::{CharacterTokens, CommentToken, DoctypeToken, TagToken}; +pub use self::Token::{EOFToken, NullCharacterToken, ParseError}; + +/// A `DOCTYPE` token. +// FIXME: already exists in Servo DOM +#[derive(PartialEq, Eq, Clone, Debug)] +pub struct Doctype { +    pub name: Option<StrTendril>, +    pub public_id: Option<StrTendril>, +    pub system_id: Option<StrTendril>, +    pub force_quirks: bool, +} + +impl Doctype { +    pub fn new() -> Doctype { +        Doctype { +            name: None, +            public_id: None, +            system_id: None, +            force_quirks: false, +        } +    } +} + +#[derive(PartialEq, Eq, Hash, Copy, Clone, Debug)] +pub enum TagKind { +    StartTag, +    EndTag, +} + +/// A tag token. +#[derive(PartialEq, Eq, Clone, Debug)] +pub struct Tag { +    pub kind: TagKind, +    pub name: LocalName, +    pub self_closing: bool, +    pub attrs: Vec<Attribute>, +} + +impl Tag { +    /// Are the tags equivalent when we don't care about attribute order? +    /// Also ignores the self-closing flag. +    pub fn equiv_modulo_attr_order(&self, other: &Tag) -> bool { +        if (self.kind != other.kind) || (self.name != other.name) { +            return false; +        } + +        let mut self_attrs = self.attrs.clone(); +        let mut other_attrs = other.attrs.clone(); +        self_attrs.sort(); +        other_attrs.sort(); + +        self_attrs == other_attrs +    } +} + +#[derive(PartialEq, Eq, Debug)] +pub enum Token { +    DoctypeToken(Doctype), +    TagToken(Tag), +    CommentToken(StrTendril), +    CharacterTokens(StrTendril), +    NullCharacterToken, +    EOFToken, +    ParseError(Cow<'static, str>), +} + +#[derive(Debug, PartialEq)] +#[must_use] +pub enum TokenSinkResult<Handle> { +    Continue, +    Script(Handle), +    Plaintext, +    RawData(states::RawKind), +} + +/// Types which can receive tokens from the tokenizer. +pub trait TokenSink { +    type Handle; + +    /// Process a token. +    fn process_token(&mut self, token: Token, line_number: u64) -> TokenSinkResult<Self::Handle>; + +    // Signal sink that tokenization reached the end. +    fn end(&mut self) {} + +    /// Used in the markup declaration open state. By default, this always +    /// returns false and thus all CDATA sections are tokenized as bogus +    /// comments. +    /// https://html.spec.whatwg.org/multipage/#markup-declaration-open-state +    fn adjusted_current_node_present_but_not_in_html_namespace(&self) -> bool { +        false +    } +} diff --git a/src/tokenizer/mod.rs b/src/tokenizer/mod.rs new file mode 100644 index 0000000..267fdf3 --- /dev/null +++ b/src/tokenizer/mod.rs @@ -0,0 +1,1713 @@ +// Copyright 2014-2017 The html5ever Project Developers. See the +// COPYRIGHT file at the top-level directory of this distribution. +// +// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or +// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license +// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +//! The HTML5 tokenizer. + +pub use self::interface::{CharacterTokens, EOFToken, NullCharacterToken, ParseError}; +pub use self::interface::{CommentToken, DoctypeToken, TagToken, Token}; +pub use self::interface::{Doctype, EndTag, StartTag, Tag, TagKind}; +pub use self::interface::{TokenSink, TokenSinkResult}; + +use self::states::{DoctypeIdKind, Public, System}; +use self::states::{DoubleEscaped, Escaped}; +use self::states::{DoubleQuoted, SingleQuoted, Unquoted}; +use self::states::{Rawtext, Rcdata, ScriptData, ScriptDataEscaped}; + +use self::char_ref::{CharRef, CharRefTokenizer}; + +use crate::util::str::lower_ascii_letter; + +use log::debug; +use mac::{_tt_as_expr_hack, format_if, matches}; +use markup5ever::{namespace_url, ns, small_char_set}; +use std::borrow::Cow::{self, Borrowed}; +use std::collections::BTreeMap; +use std::default::Default; +use std::mem::replace; + +pub use crate::buffer_queue::{BufferQueue, FromSet, NotFromSet, SetResult}; +use crate::tendril::StrTendril; +use crate::{Attribute, LocalName, QualName, SmallCharSet}; + +mod char_ref; +mod interface; +pub mod states; + +pub enum ProcessResult<Handle> { +    Continue, +    Suspend, +    Script(Handle), +} + +#[must_use] +pub enum TokenizerResult<Handle> { +    Done, +    Script(Handle), +} + +fn option_push(opt_str: &mut Option<StrTendril>, c: char) { +    match *opt_str { +        Some(ref mut s) => s.push_char(c), +        None => *opt_str = Some(StrTendril::from_char(c)), +    } +} + +/// Tokenizer options, with an impl for `Default`. +#[derive(Clone)] +pub struct TokenizerOpts { +    /// Report all parse errors described in the spec, at some +    /// performance penalty?  Default: false +    pub exact_errors: bool, + +    /// Discard a `U+FEFF BYTE ORDER MARK` if we see one at the beginning +    /// of the stream?  Default: true +    pub discard_bom: bool, + +    /// Keep a record of how long we spent in each state?  Printed +    /// when `end()` is called.  Default: false +    pub profile: bool, + +    /// Initial state override.  Only the test runner should use +    /// a non-`None` value! +    pub initial_state: Option<states::State>, + +    /// Last start tag.  Only the test runner should use a +    /// non-`None` value! +    /// +    /// FIXME: Can't use Tendril because we want TokenizerOpts +    /// to be Send. +    pub last_start_tag_name: Option<String>, +} + +impl Default for TokenizerOpts { +    fn default() -> TokenizerOpts { +        TokenizerOpts { +            exact_errors: false, +            discard_bom: true, +            profile: false, +            initial_state: None, +            last_start_tag_name: None, +        } +    } +} + +/// The HTML tokenizer. +pub struct Tokenizer<Sink> { +    /// Options controlling the behavior of the tokenizer. +    opts: TokenizerOpts, + +    /// Destination for tokens we emit. +    pub sink: Sink, + +    /// The abstract machine state as described in the spec. +    state: states::State, + +    /// Are we at the end of the file, once buffers have been processed +    /// completely? This affects whether we will wait for lookahead or not. +    at_eof: bool, + +    /// Tokenizer for character references, if we're tokenizing +    /// one at the moment. +    char_ref_tokenizer: Option<Box<CharRefTokenizer>>, + +    /// Current input character.  Just consumed, may reconsume. +    current_char: char, + +    /// Should we reconsume the current input character? +    reconsume: bool, + +    /// Did we just consume \r, translating it to \n?  In that case we need +    /// to ignore the next character if it's \n. +    ignore_lf: bool, + +    /// Discard a U+FEFF BYTE ORDER MARK if we see one?  Only done at the +    /// beginning of the stream. +    discard_bom: bool, + +    /// Current tag kind. +    current_tag_kind: TagKind, + +    /// Current tag name. +    current_tag_name: StrTendril, + +    /// Current tag is self-closing? +    current_tag_self_closing: bool, + +    /// Current tag attributes. +    current_tag_attrs: Vec<Attribute>, + +    /// Current attribute name. +    current_attr_name: StrTendril, + +    /// Current attribute value. +    current_attr_value: StrTendril, + +    /// Current comment. +    current_comment: StrTendril, + +    /// Current doctype token. +    current_doctype: Doctype, + +    /// Last start tag name, for use in checking "appropriate end tag". +    last_start_tag_name: Option<LocalName>, + +    /// The "temporary buffer" mentioned in the spec. +    temp_buf: StrTendril, + +    /// Record of how many ns we spent in each state, if profiling is enabled. +    state_profile: BTreeMap<states::State, u64>, + +    /// Record of how many ns we spent in the token sink. +    time_in_sink: u64, + +    /// Track current line +    current_line: u64, +} + +impl<Sink: TokenSink> Tokenizer<Sink> { +    /// Create a new tokenizer which feeds tokens to a particular `TokenSink`. +    pub fn new(sink: Sink, mut opts: TokenizerOpts) -> Tokenizer<Sink> { +        let start_tag_name = opts +            .last_start_tag_name +            .take() +            .map(|s| LocalName::from(&*s)); +        let state = opts.initial_state.unwrap_or(states::Data); +        let discard_bom = opts.discard_bom; +        Tokenizer { +            opts, +            sink, +            state, +            char_ref_tokenizer: None, +            at_eof: false, +            current_char: '\0', +            reconsume: false, +            ignore_lf: false, +            discard_bom, +            current_tag_kind: StartTag, +            current_tag_name: StrTendril::new(), +            current_tag_self_closing: false, +            current_tag_attrs: vec![], +            current_attr_name: StrTendril::new(), +            current_attr_value: StrTendril::new(), +            current_comment: StrTendril::new(), +            current_doctype: Doctype::new(), +            last_start_tag_name: start_tag_name, +            temp_buf: StrTendril::new(), +            state_profile: BTreeMap::new(), +            time_in_sink: 0, +            current_line: 1, +        } +    } + +    /// Feed an input string into the tokenizer. +    pub fn feed(&mut self, input: &mut BufferQueue) -> TokenizerResult<Sink::Handle> { +        if input.is_empty() { +            return TokenizerResult::Done; +        } + +        if self.discard_bom { +            if let Some(c) = input.peek() { +                if c == '\u{feff}' { +                    input.next(); +                } +            } else { +                return TokenizerResult::Done; +            } +        }; + +        self.run(input) +    } + +    pub fn set_plaintext_state(&mut self) { +        self.state = states::Plaintext; +    } + +    fn process_token(&mut self, token: Token) -> TokenSinkResult<Sink::Handle> { +        if self.opts.profile { +            let (ret, dt) = time!(self.sink.process_token(token, self.current_line)); +            self.time_in_sink += dt; +            ret +        } else { +            self.sink.process_token(token, self.current_line) +        } +    } + +    fn process_token_and_continue(&mut self, token: Token) { +        assert!(matches!( +            self.process_token(token), +            TokenSinkResult::Continue +        )); +    } + +    //§ preprocessing-the-input-stream +    // Get the next input character, which might be the character +    // 'c' that we already consumed from the buffers. +    fn get_preprocessed_char(&mut self, mut c: char, input: &mut BufferQueue) -> Option<char> { +        if self.ignore_lf { +            self.ignore_lf = false; +            if c == '\n' { +                c = unwrap_or_return!(input.next(), None); +            } +        } + +        if c == '\r' { +            self.ignore_lf = true; +            c = '\n'; +        } + +        if c == '\n' { +            self.current_line += 1; +        } + +        if self.opts.exact_errors && +            match c as u32 { +                0x01..=0x08 | 0x0B | 0x0E..=0x1F | 0x7F..=0x9F | 0xFDD0..=0xFDEF => true, +                n if (n & 0xFFFE) == 0xFFFE => true, +                _ => false, +            } +        { +            let msg = format!("Bad character {}", c); +            self.emit_error(Cow::Owned(msg)); +        } + +        debug!("got character {}", c); +        self.current_char = c; +        Some(c) +    } + +    //§ tokenization +    // Get the next input character, if one is available. +    fn get_char(&mut self, input: &mut BufferQueue) -> Option<char> { +        if self.reconsume { +            self.reconsume = false; +            Some(self.current_char) +        } else { +            input +                .next() +                .and_then(|c| self.get_preprocessed_char(c, input)) +        } +    } + +    fn pop_except_from(&mut self, input: &mut BufferQueue, set: SmallCharSet) -> Option<SetResult> { +        // Bail to the slow path for various corner cases. +        // This means that `FromSet` can contain characters not in the set! +        // It shouldn't matter because the fallback `FromSet` case should +        // always do the same thing as the `NotFromSet` case. +        if self.opts.exact_errors || self.reconsume || self.ignore_lf { +            return self.get_char(input).map(FromSet); +        } + +        let d = input.pop_except_from(set); +        debug!("got characters {:?}", d); +        match d { +            Some(FromSet(c)) => self.get_preprocessed_char(c, input).map(FromSet), + +            // NB: We don't set self.current_char for a run of characters not +            // in the set.  It shouldn't matter for the codepaths that use +            // this. +            _ => d, +        } +    } + +    // Check if the next characters are an ASCII case-insensitive match.  See +    // BufferQueue::eat. +    // +    // NB: this doesn't do input stream preprocessing or set the current input +    // character. +    fn eat( +        &mut self, +        input: &mut BufferQueue, +        pat: &str, +        eq: fn(&u8, &u8) -> bool, +    ) -> Option<bool> { +        input.push_front(replace(&mut self.temp_buf, StrTendril::new())); +        match input.eat(pat, eq) { +            None if self.at_eof => Some(false), +            None => { +                while let Some(c) = input.next() { +                    self.temp_buf.push_char(c); +                } +                None +            }, +            Some(matched) => Some(matched), +        } +    } + +    /// Run the state machine for as long as we can. +    fn run(&mut self, input: &mut BufferQueue) -> TokenizerResult<Sink::Handle> { +        if self.opts.profile { +            loop { +                let state = self.state; +                let old_sink = self.time_in_sink; +                let (run, mut dt) = time!(self.step(input)); +                dt -= (self.time_in_sink - old_sink); +                let new = match self.state_profile.get_mut(&state) { +                    Some(x) => { +                        *x += dt; +                        false +                    }, +                    None => true, +                }; +                if new { +                    // do this here because of borrow shenanigans +                    self.state_profile.insert(state, dt); +                } +                match run { +                    ProcessResult::Continue => (), +                    ProcessResult::Suspend => break, +                    ProcessResult::Script(node) => return TokenizerResult::Script(node), +                } +            } +        } else { +            loop { +                match self.step(input) { +                    ProcessResult::Continue => (), +                    ProcessResult::Suspend => break, +                    ProcessResult::Script(node) => return TokenizerResult::Script(node), +                } +            } +        } +        TokenizerResult::Done +    } + +    fn bad_char_error(&mut self) { +        let msg = format_if!( +            self.opts.exact_errors, +            "Bad character", +            "Saw {} in state {:?}", +            self.current_char, +            self.state +        ); +        self.emit_error(msg); +    } + +    fn bad_eof_error(&mut self) { +        let msg = format_if!( +            self.opts.exact_errors, +            "Unexpected EOF", +            "Saw EOF in state {:?}", +            self.state +        ); +        self.emit_error(msg); +    } + +    fn emit_char(&mut self, c: char) { +        self.process_token_and_continue(match c { +            '\0' => NullCharacterToken, +            _ => CharacterTokens(StrTendril::from_char(c)), +        }); +    } + +    // The string must not contain '\0'! +    fn emit_chars(&mut self, b: StrTendril) { +        self.process_token_and_continue(CharacterTokens(b)); +    } + +    fn emit_current_tag(&mut self) -> ProcessResult<Sink::Handle> { +        self.finish_attribute(); + +        let name = LocalName::from(&*self.current_tag_name); +        self.current_tag_name.clear(); + +        match self.current_tag_kind { +            StartTag => { +                self.last_start_tag_name = Some(name.clone()); +            }, +            EndTag => { +                if !self.current_tag_attrs.is_empty() { +                    self.emit_error(Borrowed("Attributes on an end tag")); +                } +                if self.current_tag_self_closing { +                    self.emit_error(Borrowed("Self-closing end tag")); +                } +            }, +        } + +        let token = TagToken(Tag { +            kind: self.current_tag_kind, +            name, +            self_closing: self.current_tag_self_closing, +            attrs: replace(&mut self.current_tag_attrs, vec![]), +        }); + +        match self.process_token(token) { +            TokenSinkResult::Continue => ProcessResult::Continue, +            TokenSinkResult::Plaintext => { +                self.state = states::Plaintext; +                ProcessResult::Continue +            }, +            TokenSinkResult::Script(node) => { +                self.state = states::Data; +                ProcessResult::Script(node) +            }, +            TokenSinkResult::RawData(kind) => { +                self.state = states::RawData(kind); +                ProcessResult::Continue +            }, +        } +    } + +    fn emit_temp_buf(&mut self) { +        // FIXME: Make sure that clearing on emit is spec-compatible. +        let buf = replace(&mut self.temp_buf, StrTendril::new()); +        self.emit_chars(buf); +    } + +    fn clear_temp_buf(&mut self) { +        // Do this without a new allocation. +        self.temp_buf.clear(); +    } + +    fn emit_current_comment(&mut self) { +        let comment = replace(&mut self.current_comment, StrTendril::new()); +        self.process_token_and_continue(CommentToken(comment)); +    } + +    fn discard_tag(&mut self) { +        self.current_tag_name.clear(); +        self.current_tag_self_closing = false; +        self.current_tag_attrs = vec![]; +    } + +    fn create_tag(&mut self, kind: TagKind, c: char) { +        self.discard_tag(); +        self.current_tag_name.push_char(c); +        self.current_tag_kind = kind; +    } + +    fn have_appropriate_end_tag(&self) -> bool { +        match self.last_start_tag_name.as_ref() { +            Some(last) => (self.current_tag_kind == EndTag) && (*self.current_tag_name == **last), +            None => false, +        } +    } + +    fn create_attribute(&mut self, c: char) { +        self.finish_attribute(); + +        self.current_attr_name.push_char(c); +    } + +    fn finish_attribute(&mut self) { +        if self.current_attr_name.is_empty() { +            return; +        } + +        // Check for a duplicate attribute. +        // FIXME: the spec says we should error as soon as the name is finished. +        // FIXME: linear time search, do we care? +        let dup = { +            let name = &*self.current_attr_name; +            self.current_tag_attrs +                .iter() +                .any(|a| &*a.name.local == name) +        }; + +        if dup { +            self.emit_error(Borrowed("Duplicate attribute")); +            self.current_attr_name.clear(); +            self.current_attr_value.clear(); +        } else { +            let name = LocalName::from(&*self.current_attr_name); +            self.current_attr_name.clear(); +            self.current_tag_attrs.push(Attribute { +                // The tree builder will adjust the namespace if necessary. +                // This only happens in foreign elements. +                name: QualName::new(None, ns!(), name), +                value: replace(&mut self.current_attr_value, StrTendril::new()), +            }); +        } +    } + +    fn emit_current_doctype(&mut self) { +        let doctype = replace(&mut self.current_doctype, Doctype::new()); +        self.process_token_and_continue(DoctypeToken(doctype)); +    } + +    fn doctype_id(&mut self, kind: DoctypeIdKind) -> &mut Option<StrTendril> { +        match kind { +            Public => &mut self.current_doctype.public_id, +            System => &mut self.current_doctype.system_id, +        } +    } + +    fn clear_doctype_id(&mut self, kind: DoctypeIdKind) { +        let id = self.doctype_id(kind); +        match *id { +            Some(ref mut s) => s.clear(), +            None => *id = Some(StrTendril::new()), +        } +    } + +    fn consume_char_ref(&mut self, addnl_allowed: Option<char>) { +        // NB: The char ref tokenizer assumes we have an additional allowed +        // character iff we're tokenizing in an attribute value. +        self.char_ref_tokenizer = Some(Box::new(CharRefTokenizer::new(addnl_allowed))); +    } + +    fn emit_eof(&mut self) { +        self.process_token_and_continue(EOFToken); +    } + +    fn peek(&mut self, input: &BufferQueue) -> Option<char> { +        if self.reconsume { +            Some(self.current_char) +        } else { +            input.peek() +        } +    } + +    fn discard_char(&mut self, input: &mut BufferQueue) { +        self.get_char(input); +    } + +    fn emit_error(&mut self, error: Cow<'static, str>) { +        self.process_token_and_continue(ParseError(error)); +    } +} +//§ END + +// Shorthand for common state machine behaviors. +macro_rules! shorthand ( +    ( $me:ident : emit $c:expr                     ) => ( $me.emit_char($c);                                   ); +    ( $me:ident : create_tag $kind:ident $c:expr   ) => ( $me.create_tag($kind, $c);                           ); +    ( $me:ident : push_tag $c:expr                 ) => ( $me.current_tag_name.push_char($c);                  ); +    ( $me:ident : discard_tag                      ) => ( $me.discard_tag();                                   ); +    ( $me:ident : discard_char $input:expr         ) => ( $me.discard_char($input);                            ); +    ( $me:ident : push_temp $c:expr                ) => ( $me.temp_buf.push_char($c);                          ); +    ( $me:ident : emit_temp                        ) => ( $me.emit_temp_buf();                                 ); +    ( $me:ident : clear_temp                       ) => ( $me.clear_temp_buf();                                ); +    ( $me:ident : create_attr $c:expr              ) => ( $me.create_attribute($c);                            ); +    ( $me:ident : push_name $c:expr                ) => ( $me.current_attr_name.push_char($c);                 ); +    ( $me:ident : push_value $c:expr               ) => ( $me.current_attr_value.push_char($c);                ); +    ( $me:ident : append_value $c:expr             ) => ( $me.current_attr_value.push_tendril($c);             ); +    ( $me:ident : push_comment $c:expr             ) => ( $me.current_comment.push_char($c);                   ); +    ( $me:ident : append_comment $c:expr           ) => ( $me.current_comment.push_slice($c);                  ); +    ( $me:ident : emit_comment                     ) => ( $me.emit_current_comment();                          ); +    ( $me:ident : clear_comment                    ) => ( $me.current_comment.clear();                         ); +    ( $me:ident : create_doctype                   ) => ( $me.current_doctype = Doctype::new();                ); +    ( $me:ident : push_doctype_name $c:expr        ) => ( option_push(&mut $me.current_doctype.name, $c);      ); +    ( $me:ident : push_doctype_id $k:ident $c:expr ) => ( option_push($me.doctype_id($k), $c);                 ); +    ( $me:ident : clear_doctype_id $k:ident        ) => ( $me.clear_doctype_id($k);                            ); +    ( $me:ident : force_quirks                     ) => ( $me.current_doctype.force_quirks = true;             ); +    ( $me:ident : emit_doctype                     ) => ( $me.emit_current_doctype();                          ); +    ( $me:ident : error                            ) => ( $me.bad_char_error();                                ); +    ( $me:ident : error_eof                        ) => ( $me.bad_eof_error();                                 ); +); + +// Tracing of tokenizer actions.  This adds significant bloat and compile time, +// so it's behind a cfg flag. +#[cfg(trace_tokenizer)] +macro_rules! sh_trace ( ( $me:ident : $($cmds:tt)* ) => ({ +    debug!("  {:s}", stringify!($($cmds)*)); +    shorthand!($me:expr : $($cmds)*); +})); + +#[cfg(not(trace_tokenizer))] +macro_rules! sh_trace ( ( $me:ident : $($cmds:tt)* ) => ( shorthand!($me: $($cmds)*) ) ); + +// A little DSL for sequencing shorthand actions. +macro_rules! go ( +    // A pattern like $($cmd:tt)* ; $($rest:tt)* causes parse ambiguity. +    // We have to tell the parser how much lookahead we need. + +    ( $me:ident : $a:tt                   ; $($rest:tt)* ) => ({ sh_trace!($me: $a);          go!($me: $($rest)*); }); +    ( $me:ident : $a:tt $b:tt             ; $($rest:tt)* ) => ({ sh_trace!($me: $a $b);       go!($me: $($rest)*); }); +    ( $me:ident : $a:tt $b:tt $c:tt       ; $($rest:tt)* ) => ({ sh_trace!($me: $a $b $c);    go!($me: $($rest)*); }); +    ( $me:ident : $a:tt $b:tt $c:tt $d:tt ; $($rest:tt)* ) => ({ sh_trace!($me: $a $b $c $d); go!($me: $($rest)*); }); + +    // These can only come at the end. + +    ( $me:ident : to $s:ident                    ) => ({ $me.state = states::$s; return ProcessResult::Continue;           }); +    ( $me:ident : to $s:ident $k1:expr           ) => ({ $me.state = states::$s($k1); return ProcessResult::Continue;      }); +    ( $me:ident : to $s:ident $k1:ident $k2:expr ) => ({ $me.state = states::$s($k1($k2)); return ProcessResult::Continue; }); + +    ( $me:ident : reconsume $s:ident                    ) => ({ $me.reconsume = true; go!($me: to $s);         }); +    ( $me:ident : reconsume $s:ident $k1:expr           ) => ({ $me.reconsume = true; go!($me: to $s $k1);     }); +    ( $me:ident : reconsume $s:ident $k1:ident $k2:expr ) => ({ $me.reconsume = true; go!($me: to $s $k1 $k2); }); + +    ( $me:ident : consume_char_ref             ) => ({ $me.consume_char_ref(None); return ProcessResult::Continue;         }); +    ( $me:ident : consume_char_ref $addnl:expr ) => ({ $me.consume_char_ref(Some($addnl)); return ProcessResult::Continue; }); + +    // We have a default next state after emitting a tag, but the sink can override. +    ( $me:ident : emit_tag $s:ident ) => ({ +        $me.state = states::$s; +        return $me.emit_current_tag(); +    }); + +    ( $me:ident : eof ) => ({ $me.emit_eof(); return ProcessResult::Suspend; }); + +    // If nothing else matched, it's a single command +    ( $me:ident : $($cmd:tt)+ ) => ( sh_trace!($me: $($cmd)+); ); + +    // or nothing. +    ( $me:ident : ) => (()); +); + +macro_rules! go_match ( ( $me:ident : $x:expr, $($pats:pat),+ => $($cmds:tt)* ) => ( +    match $x { +        $($pats)|+ => go!($me: $($cmds)*), +        _ => (), +    } +)); + +// This is a macro because it can cause early return +// from the function where it is used. +macro_rules! get_char ( ($me:expr, $input:expr) => ( +    unwrap_or_return!($me.get_char($input), ProcessResult::Suspend) +)); + +macro_rules! peek ( ($me:expr, $input:expr) => ( +    unwrap_or_return!($me.peek($input), ProcessResult::Suspend) +)); + +macro_rules! pop_except_from ( ($me:expr, $input:expr, $set:expr) => ( +    unwrap_or_return!($me.pop_except_from($input, $set), ProcessResult::Suspend) +)); + +macro_rules! eat ( ($me:expr, $input:expr, $pat:expr) => ( +    unwrap_or_return!($me.eat($input, $pat, u8::eq_ignore_ascii_case), ProcessResult::Suspend) +)); + +macro_rules! eat_exact ( ($me:expr, $input:expr, $pat:expr) => ( +    unwrap_or_return!($me.eat($input, $pat, u8::eq), ProcessResult::Suspend) +)); + +impl<Sink: TokenSink> Tokenizer<Sink> { +    // Run the state machine for a while. +    // Return true if we should be immediately re-invoked +    // (this just simplifies control flow vs. break / continue). +    #[allow(clippy::never_loop)] +    fn step(&mut self, input: &mut BufferQueue) -> ProcessResult<Sink::Handle> { +        if self.char_ref_tokenizer.is_some() { +            return self.step_char_ref_tokenizer(input); +        } + +        debug!("processing in state {:?}", self.state); +        match self.state { +            //§ data-state +            states::Data => loop { +                match pop_except_from!(self, input, small_char_set!('\r' '\0' '&' '<' '\n')) { +                    FromSet('\0') => go!(self: error; emit '\0'), +                    FromSet('&') => go!(self: consume_char_ref), +                    FromSet('<') => go!(self: to TagOpen), +                    FromSet(c) => go!(self: emit c), +                    NotFromSet(b) => self.emit_chars(b), +                } +            }, + +            //§ rcdata-state +            states::RawData(Rcdata) => loop { +                match pop_except_from!(self, input, small_char_set!('\r' '\0' '&' '<' '\n')) { +                    FromSet('\0') => go!(self: error; emit '\u{fffd}'), +                    FromSet('&') => go!(self: consume_char_ref), +                    FromSet('<') => go!(self: to RawLessThanSign Rcdata), +                    FromSet(c) => go!(self: emit c), +                    NotFromSet(b) => self.emit_chars(b), +                } +            }, + +            //§ rawtext-state +            states::RawData(Rawtext) => loop { +                match pop_except_from!(self, input, small_char_set!('\r' '\0' '<' '\n')) { +                    FromSet('\0') => go!(self: error; emit '\u{fffd}'), +                    FromSet('<') => go!(self: to RawLessThanSign Rawtext), +                    FromSet(c) => go!(self: emit c), +                    NotFromSet(b) => self.emit_chars(b), +                } +            }, + +            //§ script-data-state +            states::RawData(ScriptData) => loop { +                match pop_except_from!(self, input, small_char_set!('\r' '\0' '<' '\n')) { +                    FromSet('\0') => go!(self: error; emit '\u{fffd}'), +                    FromSet('<') => go!(self: to RawLessThanSign ScriptData), +                    FromSet(c) => go!(self: emit c), +                    NotFromSet(b) => self.emit_chars(b), +                } +            }, + +            //§ script-data-escaped-state +            states::RawData(ScriptDataEscaped(Escaped)) => loop { +                match pop_except_from!(self, input, small_char_set!('\r' '\0' '-' '<' '\n')) { +                    FromSet('\0') => go!(self: error; emit '\u{fffd}'), +                    FromSet('-') => go!(self: emit '-'; to ScriptDataEscapedDash Escaped), +                    FromSet('<') => go!(self: to RawLessThanSign ScriptDataEscaped Escaped), +                    FromSet(c) => go!(self: emit c), +                    NotFromSet(b) => self.emit_chars(b), +                } +            }, + +            //§ script-data-double-escaped-state +            states::RawData(ScriptDataEscaped(DoubleEscaped)) => loop { +                match pop_except_from!(self, input, small_char_set!('\r' '\0' '-' '<' '\n')) { +                    FromSet('\0') => go!(self: error; emit '\u{fffd}'), +                    FromSet('-') => go!(self: emit '-'; to ScriptDataEscapedDash DoubleEscaped), +                    FromSet('<') => { +                        go!(self: emit '<'; to RawLessThanSign ScriptDataEscaped DoubleEscaped) +                    }, +                    FromSet(c) => go!(self: emit c), +                    NotFromSet(b) => self.emit_chars(b), +                } +            }, + +            //§ plaintext-state +            states::Plaintext => loop { +                match pop_except_from!(self, input, small_char_set!('\r' '\0' '\n')) { +                    FromSet('\0') => go!(self: error; emit '\u{fffd}'), +                    FromSet(c) => go!(self: emit c), +                    NotFromSet(b) => self.emit_chars(b), +                } +            }, + +            //§ tag-open-state +            states::TagOpen => loop { +                match get_char!(self, input) { +                    '!' => go!(self: clear_temp; to MarkupDeclarationOpen), +                    '/' => go!(self: to EndTagOpen), +                    '?' => go!(self: error; clear_comment; push_comment '?'; to BogusComment), +                    c => match lower_ascii_letter(c) { +                        Some(cl) => go!(self: create_tag StartTag cl; to TagName), +                        None => go!(self: error; emit '<'; reconsume Data), +                    }, +                } +            }, + +            //§ end-tag-open-state +            states::EndTagOpen => loop { +                match get_char!(self, input) { +                    '>' => go!(self: error; to Data), +                    '\0' => { +                        go!(self: error; clear_comment; push_comment '\u{fffd}'; to BogusComment) +                    }, +                    c => match lower_ascii_letter(c) { +                        Some(cl) => go!(self: create_tag EndTag cl; to TagName), +                        None => go!(self: error; clear_comment; push_comment c; to BogusComment), +                    }, +                } +            }, + +            //§ tag-name-state +            states::TagName => loop { +                match get_char!(self, input) { +                    '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeAttributeName), +                    '/' => go!(self: to SelfClosingStartTag), +                    '>' => go!(self: emit_tag Data), +                    '\0' => go!(self: error; push_tag '\u{fffd}'), +                    c => go!(self: push_tag (c.to_ascii_lowercase())), +                } +            }, + +            //§ script-data-escaped-less-than-sign-state +            states::RawLessThanSign(ScriptDataEscaped(Escaped)) => loop { +                match get_char!(self, input) { +                    '/' => go!(self: clear_temp; to RawEndTagOpen ScriptDataEscaped Escaped), +                    c => match lower_ascii_letter(c) { +                        Some(cl) => go!(self: clear_temp; push_temp cl; emit '<'; emit c; +                                    to ScriptDataEscapeStart DoubleEscaped), +                        None => go!(self: emit '<'; reconsume RawData ScriptDataEscaped Escaped), +                    }, +                } +            }, + +            //§ script-data-double-escaped-less-than-sign-state +            states::RawLessThanSign(ScriptDataEscaped(DoubleEscaped)) => loop { +                match get_char!(self, input) { +                    '/' => go!(self: clear_temp; emit '/'; to ScriptDataDoubleEscapeEnd), +                    _ => go!(self: reconsume RawData ScriptDataEscaped DoubleEscaped), +                } +            }, + +            //§ rcdata-less-than-sign-state rawtext-less-than-sign-state script-data-less-than-sign-state +            // otherwise +            states::RawLessThanSign(kind) => loop { +                match get_char!(self, input) { +                    '/' => go!(self: clear_temp; to RawEndTagOpen kind), +                    '!' if kind == ScriptData => { +                        go!(self: emit '<'; emit '!'; to ScriptDataEscapeStart Escaped) +                    }, +                    _ => go!(self: emit '<'; reconsume RawData kind), +                } +            }, + +            //§ rcdata-end-tag-open-state rawtext-end-tag-open-state script-data-end-tag-open-state script-data-escaped-end-tag-open-state +            states::RawEndTagOpen(kind) => loop { +                let c = get_char!(self, input); +                match lower_ascii_letter(c) { +                    Some(cl) => go!(self: create_tag EndTag cl; push_temp c; to RawEndTagName kind), +                    None => go!(self: emit '<'; emit '/'; reconsume RawData kind), +                } +            }, + +            //§ rcdata-end-tag-name-state rawtext-end-tag-name-state script-data-end-tag-name-state script-data-escaped-end-tag-name-state +            states::RawEndTagName(kind) => loop { +                let c = get_char!(self, input); +                if self.have_appropriate_end_tag() { +                    match c { +                        '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeAttributeName), +                        '/' => go!(self: to SelfClosingStartTag), +                        '>' => go!(self: emit_tag Data), +                        _ => (), +                    } +                } + +                match lower_ascii_letter(c) { +                    Some(cl) => go!(self: push_tag cl; push_temp c), +                    None => { +                        go!(self: discard_tag; emit '<'; emit '/'; emit_temp; reconsume RawData kind) +                    }, +                } +            }, + +            //§ script-data-double-escape-start-state +            states::ScriptDataEscapeStart(DoubleEscaped) => loop { +                let c = get_char!(self, input); +                match c { +                    '\t' | '\n' | '\x0C' | ' ' | '/' | '>' => { +                        let esc = if &*self.temp_buf == "script" { +                            DoubleEscaped +                        } else { +                            Escaped +                        }; +                        go!(self: emit c; to RawData ScriptDataEscaped esc); +                    }, +                    _ => match lower_ascii_letter(c) { +                        Some(cl) => go!(self: push_temp cl; emit c), +                        None => go!(self: reconsume RawData ScriptDataEscaped Escaped), +                    }, +                } +            }, + +            //§ script-data-escape-start-state +            states::ScriptDataEscapeStart(Escaped) => loop { +                match get_char!(self, input) { +                    '-' => go!(self: emit '-'; to ScriptDataEscapeStartDash), +                    _ => go!(self: reconsume RawData ScriptData), +                } +            }, + +            //§ script-data-escape-start-dash-state +            states::ScriptDataEscapeStartDash => loop { +                match get_char!(self, input) { +                    '-' => go!(self: emit '-'; to ScriptDataEscapedDashDash Escaped), +                    _ => go!(self: reconsume RawData ScriptData), +                } +            }, + +            //§ script-data-escaped-dash-state script-data-double-escaped-dash-state +            states::ScriptDataEscapedDash(kind) => loop { +                match get_char!(self, input) { +                    '-' => go!(self: emit '-'; to ScriptDataEscapedDashDash kind), +                    '<' => { +                        if kind == DoubleEscaped { +                            go!(self: emit '<'); +                        } +                        go!(self: to RawLessThanSign ScriptDataEscaped kind); +                    }, +                    '\0' => go!(self: error; emit '\u{fffd}'; to RawData ScriptDataEscaped kind), +                    c => go!(self: emit c; to RawData ScriptDataEscaped kind), +                } +            }, + +            //§ script-data-escaped-dash-dash-state script-data-double-escaped-dash-dash-state +            states::ScriptDataEscapedDashDash(kind) => loop { +                match get_char!(self, input) { +                    '-' => go!(self: emit '-'), +                    '<' => { +                        if kind == DoubleEscaped { +                            go!(self: emit '<'); +                        } +                        go!(self: to RawLessThanSign ScriptDataEscaped kind); +                    }, +                    '>' => go!(self: emit '>'; to RawData ScriptData), +                    '\0' => go!(self: error; emit '\u{fffd}'; to RawData ScriptDataEscaped kind), +                    c => go!(self: emit c; to RawData ScriptDataEscaped kind), +                } +            }, + +            //§ script-data-double-escape-end-state +            states::ScriptDataDoubleEscapeEnd => loop { +                let c = get_char!(self, input); +                match c { +                    '\t' | '\n' | '\x0C' | ' ' | '/' | '>' => { +                        let esc = if &*self.temp_buf == "script" { +                            Escaped +                        } else { +                            DoubleEscaped +                        }; +                        go!(self: emit c; to RawData ScriptDataEscaped esc); +                    }, +                    _ => match lower_ascii_letter(c) { +                        Some(cl) => go!(self: push_temp cl; emit c), +                        None => go!(self: reconsume RawData ScriptDataEscaped DoubleEscaped), +                    }, +                } +            }, + +            //§ before-attribute-name-state +            states::BeforeAttributeName => loop { +                match get_char!(self, input) { +                    '\t' | '\n' | '\x0C' | ' ' => (), +                    '/' => go!(self: to SelfClosingStartTag), +                    '>' => go!(self: emit_tag Data), +                    '\0' => go!(self: error; create_attr '\u{fffd}'; to AttributeName), +                    c => match lower_ascii_letter(c) { +                        Some(cl) => go!(self: create_attr cl; to AttributeName), +                        None => { +                            go_match!(self: c, +                            '"' , '\'' , '<' , '=' => error); +                            go!(self: create_attr c; to AttributeName); +                        }, +                    }, +                } +            }, + +            //§ attribute-name-state +            states::AttributeName => loop { +                match get_char!(self, input) { +                    '\t' | '\n' | '\x0C' | ' ' => go!(self: to AfterAttributeName), +                    '/' => go!(self: to SelfClosingStartTag), +                    '=' => go!(self: to BeforeAttributeValue), +                    '>' => go!(self: emit_tag Data), +                    '\0' => go!(self: error; push_name '\u{fffd}'), +                    c => match lower_ascii_letter(c) { +                        Some(cl) => go!(self: push_name cl), +                        None => { +                            go_match!(self: c, +                            '"' , '\'' , '<' => error); +                            go!(self: push_name c); +                        }, +                    }, +                } +            }, + +            //§ after-attribute-name-state +            states::AfterAttributeName => loop { +                match get_char!(self, input) { +                    '\t' | '\n' | '\x0C' | ' ' => (), +                    '/' => go!(self: to SelfClosingStartTag), +                    '=' => go!(self: to BeforeAttributeValue), +                    '>' => go!(self: emit_tag Data), +                    '\0' => go!(self: error; create_attr '\u{fffd}'; to AttributeName), +                    c => match lower_ascii_letter(c) { +                        Some(cl) => go!(self: create_attr cl; to AttributeName), +                        None => { +                            go_match!(self: c, +                            '"' , '\'' , '<' => error); +                            go!(self: create_attr c; to AttributeName); +                        }, +                    }, +                } +            }, + +            //§ before-attribute-value-state +            // Use peek so we can handle the first attr character along with the rest, +            // hopefully in the same zero-copy buffer. +            states::BeforeAttributeValue => loop { +                match peek!(self, input) { +                    '\t' | '\n' | '\r' | '\x0C' | ' ' => go!(self: discard_char input), +                    '"' => go!(self: discard_char input; to AttributeValue DoubleQuoted), +                    '\'' => go!(self: discard_char input; to AttributeValue SingleQuoted), +                    '\0' => { +                        go!(self: discard_char input; error; push_value '\u{fffd}'; to AttributeValue Unquoted) +                    }, +                    '>' => go!(self: discard_char input; error; emit_tag Data), +                    _ => go!(self: to AttributeValue Unquoted), +                } +            }, + +            //§ attribute-value-(double-quoted)-state +            states::AttributeValue(DoubleQuoted) => loop { +                match pop_except_from!(self, input, small_char_set!('\r' '"' '&' '\0' '\n')) { +                    FromSet('"') => go!(self: to AfterAttributeValueQuoted), +                    FromSet('&') => go!(self: consume_char_ref '"'), +                    FromSet('\0') => go!(self: error; push_value '\u{fffd}'), +                    FromSet(c) => go!(self: push_value c), +                    NotFromSet(ref b) => go!(self: append_value b), +                } +            }, + +            //§ attribute-value-(single-quoted)-state +            states::AttributeValue(SingleQuoted) => loop { +                match pop_except_from!(self, input, small_char_set!('\r' '\'' '&' '\0' '\n')) { +                    FromSet('\'') => go!(self: to AfterAttributeValueQuoted), +                    FromSet('&') => go!(self: consume_char_ref '\''), +                    FromSet('\0') => go!(self: error; push_value '\u{fffd}'), +                    FromSet(c) => go!(self: push_value c), +                    NotFromSet(ref b) => go!(self: append_value b), +                } +            }, + +            //§ attribute-value-(unquoted)-state +            states::AttributeValue(Unquoted) => loop { +                match pop_except_from!( +                    self, +                    input, +                    small_char_set!('\r' '\t' '\n' '\x0C' ' ' '&' '>' '\0') +                ) { +                    FromSet('\t') | FromSet('\n') | FromSet('\x0C') | FromSet(' ') => { +                        go!(self: to BeforeAttributeName) +                    }, +                    FromSet('&') => go!(self: consume_char_ref '>'), +                    FromSet('>') => go!(self: emit_tag Data), +                    FromSet('\0') => go!(self: error; push_value '\u{fffd}'), +                    FromSet(c) => { +                        go_match!(self: c, +                            '"' , '\'' , '<' , '=' , '`' => error); +                        go!(self: push_value c); +                    }, +                    NotFromSet(ref b) => go!(self: append_value b), +                } +            }, + +            //§ after-attribute-value-(quoted)-state +            states::AfterAttributeValueQuoted => loop { +                match get_char!(self, input) { +                    '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeAttributeName), +                    '/' => go!(self: to SelfClosingStartTag), +                    '>' => go!(self: emit_tag Data), +                    _ => go!(self: error; reconsume BeforeAttributeName), +                } +            }, + +            //§ self-closing-start-tag-state +            states::SelfClosingStartTag => loop { +                match get_char!(self, input) { +                    '>' => { +                        self.current_tag_self_closing = true; +                        go!(self: emit_tag Data); +                    }, +                    _ => go!(self: error; reconsume BeforeAttributeName), +                } +            }, + +            //§ comment-start-state +            states::CommentStart => loop { +                match get_char!(self, input) { +                    '-' => go!(self: to CommentStartDash), +                    '\0' => go!(self: error; push_comment '\u{fffd}'; to Comment), +                    '>' => go!(self: error; emit_comment; to Data), +                    c => go!(self: push_comment c; to Comment), +                } +            }, + +            //§ comment-start-dash-state +            states::CommentStartDash => loop { +                match get_char!(self, input) { +                    '-' => go!(self: to CommentEnd), +                    '\0' => go!(self: error; append_comment "-\u{fffd}"; to Comment), +                    '>' => go!(self: error; emit_comment; to Data), +                    c => go!(self: push_comment '-'; push_comment c; to Comment), +                } +            }, + +            //§ comment-state +            states::Comment => loop { +                match get_char!(self, input) { +                    '-' => go!(self: to CommentEndDash), +                    '\0' => go!(self: error; push_comment '\u{fffd}'), +                    c => go!(self: push_comment c), +                } +            }, + +            //§ comment-end-dash-state +            states::CommentEndDash => loop { +                match get_char!(self, input) { +                    '-' => go!(self: to CommentEnd), +                    '\0' => go!(self: error; append_comment "-\u{fffd}"; to Comment), +                    c => go!(self: push_comment '-'; push_comment c; to Comment), +                } +            }, + +            //§ comment-end-state +            states::CommentEnd => loop { +                match get_char!(self, input) { +                    '>' => go!(self: emit_comment; to Data), +                    '\0' => go!(self: error; append_comment "--\u{fffd}"; to Comment), +                    '!' => go!(self: error; to CommentEndBang), +                    '-' => go!(self: error; push_comment '-'), +                    c => go!(self: error; append_comment "--"; push_comment c; to Comment), +                } +            }, + +            //§ comment-end-bang-state +            states::CommentEndBang => loop { +                match get_char!(self, input) { +                    '-' => go!(self: append_comment "--!"; to CommentEndDash), +                    '>' => go!(self: emit_comment; to Data), +                    '\0' => go!(self: error; append_comment "--!\u{fffd}"; to Comment), +                    c => go!(self: append_comment "--!"; push_comment c; to Comment), +                } +            }, + +            //§ doctype-state +            states::Doctype => loop { +                match get_char!(self, input) { +                    '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeDoctypeName), +                    _ => go!(self: error; reconsume BeforeDoctypeName), +                } +            }, + +            //§ before-doctype-name-state +            states::BeforeDoctypeName => loop { +                match get_char!(self, input) { +                    '\t' | '\n' | '\x0C' | ' ' => (), +                    '\0' => { +                        go!(self: error; create_doctype; push_doctype_name '\u{fffd}'; to DoctypeName) +                    }, +                    '>' => go!(self: error; create_doctype; force_quirks; emit_doctype; to Data), +                    c => go!(self: create_doctype; push_doctype_name (c.to_ascii_lowercase()); +                                  to DoctypeName), +                } +            }, + +            //§ doctype-name-state +            states::DoctypeName => loop { +                match get_char!(self, input) { +                    '\t' | '\n' | '\x0C' | ' ' => go!(self: clear_temp; to AfterDoctypeName), +                    '>' => go!(self: emit_doctype; to Data), +                    '\0' => go!(self: error; push_doctype_name '\u{fffd}'), +                    c => go!(self: push_doctype_name (c.to_ascii_lowercase())), +                } +            }, + +            //§ after-doctype-name-state +            states::AfterDoctypeName => loop { +                if eat!(self, input, "public") { +                    go!(self: to AfterDoctypeKeyword Public); +                } else if eat!(self, input, "system") { +                    go!(self: to AfterDoctypeKeyword System); +                } else { +                    match get_char!(self, input) { +                        '\t' | '\n' | '\x0C' | ' ' => (), +                        '>' => go!(self: emit_doctype; to Data), +                        _ => go!(self: error; force_quirks; to BogusDoctype), +                    } +                } +            }, + +            //§ after-doctype-public-keyword-state after-doctype-system-keyword-state +            states::AfterDoctypeKeyword(kind) => loop { +                match get_char!(self, input) { +                    '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeDoctypeIdentifier kind), +                    '"' => { +                        go!(self: error; clear_doctype_id kind; to DoctypeIdentifierDoubleQuoted kind) +                    }, +                    '\'' => { +                        go!(self: error; clear_doctype_id kind; to DoctypeIdentifierSingleQuoted kind) +                    }, +                    '>' => go!(self: error; force_quirks; emit_doctype; to Data), +                    _ => go!(self: error; force_quirks; to BogusDoctype), +                } +            }, + +            //§ before-doctype-public-identifier-state before-doctype-system-identifier-state +            states::BeforeDoctypeIdentifier(kind) => loop { +                match get_char!(self, input) { +                    '\t' | '\n' | '\x0C' | ' ' => (), +                    '"' => go!(self: clear_doctype_id kind; to DoctypeIdentifierDoubleQuoted kind), +                    '\'' => go!(self: clear_doctype_id kind; to DoctypeIdentifierSingleQuoted kind), +                    '>' => go!(self: error; force_quirks; emit_doctype; to Data), +                    _ => go!(self: error; force_quirks; to BogusDoctype), +                } +            }, + +            //§ doctype-public-identifier-(double-quoted)-state doctype-system-identifier-(double-quoted)-state +            states::DoctypeIdentifierDoubleQuoted(kind) => loop { +                match get_char!(self, input) { +                    '"' => go!(self: to AfterDoctypeIdentifier kind), +                    '\0' => go!(self: error; push_doctype_id kind '\u{fffd}'), +                    '>' => go!(self: error; force_quirks; emit_doctype; to Data), +                    c => go!(self: push_doctype_id kind c), +                } +            }, + +            //§ doctype-public-identifier-(single-quoted)-state doctype-system-identifier-(single-quoted)-state +            states::DoctypeIdentifierSingleQuoted(kind) => loop { +                match get_char!(self, input) { +                    '\'' => go!(self: to AfterDoctypeIdentifier kind), +                    '\0' => go!(self: error; push_doctype_id kind '\u{fffd}'), +                    '>' => go!(self: error; force_quirks; emit_doctype; to Data), +                    c => go!(self: push_doctype_id kind c), +                } +            }, + +            //§ after-doctype-public-identifier-state +            states::AfterDoctypeIdentifier(Public) => loop { +                match get_char!(self, input) { +                    '\t' | '\n' | '\x0C' | ' ' => { +                        go!(self: to BetweenDoctypePublicAndSystemIdentifiers) +                    }, +                    '>' => go!(self: emit_doctype; to Data), +                    '"' => { +                        go!(self: error; clear_doctype_id System; to DoctypeIdentifierDoubleQuoted System) +                    }, +                    '\'' => { +                        go!(self: error; clear_doctype_id System; to DoctypeIdentifierSingleQuoted System) +                    }, +                    _ => go!(self: error; force_quirks; to BogusDoctype), +                } +            }, + +            //§ after-doctype-system-identifier-state +            states::AfterDoctypeIdentifier(System) => loop { +                match get_char!(self, input) { +                    '\t' | '\n' | '\x0C' | ' ' => (), +                    '>' => go!(self: emit_doctype; to Data), +                    _ => go!(self: error; to BogusDoctype), +                } +            }, + +            //§ between-doctype-public-and-system-identifiers-state +            states::BetweenDoctypePublicAndSystemIdentifiers => loop { +                match get_char!(self, input) { +                    '\t' | '\n' | '\x0C' | ' ' => (), +                    '>' => go!(self: emit_doctype; to Data), +                    '"' => { +                        go!(self: clear_doctype_id System; to DoctypeIdentifierDoubleQuoted System) +                    }, +                    '\'' => { +                        go!(self: clear_doctype_id System; to DoctypeIdentifierSingleQuoted System) +                    }, +                    _ => go!(self: error; force_quirks; to BogusDoctype), +                } +            }, + +            //§ bogus-doctype-state +            states::BogusDoctype => loop { +                match get_char!(self, input) { +                    '>' => go!(self: emit_doctype; to Data), +                    _ => (), +                } +            }, + +            //§ bogus-comment-state +            states::BogusComment => loop { +                match get_char!(self, input) { +                    '>' => go!(self: emit_comment; to Data), +                    '\0' => go!(self: push_comment '\u{fffd}'), +                    c => go!(self: push_comment c), +                } +            }, + +            //§ markup-declaration-open-state +            states::MarkupDeclarationOpen => loop { +                if eat_exact!(self, input, "--") { +                    go!(self: clear_comment; to CommentStart); +                } else if eat!(self, input, "doctype") { +                    go!(self: to Doctype); +                } else { +                    if self +                        .sink +                        .adjusted_current_node_present_but_not_in_html_namespace() +                    { +                        if eat_exact!(self, input, "[CDATA[") { +                            go!(self: clear_temp; to CdataSection); +                        } +                    } +                    go!(self: error; to BogusComment); +                } +            }, + +            //§ cdata-section-state +            states::CdataSection => loop { +                match get_char!(self, input) { +                    ']' => go!(self: to CdataSectionBracket), +                    '\0' => go!(self: emit_temp; emit '\0'), +                    c => go!(self: push_temp c), +                } +            }, + +            //§ cdata-section-bracket +            states::CdataSectionBracket => match get_char!(self, input) { +                ']' => go!(self: to CdataSectionEnd), +                _ => go!(self: push_temp ']'; reconsume CdataSection), +            }, + +            //§ cdata-section-end +            states::CdataSectionEnd => loop { +                match get_char!(self, input) { +                    ']' => go!(self: push_temp ']'), +                    '>' => go!(self: emit_temp; to Data), +                    _ => go!(self: push_temp ']'; push_temp ']'; reconsume CdataSection), +                } +            }, +            //§ END +        } +    } + +    fn step_char_ref_tokenizer(&mut self, input: &mut BufferQueue) -> ProcessResult<Sink::Handle> { +        // FIXME HACK: Take and replace the tokenizer so we don't +        // double-mut-borrow self.  This is why it's boxed. +        let mut tok = self.char_ref_tokenizer.take().unwrap(); +        let outcome = tok.step(self, input); + +        let progress = match outcome { +            char_ref::Done => { +                self.process_char_ref(tok.get_result()); +                return ProcessResult::Continue; +            }, + +            char_ref::Stuck => ProcessResult::Suspend, +            char_ref::Progress => ProcessResult::Continue, +        }; + +        self.char_ref_tokenizer = Some(tok); +        progress +    } + +    fn process_char_ref(&mut self, char_ref: CharRef) { +        let CharRef { +            mut chars, +            mut num_chars, +        } = char_ref; + +        if num_chars == 0 { +            chars[0] = '&'; +            num_chars = 1; +        } + +        for i in 0..num_chars { +            let c = chars[i as usize]; +            match self.state { +                states::Data | states::RawData(states::Rcdata) => go!(self: emit c), + +                states::AttributeValue(_) => go!(self: push_value c), + +                _ => panic!( +                    "state {:?} should not be reachable in process_char_ref", +                    self.state +                ), +            } +        } +    } + +    /// Indicate that we have reached the end of the input. +    pub fn end(&mut self) { +        // Handle EOF in the char ref sub-tokenizer, if there is one. +        // Do this first because it might un-consume stuff. +        let mut input = BufferQueue::new(); +        match self.char_ref_tokenizer.take() { +            None => (), +            Some(mut tok) => { +                tok.end_of_file(self, &mut input); +                self.process_char_ref(tok.get_result()); +            }, +        } + +        // Process all remaining buffered input. +        // If we're waiting for lookahead, we're not gonna get it. +        self.at_eof = true; +        assert!(matches!(self.run(&mut input), TokenizerResult::Done)); +        assert!(input.is_empty()); + +        loop { +            match self.eof_step() { +                ProcessResult::Continue => (), +                ProcessResult::Suspend => break, +                ProcessResult::Script(_) => unreachable!(), +            } +        } + +        self.sink.end(); + +        if self.opts.profile { +            self.dump_profile(); +        } +    } + +    fn dump_profile(&self) { +        let mut results: Vec<(states::State, u64)> = +            self.state_profile.iter().map(|(s, t)| (*s, *t)).collect(); +        results.sort_by(|&(_, x), &(_, y)| y.cmp(&x)); + +        let total: u64 = results +            .iter() +            .map(|&(_, t)| t) +            .fold(0, ::std::ops::Add::add); +        println!("\nTokenizer profile, in nanoseconds"); +        println!("\n{:12}         total in token sink", self.time_in_sink); +        println!("\n{:12}         total in tokenizer", total); + +        for (k, v) in results.into_iter() { +            let pct = 100.0 * (v as f64) / (total as f64); +            println!("{:12}  {:4.1}%  {:?}", v, pct, k); +        } +    } + +    fn eof_step(&mut self) -> ProcessResult<Sink::Handle> { +        debug!("processing EOF in state {:?}", self.state); +        match self.state { +            states::Data | +            states::RawData(Rcdata) | +            states::RawData(Rawtext) | +            states::RawData(ScriptData) | +            states::Plaintext => go!(self: eof), + +            states::TagName | +            states::RawData(ScriptDataEscaped(_)) | +            states::BeforeAttributeName | +            states::AttributeName | +            states::AfterAttributeName | +            states::BeforeAttributeValue | +            states::AttributeValue(_) | +            states::AfterAttributeValueQuoted | +            states::SelfClosingStartTag | +            states::ScriptDataEscapedDash(_) | +            states::ScriptDataEscapedDashDash(_) => go!(self: error_eof; to Data), + +            states::TagOpen => go!(self: error_eof; emit '<'; to Data), + +            states::EndTagOpen => go!(self: error_eof; emit '<'; emit '/'; to Data), + +            states::RawLessThanSign(ScriptDataEscaped(DoubleEscaped)) => { +                go!(self: to RawData ScriptDataEscaped DoubleEscaped) +            }, + +            states::RawLessThanSign(kind) => go!(self: emit '<'; to RawData kind), + +            states::RawEndTagOpen(kind) => go!(self: emit '<'; emit '/'; to RawData kind), + +            states::RawEndTagName(kind) => { +                go!(self: emit '<'; emit '/'; emit_temp; to RawData kind) +            }, + +            states::ScriptDataEscapeStart(kind) => go!(self: to RawData ScriptDataEscaped kind), + +            states::ScriptDataEscapeStartDash => go!(self: to RawData ScriptData), + +            states::ScriptDataDoubleEscapeEnd => { +                go!(self: to RawData ScriptDataEscaped DoubleEscaped) +            }, + +            states::CommentStart | +            states::CommentStartDash | +            states::Comment | +            states::CommentEndDash | +            states::CommentEnd | +            states::CommentEndBang => go!(self: error_eof; emit_comment; to Data), + +            states::Doctype | states::BeforeDoctypeName => { +                go!(self: error_eof; create_doctype; force_quirks; emit_doctype; to Data) +            }, + +            states::DoctypeName | +            states::AfterDoctypeName | +            states::AfterDoctypeKeyword(_) | +            states::BeforeDoctypeIdentifier(_) | +            states::DoctypeIdentifierDoubleQuoted(_) | +            states::DoctypeIdentifierSingleQuoted(_) | +            states::AfterDoctypeIdentifier(_) | +            states::BetweenDoctypePublicAndSystemIdentifiers => { +                go!(self: error_eof; force_quirks; emit_doctype; to Data) +            }, + +            states::BogusDoctype => go!(self: emit_doctype; to Data), + +            states::BogusComment => go!(self: emit_comment; to Data), + +            states::MarkupDeclarationOpen => go!(self: error; to BogusComment), + +            states::CdataSection => go!(self: emit_temp; error_eof; to Data), + +            states::CdataSectionBracket => go!(self: push_temp ']'; to CdataSection), + +            states::CdataSectionEnd => go!(self: push_temp ']'; push_temp ']'; to CdataSection), +        } +    } +} + +#[cfg(test)] +#[allow(non_snake_case)] +mod test { +    use super::option_push; // private items +    use crate::tendril::{SliceExt, StrTendril}; + +    use super::{TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts}; + +    use super::interface::{CharacterTokens, EOFToken, NullCharacterToken, ParseError}; +    use super::interface::{EndTag, StartTag, Tag, TagKind}; +    use super::interface::{TagToken, Token}; + +    use markup5ever::buffer_queue::BufferQueue; +    use std::mem::replace; + +    use crate::LocalName; + +    // LinesMatch implements the TokenSink trait. It is used for testing to see +    // if current_line is being updated when process_token is called. The lines +    // vector is a collection of the line numbers that each token is on. +    struct LinesMatch { +        tokens: Vec<Token>, +        current_str: StrTendril, +        lines: Vec<(Token, u64)>, +    } + +    impl LinesMatch { +        fn new() -> LinesMatch { +            LinesMatch { +                tokens: vec![], +                current_str: StrTendril::new(), +                lines: vec![], +            } +        } + +        fn push(&mut self, token: Token, line_number: u64) { +            self.finish_str(); +            self.lines.push((token, line_number)); +        } + +        fn finish_str(&mut self) { +            if self.current_str.len() > 0 { +                let s = replace(&mut self.current_str, StrTendril::new()); +                self.tokens.push(CharacterTokens(s)); +            } +        } +    } + +    impl TokenSink for LinesMatch { +        type Handle = (); + +        fn process_token( +            &mut self, +            token: Token, +            line_number: u64, +        ) -> TokenSinkResult<Self::Handle> { +            match token { +                CharacterTokens(b) => { +                    self.current_str.push_slice(&b); +                }, + +                NullCharacterToken => { +                    self.current_str.push_char('\0'); +                }, + +                ParseError(_) => { +                    panic!("unexpected parse error"); +                }, + +                TagToken(mut t) => { +                    // The spec seems to indicate that one can emit +                    // erroneous end tags with attrs, but the test +                    // cases don't contain them. +                    match t.kind { +                        EndTag => { +                            t.self_closing = false; +                            t.attrs = vec![]; +                        }, +                        _ => t.attrs.sort_by(|a1, a2| a1.name.cmp(&a2.name)), +                    } +                    self.push(TagToken(t), line_number); +                }, + +                EOFToken => (), + +                _ => self.push(token, line_number), +            } +            TokenSinkResult::Continue +        } +    } + +    // Take in tokens, process them, and return vector with line +    // numbers that each token is on +    fn tokenize(input: Vec<StrTendril>, opts: TokenizerOpts) -> Vec<(Token, u64)> { +        let sink = LinesMatch::new(); +        let mut tok = Tokenizer::new(sink, opts); +        let mut buffer = BufferQueue::new(); +        for chunk in input.into_iter() { +            buffer.push_back(chunk); +            let _ = tok.feed(&mut buffer); +        } +        tok.end(); +        tok.sink.lines +    } + +    // Create a tag token +    fn create_tag(token: StrTendril, tagkind: TagKind) -> Token { +        let name = LocalName::from(&*token); +        let token = TagToken(Tag { +            kind: tagkind, +            name, +            self_closing: false, +            attrs: vec![], +        }); +        token +    } + +    #[test] +    fn push_to_None_gives_singleton() { +        let mut s: Option<StrTendril> = None; +        option_push(&mut s, 'x'); +        assert_eq!(s, Some("x".to_tendril())); +    } + +    #[test] +    fn push_to_empty_appends() { +        let mut s: Option<StrTendril> = Some(StrTendril::new()); +        option_push(&mut s, 'x'); +        assert_eq!(s, Some("x".to_tendril())); +    } + +    #[test] +    fn push_to_nonempty_appends() { +        let mut s: Option<StrTendril> = Some(StrTendril::from_slice("y")); +        option_push(&mut s, 'x'); +        assert_eq!(s, Some("yx".to_tendril())); +    } + +    #[test] +    fn check_lines() { +        let opts = TokenizerOpts { +            exact_errors: false, +            discard_bom: true, +            profile: false, +            initial_state: None, +            last_start_tag_name: None, +        }; +        let vector = vec![ +            StrTendril::from("<a>\n"), +            StrTendril::from("<b>\n"), +            StrTendril::from("</b>\n"), +            StrTendril::from("</a>\n"), +        ]; +        let expected = vec![ +            (create_tag(StrTendril::from("a"), StartTag), 1), +            (create_tag(StrTendril::from("b"), StartTag), 2), +            (create_tag(StrTendril::from("b"), EndTag), 3), +            (create_tag(StrTendril::from("a"), EndTag), 4), +        ]; +        let results = tokenize(vector, opts); +        assert_eq!(results, expected); +    } + +    #[test] +    fn check_lines_with_new_line() { +        let opts = TokenizerOpts { +            exact_errors: false, +            discard_bom: true, +            profile: false, +            initial_state: None, +            last_start_tag_name: None, +        }; +        let vector = vec![ +            StrTendril::from("<a>\r\n"), +            StrTendril::from("<b>\r\n"), +            StrTendril::from("</b>\r\n"), +            StrTendril::from("</a>\r\n"), +        ]; +        let expected = vec![ +            (create_tag(StrTendril::from("a"), StartTag), 1), +            (create_tag(StrTendril::from("b"), StartTag), 2), +            (create_tag(StrTendril::from("b"), EndTag), 3), +            (create_tag(StrTendril::from("a"), EndTag), 4), +        ]; +        let results = tokenize(vector, opts); +        assert_eq!(results, expected); +    } +} diff --git a/src/tokenizer/states.rs b/src/tokenizer/states.rs new file mode 100644 index 0000000..d455e9a --- /dev/null +++ b/src/tokenizer/states.rs @@ -0,0 +1,93 @@ +// Copyright 2014-2017 The html5ever Project Developers. See the +// COPYRIGHT file at the top-level directory of this distribution. +// +// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or +// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license +// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +//! Tokenizer states. +//! +//! This is public for use by the tokenizer tests.  Other library +//! users should not have to care about this. + +pub use self::AttrValueKind::*; +pub use self::DoctypeIdKind::*; +pub use self::RawKind::*; +pub use self::ScriptEscapeKind::*; +pub use self::State::*; + +#[derive(PartialEq, Eq, PartialOrd, Ord, Copy, Clone, Hash, Debug)] +pub enum ScriptEscapeKind { +    Escaped, +    DoubleEscaped, +} + +#[derive(PartialEq, Eq, PartialOrd, Ord, Copy, Clone, Hash, Debug)] +pub enum DoctypeIdKind { +    Public, +    System, +} + +#[derive(PartialEq, Eq, PartialOrd, Ord, Copy, Clone, Hash, Debug)] +pub enum RawKind { +    Rcdata, +    Rawtext, +    ScriptData, +    ScriptDataEscaped(ScriptEscapeKind), +} + +#[derive(PartialEq, Eq, PartialOrd, Ord, Copy, Clone, Hash, Debug)] +pub enum AttrValueKind { +    Unquoted, +    SingleQuoted, +    DoubleQuoted, +} + +#[derive(PartialEq, Eq, PartialOrd, Ord, Copy, Clone, Hash, Debug)] +pub enum State { +    Data, +    Plaintext, +    TagOpen, +    EndTagOpen, +    TagName, +    RawData(RawKind), +    RawLessThanSign(RawKind), +    RawEndTagOpen(RawKind), +    RawEndTagName(RawKind), +    ScriptDataEscapeStart(ScriptEscapeKind), +    ScriptDataEscapeStartDash, +    ScriptDataEscapedDash(ScriptEscapeKind), +    ScriptDataEscapedDashDash(ScriptEscapeKind), +    ScriptDataDoubleEscapeEnd, +    BeforeAttributeName, +    AttributeName, +    AfterAttributeName, +    BeforeAttributeValue, +    AttributeValue(AttrValueKind), +    AfterAttributeValueQuoted, +    SelfClosingStartTag, +    BogusComment, +    MarkupDeclarationOpen, +    CommentStart, +    CommentStartDash, +    Comment, +    CommentEndDash, +    CommentEnd, +    CommentEndBang, +    Doctype, +    BeforeDoctypeName, +    DoctypeName, +    AfterDoctypeName, +    AfterDoctypeKeyword(DoctypeIdKind), +    BeforeDoctypeIdentifier(DoctypeIdKind), +    DoctypeIdentifierDoubleQuoted(DoctypeIdKind), +    DoctypeIdentifierSingleQuoted(DoctypeIdKind), +    AfterDoctypeIdentifier(DoctypeIdKind), +    BetweenDoctypePublicAndSystemIdentifiers, +    BogusDoctype, +    CdataSection, +    CdataSectionBracket, +    CdataSectionEnd, +} | 
