aboutsummaryrefslogtreecommitdiff
path: root/src/tokenizer
diff options
context:
space:
mode:
authorMartin Fischer <martin@push-f.com>2021-04-08 08:42:01 +0200
committerMartin Fischer <martin@push-f.com>2021-04-08 15:40:37 +0200
commit57e7eefcbe6fb8c3dc4b01c707be9de4c34963a7 (patch)
tree6a9d296389bf3023396592c8514ed6712e011c7f /src/tokenizer
import https://github.com/servo/html5ever
commit d1206daa740305f55a5fa159e43eb33afc359cb4
Diffstat (limited to 'src/tokenizer')
-rw-r--r--src/tokenizer/char_ref/mod.rs449
-rw-r--r--src/tokenizer/interface.rs110
-rw-r--r--src/tokenizer/mod.rs1713
-rw-r--r--src/tokenizer/states.rs93
4 files changed, 2365 insertions, 0 deletions
diff --git a/src/tokenizer/char_ref/mod.rs b/src/tokenizer/char_ref/mod.rs
new file mode 100644
index 0000000..a52485d
--- /dev/null
+++ b/src/tokenizer/char_ref/mod.rs
@@ -0,0 +1,449 @@
+// Copyright 2014-2017 The html5ever Project Developers. See the
+// COPYRIGHT file at the top-level directory of this distribution.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+use super::{TokenSink, Tokenizer};
+use crate::buffer_queue::BufferQueue;
+use crate::data;
+use crate::tendril::StrTendril;
+use crate::util::str::is_ascii_alnum;
+
+use log::debug;
+use mac::format_if;
+use std::borrow::Cow::Borrowed;
+use std::char::from_u32;
+
+use self::State::*;
+pub use self::Status::*;
+
+//§ tokenizing-character-references
+pub struct CharRef {
+ /// The resulting character(s)
+ pub chars: [char; 2],
+
+ /// How many slots in `chars` are valid?
+ pub num_chars: u8,
+}
+
+pub enum Status {
+ Stuck,
+ Progress,
+ Done,
+}
+
+#[derive(Debug)]
+enum State {
+ Begin,
+ Octothorpe,
+ Numeric(u32), // base
+ NumericSemicolon,
+ Named,
+ BogusName,
+}
+
+pub struct CharRefTokenizer {
+ state: State,
+ addnl_allowed: Option<char>,
+ result: Option<CharRef>,
+
+ num: u32,
+ num_too_big: bool,
+ seen_digit: bool,
+ hex_marker: Option<char>,
+
+ name_buf_opt: Option<StrTendril>,
+ name_match: Option<(u32, u32)>,
+ name_len: usize,
+}
+
+impl CharRefTokenizer {
+ // NB: We assume that we have an additional allowed character iff we're
+ // tokenizing in an attribute value.
+ pub fn new(addnl_allowed: Option<char>) -> CharRefTokenizer {
+ CharRefTokenizer {
+ state: Begin,
+ addnl_allowed,
+ result: None,
+ num: 0,
+ num_too_big: false,
+ seen_digit: false,
+ hex_marker: None,
+ name_buf_opt: None,
+ name_match: None,
+ name_len: 0,
+ }
+ }
+
+ // A CharRefTokenizer can only tokenize one character reference,
+ // so this method consumes the tokenizer.
+ pub fn get_result(self) -> CharRef {
+ self.result.expect("get_result called before done")
+ }
+
+ fn name_buf(&self) -> &StrTendril {
+ self.name_buf_opt
+ .as_ref()
+ .expect("name_buf missing in named character reference")
+ }
+
+ fn name_buf_mut(&mut self) -> &mut StrTendril {
+ self.name_buf_opt
+ .as_mut()
+ .expect("name_buf missing in named character reference")
+ }
+
+ fn finish_none(&mut self) -> Status {
+ self.result = Some(CharRef {
+ chars: ['\0', '\0'],
+ num_chars: 0,
+ });
+ Done
+ }
+
+ fn finish_one(&mut self, c: char) -> Status {
+ self.result = Some(CharRef {
+ chars: [c, '\0'],
+ num_chars: 1,
+ });
+ Done
+ }
+}
+
+impl CharRefTokenizer {
+ pub fn step<Sink: TokenSink>(
+ &mut self,
+ tokenizer: &mut Tokenizer<Sink>,
+ input: &mut BufferQueue,
+ ) -> Status {
+ if self.result.is_some() {
+ return Done;
+ }
+
+ debug!("char ref tokenizer stepping in state {:?}", self.state);
+ match self.state {
+ Begin => self.do_begin(tokenizer, input),
+ Octothorpe => self.do_octothorpe(tokenizer, input),
+ Numeric(base) => self.do_numeric(tokenizer, input, base),
+ NumericSemicolon => self.do_numeric_semicolon(tokenizer, input),
+ Named => self.do_named(tokenizer, input),
+ BogusName => self.do_bogus_name(tokenizer, input),
+ }
+ }
+
+ fn do_begin<Sink: TokenSink>(
+ &mut self,
+ tokenizer: &mut Tokenizer<Sink>,
+ input: &mut BufferQueue,
+ ) -> Status {
+ match unwrap_or_return!(tokenizer.peek(input), Stuck) {
+ '\t' | '\n' | '\x0C' | ' ' | '<' | '&' => self.finish_none(),
+ c if Some(c) == self.addnl_allowed => self.finish_none(),
+
+ '#' => {
+ tokenizer.discard_char(input);
+ self.state = Octothorpe;
+ Progress
+ },
+
+ _ => {
+ self.state = Named;
+ self.name_buf_opt = Some(StrTendril::new());
+ Progress
+ },
+ }
+ }
+
+ fn do_octothorpe<Sink: TokenSink>(
+ &mut self,
+ tokenizer: &mut Tokenizer<Sink>,
+ input: &mut BufferQueue,
+ ) -> Status {
+ let c = unwrap_or_return!(tokenizer.peek(input), Stuck);
+ match c {
+ 'x' | 'X' => {
+ tokenizer.discard_char(input);
+ self.hex_marker = Some(c);
+ self.state = Numeric(16);
+ },
+
+ _ => {
+ self.hex_marker = None;
+ self.state = Numeric(10);
+ },
+ }
+ Progress
+ }
+
+ fn do_numeric<Sink: TokenSink>(
+ &mut self,
+ tokenizer: &mut Tokenizer<Sink>,
+ input: &mut BufferQueue,
+ base: u32,
+ ) -> Status {
+ let c = unwrap_or_return!(tokenizer.peek(input), Stuck);
+ match c.to_digit(base) {
+ Some(n) => {
+ tokenizer.discard_char(input);
+ self.num = self.num.wrapping_mul(base);
+ if self.num > 0x10FFFF {
+ // We might overflow, and the character is definitely invalid.
+ // We still parse digits and semicolon, but don't use the result.
+ self.num_too_big = true;
+ }
+ self.num = self.num.wrapping_add(n);
+ self.seen_digit = true;
+ Progress
+ },
+
+ None if !self.seen_digit => self.unconsume_numeric(tokenizer, input),
+
+ None => {
+ self.state = NumericSemicolon;
+ Progress
+ },
+ }
+ }
+
+ fn do_numeric_semicolon<Sink: TokenSink>(
+ &mut self,
+ tokenizer: &mut Tokenizer<Sink>,
+ input: &mut BufferQueue,
+ ) -> Status {
+ match unwrap_or_return!(tokenizer.peek(input), Stuck) {
+ ';' => tokenizer.discard_char(input),
+ _ => tokenizer.emit_error(Borrowed(
+ "Semicolon missing after numeric character reference",
+ )),
+ };
+ self.finish_numeric(tokenizer)
+ }
+
+ fn unconsume_numeric<Sink: TokenSink>(
+ &mut self,
+ tokenizer: &mut Tokenizer<Sink>,
+ input: &mut BufferQueue,
+ ) -> Status {
+ let mut unconsume = StrTendril::from_char('#');
+ match self.hex_marker {
+ Some(c) => unconsume.push_char(c),
+ None => (),
+ }
+
+ input.push_front(unconsume);
+ tokenizer.emit_error(Borrowed("Numeric character reference without digits"));
+ self.finish_none()
+ }
+
+ fn finish_numeric<Sink: TokenSink>(&mut self, tokenizer: &mut Tokenizer<Sink>) -> Status {
+ fn conv(n: u32) -> char {
+ from_u32(n).expect("invalid char missed by error handling cases")
+ }
+
+ let (c, error) = match self.num {
+ n if (n > 0x10FFFF) || self.num_too_big => ('\u{fffd}', true),
+ 0x00 | 0xD800..=0xDFFF => ('\u{fffd}', true),
+
+ 0x80..=0x9F => match data::C1_REPLACEMENTS[(self.num - 0x80) as usize] {
+ Some(c) => (c, true),
+ None => (conv(self.num), true),
+ },
+
+ 0x01..=0x08 | 0x0B | 0x0D..=0x1F | 0x7F | 0xFDD0..=0xFDEF => (conv(self.num), true),
+
+ n if (n & 0xFFFE) == 0xFFFE => (conv(n), true),
+
+ n => (conv(n), false),
+ };
+
+ if error {
+ let msg = format_if!(
+ tokenizer.opts.exact_errors,
+ "Invalid numeric character reference",
+ "Invalid numeric character reference value 0x{:06X}",
+ self.num
+ );
+ tokenizer.emit_error(msg);
+ }
+
+ self.finish_one(c)
+ }
+
+ fn do_named<Sink: TokenSink>(
+ &mut self,
+ tokenizer: &mut Tokenizer<Sink>,
+ input: &mut BufferQueue,
+ ) -> Status {
+ let c = unwrap_or_return!(tokenizer.get_char(input), Stuck);
+ self.name_buf_mut().push_char(c);
+ match data::NAMED_ENTITIES.get(&self.name_buf()[..]) {
+ // We have either a full match or a prefix of one.
+ Some(&m) => {
+ if m.0 != 0 {
+ // We have a full match, but there might be a longer one to come.
+ self.name_match = Some(m);
+ self.name_len = self.name_buf().len();
+ }
+ // Otherwise we just have a prefix match.
+ Progress
+ },
+
+ // Can't continue the match.
+ None => self.finish_named(tokenizer, input, Some(c)),
+ }
+ }
+
+ fn emit_name_error<Sink: TokenSink>(&mut self, tokenizer: &mut Tokenizer<Sink>) {
+ let msg = format_if!(
+ tokenizer.opts.exact_errors,
+ "Invalid character reference",
+ "Invalid character reference &{}",
+ self.name_buf()
+ );
+ tokenizer.emit_error(msg);
+ }
+
+ fn unconsume_name(&mut self, input: &mut BufferQueue) {
+ input.push_front(self.name_buf_opt.take().unwrap());
+ }
+
+ fn finish_named<Sink: TokenSink>(
+ &mut self,
+ tokenizer: &mut Tokenizer<Sink>,
+ input: &mut BufferQueue,
+ end_char: Option<char>,
+ ) -> Status {
+ match self.name_match {
+ None => {
+ match end_char {
+ Some(c) if is_ascii_alnum(c) => {
+ // Keep looking for a semicolon, to determine whether
+ // we emit a parse error.
+ self.state = BogusName;
+ return Progress;
+ },
+
+ // Check length because &; is not a parse error.
+ Some(';') if self.name_buf().len() > 1 => self.emit_name_error(tokenizer),
+
+ _ => (),
+ }
+ self.unconsume_name(input);
+ self.finish_none()
+ },
+
+ Some((c1, c2)) => {
+ // We have a complete match, but we may have consumed
+ // additional characters into self.name_buf. Usually
+ // at least one, but several in cases like
+ //
+ // &not => match for U+00AC
+ // &noti => valid prefix for &notin
+ // &notit => can't continue match
+
+ let name_len = self.name_len;
+ assert!(name_len > 0);
+ let last_matched = self.name_buf()[name_len - 1..].chars().next().unwrap();
+
+ // There might not be a next character after the match, if
+ // we had a full match and then hit EOF.
+ let next_after = if name_len == self.name_buf().len() {
+ None
+ } else {
+ Some(self.name_buf()[name_len..].chars().next().unwrap())
+ };
+
+ // "If the character reference is being consumed as part of an
+ // attribute, and the last character matched is not a U+003B
+ // SEMICOLON character (;), and the next character is either a
+ // U+003D EQUALS SIGN character (=) or an alphanumeric ASCII
+ // character, then, for historical reasons, all the characters
+ // that were matched after the U+0026 AMPERSAND character (&)
+ // must be unconsumed, and nothing is returned. However, if
+ // this next character is in fact a U+003D EQUALS SIGN
+ // character (=), then this is a parse error"
+
+ let unconsume_all = match (self.addnl_allowed, last_matched, next_after) {
+ (_, ';', _) => false,
+ (Some(_), _, Some('=')) => {
+ tokenizer.emit_error(Borrowed(
+ "Equals sign after character reference in attribute",
+ ));
+ true
+ },
+ (Some(_), _, Some(c)) if is_ascii_alnum(c) => true,
+ _ => {
+ tokenizer.emit_error(Borrowed(
+ "Character reference does not end with semicolon",
+ ));
+ false
+ },
+ };
+
+ if unconsume_all {
+ self.unconsume_name(input);
+ self.finish_none()
+ } else {
+ input.push_front(StrTendril::from_slice(&self.name_buf()[name_len..]));
+ self.result = Some(CharRef {
+ chars: [from_u32(c1).unwrap(), from_u32(c2).unwrap()],
+ num_chars: if c2 == 0 { 1 } else { 2 },
+ });
+ Done
+ }
+ },
+ }
+ }
+
+ fn do_bogus_name<Sink: TokenSink>(
+ &mut self,
+ tokenizer: &mut Tokenizer<Sink>,
+ input: &mut BufferQueue,
+ ) -> Status {
+ let c = unwrap_or_return!(tokenizer.get_char(input), Stuck);
+ self.name_buf_mut().push_char(c);
+ match c {
+ _ if is_ascii_alnum(c) => return Progress,
+ ';' => self.emit_name_error(tokenizer),
+ _ => (),
+ }
+ self.unconsume_name(input);
+ self.finish_none()
+ }
+
+ pub fn end_of_file<Sink: TokenSink>(
+ &mut self,
+ tokenizer: &mut Tokenizer<Sink>,
+ input: &mut BufferQueue,
+ ) {
+ while self.result.is_none() {
+ match self.state {
+ Begin => drop(self.finish_none()),
+
+ Numeric(_) if !self.seen_digit => drop(self.unconsume_numeric(tokenizer, input)),
+
+ Numeric(_) | NumericSemicolon => {
+ tokenizer.emit_error(Borrowed("EOF in numeric character reference"));
+ self.finish_numeric(tokenizer);
+ },
+
+ Named => drop(self.finish_named(tokenizer, input, None)),
+
+ BogusName => {
+ self.unconsume_name(input);
+ self.finish_none();
+ },
+
+ Octothorpe => {
+ input.push_front(StrTendril::from_slice("#"));
+ tokenizer.emit_error(Borrowed("EOF after '#' in character reference"));
+ self.finish_none();
+ },
+ }
+ }
+ }
+}
diff --git a/src/tokenizer/interface.rs b/src/tokenizer/interface.rs
new file mode 100644
index 0000000..22d11be
--- /dev/null
+++ b/src/tokenizer/interface.rs
@@ -0,0 +1,110 @@
+// Copyright 2014-2017 The html5ever Project Developers. See the
+// COPYRIGHT file at the top-level directory of this distribution.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+use crate::interface::Attribute;
+use crate::tendril::StrTendril;
+use crate::tokenizer::states;
+use crate::LocalName;
+use std::borrow::Cow;
+
+pub use self::TagKind::{EndTag, StartTag};
+pub use self::Token::{CharacterTokens, CommentToken, DoctypeToken, TagToken};
+pub use self::Token::{EOFToken, NullCharacterToken, ParseError};
+
+/// A `DOCTYPE` token.
+// FIXME: already exists in Servo DOM
+#[derive(PartialEq, Eq, Clone, Debug)]
+pub struct Doctype {
+ pub name: Option<StrTendril>,
+ pub public_id: Option<StrTendril>,
+ pub system_id: Option<StrTendril>,
+ pub force_quirks: bool,
+}
+
+impl Doctype {
+ pub fn new() -> Doctype {
+ Doctype {
+ name: None,
+ public_id: None,
+ system_id: None,
+ force_quirks: false,
+ }
+ }
+}
+
+#[derive(PartialEq, Eq, Hash, Copy, Clone, Debug)]
+pub enum TagKind {
+ StartTag,
+ EndTag,
+}
+
+/// A tag token.
+#[derive(PartialEq, Eq, Clone, Debug)]
+pub struct Tag {
+ pub kind: TagKind,
+ pub name: LocalName,
+ pub self_closing: bool,
+ pub attrs: Vec<Attribute>,
+}
+
+impl Tag {
+ /// Are the tags equivalent when we don't care about attribute order?
+ /// Also ignores the self-closing flag.
+ pub fn equiv_modulo_attr_order(&self, other: &Tag) -> bool {
+ if (self.kind != other.kind) || (self.name != other.name) {
+ return false;
+ }
+
+ let mut self_attrs = self.attrs.clone();
+ let mut other_attrs = other.attrs.clone();
+ self_attrs.sort();
+ other_attrs.sort();
+
+ self_attrs == other_attrs
+ }
+}
+
+#[derive(PartialEq, Eq, Debug)]
+pub enum Token {
+ DoctypeToken(Doctype),
+ TagToken(Tag),
+ CommentToken(StrTendril),
+ CharacterTokens(StrTendril),
+ NullCharacterToken,
+ EOFToken,
+ ParseError(Cow<'static, str>),
+}
+
+#[derive(Debug, PartialEq)]
+#[must_use]
+pub enum TokenSinkResult<Handle> {
+ Continue,
+ Script(Handle),
+ Plaintext,
+ RawData(states::RawKind),
+}
+
+/// Types which can receive tokens from the tokenizer.
+pub trait TokenSink {
+ type Handle;
+
+ /// Process a token.
+ fn process_token(&mut self, token: Token, line_number: u64) -> TokenSinkResult<Self::Handle>;
+
+ // Signal sink that tokenization reached the end.
+ fn end(&mut self) {}
+
+ /// Used in the markup declaration open state. By default, this always
+ /// returns false and thus all CDATA sections are tokenized as bogus
+ /// comments.
+ /// https://html.spec.whatwg.org/multipage/#markup-declaration-open-state
+ fn adjusted_current_node_present_but_not_in_html_namespace(&self) -> bool {
+ false
+ }
+}
diff --git a/src/tokenizer/mod.rs b/src/tokenizer/mod.rs
new file mode 100644
index 0000000..267fdf3
--- /dev/null
+++ b/src/tokenizer/mod.rs
@@ -0,0 +1,1713 @@
+// Copyright 2014-2017 The html5ever Project Developers. See the
+// COPYRIGHT file at the top-level directory of this distribution.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+//! The HTML5 tokenizer.
+
+pub use self::interface::{CharacterTokens, EOFToken, NullCharacterToken, ParseError};
+pub use self::interface::{CommentToken, DoctypeToken, TagToken, Token};
+pub use self::interface::{Doctype, EndTag, StartTag, Tag, TagKind};
+pub use self::interface::{TokenSink, TokenSinkResult};
+
+use self::states::{DoctypeIdKind, Public, System};
+use self::states::{DoubleEscaped, Escaped};
+use self::states::{DoubleQuoted, SingleQuoted, Unquoted};
+use self::states::{Rawtext, Rcdata, ScriptData, ScriptDataEscaped};
+
+use self::char_ref::{CharRef, CharRefTokenizer};
+
+use crate::util::str::lower_ascii_letter;
+
+use log::debug;
+use mac::{_tt_as_expr_hack, format_if, matches};
+use markup5ever::{namespace_url, ns, small_char_set};
+use std::borrow::Cow::{self, Borrowed};
+use std::collections::BTreeMap;
+use std::default::Default;
+use std::mem::replace;
+
+pub use crate::buffer_queue::{BufferQueue, FromSet, NotFromSet, SetResult};
+use crate::tendril::StrTendril;
+use crate::{Attribute, LocalName, QualName, SmallCharSet};
+
+mod char_ref;
+mod interface;
+pub mod states;
+
+pub enum ProcessResult<Handle> {
+ Continue,
+ Suspend,
+ Script(Handle),
+}
+
+#[must_use]
+pub enum TokenizerResult<Handle> {
+ Done,
+ Script(Handle),
+}
+
+fn option_push(opt_str: &mut Option<StrTendril>, c: char) {
+ match *opt_str {
+ Some(ref mut s) => s.push_char(c),
+ None => *opt_str = Some(StrTendril::from_char(c)),
+ }
+}
+
+/// Tokenizer options, with an impl for `Default`.
+#[derive(Clone)]
+pub struct TokenizerOpts {
+ /// Report all parse errors described in the spec, at some
+ /// performance penalty? Default: false
+ pub exact_errors: bool,
+
+ /// Discard a `U+FEFF BYTE ORDER MARK` if we see one at the beginning
+ /// of the stream? Default: true
+ pub discard_bom: bool,
+
+ /// Keep a record of how long we spent in each state? Printed
+ /// when `end()` is called. Default: false
+ pub profile: bool,
+
+ /// Initial state override. Only the test runner should use
+ /// a non-`None` value!
+ pub initial_state: Option<states::State>,
+
+ /// Last start tag. Only the test runner should use a
+ /// non-`None` value!
+ ///
+ /// FIXME: Can't use Tendril because we want TokenizerOpts
+ /// to be Send.
+ pub last_start_tag_name: Option<String>,
+}
+
+impl Default for TokenizerOpts {
+ fn default() -> TokenizerOpts {
+ TokenizerOpts {
+ exact_errors: false,
+ discard_bom: true,
+ profile: false,
+ initial_state: None,
+ last_start_tag_name: None,
+ }
+ }
+}
+
+/// The HTML tokenizer.
+pub struct Tokenizer<Sink> {
+ /// Options controlling the behavior of the tokenizer.
+ opts: TokenizerOpts,
+
+ /// Destination for tokens we emit.
+ pub sink: Sink,
+
+ /// The abstract machine state as described in the spec.
+ state: states::State,
+
+ /// Are we at the end of the file, once buffers have been processed
+ /// completely? This affects whether we will wait for lookahead or not.
+ at_eof: bool,
+
+ /// Tokenizer for character references, if we're tokenizing
+ /// one at the moment.
+ char_ref_tokenizer: Option<Box<CharRefTokenizer>>,
+
+ /// Current input character. Just consumed, may reconsume.
+ current_char: char,
+
+ /// Should we reconsume the current input character?
+ reconsume: bool,
+
+ /// Did we just consume \r, translating it to \n? In that case we need
+ /// to ignore the next character if it's \n.
+ ignore_lf: bool,
+
+ /// Discard a U+FEFF BYTE ORDER MARK if we see one? Only done at the
+ /// beginning of the stream.
+ discard_bom: bool,
+
+ /// Current tag kind.
+ current_tag_kind: TagKind,
+
+ /// Current tag name.
+ current_tag_name: StrTendril,
+
+ /// Current tag is self-closing?
+ current_tag_self_closing: bool,
+
+ /// Current tag attributes.
+ current_tag_attrs: Vec<Attribute>,
+
+ /// Current attribute name.
+ current_attr_name: StrTendril,
+
+ /// Current attribute value.
+ current_attr_value: StrTendril,
+
+ /// Current comment.
+ current_comment: StrTendril,
+
+ /// Current doctype token.
+ current_doctype: Doctype,
+
+ /// Last start tag name, for use in checking "appropriate end tag".
+ last_start_tag_name: Option<LocalName>,
+
+ /// The "temporary buffer" mentioned in the spec.
+ temp_buf: StrTendril,
+
+ /// Record of how many ns we spent in each state, if profiling is enabled.
+ state_profile: BTreeMap<states::State, u64>,
+
+ /// Record of how many ns we spent in the token sink.
+ time_in_sink: u64,
+
+ /// Track current line
+ current_line: u64,
+}
+
+impl<Sink: TokenSink> Tokenizer<Sink> {
+ /// Create a new tokenizer which feeds tokens to a particular `TokenSink`.
+ pub fn new(sink: Sink, mut opts: TokenizerOpts) -> Tokenizer<Sink> {
+ let start_tag_name = opts
+ .last_start_tag_name
+ .take()
+ .map(|s| LocalName::from(&*s));
+ let state = opts.initial_state.unwrap_or(states::Data);
+ let discard_bom = opts.discard_bom;
+ Tokenizer {
+ opts,
+ sink,
+ state,
+ char_ref_tokenizer: None,
+ at_eof: false,
+ current_char: '\0',
+ reconsume: false,
+ ignore_lf: false,
+ discard_bom,
+ current_tag_kind: StartTag,
+ current_tag_name: StrTendril::new(),
+ current_tag_self_closing: false,
+ current_tag_attrs: vec![],
+ current_attr_name: StrTendril::new(),
+ current_attr_value: StrTendril::new(),
+ current_comment: StrTendril::new(),
+ current_doctype: Doctype::new(),
+ last_start_tag_name: start_tag_name,
+ temp_buf: StrTendril::new(),
+ state_profile: BTreeMap::new(),
+ time_in_sink: 0,
+ current_line: 1,
+ }
+ }
+
+ /// Feed an input string into the tokenizer.
+ pub fn feed(&mut self, input: &mut BufferQueue) -> TokenizerResult<Sink::Handle> {
+ if input.is_empty() {
+ return TokenizerResult::Done;
+ }
+
+ if self.discard_bom {
+ if let Some(c) = input.peek() {
+ if c == '\u{feff}' {
+ input.next();
+ }
+ } else {
+ return TokenizerResult::Done;
+ }
+ };
+
+ self.run(input)
+ }
+
+ pub fn set_plaintext_state(&mut self) {
+ self.state = states::Plaintext;
+ }
+
+ fn process_token(&mut self, token: Token) -> TokenSinkResult<Sink::Handle> {
+ if self.opts.profile {
+ let (ret, dt) = time!(self.sink.process_token(token, self.current_line));
+ self.time_in_sink += dt;
+ ret
+ } else {
+ self.sink.process_token(token, self.current_line)
+ }
+ }
+
+ fn process_token_and_continue(&mut self, token: Token) {
+ assert!(matches!(
+ self.process_token(token),
+ TokenSinkResult::Continue
+ ));
+ }
+
+ //§ preprocessing-the-input-stream
+ // Get the next input character, which might be the character
+ // 'c' that we already consumed from the buffers.
+ fn get_preprocessed_char(&mut self, mut c: char, input: &mut BufferQueue) -> Option<char> {
+ if self.ignore_lf {
+ self.ignore_lf = false;
+ if c == '\n' {
+ c = unwrap_or_return!(input.next(), None);
+ }
+ }
+
+ if c == '\r' {
+ self.ignore_lf = true;
+ c = '\n';
+ }
+
+ if c == '\n' {
+ self.current_line += 1;
+ }
+
+ if self.opts.exact_errors &&
+ match c as u32 {
+ 0x01..=0x08 | 0x0B | 0x0E..=0x1F | 0x7F..=0x9F | 0xFDD0..=0xFDEF => true,
+ n if (n & 0xFFFE) == 0xFFFE => true,
+ _ => false,
+ }
+ {
+ let msg = format!("Bad character {}", c);
+ self.emit_error(Cow::Owned(msg));
+ }
+
+ debug!("got character {}", c);
+ self.current_char = c;
+ Some(c)
+ }
+
+ //§ tokenization
+ // Get the next input character, if one is available.
+ fn get_char(&mut self, input: &mut BufferQueue) -> Option<char> {
+ if self.reconsume {
+ self.reconsume = false;
+ Some(self.current_char)
+ } else {
+ input
+ .next()
+ .and_then(|c| self.get_preprocessed_char(c, input))
+ }
+ }
+
+ fn pop_except_from(&mut self, input: &mut BufferQueue, set: SmallCharSet) -> Option<SetResult> {
+ // Bail to the slow path for various corner cases.
+ // This means that `FromSet` can contain characters not in the set!
+ // It shouldn't matter because the fallback `FromSet` case should
+ // always do the same thing as the `NotFromSet` case.
+ if self.opts.exact_errors || self.reconsume || self.ignore_lf {
+ return self.get_char(input).map(FromSet);
+ }
+
+ let d = input.pop_except_from(set);
+ debug!("got characters {:?}", d);
+ match d {
+ Some(FromSet(c)) => self.get_preprocessed_char(c, input).map(FromSet),
+
+ // NB: We don't set self.current_char for a run of characters not
+ // in the set. It shouldn't matter for the codepaths that use
+ // this.
+ _ => d,
+ }
+ }
+
+ // Check if the next characters are an ASCII case-insensitive match. See
+ // BufferQueue::eat.
+ //
+ // NB: this doesn't do input stream preprocessing or set the current input
+ // character.
+ fn eat(
+ &mut self,
+ input: &mut BufferQueue,
+ pat: &str,
+ eq: fn(&u8, &u8) -> bool,
+ ) -> Option<bool> {
+ input.push_front(replace(&mut self.temp_buf, StrTendril::new()));
+ match input.eat(pat, eq) {
+ None if self.at_eof => Some(false),
+ None => {
+ while let Some(c) = input.next() {
+ self.temp_buf.push_char(c);
+ }
+ None
+ },
+ Some(matched) => Some(matched),
+ }
+ }
+
+ /// Run the state machine for as long as we can.
+ fn run(&mut self, input: &mut BufferQueue) -> TokenizerResult<Sink::Handle> {
+ if self.opts.profile {
+ loop {
+ let state = self.state;
+ let old_sink = self.time_in_sink;
+ let (run, mut dt) = time!(self.step(input));
+ dt -= (self.time_in_sink - old_sink);
+ let new = match self.state_profile.get_mut(&state) {
+ Some(x) => {
+ *x += dt;
+ false
+ },
+ None => true,
+ };
+ if new {
+ // do this here because of borrow shenanigans
+ self.state_profile.insert(state, dt);
+ }
+ match run {
+ ProcessResult::Continue => (),
+ ProcessResult::Suspend => break,
+ ProcessResult::Script(node) => return TokenizerResult::Script(node),
+ }
+ }
+ } else {
+ loop {
+ match self.step(input) {
+ ProcessResult::Continue => (),
+ ProcessResult::Suspend => break,
+ ProcessResult::Script(node) => return TokenizerResult::Script(node),
+ }
+ }
+ }
+ TokenizerResult::Done
+ }
+
+ fn bad_char_error(&mut self) {
+ let msg = format_if!(
+ self.opts.exact_errors,
+ "Bad character",
+ "Saw {} in state {:?}",
+ self.current_char,
+ self.state
+ );
+ self.emit_error(msg);
+ }
+
+ fn bad_eof_error(&mut self) {
+ let msg = format_if!(
+ self.opts.exact_errors,
+ "Unexpected EOF",
+ "Saw EOF in state {:?}",
+ self.state
+ );
+ self.emit_error(msg);
+ }
+
+ fn emit_char(&mut self, c: char) {
+ self.process_token_and_continue(match c {
+ '\0' => NullCharacterToken,
+ _ => CharacterTokens(StrTendril::from_char(c)),
+ });
+ }
+
+ // The string must not contain '\0'!
+ fn emit_chars(&mut self, b: StrTendril) {
+ self.process_token_and_continue(CharacterTokens(b));
+ }
+
+ fn emit_current_tag(&mut self) -> ProcessResult<Sink::Handle> {
+ self.finish_attribute();
+
+ let name = LocalName::from(&*self.current_tag_name);
+ self.current_tag_name.clear();
+
+ match self.current_tag_kind {
+ StartTag => {
+ self.last_start_tag_name = Some(name.clone());
+ },
+ EndTag => {
+ if !self.current_tag_attrs.is_empty() {
+ self.emit_error(Borrowed("Attributes on an end tag"));
+ }
+ if self.current_tag_self_closing {
+ self.emit_error(Borrowed("Self-closing end tag"));
+ }
+ },
+ }
+
+ let token = TagToken(Tag {
+ kind: self.current_tag_kind,
+ name,
+ self_closing: self.current_tag_self_closing,
+ attrs: replace(&mut self.current_tag_attrs, vec![]),
+ });
+
+ match self.process_token(token) {
+ TokenSinkResult::Continue => ProcessResult::Continue,
+ TokenSinkResult::Plaintext => {
+ self.state = states::Plaintext;
+ ProcessResult::Continue
+ },
+ TokenSinkResult::Script(node) => {
+ self.state = states::Data;
+ ProcessResult::Script(node)
+ },
+ TokenSinkResult::RawData(kind) => {
+ self.state = states::RawData(kind);
+ ProcessResult::Continue
+ },
+ }
+ }
+
+ fn emit_temp_buf(&mut self) {
+ // FIXME: Make sure that clearing on emit is spec-compatible.
+ let buf = replace(&mut self.temp_buf, StrTendril::new());
+ self.emit_chars(buf);
+ }
+
+ fn clear_temp_buf(&mut self) {
+ // Do this without a new allocation.
+ self.temp_buf.clear();
+ }
+
+ fn emit_current_comment(&mut self) {
+ let comment = replace(&mut self.current_comment, StrTendril::new());
+ self.process_token_and_continue(CommentToken(comment));
+ }
+
+ fn discard_tag(&mut self) {
+ self.current_tag_name.clear();
+ self.current_tag_self_closing = false;
+ self.current_tag_attrs = vec![];
+ }
+
+ fn create_tag(&mut self, kind: TagKind, c: char) {
+ self.discard_tag();
+ self.current_tag_name.push_char(c);
+ self.current_tag_kind = kind;
+ }
+
+ fn have_appropriate_end_tag(&self) -> bool {
+ match self.last_start_tag_name.as_ref() {
+ Some(last) => (self.current_tag_kind == EndTag) && (*self.current_tag_name == **last),
+ None => false,
+ }
+ }
+
+ fn create_attribute(&mut self, c: char) {
+ self.finish_attribute();
+
+ self.current_attr_name.push_char(c);
+ }
+
+ fn finish_attribute(&mut self) {
+ if self.current_attr_name.is_empty() {
+ return;
+ }
+
+ // Check for a duplicate attribute.
+ // FIXME: the spec says we should error as soon as the name is finished.
+ // FIXME: linear time search, do we care?
+ let dup = {
+ let name = &*self.current_attr_name;
+ self.current_tag_attrs
+ .iter()
+ .any(|a| &*a.name.local == name)
+ };
+
+ if dup {
+ self.emit_error(Borrowed("Duplicate attribute"));
+ self.current_attr_name.clear();
+ self.current_attr_value.clear();
+ } else {
+ let name = LocalName::from(&*self.current_attr_name);
+ self.current_attr_name.clear();
+ self.current_tag_attrs.push(Attribute {
+ // The tree builder will adjust the namespace if necessary.
+ // This only happens in foreign elements.
+ name: QualName::new(None, ns!(), name),
+ value: replace(&mut self.current_attr_value, StrTendril::new()),
+ });
+ }
+ }
+
+ fn emit_current_doctype(&mut self) {
+ let doctype = replace(&mut self.current_doctype, Doctype::new());
+ self.process_token_and_continue(DoctypeToken(doctype));
+ }
+
+ fn doctype_id(&mut self, kind: DoctypeIdKind) -> &mut Option<StrTendril> {
+ match kind {
+ Public => &mut self.current_doctype.public_id,
+ System => &mut self.current_doctype.system_id,
+ }
+ }
+
+ fn clear_doctype_id(&mut self, kind: DoctypeIdKind) {
+ let id = self.doctype_id(kind);
+ match *id {
+ Some(ref mut s) => s.clear(),
+ None => *id = Some(StrTendril::new()),
+ }
+ }
+
+ fn consume_char_ref(&mut self, addnl_allowed: Option<char>) {
+ // NB: The char ref tokenizer assumes we have an additional allowed
+ // character iff we're tokenizing in an attribute value.
+ self.char_ref_tokenizer = Some(Box::new(CharRefTokenizer::new(addnl_allowed)));
+ }
+
+ fn emit_eof(&mut self) {
+ self.process_token_and_continue(EOFToken);
+ }
+
+ fn peek(&mut self, input: &BufferQueue) -> Option<char> {
+ if self.reconsume {
+ Some(self.current_char)
+ } else {
+ input.peek()
+ }
+ }
+
+ fn discard_char(&mut self, input: &mut BufferQueue) {
+ self.get_char(input);
+ }
+
+ fn emit_error(&mut self, error: Cow<'static, str>) {
+ self.process_token_and_continue(ParseError(error));
+ }
+}
+//§ END
+
+// Shorthand for common state machine behaviors.
+macro_rules! shorthand (
+ ( $me:ident : emit $c:expr ) => ( $me.emit_char($c); );
+ ( $me:ident : create_tag $kind:ident $c:expr ) => ( $me.create_tag($kind, $c); );
+ ( $me:ident : push_tag $c:expr ) => ( $me.current_tag_name.push_char($c); );
+ ( $me:ident : discard_tag ) => ( $me.discard_tag(); );
+ ( $me:ident : discard_char $input:expr ) => ( $me.discard_char($input); );
+ ( $me:ident : push_temp $c:expr ) => ( $me.temp_buf.push_char($c); );
+ ( $me:ident : emit_temp ) => ( $me.emit_temp_buf(); );
+ ( $me:ident : clear_temp ) => ( $me.clear_temp_buf(); );
+ ( $me:ident : create_attr $c:expr ) => ( $me.create_attribute($c); );
+ ( $me:ident : push_name $c:expr ) => ( $me.current_attr_name.push_char($c); );
+ ( $me:ident : push_value $c:expr ) => ( $me.current_attr_value.push_char($c); );
+ ( $me:ident : append_value $c:expr ) => ( $me.current_attr_value.push_tendril($c); );
+ ( $me:ident : push_comment $c:expr ) => ( $me.current_comment.push_char($c); );
+ ( $me:ident : append_comment $c:expr ) => ( $me.current_comment.push_slice($c); );
+ ( $me:ident : emit_comment ) => ( $me.emit_current_comment(); );
+ ( $me:ident : clear_comment ) => ( $me.current_comment.clear(); );
+ ( $me:ident : create_doctype ) => ( $me.current_doctype = Doctype::new(); );
+ ( $me:ident : push_doctype_name $c:expr ) => ( option_push(&mut $me.current_doctype.name, $c); );
+ ( $me:ident : push_doctype_id $k:ident $c:expr ) => ( option_push($me.doctype_id($k), $c); );
+ ( $me:ident : clear_doctype_id $k:ident ) => ( $me.clear_doctype_id($k); );
+ ( $me:ident : force_quirks ) => ( $me.current_doctype.force_quirks = true; );
+ ( $me:ident : emit_doctype ) => ( $me.emit_current_doctype(); );
+ ( $me:ident : error ) => ( $me.bad_char_error(); );
+ ( $me:ident : error_eof ) => ( $me.bad_eof_error(); );
+);
+
+// Tracing of tokenizer actions. This adds significant bloat and compile time,
+// so it's behind a cfg flag.
+#[cfg(trace_tokenizer)]
+macro_rules! sh_trace ( ( $me:ident : $($cmds:tt)* ) => ({
+ debug!(" {:s}", stringify!($($cmds)*));
+ shorthand!($me:expr : $($cmds)*);
+}));
+
+#[cfg(not(trace_tokenizer))]
+macro_rules! sh_trace ( ( $me:ident : $($cmds:tt)* ) => ( shorthand!($me: $($cmds)*) ) );
+
+// A little DSL for sequencing shorthand actions.
+macro_rules! go (
+ // A pattern like $($cmd:tt)* ; $($rest:tt)* causes parse ambiguity.
+ // We have to tell the parser how much lookahead we need.
+
+ ( $me:ident : $a:tt ; $($rest:tt)* ) => ({ sh_trace!($me: $a); go!($me: $($rest)*); });
+ ( $me:ident : $a:tt $b:tt ; $($rest:tt)* ) => ({ sh_trace!($me: $a $b); go!($me: $($rest)*); });
+ ( $me:ident : $a:tt $b:tt $c:tt ; $($rest:tt)* ) => ({ sh_trace!($me: $a $b $c); go!($me: $($rest)*); });
+ ( $me:ident : $a:tt $b:tt $c:tt $d:tt ; $($rest:tt)* ) => ({ sh_trace!($me: $a $b $c $d); go!($me: $($rest)*); });
+
+ // These can only come at the end.
+
+ ( $me:ident : to $s:ident ) => ({ $me.state = states::$s; return ProcessResult::Continue; });
+ ( $me:ident : to $s:ident $k1:expr ) => ({ $me.state = states::$s($k1); return ProcessResult::Continue; });
+ ( $me:ident : to $s:ident $k1:ident $k2:expr ) => ({ $me.state = states::$s($k1($k2)); return ProcessResult::Continue; });
+
+ ( $me:ident : reconsume $s:ident ) => ({ $me.reconsume = true; go!($me: to $s); });
+ ( $me:ident : reconsume $s:ident $k1:expr ) => ({ $me.reconsume = true; go!($me: to $s $k1); });
+ ( $me:ident : reconsume $s:ident $k1:ident $k2:expr ) => ({ $me.reconsume = true; go!($me: to $s $k1 $k2); });
+
+ ( $me:ident : consume_char_ref ) => ({ $me.consume_char_ref(None); return ProcessResult::Continue; });
+ ( $me:ident : consume_char_ref $addnl:expr ) => ({ $me.consume_char_ref(Some($addnl)); return ProcessResult::Continue; });
+
+ // We have a default next state after emitting a tag, but the sink can override.
+ ( $me:ident : emit_tag $s:ident ) => ({
+ $me.state = states::$s;
+ return $me.emit_current_tag();
+ });
+
+ ( $me:ident : eof ) => ({ $me.emit_eof(); return ProcessResult::Suspend; });
+
+ // If nothing else matched, it's a single command
+ ( $me:ident : $($cmd:tt)+ ) => ( sh_trace!($me: $($cmd)+); );
+
+ // or nothing.
+ ( $me:ident : ) => (());
+);
+
+macro_rules! go_match ( ( $me:ident : $x:expr, $($pats:pat),+ => $($cmds:tt)* ) => (
+ match $x {
+ $($pats)|+ => go!($me: $($cmds)*),
+ _ => (),
+ }
+));
+
+// This is a macro because it can cause early return
+// from the function where it is used.
+macro_rules! get_char ( ($me:expr, $input:expr) => (
+ unwrap_or_return!($me.get_char($input), ProcessResult::Suspend)
+));
+
+macro_rules! peek ( ($me:expr, $input:expr) => (
+ unwrap_or_return!($me.peek($input), ProcessResult::Suspend)
+));
+
+macro_rules! pop_except_from ( ($me:expr, $input:expr, $set:expr) => (
+ unwrap_or_return!($me.pop_except_from($input, $set), ProcessResult::Suspend)
+));
+
+macro_rules! eat ( ($me:expr, $input:expr, $pat:expr) => (
+ unwrap_or_return!($me.eat($input, $pat, u8::eq_ignore_ascii_case), ProcessResult::Suspend)
+));
+
+macro_rules! eat_exact ( ($me:expr, $input:expr, $pat:expr) => (
+ unwrap_or_return!($me.eat($input, $pat, u8::eq), ProcessResult::Suspend)
+));
+
+impl<Sink: TokenSink> Tokenizer<Sink> {
+ // Run the state machine for a while.
+ // Return true if we should be immediately re-invoked
+ // (this just simplifies control flow vs. break / continue).
+ #[allow(clippy::never_loop)]
+ fn step(&mut self, input: &mut BufferQueue) -> ProcessResult<Sink::Handle> {
+ if self.char_ref_tokenizer.is_some() {
+ return self.step_char_ref_tokenizer(input);
+ }
+
+ debug!("processing in state {:?}", self.state);
+ match self.state {
+ //§ data-state
+ states::Data => loop {
+ match pop_except_from!(self, input, small_char_set!('\r' '\0' '&' '<' '\n')) {
+ FromSet('\0') => go!(self: error; emit '\0'),
+ FromSet('&') => go!(self: consume_char_ref),
+ FromSet('<') => go!(self: to TagOpen),
+ FromSet(c) => go!(self: emit c),
+ NotFromSet(b) => self.emit_chars(b),
+ }
+ },
+
+ //§ rcdata-state
+ states::RawData(Rcdata) => loop {
+ match pop_except_from!(self, input, small_char_set!('\r' '\0' '&' '<' '\n')) {
+ FromSet('\0') => go!(self: error; emit '\u{fffd}'),
+ FromSet('&') => go!(self: consume_char_ref),
+ FromSet('<') => go!(self: to RawLessThanSign Rcdata),
+ FromSet(c) => go!(self: emit c),
+ NotFromSet(b) => self.emit_chars(b),
+ }
+ },
+
+ //§ rawtext-state
+ states::RawData(Rawtext) => loop {
+ match pop_except_from!(self, input, small_char_set!('\r' '\0' '<' '\n')) {
+ FromSet('\0') => go!(self: error; emit '\u{fffd}'),
+ FromSet('<') => go!(self: to RawLessThanSign Rawtext),
+ FromSet(c) => go!(self: emit c),
+ NotFromSet(b) => self.emit_chars(b),
+ }
+ },
+
+ //§ script-data-state
+ states::RawData(ScriptData) => loop {
+ match pop_except_from!(self, input, small_char_set!('\r' '\0' '<' '\n')) {
+ FromSet('\0') => go!(self: error; emit '\u{fffd}'),
+ FromSet('<') => go!(self: to RawLessThanSign ScriptData),
+ FromSet(c) => go!(self: emit c),
+ NotFromSet(b) => self.emit_chars(b),
+ }
+ },
+
+ //§ script-data-escaped-state
+ states::RawData(ScriptDataEscaped(Escaped)) => loop {
+ match pop_except_from!(self, input, small_char_set!('\r' '\0' '-' '<' '\n')) {
+ FromSet('\0') => go!(self: error; emit '\u{fffd}'),
+ FromSet('-') => go!(self: emit '-'; to ScriptDataEscapedDash Escaped),
+ FromSet('<') => go!(self: to RawLessThanSign ScriptDataEscaped Escaped),
+ FromSet(c) => go!(self: emit c),
+ NotFromSet(b) => self.emit_chars(b),
+ }
+ },
+
+ //§ script-data-double-escaped-state
+ states::RawData(ScriptDataEscaped(DoubleEscaped)) => loop {
+ match pop_except_from!(self, input, small_char_set!('\r' '\0' '-' '<' '\n')) {
+ FromSet('\0') => go!(self: error; emit '\u{fffd}'),
+ FromSet('-') => go!(self: emit '-'; to ScriptDataEscapedDash DoubleEscaped),
+ FromSet('<') => {
+ go!(self: emit '<'; to RawLessThanSign ScriptDataEscaped DoubleEscaped)
+ },
+ FromSet(c) => go!(self: emit c),
+ NotFromSet(b) => self.emit_chars(b),
+ }
+ },
+
+ //§ plaintext-state
+ states::Plaintext => loop {
+ match pop_except_from!(self, input, small_char_set!('\r' '\0' '\n')) {
+ FromSet('\0') => go!(self: error; emit '\u{fffd}'),
+ FromSet(c) => go!(self: emit c),
+ NotFromSet(b) => self.emit_chars(b),
+ }
+ },
+
+ //§ tag-open-state
+ states::TagOpen => loop {
+ match get_char!(self, input) {
+ '!' => go!(self: clear_temp; to MarkupDeclarationOpen),
+ '/' => go!(self: to EndTagOpen),
+ '?' => go!(self: error; clear_comment; push_comment '?'; to BogusComment),
+ c => match lower_ascii_letter(c) {
+ Some(cl) => go!(self: create_tag StartTag cl; to TagName),
+ None => go!(self: error; emit '<'; reconsume Data),
+ },
+ }
+ },
+
+ //§ end-tag-open-state
+ states::EndTagOpen => loop {
+ match get_char!(self, input) {
+ '>' => go!(self: error; to Data),
+ '\0' => {
+ go!(self: error; clear_comment; push_comment '\u{fffd}'; to BogusComment)
+ },
+ c => match lower_ascii_letter(c) {
+ Some(cl) => go!(self: create_tag EndTag cl; to TagName),
+ None => go!(self: error; clear_comment; push_comment c; to BogusComment),
+ },
+ }
+ },
+
+ //§ tag-name-state
+ states::TagName => loop {
+ match get_char!(self, input) {
+ '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeAttributeName),
+ '/' => go!(self: to SelfClosingStartTag),
+ '>' => go!(self: emit_tag Data),
+ '\0' => go!(self: error; push_tag '\u{fffd}'),
+ c => go!(self: push_tag (c.to_ascii_lowercase())),
+ }
+ },
+
+ //§ script-data-escaped-less-than-sign-state
+ states::RawLessThanSign(ScriptDataEscaped(Escaped)) => loop {
+ match get_char!(self, input) {
+ '/' => go!(self: clear_temp; to RawEndTagOpen ScriptDataEscaped Escaped),
+ c => match lower_ascii_letter(c) {
+ Some(cl) => go!(self: clear_temp; push_temp cl; emit '<'; emit c;
+ to ScriptDataEscapeStart DoubleEscaped),
+ None => go!(self: emit '<'; reconsume RawData ScriptDataEscaped Escaped),
+ },
+ }
+ },
+
+ //§ script-data-double-escaped-less-than-sign-state
+ states::RawLessThanSign(ScriptDataEscaped(DoubleEscaped)) => loop {
+ match get_char!(self, input) {
+ '/' => go!(self: clear_temp; emit '/'; to ScriptDataDoubleEscapeEnd),
+ _ => go!(self: reconsume RawData ScriptDataEscaped DoubleEscaped),
+ }
+ },
+
+ //§ rcdata-less-than-sign-state rawtext-less-than-sign-state script-data-less-than-sign-state
+ // otherwise
+ states::RawLessThanSign(kind) => loop {
+ match get_char!(self, input) {
+ '/' => go!(self: clear_temp; to RawEndTagOpen kind),
+ '!' if kind == ScriptData => {
+ go!(self: emit '<'; emit '!'; to ScriptDataEscapeStart Escaped)
+ },
+ _ => go!(self: emit '<'; reconsume RawData kind),
+ }
+ },
+
+ //§ rcdata-end-tag-open-state rawtext-end-tag-open-state script-data-end-tag-open-state script-data-escaped-end-tag-open-state
+ states::RawEndTagOpen(kind) => loop {
+ let c = get_char!(self, input);
+ match lower_ascii_letter(c) {
+ Some(cl) => go!(self: create_tag EndTag cl; push_temp c; to RawEndTagName kind),
+ None => go!(self: emit '<'; emit '/'; reconsume RawData kind),
+ }
+ },
+
+ //§ rcdata-end-tag-name-state rawtext-end-tag-name-state script-data-end-tag-name-state script-data-escaped-end-tag-name-state
+ states::RawEndTagName(kind) => loop {
+ let c = get_char!(self, input);
+ if self.have_appropriate_end_tag() {
+ match c {
+ '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeAttributeName),
+ '/' => go!(self: to SelfClosingStartTag),
+ '>' => go!(self: emit_tag Data),
+ _ => (),
+ }
+ }
+
+ match lower_ascii_letter(c) {
+ Some(cl) => go!(self: push_tag cl; push_temp c),
+ None => {
+ go!(self: discard_tag; emit '<'; emit '/'; emit_temp; reconsume RawData kind)
+ },
+ }
+ },
+
+ //§ script-data-double-escape-start-state
+ states::ScriptDataEscapeStart(DoubleEscaped) => loop {
+ let c = get_char!(self, input);
+ match c {
+ '\t' | '\n' | '\x0C' | ' ' | '/' | '>' => {
+ let esc = if &*self.temp_buf == "script" {
+ DoubleEscaped
+ } else {
+ Escaped
+ };
+ go!(self: emit c; to RawData ScriptDataEscaped esc);
+ },
+ _ => match lower_ascii_letter(c) {
+ Some(cl) => go!(self: push_temp cl; emit c),
+ None => go!(self: reconsume RawData ScriptDataEscaped Escaped),
+ },
+ }
+ },
+
+ //§ script-data-escape-start-state
+ states::ScriptDataEscapeStart(Escaped) => loop {
+ match get_char!(self, input) {
+ '-' => go!(self: emit '-'; to ScriptDataEscapeStartDash),
+ _ => go!(self: reconsume RawData ScriptData),
+ }
+ },
+
+ //§ script-data-escape-start-dash-state
+ states::ScriptDataEscapeStartDash => loop {
+ match get_char!(self, input) {
+ '-' => go!(self: emit '-'; to ScriptDataEscapedDashDash Escaped),
+ _ => go!(self: reconsume RawData ScriptData),
+ }
+ },
+
+ //§ script-data-escaped-dash-state script-data-double-escaped-dash-state
+ states::ScriptDataEscapedDash(kind) => loop {
+ match get_char!(self, input) {
+ '-' => go!(self: emit '-'; to ScriptDataEscapedDashDash kind),
+ '<' => {
+ if kind == DoubleEscaped {
+ go!(self: emit '<');
+ }
+ go!(self: to RawLessThanSign ScriptDataEscaped kind);
+ },
+ '\0' => go!(self: error; emit '\u{fffd}'; to RawData ScriptDataEscaped kind),
+ c => go!(self: emit c; to RawData ScriptDataEscaped kind),
+ }
+ },
+
+ //§ script-data-escaped-dash-dash-state script-data-double-escaped-dash-dash-state
+ states::ScriptDataEscapedDashDash(kind) => loop {
+ match get_char!(self, input) {
+ '-' => go!(self: emit '-'),
+ '<' => {
+ if kind == DoubleEscaped {
+ go!(self: emit '<');
+ }
+ go!(self: to RawLessThanSign ScriptDataEscaped kind);
+ },
+ '>' => go!(self: emit '>'; to RawData ScriptData),
+ '\0' => go!(self: error; emit '\u{fffd}'; to RawData ScriptDataEscaped kind),
+ c => go!(self: emit c; to RawData ScriptDataEscaped kind),
+ }
+ },
+
+ //§ script-data-double-escape-end-state
+ states::ScriptDataDoubleEscapeEnd => loop {
+ let c = get_char!(self, input);
+ match c {
+ '\t' | '\n' | '\x0C' | ' ' | '/' | '>' => {
+ let esc = if &*self.temp_buf == "script" {
+ Escaped
+ } else {
+ DoubleEscaped
+ };
+ go!(self: emit c; to RawData ScriptDataEscaped esc);
+ },
+ _ => match lower_ascii_letter(c) {
+ Some(cl) => go!(self: push_temp cl; emit c),
+ None => go!(self: reconsume RawData ScriptDataEscaped DoubleEscaped),
+ },
+ }
+ },
+
+ //§ before-attribute-name-state
+ states::BeforeAttributeName => loop {
+ match get_char!(self, input) {
+ '\t' | '\n' | '\x0C' | ' ' => (),
+ '/' => go!(self: to SelfClosingStartTag),
+ '>' => go!(self: emit_tag Data),
+ '\0' => go!(self: error; create_attr '\u{fffd}'; to AttributeName),
+ c => match lower_ascii_letter(c) {
+ Some(cl) => go!(self: create_attr cl; to AttributeName),
+ None => {
+ go_match!(self: c,
+ '"' , '\'' , '<' , '=' => error);
+ go!(self: create_attr c; to AttributeName);
+ },
+ },
+ }
+ },
+
+ //§ attribute-name-state
+ states::AttributeName => loop {
+ match get_char!(self, input) {
+ '\t' | '\n' | '\x0C' | ' ' => go!(self: to AfterAttributeName),
+ '/' => go!(self: to SelfClosingStartTag),
+ '=' => go!(self: to BeforeAttributeValue),
+ '>' => go!(self: emit_tag Data),
+ '\0' => go!(self: error; push_name '\u{fffd}'),
+ c => match lower_ascii_letter(c) {
+ Some(cl) => go!(self: push_name cl),
+ None => {
+ go_match!(self: c,
+ '"' , '\'' , '<' => error);
+ go!(self: push_name c);
+ },
+ },
+ }
+ },
+
+ //§ after-attribute-name-state
+ states::AfterAttributeName => loop {
+ match get_char!(self, input) {
+ '\t' | '\n' | '\x0C' | ' ' => (),
+ '/' => go!(self: to SelfClosingStartTag),
+ '=' => go!(self: to BeforeAttributeValue),
+ '>' => go!(self: emit_tag Data),
+ '\0' => go!(self: error; create_attr '\u{fffd}'; to AttributeName),
+ c => match lower_ascii_letter(c) {
+ Some(cl) => go!(self: create_attr cl; to AttributeName),
+ None => {
+ go_match!(self: c,
+ '"' , '\'' , '<' => error);
+ go!(self: create_attr c; to AttributeName);
+ },
+ },
+ }
+ },
+
+ //§ before-attribute-value-state
+ // Use peek so we can handle the first attr character along with the rest,
+ // hopefully in the same zero-copy buffer.
+ states::BeforeAttributeValue => loop {
+ match peek!(self, input) {
+ '\t' | '\n' | '\r' | '\x0C' | ' ' => go!(self: discard_char input),
+ '"' => go!(self: discard_char input; to AttributeValue DoubleQuoted),
+ '\'' => go!(self: discard_char input; to AttributeValue SingleQuoted),
+ '\0' => {
+ go!(self: discard_char input; error; push_value '\u{fffd}'; to AttributeValue Unquoted)
+ },
+ '>' => go!(self: discard_char input; error; emit_tag Data),
+ _ => go!(self: to AttributeValue Unquoted),
+ }
+ },
+
+ //§ attribute-value-(double-quoted)-state
+ states::AttributeValue(DoubleQuoted) => loop {
+ match pop_except_from!(self, input, small_char_set!('\r' '"' '&' '\0' '\n')) {
+ FromSet('"') => go!(self: to AfterAttributeValueQuoted),
+ FromSet('&') => go!(self: consume_char_ref '"'),
+ FromSet('\0') => go!(self: error; push_value '\u{fffd}'),
+ FromSet(c) => go!(self: push_value c),
+ NotFromSet(ref b) => go!(self: append_value b),
+ }
+ },
+
+ //§ attribute-value-(single-quoted)-state
+ states::AttributeValue(SingleQuoted) => loop {
+ match pop_except_from!(self, input, small_char_set!('\r' '\'' '&' '\0' '\n')) {
+ FromSet('\'') => go!(self: to AfterAttributeValueQuoted),
+ FromSet('&') => go!(self: consume_char_ref '\''),
+ FromSet('\0') => go!(self: error; push_value '\u{fffd}'),
+ FromSet(c) => go!(self: push_value c),
+ NotFromSet(ref b) => go!(self: append_value b),
+ }
+ },
+
+ //§ attribute-value-(unquoted)-state
+ states::AttributeValue(Unquoted) => loop {
+ match pop_except_from!(
+ self,
+ input,
+ small_char_set!('\r' '\t' '\n' '\x0C' ' ' '&' '>' '\0')
+ ) {
+ FromSet('\t') | FromSet('\n') | FromSet('\x0C') | FromSet(' ') => {
+ go!(self: to BeforeAttributeName)
+ },
+ FromSet('&') => go!(self: consume_char_ref '>'),
+ FromSet('>') => go!(self: emit_tag Data),
+ FromSet('\0') => go!(self: error; push_value '\u{fffd}'),
+ FromSet(c) => {
+ go_match!(self: c,
+ '"' , '\'' , '<' , '=' , '`' => error);
+ go!(self: push_value c);
+ },
+ NotFromSet(ref b) => go!(self: append_value b),
+ }
+ },
+
+ //§ after-attribute-value-(quoted)-state
+ states::AfterAttributeValueQuoted => loop {
+ match get_char!(self, input) {
+ '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeAttributeName),
+ '/' => go!(self: to SelfClosingStartTag),
+ '>' => go!(self: emit_tag Data),
+ _ => go!(self: error; reconsume BeforeAttributeName),
+ }
+ },
+
+ //§ self-closing-start-tag-state
+ states::SelfClosingStartTag => loop {
+ match get_char!(self, input) {
+ '>' => {
+ self.current_tag_self_closing = true;
+ go!(self: emit_tag Data);
+ },
+ _ => go!(self: error; reconsume BeforeAttributeName),
+ }
+ },
+
+ //§ comment-start-state
+ states::CommentStart => loop {
+ match get_char!(self, input) {
+ '-' => go!(self: to CommentStartDash),
+ '\0' => go!(self: error; push_comment '\u{fffd}'; to Comment),
+ '>' => go!(self: error; emit_comment; to Data),
+ c => go!(self: push_comment c; to Comment),
+ }
+ },
+
+ //§ comment-start-dash-state
+ states::CommentStartDash => loop {
+ match get_char!(self, input) {
+ '-' => go!(self: to CommentEnd),
+ '\0' => go!(self: error; append_comment "-\u{fffd}"; to Comment),
+ '>' => go!(self: error; emit_comment; to Data),
+ c => go!(self: push_comment '-'; push_comment c; to Comment),
+ }
+ },
+
+ //§ comment-state
+ states::Comment => loop {
+ match get_char!(self, input) {
+ '-' => go!(self: to CommentEndDash),
+ '\0' => go!(self: error; push_comment '\u{fffd}'),
+ c => go!(self: push_comment c),
+ }
+ },
+
+ //§ comment-end-dash-state
+ states::CommentEndDash => loop {
+ match get_char!(self, input) {
+ '-' => go!(self: to CommentEnd),
+ '\0' => go!(self: error; append_comment "-\u{fffd}"; to Comment),
+ c => go!(self: push_comment '-'; push_comment c; to Comment),
+ }
+ },
+
+ //§ comment-end-state
+ states::CommentEnd => loop {
+ match get_char!(self, input) {
+ '>' => go!(self: emit_comment; to Data),
+ '\0' => go!(self: error; append_comment "--\u{fffd}"; to Comment),
+ '!' => go!(self: error; to CommentEndBang),
+ '-' => go!(self: error; push_comment '-'),
+ c => go!(self: error; append_comment "--"; push_comment c; to Comment),
+ }
+ },
+
+ //§ comment-end-bang-state
+ states::CommentEndBang => loop {
+ match get_char!(self, input) {
+ '-' => go!(self: append_comment "--!"; to CommentEndDash),
+ '>' => go!(self: emit_comment; to Data),
+ '\0' => go!(self: error; append_comment "--!\u{fffd}"; to Comment),
+ c => go!(self: append_comment "--!"; push_comment c; to Comment),
+ }
+ },
+
+ //§ doctype-state
+ states::Doctype => loop {
+ match get_char!(self, input) {
+ '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeDoctypeName),
+ _ => go!(self: error; reconsume BeforeDoctypeName),
+ }
+ },
+
+ //§ before-doctype-name-state
+ states::BeforeDoctypeName => loop {
+ match get_char!(self, input) {
+ '\t' | '\n' | '\x0C' | ' ' => (),
+ '\0' => {
+ go!(self: error; create_doctype; push_doctype_name '\u{fffd}'; to DoctypeName)
+ },
+ '>' => go!(self: error; create_doctype; force_quirks; emit_doctype; to Data),
+ c => go!(self: create_doctype; push_doctype_name (c.to_ascii_lowercase());
+ to DoctypeName),
+ }
+ },
+
+ //§ doctype-name-state
+ states::DoctypeName => loop {
+ match get_char!(self, input) {
+ '\t' | '\n' | '\x0C' | ' ' => go!(self: clear_temp; to AfterDoctypeName),
+ '>' => go!(self: emit_doctype; to Data),
+ '\0' => go!(self: error; push_doctype_name '\u{fffd}'),
+ c => go!(self: push_doctype_name (c.to_ascii_lowercase())),
+ }
+ },
+
+ //§ after-doctype-name-state
+ states::AfterDoctypeName => loop {
+ if eat!(self, input, "public") {
+ go!(self: to AfterDoctypeKeyword Public);
+ } else if eat!(self, input, "system") {
+ go!(self: to AfterDoctypeKeyword System);
+ } else {
+ match get_char!(self, input) {
+ '\t' | '\n' | '\x0C' | ' ' => (),
+ '>' => go!(self: emit_doctype; to Data),
+ _ => go!(self: error; force_quirks; to BogusDoctype),
+ }
+ }
+ },
+
+ //§ after-doctype-public-keyword-state after-doctype-system-keyword-state
+ states::AfterDoctypeKeyword(kind) => loop {
+ match get_char!(self, input) {
+ '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeDoctypeIdentifier kind),
+ '"' => {
+ go!(self: error; clear_doctype_id kind; to DoctypeIdentifierDoubleQuoted kind)
+ },
+ '\'' => {
+ go!(self: error; clear_doctype_id kind; to DoctypeIdentifierSingleQuoted kind)
+ },
+ '>' => go!(self: error; force_quirks; emit_doctype; to Data),
+ _ => go!(self: error; force_quirks; to BogusDoctype),
+ }
+ },
+
+ //§ before-doctype-public-identifier-state before-doctype-system-identifier-state
+ states::BeforeDoctypeIdentifier(kind) => loop {
+ match get_char!(self, input) {
+ '\t' | '\n' | '\x0C' | ' ' => (),
+ '"' => go!(self: clear_doctype_id kind; to DoctypeIdentifierDoubleQuoted kind),
+ '\'' => go!(self: clear_doctype_id kind; to DoctypeIdentifierSingleQuoted kind),
+ '>' => go!(self: error; force_quirks; emit_doctype; to Data),
+ _ => go!(self: error; force_quirks; to BogusDoctype),
+ }
+ },
+
+ //§ doctype-public-identifier-(double-quoted)-state doctype-system-identifier-(double-quoted)-state
+ states::DoctypeIdentifierDoubleQuoted(kind) => loop {
+ match get_char!(self, input) {
+ '"' => go!(self: to AfterDoctypeIdentifier kind),
+ '\0' => go!(self: error; push_doctype_id kind '\u{fffd}'),
+ '>' => go!(self: error; force_quirks; emit_doctype; to Data),
+ c => go!(self: push_doctype_id kind c),
+ }
+ },
+
+ //§ doctype-public-identifier-(single-quoted)-state doctype-system-identifier-(single-quoted)-state
+ states::DoctypeIdentifierSingleQuoted(kind) => loop {
+ match get_char!(self, input) {
+ '\'' => go!(self: to AfterDoctypeIdentifier kind),
+ '\0' => go!(self: error; push_doctype_id kind '\u{fffd}'),
+ '>' => go!(self: error; force_quirks; emit_doctype; to Data),
+ c => go!(self: push_doctype_id kind c),
+ }
+ },
+
+ //§ after-doctype-public-identifier-state
+ states::AfterDoctypeIdentifier(Public) => loop {
+ match get_char!(self, input) {
+ '\t' | '\n' | '\x0C' | ' ' => {
+ go!(self: to BetweenDoctypePublicAndSystemIdentifiers)
+ },
+ '>' => go!(self: emit_doctype; to Data),
+ '"' => {
+ go!(self: error; clear_doctype_id System; to DoctypeIdentifierDoubleQuoted System)
+ },
+ '\'' => {
+ go!(self: error; clear_doctype_id System; to DoctypeIdentifierSingleQuoted System)
+ },
+ _ => go!(self: error; force_quirks; to BogusDoctype),
+ }
+ },
+
+ //§ after-doctype-system-identifier-state
+ states::AfterDoctypeIdentifier(System) => loop {
+ match get_char!(self, input) {
+ '\t' | '\n' | '\x0C' | ' ' => (),
+ '>' => go!(self: emit_doctype; to Data),
+ _ => go!(self: error; to BogusDoctype),
+ }
+ },
+
+ //§ between-doctype-public-and-system-identifiers-state
+ states::BetweenDoctypePublicAndSystemIdentifiers => loop {
+ match get_char!(self, input) {
+ '\t' | '\n' | '\x0C' | ' ' => (),
+ '>' => go!(self: emit_doctype; to Data),
+ '"' => {
+ go!(self: clear_doctype_id System; to DoctypeIdentifierDoubleQuoted System)
+ },
+ '\'' => {
+ go!(self: clear_doctype_id System; to DoctypeIdentifierSingleQuoted System)
+ },
+ _ => go!(self: error; force_quirks; to BogusDoctype),
+ }
+ },
+
+ //§ bogus-doctype-state
+ states::BogusDoctype => loop {
+ match get_char!(self, input) {
+ '>' => go!(self: emit_doctype; to Data),
+ _ => (),
+ }
+ },
+
+ //§ bogus-comment-state
+ states::BogusComment => loop {
+ match get_char!(self, input) {
+ '>' => go!(self: emit_comment; to Data),
+ '\0' => go!(self: push_comment '\u{fffd}'),
+ c => go!(self: push_comment c),
+ }
+ },
+
+ //§ markup-declaration-open-state
+ states::MarkupDeclarationOpen => loop {
+ if eat_exact!(self, input, "--") {
+ go!(self: clear_comment; to CommentStart);
+ } else if eat!(self, input, "doctype") {
+ go!(self: to Doctype);
+ } else {
+ if self
+ .sink
+ .adjusted_current_node_present_but_not_in_html_namespace()
+ {
+ if eat_exact!(self, input, "[CDATA[") {
+ go!(self: clear_temp; to CdataSection);
+ }
+ }
+ go!(self: error; to BogusComment);
+ }
+ },
+
+ //§ cdata-section-state
+ states::CdataSection => loop {
+ match get_char!(self, input) {
+ ']' => go!(self: to CdataSectionBracket),
+ '\0' => go!(self: emit_temp; emit '\0'),
+ c => go!(self: push_temp c),
+ }
+ },
+
+ //§ cdata-section-bracket
+ states::CdataSectionBracket => match get_char!(self, input) {
+ ']' => go!(self: to CdataSectionEnd),
+ _ => go!(self: push_temp ']'; reconsume CdataSection),
+ },
+
+ //§ cdata-section-end
+ states::CdataSectionEnd => loop {
+ match get_char!(self, input) {
+ ']' => go!(self: push_temp ']'),
+ '>' => go!(self: emit_temp; to Data),
+ _ => go!(self: push_temp ']'; push_temp ']'; reconsume CdataSection),
+ }
+ },
+ //§ END
+ }
+ }
+
+ fn step_char_ref_tokenizer(&mut self, input: &mut BufferQueue) -> ProcessResult<Sink::Handle> {
+ // FIXME HACK: Take and replace the tokenizer so we don't
+ // double-mut-borrow self. This is why it's boxed.
+ let mut tok = self.char_ref_tokenizer.take().unwrap();
+ let outcome = tok.step(self, input);
+
+ let progress = match outcome {
+ char_ref::Done => {
+ self.process_char_ref(tok.get_result());
+ return ProcessResult::Continue;
+ },
+
+ char_ref::Stuck => ProcessResult::Suspend,
+ char_ref::Progress => ProcessResult::Continue,
+ };
+
+ self.char_ref_tokenizer = Some(tok);
+ progress
+ }
+
+ fn process_char_ref(&mut self, char_ref: CharRef) {
+ let CharRef {
+ mut chars,
+ mut num_chars,
+ } = char_ref;
+
+ if num_chars == 0 {
+ chars[0] = '&';
+ num_chars = 1;
+ }
+
+ for i in 0..num_chars {
+ let c = chars[i as usize];
+ match self.state {
+ states::Data | states::RawData(states::Rcdata) => go!(self: emit c),
+
+ states::AttributeValue(_) => go!(self: push_value c),
+
+ _ => panic!(
+ "state {:?} should not be reachable in process_char_ref",
+ self.state
+ ),
+ }
+ }
+ }
+
+ /// Indicate that we have reached the end of the input.
+ pub fn end(&mut self) {
+ // Handle EOF in the char ref sub-tokenizer, if there is one.
+ // Do this first because it might un-consume stuff.
+ let mut input = BufferQueue::new();
+ match self.char_ref_tokenizer.take() {
+ None => (),
+ Some(mut tok) => {
+ tok.end_of_file(self, &mut input);
+ self.process_char_ref(tok.get_result());
+ },
+ }
+
+ // Process all remaining buffered input.
+ // If we're waiting for lookahead, we're not gonna get it.
+ self.at_eof = true;
+ assert!(matches!(self.run(&mut input), TokenizerResult::Done));
+ assert!(input.is_empty());
+
+ loop {
+ match self.eof_step() {
+ ProcessResult::Continue => (),
+ ProcessResult::Suspend => break,
+ ProcessResult::Script(_) => unreachable!(),
+ }
+ }
+
+ self.sink.end();
+
+ if self.opts.profile {
+ self.dump_profile();
+ }
+ }
+
+ fn dump_profile(&self) {
+ let mut results: Vec<(states::State, u64)> =
+ self.state_profile.iter().map(|(s, t)| (*s, *t)).collect();
+ results.sort_by(|&(_, x), &(_, y)| y.cmp(&x));
+
+ let total: u64 = results
+ .iter()
+ .map(|&(_, t)| t)
+ .fold(0, ::std::ops::Add::add);
+ println!("\nTokenizer profile, in nanoseconds");
+ println!("\n{:12} total in token sink", self.time_in_sink);
+ println!("\n{:12} total in tokenizer", total);
+
+ for (k, v) in results.into_iter() {
+ let pct = 100.0 * (v as f64) / (total as f64);
+ println!("{:12} {:4.1}% {:?}", v, pct, k);
+ }
+ }
+
+ fn eof_step(&mut self) -> ProcessResult<Sink::Handle> {
+ debug!("processing EOF in state {:?}", self.state);
+ match self.state {
+ states::Data |
+ states::RawData(Rcdata) |
+ states::RawData(Rawtext) |
+ states::RawData(ScriptData) |
+ states::Plaintext => go!(self: eof),
+
+ states::TagName |
+ states::RawData(ScriptDataEscaped(_)) |
+ states::BeforeAttributeName |
+ states::AttributeName |
+ states::AfterAttributeName |
+ states::BeforeAttributeValue |
+ states::AttributeValue(_) |
+ states::AfterAttributeValueQuoted |
+ states::SelfClosingStartTag |
+ states::ScriptDataEscapedDash(_) |
+ states::ScriptDataEscapedDashDash(_) => go!(self: error_eof; to Data),
+
+ states::TagOpen => go!(self: error_eof; emit '<'; to Data),
+
+ states::EndTagOpen => go!(self: error_eof; emit '<'; emit '/'; to Data),
+
+ states::RawLessThanSign(ScriptDataEscaped(DoubleEscaped)) => {
+ go!(self: to RawData ScriptDataEscaped DoubleEscaped)
+ },
+
+ states::RawLessThanSign(kind) => go!(self: emit '<'; to RawData kind),
+
+ states::RawEndTagOpen(kind) => go!(self: emit '<'; emit '/'; to RawData kind),
+
+ states::RawEndTagName(kind) => {
+ go!(self: emit '<'; emit '/'; emit_temp; to RawData kind)
+ },
+
+ states::ScriptDataEscapeStart(kind) => go!(self: to RawData ScriptDataEscaped kind),
+
+ states::ScriptDataEscapeStartDash => go!(self: to RawData ScriptData),
+
+ states::ScriptDataDoubleEscapeEnd => {
+ go!(self: to RawData ScriptDataEscaped DoubleEscaped)
+ },
+
+ states::CommentStart |
+ states::CommentStartDash |
+ states::Comment |
+ states::CommentEndDash |
+ states::CommentEnd |
+ states::CommentEndBang => go!(self: error_eof; emit_comment; to Data),
+
+ states::Doctype | states::BeforeDoctypeName => {
+ go!(self: error_eof; create_doctype; force_quirks; emit_doctype; to Data)
+ },
+
+ states::DoctypeName |
+ states::AfterDoctypeName |
+ states::AfterDoctypeKeyword(_) |
+ states::BeforeDoctypeIdentifier(_) |
+ states::DoctypeIdentifierDoubleQuoted(_) |
+ states::DoctypeIdentifierSingleQuoted(_) |
+ states::AfterDoctypeIdentifier(_) |
+ states::BetweenDoctypePublicAndSystemIdentifiers => {
+ go!(self: error_eof; force_quirks; emit_doctype; to Data)
+ },
+
+ states::BogusDoctype => go!(self: emit_doctype; to Data),
+
+ states::BogusComment => go!(self: emit_comment; to Data),
+
+ states::MarkupDeclarationOpen => go!(self: error; to BogusComment),
+
+ states::CdataSection => go!(self: emit_temp; error_eof; to Data),
+
+ states::CdataSectionBracket => go!(self: push_temp ']'; to CdataSection),
+
+ states::CdataSectionEnd => go!(self: push_temp ']'; push_temp ']'; to CdataSection),
+ }
+ }
+}
+
+#[cfg(test)]
+#[allow(non_snake_case)]
+mod test {
+ use super::option_push; // private items
+ use crate::tendril::{SliceExt, StrTendril};
+
+ use super::{TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts};
+
+ use super::interface::{CharacterTokens, EOFToken, NullCharacterToken, ParseError};
+ use super::interface::{EndTag, StartTag, Tag, TagKind};
+ use super::interface::{TagToken, Token};
+
+ use markup5ever::buffer_queue::BufferQueue;
+ use std::mem::replace;
+
+ use crate::LocalName;
+
+ // LinesMatch implements the TokenSink trait. It is used for testing to see
+ // if current_line is being updated when process_token is called. The lines
+ // vector is a collection of the line numbers that each token is on.
+ struct LinesMatch {
+ tokens: Vec<Token>,
+ current_str: StrTendril,
+ lines: Vec<(Token, u64)>,
+ }
+
+ impl LinesMatch {
+ fn new() -> LinesMatch {
+ LinesMatch {
+ tokens: vec![],
+ current_str: StrTendril::new(),
+ lines: vec![],
+ }
+ }
+
+ fn push(&mut self, token: Token, line_number: u64) {
+ self.finish_str();
+ self.lines.push((token, line_number));
+ }
+
+ fn finish_str(&mut self) {
+ if self.current_str.len() > 0 {
+ let s = replace(&mut self.current_str, StrTendril::new());
+ self.tokens.push(CharacterTokens(s));
+ }
+ }
+ }
+
+ impl TokenSink for LinesMatch {
+ type Handle = ();
+
+ fn process_token(
+ &mut self,
+ token: Token,
+ line_number: u64,
+ ) -> TokenSinkResult<Self::Handle> {
+ match token {
+ CharacterTokens(b) => {
+ self.current_str.push_slice(&b);
+ },
+
+ NullCharacterToken => {
+ self.current_str.push_char('\0');
+ },
+
+ ParseError(_) => {
+ panic!("unexpected parse error");
+ },
+
+ TagToken(mut t) => {
+ // The spec seems to indicate that one can emit
+ // erroneous end tags with attrs, but the test
+ // cases don't contain them.
+ match t.kind {
+ EndTag => {
+ t.self_closing = false;
+ t.attrs = vec![];
+ },
+ _ => t.attrs.sort_by(|a1, a2| a1.name.cmp(&a2.name)),
+ }
+ self.push(TagToken(t), line_number);
+ },
+
+ EOFToken => (),
+
+ _ => self.push(token, line_number),
+ }
+ TokenSinkResult::Continue
+ }
+ }
+
+ // Take in tokens, process them, and return vector with line
+ // numbers that each token is on
+ fn tokenize(input: Vec<StrTendril>, opts: TokenizerOpts) -> Vec<(Token, u64)> {
+ let sink = LinesMatch::new();
+ let mut tok = Tokenizer::new(sink, opts);
+ let mut buffer = BufferQueue::new();
+ for chunk in input.into_iter() {
+ buffer.push_back(chunk);
+ let _ = tok.feed(&mut buffer);
+ }
+ tok.end();
+ tok.sink.lines
+ }
+
+ // Create a tag token
+ fn create_tag(token: StrTendril, tagkind: TagKind) -> Token {
+ let name = LocalName::from(&*token);
+ let token = TagToken(Tag {
+ kind: tagkind,
+ name,
+ self_closing: false,
+ attrs: vec![],
+ });
+ token
+ }
+
+ #[test]
+ fn push_to_None_gives_singleton() {
+ let mut s: Option<StrTendril> = None;
+ option_push(&mut s, 'x');
+ assert_eq!(s, Some("x".to_tendril()));
+ }
+
+ #[test]
+ fn push_to_empty_appends() {
+ let mut s: Option<StrTendril> = Some(StrTendril::new());
+ option_push(&mut s, 'x');
+ assert_eq!(s, Some("x".to_tendril()));
+ }
+
+ #[test]
+ fn push_to_nonempty_appends() {
+ let mut s: Option<StrTendril> = Some(StrTendril::from_slice("y"));
+ option_push(&mut s, 'x');
+ assert_eq!(s, Some("yx".to_tendril()));
+ }
+
+ #[test]
+ fn check_lines() {
+ let opts = TokenizerOpts {
+ exact_errors: false,
+ discard_bom: true,
+ profile: false,
+ initial_state: None,
+ last_start_tag_name: None,
+ };
+ let vector = vec![
+ StrTendril::from("<a>\n"),
+ StrTendril::from("<b>\n"),
+ StrTendril::from("</b>\n"),
+ StrTendril::from("</a>\n"),
+ ];
+ let expected = vec![
+ (create_tag(StrTendril::from("a"), StartTag), 1),
+ (create_tag(StrTendril::from("b"), StartTag), 2),
+ (create_tag(StrTendril::from("b"), EndTag), 3),
+ (create_tag(StrTendril::from("a"), EndTag), 4),
+ ];
+ let results = tokenize(vector, opts);
+ assert_eq!(results, expected);
+ }
+
+ #[test]
+ fn check_lines_with_new_line() {
+ let opts = TokenizerOpts {
+ exact_errors: false,
+ discard_bom: true,
+ profile: false,
+ initial_state: None,
+ last_start_tag_name: None,
+ };
+ let vector = vec![
+ StrTendril::from("<a>\r\n"),
+ StrTendril::from("<b>\r\n"),
+ StrTendril::from("</b>\r\n"),
+ StrTendril::from("</a>\r\n"),
+ ];
+ let expected = vec![
+ (create_tag(StrTendril::from("a"), StartTag), 1),
+ (create_tag(StrTendril::from("b"), StartTag), 2),
+ (create_tag(StrTendril::from("b"), EndTag), 3),
+ (create_tag(StrTendril::from("a"), EndTag), 4),
+ ];
+ let results = tokenize(vector, opts);
+ assert_eq!(results, expected);
+ }
+}
diff --git a/src/tokenizer/states.rs b/src/tokenizer/states.rs
new file mode 100644
index 0000000..d455e9a
--- /dev/null
+++ b/src/tokenizer/states.rs
@@ -0,0 +1,93 @@
+// Copyright 2014-2017 The html5ever Project Developers. See the
+// COPYRIGHT file at the top-level directory of this distribution.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+//! Tokenizer states.
+//!
+//! This is public for use by the tokenizer tests. Other library
+//! users should not have to care about this.
+
+pub use self::AttrValueKind::*;
+pub use self::DoctypeIdKind::*;
+pub use self::RawKind::*;
+pub use self::ScriptEscapeKind::*;
+pub use self::State::*;
+
+#[derive(PartialEq, Eq, PartialOrd, Ord, Copy, Clone, Hash, Debug)]
+pub enum ScriptEscapeKind {
+ Escaped,
+ DoubleEscaped,
+}
+
+#[derive(PartialEq, Eq, PartialOrd, Ord, Copy, Clone, Hash, Debug)]
+pub enum DoctypeIdKind {
+ Public,
+ System,
+}
+
+#[derive(PartialEq, Eq, PartialOrd, Ord, Copy, Clone, Hash, Debug)]
+pub enum RawKind {
+ Rcdata,
+ Rawtext,
+ ScriptData,
+ ScriptDataEscaped(ScriptEscapeKind),
+}
+
+#[derive(PartialEq, Eq, PartialOrd, Ord, Copy, Clone, Hash, Debug)]
+pub enum AttrValueKind {
+ Unquoted,
+ SingleQuoted,
+ DoubleQuoted,
+}
+
+#[derive(PartialEq, Eq, PartialOrd, Ord, Copy, Clone, Hash, Debug)]
+pub enum State {
+ Data,
+ Plaintext,
+ TagOpen,
+ EndTagOpen,
+ TagName,
+ RawData(RawKind),
+ RawLessThanSign(RawKind),
+ RawEndTagOpen(RawKind),
+ RawEndTagName(RawKind),
+ ScriptDataEscapeStart(ScriptEscapeKind),
+ ScriptDataEscapeStartDash,
+ ScriptDataEscapedDash(ScriptEscapeKind),
+ ScriptDataEscapedDashDash(ScriptEscapeKind),
+ ScriptDataDoubleEscapeEnd,
+ BeforeAttributeName,
+ AttributeName,
+ AfterAttributeName,
+ BeforeAttributeValue,
+ AttributeValue(AttrValueKind),
+ AfterAttributeValueQuoted,
+ SelfClosingStartTag,
+ BogusComment,
+ MarkupDeclarationOpen,
+ CommentStart,
+ CommentStartDash,
+ Comment,
+ CommentEndDash,
+ CommentEnd,
+ CommentEndBang,
+ Doctype,
+ BeforeDoctypeName,
+ DoctypeName,
+ AfterDoctypeName,
+ AfterDoctypeKeyword(DoctypeIdKind),
+ BeforeDoctypeIdentifier(DoctypeIdKind),
+ DoctypeIdentifierDoubleQuoted(DoctypeIdKind),
+ DoctypeIdentifierSingleQuoted(DoctypeIdKind),
+ AfterDoctypeIdentifier(DoctypeIdKind),
+ BetweenDoctypePublicAndSystemIdentifiers,
+ BogusDoctype,
+ CdataSection,
+ CdataSectionBracket,
+ CdataSectionEnd,
+}