// Copyright 2014-2017 The html5ever Project Developers. See the
// COPYRIGHT file at the top-level directory of this distribution.
//
// Licensed under the Apache License, Version 2.0 or the MIT license
// , at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
//! The HTML5 tokenizer.
use self::error::InternalState;
pub use self::interface::{Attribute, Doctype, Tag, TagKind, Token};
use self::interface::{CharacterTokens, EOFToken, NullCharacterToken, ParseError};
use self::interface::{CommentToken, DoctypeToken, EndTag, StartTag, TagToken};
pub use self::interface::{TokenSink, TokenSinkResult};
use self::states::{DoctypeIdKind, Public, System};
use self::states::{DoubleEscaped, Escaped};
use self::states::{DoubleQuoted, SingleQuoted, Unquoted};
use self::states::{Rawtext, Rcdata, ScriptData, ScriptDataEscaped};
use self::char_ref::{CharRef, CharRefTokenizer};
use crate::error::Error;
use crate::util::{smallcharset::SmallCharSet, str::lower_ascii_letter};
use std::collections::BTreeMap;
use std::default::Default;
use std::mem::replace;
use std::ops::ControlFlow;
pub use crate::util::buffer_queue::BufferQueue;
use crate::util::buffer_queue::{FromSet, NotFromSet, SetResult};
pub use states::RawKind;
mod char_ref;
pub mod error;
mod interface;
mod states;
pub enum ProcessResult {
Suspend,
Break,
}
#[must_use]
#[derive(Debug, PartialEq, Eq)]
pub enum TokenizerResult {
Done,
Break,
}
fn option_push(opt_str: &mut Option, c: char) {
match *opt_str {
Some(ref mut s) => s.push(c),
None => *opt_str = Some(String::from(c)),
}
}
/// Tokenizer options, with an impl for `Default`.
#[derive(Clone)]
pub struct TokenizerOpts {
/// Report all parse errors described in the spec, at some
/// performance penalty? Defaults to false, except when the
/// `spans` feature is enabled in which case it defaults to true.
pub exact_errors: bool,
/// Discard a `U+FEFF BYTE ORDER MARK` if we see one at the beginning
/// of the stream? Default: true
pub discard_bom: bool,
/// Keep a record of how long we spent in each state? Printed
/// when `end()` is called. Default: false
pub profile: bool,
/// Initial state override. Only the test runner should use
/// a non-`None` value!
pub initial_state: Option,
/// Last start tag. Only the test runner should use a
/// non-`None` value!
pub last_start_tag_name: Option,
}
impl Default for TokenizerOpts {
fn default() -> TokenizerOpts {
TokenizerOpts {
exact_errors: cfg!(feature = "spans"),
discard_bom: true,
profile: false,
initial_state: None,
last_start_tag_name: None,
}
}
}
/// The HTML tokenizer.
pub struct Tokenizer {
/// Options controlling the behavior of the tokenizer.
opts: TokenizerOpts,
/// Destination for tokens we emit.
pub sink: Sink,
/// The abstract machine state as described in the spec.
state: states::State,
/// Are we at the end of the file, once buffers have been processed
/// completely? This affects whether we will wait for lookahead or not.
at_eof: bool,
/// Tokenizer for character references, if we're tokenizing
/// one at the moment.
char_ref_tokenizer: Option>,
/// Current input character. Just consumed, may reconsume.
current_char: char,
/// Should we reconsume the current input character?
reconsume: bool,
/// Did we just consume \r, translating it to \n? In that case we need
/// to ignore the next character if it's \n.
ignore_lf: bool,
/// Discard a U+FEFF BYTE ORDER MARK if we see one? Only done at the
/// beginning of the stream.
discard_bom: bool,
/// Current tag kind.
current_tag_kind: TagKind,
/// Current tag name.
current_tag_name: String,
/// Current tag is self-closing?
current_tag_self_closing: bool,
/// Current tag attributes.
current_tag_attrs: Vec,
/// Current attribute name.
current_attr_name: String,
/// Current attribute value.
current_attr_value: String,
/// Current comment.
current_comment: String,
/// Current doctype token.
current_doctype: Doctype,
/// Last start tag name, for use in checking "appropriate end tag".
last_start_tag_name: Option,
/// The "temporary buffer" mentioned in the spec.
temp_buf: String,
/// Record of how many ns we spent in each state, if profiling is enabled.
state_profile: BTreeMap,
/// Record of how many ns we spent in the token sink.
time_in_sink: u64,
/// Track current line
current_line: u64,
#[cfg(feature = "spans")]
spans: Spans,
}
#[cfg(feature = "spans")]
#[derive(Default)]
struct Spans {
/// Track current byte position
current_pos: usize,
/// Current tag name span.
current_tag_name: core::ops::Range,
/// Current attribute name span.
current_attr_name: core::ops::Range,
/// Current attribute value span.
current_attr_value: core::ops::Range,
}
#[cfg(feature = "spans")]
impl Spans {
fn end_tag_name(&mut self) {
self.current_tag_name.end = self.current_pos - 1;
}
fn end_attr_name(&mut self) {
self.current_attr_name.end = self.current_pos - 1;
}
}
impl Tokenizer {
/// Create a new tokenizer which feeds tokens to a particular `TokenSink`.
pub fn new(sink: Sink, mut opts: TokenizerOpts) -> Tokenizer {
let start_tag_name = opts.last_start_tag_name.take();
let state = opts.initial_state.unwrap_or(states::Data);
let discard_bom = opts.discard_bom;
Tokenizer {
opts,
sink,
state,
char_ref_tokenizer: None,
at_eof: false,
current_char: '\0',
reconsume: false,
ignore_lf: false,
discard_bom,
current_tag_kind: StartTag,
current_tag_name: String::new(),
current_tag_self_closing: false,
current_tag_attrs: vec![],
current_attr_name: String::new(),
current_attr_value: String::new(),
current_comment: String::new(),
current_doctype: Doctype::new(),
last_start_tag_name: start_tag_name,
temp_buf: String::new(),
state_profile: BTreeMap::new(),
time_in_sink: 0,
current_line: 1,
#[cfg(feature = "spans")]
spans: Spans::default(),
}
}
/// Feed an input string into the tokenizer.
pub fn feed(&mut self, input: &mut BufferQueue) -> TokenizerResult {
if input.is_empty() {
return TokenizerResult::Done;
}
if self.discard_bom {
if let Some(c) = input.peek() {
if c == '\u{feff}' {
input.next();
}
} else {
return TokenizerResult::Done;
}
};
self.run(input)
}
fn process_token(&mut self, token: Token) -> TokenSinkResult {
if self.opts.profile {
let (ret, dt) = time!(self.sink.process_token(token, self.current_line));
self.time_in_sink += dt;
ret
} else {
self.sink.process_token(token, self.current_line)
}
}
fn process_token_and_continue(&mut self, token: Token) {
assert!(matches!(
self.process_token(token),
TokenSinkResult::Continue
));
}
//§ preprocessing-the-input-stream
// Get the next input character, which might be the character
// 'c' that we already consumed from the buffers.
fn get_preprocessed_char(&mut self, mut c: char, input: &mut BufferQueue) -> Option {
if self.ignore_lf {
self.ignore_lf = false;
if c == '\n' {
c = input.next()?;
}
}
if c == '\r' {
self.ignore_lf = true;
c = '\n';
}
if c == '\n' {
self.current_line += 1;
}
if self.opts.exact_errors
&& match c as u32 {
0x01..=0x08 | 0x0B | 0x0E..=0x1F | 0x7F..=0x9F | 0xFDD0..=0xFDEF => true,
n if (n & 0xFFFE) == 0xFFFE => true,
_ => false,
}
{
self.emit_error(Error::BadCharacter(c));
}
#[cfg(feature = "spans")]
{
self.spans.current_pos += c.len_utf8();
}
self.current_char = c;
Some(c)
}
//§ tokenization
// Get the next input character, if one is available.
fn get_char(&mut self, input: &mut BufferQueue) -> Option {
if self.reconsume {
self.reconsume = false;
Some(self.current_char)
} else {
input
.next()
.and_then(|c| self.get_preprocessed_char(c, input))
}
}
fn pop_except_from(&mut self, input: &mut BufferQueue, set: SmallCharSet) -> Option {
// Bail to the slow path for various corner cases.
// This means that `FromSet` can contain characters not in the set!
// It shouldn't matter because the fallback `FromSet` case should
// always do the same thing as the `NotFromSet` case.
if self.opts.exact_errors || self.reconsume || self.ignore_lf {
return self.get_char(input).map(FromSet);
}
let d = input.pop_except_from(set);
match d {
Some(FromSet(c)) => self.get_preprocessed_char(c, input).map(FromSet),
// NB: We don't set self.current_char for a run of characters not
// in the set. It shouldn't matter for the codepaths that use
// this.
_ => d,
}
}
// Check if the next characters are an ASCII case-insensitive match. See
// BufferQueue::eat.
//
// NB: this doesn't do input stream preprocessing or set the current input
// character.
fn eat(
&mut self,
input: &mut BufferQueue,
pat: &str,
eq: fn(&u8, &u8) -> bool,
) -> Option {
input.push_front(replace(&mut self.temp_buf, String::new()));
match input.eat(pat, eq) {
None if self.at_eof => Some(false),
None => {
while let Some(c) = input.next() {
self.temp_buf.push(c);
}
None
}
Some(matched) => Some(matched),
}
}
/// Run the state machine for as long as we can.
fn run(&mut self, input: &mut BufferQueue) -> TokenizerResult {
if self.opts.profile {
loop {
let state = self.state;
let old_sink = self.time_in_sink;
let (run, mut dt) = time!(self.step(input));
dt -= (self.time_in_sink - old_sink);
let new = match self.state_profile.get_mut(&state) {
Some(x) => {
*x += dt;
false
}
None => true,
};
if new {
// do this here because of borrow shenanigans
self.state_profile.insert(state, dt);
}
match run {
ControlFlow::Continue(()) => (),
ControlFlow::Break(ProcessResult::Suspend) => break,
ControlFlow::Break(ProcessResult::Break) => return TokenizerResult::Break,
}
}
} else {
loop {
match self.step(input) {
ControlFlow::Continue(()) => (),
ControlFlow::Break(ProcessResult::Suspend) => break,
ControlFlow::Break(ProcessResult::Break) => return TokenizerResult::Break,
}
}
}
TokenizerResult::Done
}
fn bad_char_error(&mut self) {
self.emit_error(Error::UnexpectedCharacter(
self.current_char,
InternalState(self.state),
));
}
fn bad_eof_error(&mut self) {
self.emit_error(Error::UnexpectedEOF(InternalState(self.state)));
}
fn emit_char(&mut self, c: char) {
self.process_token_and_continue(match c {
'\0' => NullCharacterToken,
_ => CharacterTokens(String::from(c)),
});
}
// The string must not contain '\0'!
fn emit_chars(&mut self, b: String) {
self.process_token_and_continue(CharacterTokens(b));
}
fn emit_current_tag(&mut self) -> ControlFlow {
self.finish_attribute();
let name = self.current_tag_name.clone();
self.current_tag_name.clear();
match self.current_tag_kind {
StartTag => {
self.last_start_tag_name = Some(name.clone());
}
EndTag => {
if !self.current_tag_attrs.is_empty() {
self.emit_error(Error::AttributesOnEndTag);
}
if self.current_tag_self_closing {
self.emit_error(Error::SelfClosingEndTag);
}
}
}
// https://html.spec.whatwg.org/multipage/#concept-frag-parse-context
let next_state = match name.as_str() {
"title" | "textarea" => states::RawData(RawKind::Rcdata),
"style" | "xmp" | "iframe" | "noembed" | "noframes" => {
states::RawData(RawKind::Rawtext)
}
"script" => states::RawData(RawKind::ScriptData),
"plaintext" => states::Plaintext,
_other => states::Data,
};
let token = TagToken(Tag {
kind: self.current_tag_kind,
name,
self_closing: self.current_tag_self_closing,
attrs: replace(&mut self.current_tag_attrs, vec![]),
#[cfg(feature = "spans")]
name_span: self.spans.current_tag_name.clone(),
});
match self.process_token(token) {
TokenSinkResult::Continue => {
self.state = next_state;
ControlFlow::Continue(())
}
TokenSinkResult::Plaintext => {
self.state = states::Plaintext;
ControlFlow::Continue(())
}
TokenSinkResult::Break => {
self.state = states::Data;
ControlFlow::Break(ProcessResult::Break)
}
TokenSinkResult::RawData(kind) => {
self.state = states::RawData(kind);
ControlFlow::Continue(())
}
}
}
fn emit_temp_buf(&mut self) {
// FIXME: Make sure that clearing on emit is spec-compatible.
let buf = replace(&mut self.temp_buf, String::new());
self.emit_chars(buf);
}
fn clear_temp_buf(&mut self) {
// Do this without a new allocation.
self.temp_buf.clear();
}
fn emit_current_comment(&mut self) {
let comment = replace(&mut self.current_comment, String::new());
self.process_token_and_continue(CommentToken(comment));
}
fn discard_tag(&mut self) {
self.current_tag_name.clear();
self.current_tag_self_closing = false;
self.current_tag_attrs = vec![];
}
fn create_tag(&mut self, kind: TagKind, c: char) {
self.discard_tag();
self.current_tag_name.push(c);
self.current_tag_kind = kind;
}
fn have_appropriate_end_tag(&self) -> bool {
match self.last_start_tag_name.as_ref() {
Some(last) => (self.current_tag_kind == EndTag) && (*self.current_tag_name == **last),
None => false,
}
}
fn create_attribute(&mut self, c: char) {
self.finish_attribute();
self.current_attr_name.push(c);
}
fn finish_attribute(&mut self) {
if self.current_attr_name.is_empty() {
return;
}
// Check for a duplicate attribute.
// FIXME: the spec says we should error as soon as the name is finished.
// FIXME: linear time search, do we care?
let dup = {
let name = &*self.current_attr_name;
self.current_tag_attrs.iter().any(|a| &*a.name == name)
};
if dup {
self.emit_error(Error::DuplicateAttribute {
#[cfg(feature = "spans")]
span: self.spans.current_attr_name.clone(),
});
self.current_attr_name.clear();
self.current_attr_value.clear();
} else {
let name = self.current_attr_name.clone();
self.current_attr_name.clear();
self.current_tag_attrs.push(Attribute {
name: name,
value: replace(&mut self.current_attr_value, String::new()),
#[cfg(feature = "spans")]
name_span: self.spans.current_attr_name.clone(),
#[cfg(feature = "spans")]
value_span: self.spans.current_attr_value.clone(),
});
}
}
fn emit_current_doctype(&mut self) {
let doctype = replace(&mut self.current_doctype, Doctype::new());
self.process_token_and_continue(DoctypeToken(doctype));
}
fn doctype_id(&mut self, kind: DoctypeIdKind) -> &mut Option {
match kind {
Public => &mut self.current_doctype.public_id,
System => &mut self.current_doctype.system_id,
}
}
fn clear_doctype_id(&mut self, kind: DoctypeIdKind) {
let id = self.doctype_id(kind);
match *id {
Some(ref mut s) => s.clear(),
None => *id = Some(String::new()),
}
}
fn consume_char_ref(&mut self, addnl_allowed: Option) {
// NB: The char ref tokenizer assumes we have an additional allowed
// character iff we're tokenizing in an attribute value.
self.char_ref_tokenizer = Some(Box::new(CharRefTokenizer::new(addnl_allowed)));
}
fn emit_eof(&mut self) {
self.process_token_and_continue(EOFToken);
}
fn peek(&mut self, input: &BufferQueue) -> Option {
if self.reconsume {
Some(self.current_char)
} else {
input.peek()
}
}
fn discard_char(&mut self, input: &mut BufferQueue) {
self.get_char(input);
}
fn emit_error(&mut self, error: Error) {
self.process_token_and_continue(ParseError {
error,
#[cfg(feature = "spans")]
span: self.spans.current_pos - 1..self.spans.current_pos - 1,
});
}
}
//§ END
// Shorthand for common state machine behaviors.
macro_rules! shorthand (
( $me:ident : emit $c:expr ) => ( $me.emit_char($c) );
( $me:ident : create_tag $kind:ident $c:expr ) => ( $me.create_tag($kind, $c) );
( $me:ident : push_tag $c:expr ) => ( $me.current_tag_name.push($c) );
( $me:ident : discard_tag ) => ( $me.discard_tag() );
( $me:ident : discard_char $input:expr ) => ( $me.discard_char($input) );
( $me:ident : push_temp $c:expr ) => ( $me.temp_buf.push($c) );
( $me:ident : emit_temp ) => ( $me.emit_temp_buf() );
( $me:ident : clear_temp ) => ( $me.clear_temp_buf() );
( $me:ident : create_attr $c:expr ) => ( $me.create_attribute($c) );
( $me:ident : push_name $c:expr ) => ( $me.current_attr_name.push($c) );
( $me:ident : push_value $c:expr ) => ( $me.current_attr_value.push($c) );
( $me:ident : append_value $c:expr ) => ( $me.current_attr_value.push_str($c) );
( $me:ident : push_comment $c:expr ) => ( $me.current_comment.push($c) );
( $me:ident : append_comment $c:expr ) => ( $me.current_comment.push_str($c) );
( $me:ident : emit_comment ) => ( $me.emit_current_comment() );
( $me:ident : clear_comment ) => ( $me.current_comment.clear() );
( $me:ident : create_doctype ) => ( $me.current_doctype = Doctype::new() );
( $me:ident : push_doctype_name $c:expr ) => ( option_push(&mut $me.current_doctype.name, $c) );
( $me:ident : push_doctype_id $k:ident $c:expr ) => ( option_push($me.doctype_id($k), $c) );
( $me:ident : clear_doctype_id $k:ident ) => ( $me.clear_doctype_id($k) );
( $me:ident : force_quirks ) => ( $me.current_doctype.force_quirks = true );
( $me:ident : emit_doctype ) => ( $me.emit_current_doctype() );
( $me:ident : error ) => ( $me.bad_char_error() );
( $me:ident : error_eof ) => ( $me.bad_eof_error() );
);
// Tracing of tokenizer actions. This adds significant bloat and compile time,
// so it's behind a cfg flag.
#[cfg(trace_tokenizer)]
macro_rules! sh_trace ( ( $me:ident : $($cmds:tt)* ) => ({
println!(" {:s}", stringify!($($cmds)*));
shorthand!($me:expr : $($cmds)*);
}));
#[cfg(not(trace_tokenizer))]
macro_rules! sh_trace ( ( $me:ident : $($cmds:tt)* ) => ( shorthand!($me: $($cmds)*) ) );
// A little DSL for sequencing shorthand actions.
macro_rules! go (
( $me:ident : to $s:ident ) => ({ $me.state = states::$s; ControlFlow::Continue(()) });
( $me:ident : to $s:ident $k1:expr ) => ({ $me.state = states::$s($k1); ControlFlow::Continue(()) });
( $me:ident : to $s:ident $k1:ident $k2:expr ) => ({ $me.state = states::$s($k1($k2)); ControlFlow::Continue(()) });
( $me:ident : consume_char_ref ) => ({ $me.consume_char_ref(None); ControlFlow::Continue(()) });
( $me:ident : consume_char_ref $addnl:expr ) => ({ $me.consume_char_ref(Some($addnl)); ControlFlow::Continue(()) });
// We have a default next state after emitting a tag, but the sink can override.
( $me:ident : emit_tag $s:ident ) => ({
$me.state = states::$s;
$me.emit_current_tag()
});
// If nothing else matched, it's a single command
( $me:ident : $($cmd:tt)+ ) => ( sh_trace!($me: $($cmd)+) );
);
macro_rules! go_match ( ( $me:ident : $x:expr, $($pats:pat),+ => $($cmds:tt)* ) => (
match $x {
$($pats)|+ => go!($me: $($cmds)*),
_ => (),
}
));
// This is a macro because it can cause early return
// from the function where it is used.
macro_rules! get_char ( ($me:expr, $input:expr) => (
match $me.get_char($input) {
Some(char) => ControlFlow::Continue(char),
None => ControlFlow::Break(ProcessResult::Suspend)
}
));
macro_rules! peek ( ($me:expr, $input:expr) => (
match $me.peek($input) {
Some(char) => ControlFlow::Continue(char),
None => ControlFlow::Break(ProcessResult::Suspend)
}
));
macro_rules! pop_except_from ( ($me:expr, $input:expr, $set:expr) => (
match $me.pop_except_from($input, $set) {
Some(char) => ControlFlow::Continue(char),
None => ControlFlow::Break(ProcessResult::Suspend)
}
));
macro_rules! eat ( ($me:expr, $input:expr, $pat:expr) => (
match $me.eat($input, $pat, u8::eq_ignore_ascii_case) {
Some(char) => ControlFlow::Continue(char),
None => ControlFlow::Break(ProcessResult::Suspend)
}
));
macro_rules! eat_exact ( ($me:expr, $input:expr, $pat:expr) => (
match $me.eat($input, $pat, u8::eq) {
Some(char) => ControlFlow::Continue(char),
None => ControlFlow::Break(ProcessResult::Suspend)
}
));
impl Tokenizer {
// Run the state machine for a while.
// Return true if we should be immediately re-invoked
// (this just simplifies control flow vs. break / continue).
#[allow(clippy::never_loop)]
fn step(&mut self, input: &mut BufferQueue) -> ControlFlow {
if self.char_ref_tokenizer.is_some() {
return self.step_char_ref_tokenizer(input);
}
match self.state {
//§ data-state
states::Data => loop {
match pop_except_from!(self, input, small_char_set!('\r' '\0' '&' '<' '\n'))? {
FromSet('\0') => {
go!(self: error);
go!(self: emit '\0');
}
FromSet('&') => {
return go!(self: consume_char_ref);
}
FromSet('<') => {
return go!(self: to TagOpen);
}
FromSet(c) => {
go!(self: emit c);
}
NotFromSet(b) => self.emit_chars(b),
}
},
//§ rcdata-state
states::RawData(Rcdata) => loop {
match pop_except_from!(self, input, small_char_set!('\r' '\0' '&' '<' '\n'))? {
FromSet('\0') => {
go!(self: error);
go!(self: emit '\u{fffd}');
}
FromSet('&') => {
return go!(self: consume_char_ref);
}
FromSet('<') => {
return go!(self: to RawLessThanSign Rcdata);
}
FromSet(c) => {
go!(self: emit c);
}
NotFromSet(b) => self.emit_chars(b),
}
},
//§ rawtext-state
states::RawData(Rawtext) => loop {
match pop_except_from!(self, input, small_char_set!('\r' '\0' '<' '\n'))? {
FromSet('\0') => {
go!(self: error);
go!(self: emit '\u{fffd}');
}
FromSet('<') => {
return go!(self: to RawLessThanSign Rawtext);
}
FromSet(c) => {
go!(self: emit c);
}
NotFromSet(b) => self.emit_chars(b),
}
},
//§ script-data-state
states::RawData(ScriptData) => loop {
match pop_except_from!(self, input, small_char_set!('\r' '\0' '<' '\n'))? {
FromSet('\0') => {
go!(self: error);
go!(self: emit '\u{fffd}');
}
FromSet('<') => {
return go!(self: to RawLessThanSign ScriptData);
}
FromSet(c) => {
go!(self: emit c);
}
NotFromSet(b) => self.emit_chars(b),
}
},
//§ script-data-escaped-state
states::RawData(ScriptDataEscaped(Escaped)) => loop {
match pop_except_from!(self, input, small_char_set!('\r' '\0' '-' '<' '\n'))? {
FromSet('\0') => {
go!(self: error);
go!(self: emit '\u{fffd}');
}
FromSet('-') => {
go!(self: emit '-');
return go!(self: to ScriptDataEscapedDash Escaped);
}
FromSet('<') => {
return go!(self: to RawLessThanSign ScriptDataEscaped Escaped);
}
FromSet(c) => {
go!(self: emit c);
}
NotFromSet(b) => self.emit_chars(b),
}
},
//§ script-data-double-escaped-state
states::RawData(ScriptDataEscaped(DoubleEscaped)) => loop {
match pop_except_from!(self, input, small_char_set!('\r' '\0' '-' '<' '\n'))? {
FromSet('\0') => {
go!(self: error);
go!(self: emit '\u{fffd}');
}
FromSet('-') => {
go!(self: emit '-');
return go!(self: to ScriptDataEscapedDash DoubleEscaped);
}
FromSet('<') => {
go!(self: emit '<');
return go!(self: to RawLessThanSign ScriptDataEscaped DoubleEscaped);
}
FromSet(c) => {
go!(self: emit c);
}
NotFromSet(b) => self.emit_chars(b),
}
},
//§ plaintext-state
states::Plaintext => loop {
match pop_except_from!(self, input, small_char_set!('\r' '\0' '\n'))? {
FromSet('\0') => {
go!(self: error);
go!(self: emit '\u{fffd}');
}
FromSet(c) => {
go!(self: emit c);
}
NotFromSet(b) => self.emit_chars(b),
}
},
//§ tag-open-state
states::TagOpen => loop {
match get_char!(self, input)? {
'!' => {
go!(self: clear_temp);
return go!(self: to MarkupDeclarationOpen);
}
'/' => {
return go!(self: to EndTagOpen);
}
'?' => {
go!(self: error);
go!(self: clear_comment);
go!(self: push_comment '?');
return go!(self: to BogusComment);
}
c => match lower_ascii_letter(c) {
Some(cl) => {
#[cfg(feature = "spans")]
{
self.spans.current_tag_name.start = self.spans.current_pos - 1;
}
go!(self: create_tag StartTag cl);
return go!(self: to TagName);
}
None => {
go!(self: error);
go!(self: emit '<');
self.reconsume = true;
return go!(self: to Data);
}
},
}
},
//§ end-tag-open-state
states::EndTagOpen => loop {
match get_char!(self, input)? {
'>' => {
go!(self: error);
return go!(self: to Data);
}
'\0' => {
go!(self: error);
go!(self: clear_comment);
go!(self: push_comment '\u{fffd}');
return go!(self: to BogusComment);
}
c => match lower_ascii_letter(c) {
Some(cl) => {
#[cfg(feature = "spans")]
{
self.spans.current_tag_name.start = self.spans.current_pos - 1;
}
go!(self: create_tag EndTag cl);
return go!(self: to TagName);
}
None => {
go!(self: error);
go!(self: clear_comment);
go!(self: push_comment c);
return go!(self: to BogusComment);
}
},
}
},
//§ tag-name-state
states::TagName => loop {
match get_char!(self, input)? {
'\t' | '\n' | '\x0C' | ' ' => {
#[cfg(feature = "spans")]
self.spans.end_tag_name();
return go!(self: to BeforeAttributeName);
}
'/' => {
#[cfg(feature = "spans")]
self.spans.end_tag_name();
return go!(self: to SelfClosingStartTag);
}
'>' => {
#[cfg(feature = "spans")]
self.spans.end_tag_name();
return go!(self: emit_tag Data);
}
'\0' => {
go!(self: error);
go!(self: push_tag '\u{fffd}');
}
c => {
go!(self: push_tag (c.to_ascii_lowercase()));
}
}
},
//§ script-data-escaped-less-than-sign-state
states::RawLessThanSign(ScriptDataEscaped(Escaped)) => loop {
match get_char!(self, input)? {
'/' => {
go!(self: clear_temp);
return go!(self: to RawEndTagOpen ScriptDataEscaped Escaped);
}
c => match lower_ascii_letter(c) {
Some(cl) => {
go!(self: clear_temp);
go!(self: push_temp cl);
go!(self: emit '<');
go!(self: emit c);
return go!(self: to ScriptDataEscapeStart DoubleEscaped);
}
None => {
go!(self: emit '<');
self.reconsume = true;
return go!(self: to RawData ScriptDataEscaped Escaped);
}
},
}
},
//§ script-data-double-escaped-less-than-sign-state
states::RawLessThanSign(ScriptDataEscaped(DoubleEscaped)) => loop {
match get_char!(self, input)? {
'/' => {
go!(self: clear_temp);
go!(self: emit '/');
return go!(self: to ScriptDataDoubleEscapeEnd);
}
_ => {
self.reconsume = true;
return go!(self: to RawData ScriptDataEscaped DoubleEscaped);
}
}
},
//§ rcdata-less-than-sign-state rawtext-less-than-sign-state script-data-less-than-sign-state
// otherwise
states::RawLessThanSign(kind) => loop {
match get_char!(self, input)? {
'/' => {
go!(self: clear_temp);
return go!(self: to RawEndTagOpen kind);
}
'!' if kind == ScriptData => {
go!(self: emit '<');
go!(self: emit '!');
return go!(self: to ScriptDataEscapeStart Escaped);
}
_ => {
go!(self: emit '<');
self.reconsume = true;
return go!(self: to RawData kind);
}
}
},
//§ rcdata-end-tag-open-state rawtext-end-tag-open-state script-data-end-tag-open-state script-data-escaped-end-tag-open-state
states::RawEndTagOpen(kind) => loop {
let c = get_char!(self, input)?;
match lower_ascii_letter(c) {
Some(cl) => {
go!(self: create_tag EndTag cl);
go!(self: push_temp c);
return go!(self: to RawEndTagName kind);
}
None => {
go!(self: emit '<');
go!(self: emit '/');
self.reconsume = true;
return go!(self: to RawData kind);
}
}
},
//§ rcdata-end-tag-name-state rawtext-end-tag-name-state script-data-end-tag-name-state script-data-escaped-end-tag-name-state
states::RawEndTagName(kind) => loop {
let c = get_char!(self, input)?;
if self.have_appropriate_end_tag() {
match c {
'\t' | '\n' | '\x0C' | ' ' => {
return go!(self: to BeforeAttributeName);
}
'/' => {
return go!(self: to SelfClosingStartTag);
}
'>' => {
return go!(self: emit_tag Data);
}
_ => (),
}
}
match lower_ascii_letter(c) {
Some(cl) => {
go!(self: push_tag cl);
go!(self: push_temp c);
}
None => {
go!(self: discard_tag);
go!(self: emit '<');
go!(self: emit '/');
go!(self: emit_temp);
self.reconsume = true;
return go!(self: to RawData kind);
}
}
},
//§ script-data-double-escape-start-state
states::ScriptDataEscapeStart(DoubleEscaped) => loop {
let c = get_char!(self, input)?;
match c {
'\t' | '\n' | '\x0C' | ' ' | '/' | '>' => {
let esc = if &*self.temp_buf == "script" {
DoubleEscaped
} else {
Escaped
};
{
go!(self: emit c);
return go!(self: to RawData ScriptDataEscaped esc);
};
}
_ => match lower_ascii_letter(c) {
Some(cl) => {
go!(self: push_temp cl);
go!(self: emit c);
}
None => {
self.reconsume = true;
return go!(self: to RawData ScriptDataEscaped Escaped);
}
},
}
},
//§ script-data-escape-start-state
states::ScriptDataEscapeStart(Escaped) => loop {
match get_char!(self, input)? {
'-' => {
go!(self: emit '-');
return go!(self: to ScriptDataEscapeStartDash);
}
_ => {
self.reconsume = true;
return go!(self: to RawData ScriptData);
}
}
},
//§ script-data-escape-start-dash-state
states::ScriptDataEscapeStartDash => loop {
match get_char!(self, input)? {
'-' => {
go!(self: emit '-');
return go!(self: to ScriptDataEscapedDashDash Escaped);
}
_ => {
self.reconsume = true;
return go!(self: to RawData ScriptData);
}
}
},
//§ script-data-escaped-dash-state script-data-double-escaped-dash-state
states::ScriptDataEscapedDash(kind) => loop {
match get_char!(self, input)? {
'-' => {
go!(self: emit '-');
return go!(self: to ScriptDataEscapedDashDash kind);
}
'<' => {
if kind == DoubleEscaped {
{
go!(self: emit '<');
};
}
{
return go!(self: to RawLessThanSign ScriptDataEscaped kind);
};
}
'\0' => {
go!(self: error);
go!(self: emit '\u{fffd}');
return go!(self: to RawData ScriptDataEscaped kind);
}
c => {
go!(self: emit c);
return go!(self: to RawData ScriptDataEscaped kind);
}
}
},
//§ script-data-escaped-dash-dash-state script-data-double-escaped-dash-dash-state
states::ScriptDataEscapedDashDash(kind) => loop {
match get_char!(self, input)? {
'-' => {
go!(self: emit '-');
}
'<' => {
if kind == DoubleEscaped {
{
go!(self: emit '<');
};
}
{
return go!(self: to RawLessThanSign ScriptDataEscaped kind);
};
}
'>' => {
go!(self: emit '>');
return go!(self: to RawData ScriptData);
}
'\0' => {
go!(self: error);
go!(self: emit '\u{fffd}');
return go!(self: to RawData ScriptDataEscaped kind);
}
c => {
go!(self: emit c);
return go!(self: to RawData ScriptDataEscaped kind);
}
}
},
//§ script-data-double-escape-end-state
states::ScriptDataDoubleEscapeEnd => loop {
let c = get_char!(self, input)?;
match c {
'\t' | '\n' | '\x0C' | ' ' | '/' | '>' => {
let esc = if &*self.temp_buf == "script" {
Escaped
} else {
DoubleEscaped
};
{
go!(self: emit c);
return go!(self: to RawData ScriptDataEscaped esc);
};
}
_ => match lower_ascii_letter(c) {
Some(cl) => {
go!(self: push_temp cl);
go!(self: emit c);
}
None => {
self.reconsume = true;
return go!(self: to RawData ScriptDataEscaped DoubleEscaped);
}
},
}
},
//§ before-attribute-name-state
states::BeforeAttributeName => loop {
match get_char!(self, input)? {
'\t' | '\n' | '\x0C' | ' ' => (),
'/' => {
return go!(self: to SelfClosingStartTag);
}
'>' => {
return go!(self: emit_tag Data);
}
'\0' => {
go!(self: error);
go!(self: create_attr '\u{fffd}');
return go!(self: to AttributeName);
}
c => match lower_ascii_letter(c) {
Some(cl) => {
go!(self: create_attr cl);
#[cfg(feature = "spans")]
{
self.spans.current_attr_name.start = self.spans.current_pos - 1;
}
return go!(self: to AttributeName);
}
None => {
go_match!(self: c,
'"' , '\'' , '<' , '=' => error);
{
go!(self: create_attr c);
return go!(self: to AttributeName);
};
}
},
}
},
//§ attribute-name-state
states::AttributeName => loop {
match get_char!(self, input)? {
'\t' | '\n' | '\x0C' | ' ' => {
#[cfg(feature = "spans")]
self.spans.end_attr_name();
return go!(self: to AfterAttributeName);
}
'/' => {
#[cfg(feature = "spans")]
self.spans.end_attr_name();
return go!(self: to SelfClosingStartTag);
}
'=' => {
#[cfg(feature = "spans")]
self.spans.end_attr_name();
return go!(self: to BeforeAttributeValue);
}
'>' => {
#[cfg(feature = "spans")]
self.spans.end_attr_name();
return go!(self: emit_tag Data);
}
'\0' => {
go!(self: error);
go!(self: push_name '\u{fffd}');
}
c => match lower_ascii_letter(c) {
Some(cl) => {
go!(self: push_name cl);
}
None => {
go_match!(self: c,
'"' , '\'' , '<' => error);
{
go!(self: push_name c);
};
}
},
}
},
//§ after-attribute-name-state
states::AfterAttributeName => loop {
match get_char!(self, input)? {
'\t' | '\n' | '\x0C' | ' ' => (),
'/' => {
return go!(self: to SelfClosingStartTag);
}
'=' => {
return go!(self: to BeforeAttributeValue);
}
'>' => {
return go!(self: emit_tag Data);
}
'\0' => {
go!(self: error);
go!(self: create_attr '\u{fffd}');
return go!(self: to AttributeName);
}
c => match lower_ascii_letter(c) {
Some(cl) => {
go!(self: create_attr cl);
return go!(self: to AttributeName);
}
None => {
go_match!(self: c,
'"' , '\'' , '<' => error);
{
go!(self: create_attr c);
return go!(self: to AttributeName);
};
}
},
}
},
//§ before-attribute-value-state
// Use peek so we can handle the first attr character along with the rest,
// hopefully in the same zero-copy buffer.
states::BeforeAttributeValue => loop {
match peek!(self, input)? {
'\t' | '\n' | '\r' | '\x0C' | ' ' => {
go!(self: discard_char input);
}
'"' => {
go!(self: discard_char input);
return go!(self: to AttributeValue DoubleQuoted);
}
'\'' => {
go!(self: discard_char input);
return go!(self: to AttributeValue SingleQuoted);
}
'\0' => {
go!(self: discard_char input);
go!(self: error);
go!(self: push_value '\u{fffd}');
return go!(self: to AttributeValue Unquoted);
}
'>' => {
go!(self: discard_char input);
go!(self: error);
return go!(self: emit_tag Data);
}
_ => {
return go!(self: to AttributeValue Unquoted);
}
}
},
//§ attribute-value-(double-quoted)-state
states::AttributeValue(DoubleQuoted) => {
#[cfg(feature = "spans")]
{
self.spans.current_attr_value.start = self.spans.current_pos;
}
loop {
match pop_except_from!(self, input, small_char_set!('\r' '"' '&' '\0' '\n'))? {
FromSet('"') => {
return go!(self: to AfterAttributeValueQuoted);
}
FromSet('&') => {
return go!(self: consume_char_ref '"');
}
FromSet('\0') => {
go!(self: error);
go!(self: push_value '\u{fffd}');
}
FromSet(c) => {
go!(self: push_value c);
}
NotFromSet(ref b) => {
go!(self: append_value b);
}
}
}
}
//§ attribute-value-(single-quoted)-state
states::AttributeValue(SingleQuoted) => {
#[cfg(feature = "spans")]
{
self.spans.current_attr_value.start = self.spans.current_pos;
}
loop {
match pop_except_from!(self, input, small_char_set!('\r' '\'' '&' '\0' '\n'))? {
FromSet('\'') => {
return go!(self: to AfterAttributeValueQuoted);
}
FromSet('&') => {
return go!(self: consume_char_ref '\'');
}
FromSet('\0') => {
go!(self: error);
go!(self: push_value '\u{fffd}');
}
FromSet(c) => {
go!(self: push_value c);
}
NotFromSet(ref b) => {
go!(self: append_value b);
}
}
}
}
//§ attribute-value-(unquoted)-state
states::AttributeValue(Unquoted) => {
#[cfg(feature = "spans")]
{
self.spans.current_attr_value.start = self.spans.current_pos;
}
loop {
match pop_except_from!(
self,
input,
small_char_set!('\r' '\t' '\n' '\x0C' ' ' '&' '>' '\0')
)? {
FromSet('\t') | FromSet('\n') | FromSet('\x0C') | FromSet(' ') => {
#[cfg(feature = "spans")]
{
self.spans.current_attr_value.end = self.spans.current_pos - 1;
}
return go!(self: to BeforeAttributeName);
}
FromSet('&') => {
return go!(self: consume_char_ref '>');
}
FromSet('>') => {
#[cfg(feature = "spans")]
{
self.spans.current_attr_value.end = self.spans.current_pos - 1;
}
return go!(self: emit_tag Data);
}
FromSet('\0') => {
go!(self: error);
go!(self: push_value '\u{fffd}');
}
FromSet(c) => {
go_match!(self: c,
'"' , '\'' , '<' , '=' , '`' => error);
{
go!(self: push_value c);
};
}
NotFromSet(ref b) => {
go!(self: append_value b);
}
}
}
}
//§ after-attribute-value-(quoted)-state
states::AfterAttributeValueQuoted => {
#[cfg(feature = "spans")]
{
self.spans.current_attr_value.end = self.spans.current_pos - 1;
}
loop {
match get_char!(self, input)? {
'\t' | '\n' | '\x0C' | ' ' => {
return go!(self: to BeforeAttributeName);
}
'/' => {
return go!(self: to SelfClosingStartTag);
}
'>' => {
return go!(self: emit_tag Data);
}
_ => {
go!(self: error);
self.reconsume = true;
return go!(self: to BeforeAttributeName);
}
}
}
}
//§ self-closing-start-tag-state
states::SelfClosingStartTag => loop {
match get_char!(self, input)? {
'>' => {
self.current_tag_self_closing = true;
{
return go!(self: emit_tag Data);
};
}
_ => {
go!(self: error);
self.reconsume = true;
return go!(self: to BeforeAttributeName);
}
}
},
//§ comment-start-state
states::CommentStart => loop {
match get_char!(self, input)? {
'-' => {
return go!(self: to CommentStartDash);
}
'\0' => {
go!(self: error);
go!(self: push_comment '\u{fffd}');
return go!(self: to Comment);
}
'>' => {
go!(self: error);
go!(self: emit_comment);
return go!(self: to Data);
}
c => {
go!(self: push_comment c);
return go!(self: to Comment);
}
}
},
//§ comment-start-dash-state
states::CommentStartDash => loop {
match get_char!(self, input)? {
'-' => {
return go!(self: to CommentEnd);
}
'\0' => {
go!(self: error);
go!(self: append_comment "-\u{fffd}");
return go!(self: to Comment);
}
'>' => {
go!(self: error);
go!(self: emit_comment);
return go!(self: to Data);
}
c => {
go!(self: push_comment '-');
go!(self: push_comment c);
return go!(self: to Comment);
}
}
},
//§ comment-state
states::Comment => loop {
match get_char!(self, input)? {
'-' => {
return go!(self: to CommentEndDash);
}
'\0' => {
go!(self: error);
go!(self: push_comment '\u{fffd}');
}
c => {
go!(self: push_comment c);
}
}
},
//§ comment-end-dash-state
states::CommentEndDash => loop {
match get_char!(self, input)? {
'-' => {
return go!(self: to CommentEnd);
}
'\0' => {
go!(self: error);
go!(self: append_comment "-\u{fffd}");
return go!(self: to Comment);
}
c => {
go!(self: push_comment '-');
go!(self: push_comment c);
return go!(self: to Comment);
}
}
},
//§ comment-end-state
states::CommentEnd => loop {
match get_char!(self, input)? {
'>' => {
go!(self: emit_comment);
return go!(self: to Data);
}
'\0' => {
go!(self: error);
go!(self: append_comment "--\u{fffd}");
return go!(self: to Comment);
}
'!' => {
go!(self: error);
return go!(self: to CommentEndBang);
}
'-' => {
go!(self: error);
go!(self: push_comment '-');
}
c => {
go!(self: error);
go!(self: append_comment "--");
go!(self: push_comment c);
return go!(self: to Comment);
}
}
},
//§ comment-end-bang-state
states::CommentEndBang => loop {
match get_char!(self, input)? {
'-' => {
go!(self: append_comment "--!");
return go!(self: to CommentEndDash);
}
'>' => {
go!(self: emit_comment);
return go!(self: to Data);
}
'\0' => {
go!(self: error);
go!(self: append_comment "--!\u{fffd}");
return go!(self: to Comment);
}
c => {
go!(self: append_comment "--!");
go!(self: push_comment c);
return go!(self: to Comment);
}
}
},
//§ doctype-state
states::Doctype => loop {
match get_char!(self, input)? {
'\t' | '\n' | '\x0C' | ' ' => {
return go!(self: to BeforeDoctypeName);
}
_ => {
go!(self: error);
self.reconsume = true;
return go!(self: to BeforeDoctypeName);
}
}
},
//§ before-doctype-name-state
states::BeforeDoctypeName => loop {
match get_char!(self, input)? {
'\t' | '\n' | '\x0C' | ' ' => (),
'\0' => {
go!(self: error);
go!(self: create_doctype);
go!(self: push_doctype_name '\u{fffd}');
return go!(self: to DoctypeName);
}
'>' => {
go!(self: error);
go!(self: create_doctype);
go!(self: force_quirks);
go!(self: emit_doctype);
return go!(self: to Data);
}
c => {
go!(self: create_doctype);
go!(self: push_doctype_name (c.to_ascii_lowercase()));
return go!(self: to DoctypeName);
}
}
},
//§ doctype-name-state
states::DoctypeName => loop {
match get_char!(self, input)? {
'\t' | '\n' | '\x0C' | ' ' => {
go!(self: clear_temp);
return go!(self: to AfterDoctypeName);
}
'>' => {
go!(self: emit_doctype);
return go!(self: to Data);
}
'\0' => {
go!(self: error);
go!(self: push_doctype_name '\u{fffd}');
}
c => {
go!(self: push_doctype_name (c.to_ascii_lowercase()));
}
}
},
//§ after-doctype-name-state
states::AfterDoctypeName => loop {
if eat!(self, input, "public")? {
{
return go!(self: to AfterDoctypeKeyword Public);
};
} else if eat!(self, input, "system")? {
{
return go!(self: to AfterDoctypeKeyword System);
};
} else {
match get_char!(self, input)? {
'\t' | '\n' | '\x0C' | ' ' => (),
'>' => {
go!(self: emit_doctype);
return go!(self: to Data);
}
_ => {
go!(self: error);
go!(self: force_quirks);
return go!(self: to BogusDoctype);
}
}
}
},
//§ after-doctype-public-keyword-state after-doctype-system-keyword-state
states::AfterDoctypeKeyword(kind) => loop {
match get_char!(self, input)? {
'\t' | '\n' | '\x0C' | ' ' => {
return go!(self: to BeforeDoctypeIdentifier kind);
}
'"' => {
go!(self: error);
go!(self: clear_doctype_id kind);
return go!(self: to DoctypeIdentifierDoubleQuoted kind);
}
'\'' => {
go!(self: error);
go!(self: clear_doctype_id kind);
return go!(self: to DoctypeIdentifierSingleQuoted kind);
}
'>' => {
go!(self: error);
go!(self: force_quirks);
go!(self: emit_doctype);
return go!(self: to Data);
}
_ => {
go!(self: error);
go!(self: force_quirks);
return go!(self: to BogusDoctype);
}
}
},
//§ before-doctype-public-identifier-state before-doctype-system-identifier-state
states::BeforeDoctypeIdentifier(kind) => loop {
match get_char!(self, input)? {
'\t' | '\n' | '\x0C' | ' ' => (),
'"' => {
go!(self: clear_doctype_id kind);
return go!(self: to DoctypeIdentifierDoubleQuoted kind);
}
'\'' => {
go!(self: clear_doctype_id kind);
return go!(self: to DoctypeIdentifierSingleQuoted kind);
}
'>' => {
go!(self: error);
go!(self: force_quirks);
go!(self: emit_doctype);
return go!(self: to Data);
}
_ => {
go!(self: error);
go!(self: force_quirks);
return go!(self: to BogusDoctype);
}
}
},
//§ doctype-public-identifier-(double-quoted)-state doctype-system-identifier-(double-quoted)-state
states::DoctypeIdentifierDoubleQuoted(kind) => loop {
match get_char!(self, input)? {
'"' => {
return go!(self: to AfterDoctypeIdentifier kind);
}
'\0' => {
go!(self: error);
go!(self: push_doctype_id kind '\u{fffd}');
}
'>' => {
go!(self: error);
go!(self: force_quirks);
go!(self: emit_doctype);
return go!(self: to Data);
}
c => {
go!(self: push_doctype_id kind c);
}
}
},
//§ doctype-public-identifier-(single-quoted)-state doctype-system-identifier-(single-quoted)-state
states::DoctypeIdentifierSingleQuoted(kind) => loop {
match get_char!(self, input)? {
'\'' => {
return go!(self: to AfterDoctypeIdentifier kind);
}
'\0' => {
go!(self: error);
go!(self: push_doctype_id kind '\u{fffd}');
}
'>' => {
go!(self: error);
go!(self: force_quirks);
go!(self: emit_doctype);
return go!(self: to Data);
}
c => {
go!(self: push_doctype_id kind c);
}
}
},
//§ after-doctype-public-identifier-state
states::AfterDoctypeIdentifier(Public) => loop {
match get_char!(self, input)? {
'\t' | '\n' | '\x0C' | ' ' => {
return go!(self: to BetweenDoctypePublicAndSystemIdentifiers);
}
'>' => {
go!(self: emit_doctype);
return go!(self: to Data);
}
'"' => {
go!(self: error);
go!(self: clear_doctype_id System);
return go!(self: to DoctypeIdentifierDoubleQuoted System);
}
'\'' => {
go!(self: error);
go!(self: clear_doctype_id System);
return go!(self: to DoctypeIdentifierSingleQuoted System);
}
_ => {
go!(self: error);
go!(self: force_quirks);
return go!(self: to BogusDoctype);
}
}
},
//§ after-doctype-system-identifier-state
states::AfterDoctypeIdentifier(System) => loop {
match get_char!(self, input)? {
'\t' | '\n' | '\x0C' | ' ' => (),
'>' => {
go!(self: emit_doctype);
return go!(self: to Data);
}
_ => {
go!(self: error);
return go!(self: to BogusDoctype);
}
}
},
//§ between-doctype-public-and-system-identifiers-state
states::BetweenDoctypePublicAndSystemIdentifiers => loop {
match get_char!(self, input)? {
'\t' | '\n' | '\x0C' | ' ' => (),
'>' => {
go!(self: emit_doctype);
return go!(self: to Data);
}
'"' => {
go!(self: clear_doctype_id System);
return go!(self: to DoctypeIdentifierDoubleQuoted System);
}
'\'' => {
go!(self: clear_doctype_id System);
return go!(self: to DoctypeIdentifierSingleQuoted System);
}
_ => {
go!(self: error);
go!(self: force_quirks);
return go!(self: to BogusDoctype);
}
}
},
//§ bogus-doctype-state
states::BogusDoctype => loop {
match get_char!(self, input)? {
'>' => {
go!(self: emit_doctype);
return go!(self: to Data);
}
_ => (),
}
},
//§ bogus-comment-state
states::BogusComment => loop {
match get_char!(self, input)? {
'>' => {
go!(self: emit_comment);
return go!(self: to Data);
}
'\0' => {
go!(self: push_comment '\u{fffd}');
}
c => {
go!(self: push_comment c);
}
}
},
//§ markup-declaration-open-state
states::MarkupDeclarationOpen => loop {
if eat_exact!(self, input, "--")? {
{
go!(self: clear_comment);
return go!(self: to CommentStart);
};
} else if eat!(self, input, "doctype")? {
{
return go!(self: to Doctype);
};
} else {
if self
.sink
.adjusted_current_node_present_but_not_in_html_namespace()
{
if eat_exact!(self, input, "[CDATA[")? {
{
go!(self: clear_temp);
return go!(self: to CdataSection);
};
}
}
{
go!(self: error);
return go!(self: to BogusComment);
};
}
},
//§ cdata-section-state
states::CdataSection => loop {
match get_char!(self, input)? {
']' => {
return go!(self: to CdataSectionBracket);
}
'\0' => {
go!(self: emit_temp);
go!(self: emit '\0');
}
c => {
go!(self: push_temp c);
}
}
},
//§ cdata-section-bracket
states::CdataSectionBracket => match get_char!(self, input)? {
']' => {
return go!(self: to CdataSectionEnd);
}
_ => {
go!(self: push_temp ']');
self.reconsume = true;
return go!(self: to CdataSection);
}
},
//§ cdata-section-end
states::CdataSectionEnd => loop {
match get_char!(self, input)? {
']' => {
go!(self: push_temp ']');
}
'>' => {
go!(self: emit_temp);
return go!(self: to Data);
}
_ => {
go!(self: push_temp ']');
go!(self: push_temp ']');
self.reconsume = true;
return go!(self: to CdataSection);
}
}
},
//§ END
}
}
fn step_char_ref_tokenizer(&mut self, input: &mut BufferQueue) -> ControlFlow {
// FIXME HACK: Take and replace the tokenizer so we don't
// double-mut-borrow self. This is why it's boxed.
let mut tok = self.char_ref_tokenizer.take().unwrap();
let outcome = tok.step(self, input);
let progress = match outcome {
char_ref::Done => {
self.process_char_ref(tok.get_result());
return ControlFlow::Continue(());
}
char_ref::Stuck => ControlFlow::Break(ProcessResult::Suspend),
char_ref::Progress => ControlFlow::Continue(()),
};
self.char_ref_tokenizer = Some(tok);
progress
}
fn process_char_ref(&mut self, char_ref: CharRef) {
let CharRef {
mut chars,
mut num_chars,
} = char_ref;
if num_chars == 0 {
chars[0] = '&';
num_chars = 1;
}
for i in 0..num_chars {
let c = chars[i as usize];
match self.state {
states::Data | states::RawData(states::Rcdata) => {
go!(self: emit c);
}
states::AttributeValue(_) => {
go!(self: push_value c);
}
_ => panic!(
"state {:?} should not be reachable in process_char_ref",
self.state
),
}
}
}
/// Indicate that we have reached the end of the input.
pub fn end(&mut self) {
// Handle EOF in the char ref sub-tokenizer, if there is one.
// Do this first because it might un-consume stuff.
let mut input = BufferQueue::new();
match self.char_ref_tokenizer.take() {
None => (),
Some(mut tok) => {
tok.end_of_file(self, &mut input);
self.process_char_ref(tok.get_result());
}
}
// Process all remaining buffered input.
// If we're waiting for lookahead, we're not gonna get it.
self.at_eof = true;
assert!(matches!(self.run(&mut input), TokenizerResult::Done));
assert!(input.is_empty());
loop {
match self.eof_step() {
ControlFlow::Continue(()) => (),
ControlFlow::Break(ProcessResult::Suspend) => break,
ControlFlow::Break(ProcessResult::Break) => unreachable!(),
}
}
self.sink.end();
if self.opts.profile {
self.dump_profile();
}
}
fn dump_profile(&self) {
let mut results: Vec<(states::State, u64)> =
self.state_profile.iter().map(|(s, t)| (*s, *t)).collect();
results.sort_by(|&(_, x), &(_, y)| y.cmp(&x));
let total: u64 = results
.iter()
.map(|&(_, t)| t)
.fold(0, ::std::ops::Add::add);
println!("\nTokenizer profile, in nanoseconds");
println!("\n{:12} total in token sink", self.time_in_sink);
println!("\n{:12} total in tokenizer", total);
for (k, v) in results.into_iter() {
let pct = 100.0 * (v as f64) / (total as f64);
println!("{:12} {:4.1}% {:?}", v, pct, k);
}
}
fn eof_step(&mut self) -> ControlFlow {
match self.state {
states::Data
| states::RawData(Rcdata)
| states::RawData(Rawtext)
| states::RawData(ScriptData)
| states::Plaintext => {
self.emit_eof();
ControlFlow::Break(ProcessResult::Suspend)
}
states::TagName
| states::RawData(ScriptDataEscaped(_))
| states::BeforeAttributeName
| states::AttributeName
| states::AfterAttributeName
| states::BeforeAttributeValue
| states::AttributeValue(_)
| states::AfterAttributeValueQuoted
| states::SelfClosingStartTag
| states::ScriptDataEscapedDash(_)
| states::ScriptDataEscapedDashDash(_) => {
go!(self: error_eof);
return go!(self: to Data);
}
states::TagOpen => {
go!(self: error_eof);
go!(self: emit '<');
return go!(self: to Data);
}
states::EndTagOpen => {
go!(self: error_eof);
go!(self: emit '<');
go!(self: emit '/');
return go!(self: to Data);
}
states::RawLessThanSign(ScriptDataEscaped(DoubleEscaped)) => {
return go!(self: to RawData ScriptDataEscaped DoubleEscaped);
}
states::RawLessThanSign(kind) => {
go!(self: emit '<');
return go!(self: to RawData kind);
}
states::RawEndTagOpen(kind) => {
go!(self: emit '<');
go!(self: emit '/');
return go!(self: to RawData kind);
}
states::RawEndTagName(kind) => {
go!(self: emit '<');
go!(self: emit '/');
go!(self: emit_temp);
return go!(self: to RawData kind);
}
states::ScriptDataEscapeStart(kind) => {
return go!(self: to RawData ScriptDataEscaped kind);
}
states::ScriptDataEscapeStartDash => {
return go!(self: to RawData ScriptData);
}
states::ScriptDataDoubleEscapeEnd => {
return go!(self: to RawData ScriptDataEscaped DoubleEscaped);
}
states::CommentStart
| states::CommentStartDash
| states::Comment
| states::CommentEndDash
| states::CommentEnd
| states::CommentEndBang => {
go!(self: error_eof);
go!(self: emit_comment);
return go!(self: to Data);
}
states::Doctype | states::BeforeDoctypeName => {
go!(self: error_eof);
go!(self: create_doctype);
go!(self: force_quirks);
go!(self: emit_doctype);
return go!(self: to Data);
}
states::DoctypeName
| states::AfterDoctypeName
| states::AfterDoctypeKeyword(_)
| states::BeforeDoctypeIdentifier(_)
| states::DoctypeIdentifierDoubleQuoted(_)
| states::DoctypeIdentifierSingleQuoted(_)
| states::AfterDoctypeIdentifier(_)
| states::BetweenDoctypePublicAndSystemIdentifiers => {
go!(self: error_eof);
go!(self: force_quirks);
go!(self: emit_doctype);
return go!(self: to Data);
}
states::BogusDoctype => {
go!(self: emit_doctype);
return go!(self: to Data);
}
states::BogusComment => {
go!(self: emit_comment);
return go!(self: to Data);
}
states::MarkupDeclarationOpen => {
go!(self: error);
return go!(self: to BogusComment);
}
states::CdataSection => {
go!(self: emit_temp);
go!(self: error_eof);
return go!(self: to Data);
}
states::CdataSectionBracket => {
go!(self: push_temp ']');
return go!(self: to CdataSection);
}
states::CdataSectionEnd => {
go!(self: push_temp ']');
go!(self: push_temp ']');
return go!(self: to CdataSection);
}
}
}
}
#[cfg(test)]
#[allow(non_snake_case)]
mod test {
use super::option_push; // private items
use super::{TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts};
use super::interface::{CharacterTokens, EOFToken, NullCharacterToken, ParseError};
use super::interface::{EndTag, StartTag, Tag};
use super::interface::{TagToken, Token};
use crate::util::buffer_queue::BufferQueue;
use std::mem::replace;
// LinesMatch implements the TokenSink trait. It is used for testing to see
// if current_line is being updated when process_token is called. The lines
// vector is a collection of the line numbers that each token is on.
struct LinesMatch {
current_str: String,
current_str_line: u64,
lines: Vec<(u64, Token)>,
}
impl LinesMatch {
fn new() -> LinesMatch {
LinesMatch {
current_str: String::new(),
current_str_line: 0,
lines: vec![],
}
}
fn push(&mut self, token: Token, line_number: u64) {
self.finish_str();
self.lines.push((line_number, token));
}
fn finish_str(&mut self) {
if self.current_str.len() > 0 {
let s = replace(&mut self.current_str, String::new());
self.push(CharacterTokens(s), self.current_str_line);
self.current_str_line = 0;
}
}
}
impl TokenSink for LinesMatch {
fn end(&mut self) {
self.finish_str();
}
fn process_token(&mut self, token: Token, line_number: u64) -> TokenSinkResult {
match token {
CharacterTokens(b) => {
self.current_str_line = line_number;
self.current_str.push_str(&b);
}
NullCharacterToken => {
self.current_str.push('\0');
}
token @ ParseError { .. } => {
self.push(token, line_number);
}
TagToken(mut t) => {
// The spec seems to indicate that one can emit
// erroneous end tags with attrs, but the test
// cases don't contain them.
match t.kind {
EndTag => {
t.self_closing = false;
t.attrs = vec![];
}
_ => t.attrs.sort_by(|a1, a2| a1.name.cmp(&a2.name)),
}
self.push(TagToken(t), line_number);
}
EOFToken => (),
_ => self.push(token, line_number),
}
TokenSinkResult::Continue
}
}
// Take in tokens, process them, and return vector with line
// numbers that each token is on
fn tokenize(input: Vec, opts: TokenizerOpts) -> Vec<(u64, Token)> {
let sink = LinesMatch::new();
let mut tok = Tokenizer::new(sink, opts);
let mut buffer = BufferQueue::new();
for chunk in input.into_iter() {
buffer.push_back(chunk);
let _ = tok.feed(&mut buffer);
}
tok.end();
tok.sink.lines
}
// FUTURE: replace with std::assert_matches once stable
macro_rules! assert_matches {
($expr:expr, $($args:tt)+) => {
assert!(matches!($expr, $($args)*), "left matches right\n left: {:?}\nright: {}", &$expr, stringify!($($args)*))
};
}
#[test]
fn push_to_None_gives_singleton() {
let mut s: Option = None;
option_push(&mut s, 'x');
assert_eq!(s, Some("x".into()));
}
#[test]
fn push_to_empty_appends() {
let mut s: Option = Some(String::new());
option_push(&mut s, 'x');
assert_eq!(s, Some("x".into()));
}
#[test]
fn push_to_nonempty_appends() {
let mut s: Option = Some(String::from("y"));
option_push(&mut s, 'x');
assert_eq!(s, Some("yx".into()));
}
fn opts() -> TokenizerOpts {
TokenizerOpts {
exact_errors: false,
discard_bom: true,
profile: false,
initial_state: None,
last_start_tag_name: None,
}
}
#[test]
fn check_lines() {
let opts = opts();
let vector = vec![
String::from("\n"),
String::from("\n"),
String::from("\n"),
String::from("\n"),
];
let results = tokenize(vector, opts);
assert_matches!(
&results[..],
[
(1, Token::TagToken(Tag{name: n1, kind: StartTag, ..})),
(2, CharacterTokens(c1)),
(2, Token::TagToken(Tag{name: n2, kind: StartTag, ..})),
(3, CharacterTokens(c2)),
(3, Token::TagToken(Tag{name: n3, kind: EndTag, ..})),
(4, CharacterTokens(c3)),
(4, Token::TagToken(Tag{name: n4, kind: EndTag, ..})),
(5, CharacterTokens(c4)),
] if
n1 == "a" && c1 == "\n" &&
n2 == "b" && c2 == "\n" &&
n3 == "b" && c3 == "\n" &&
n4 == "a" && c4 == "\n"
);
}
#[test]
fn check_lines_with_new_line() {
let opts = opts();
let vector = vec![
String::from("\r\n"),
String::from("\r\n"),
String::from("\r\n"),
String::from("\r\n"),
];
let results = tokenize(vector, opts);
assert_matches!(
&results[..],
[
(1, Token::TagToken(Tag{name: n1, kind: StartTag, ..})),
(2, CharacterTokens(c1)),
(2, Token::TagToken(Tag{name: n2, kind: StartTag, ..})),
(3, CharacterTokens(c2)),
(3, Token::TagToken(Tag{name: n3, kind: EndTag, ..})),
(4, CharacterTokens(c3)),
(4, Token::TagToken(Tag{name: n4, kind: EndTag, ..})),
(5, CharacterTokens(c4)),
] if
n1 == "a" && c1 == "\n" &&
n2 == "b" && c2 == "\n" &&
n3 == "b" && c3 == "\n" &&
n4 == "a" && c4 == "\n"
);
}
#[test]
#[cfg(not(feature = "named-entities"))]
fn named_entities() {
let opts = opts();
let vector = vec![String::from("&\r\n"), String::from("&aamp;\r\n")];
let expected = vec![(3, Token::CharacterTokens("&\n&aamp;\n".into()))];
let results = tokenize(vector, opts);
assert_eq!(results, expected);
}
#[test]
#[cfg(feature = "named-entities")]
fn named_entities() {
use crate::error::{CharRefError, Error};
let opts = opts();
let vector = vec![String::from("&\r\n"), String::from("&aamp;\r\n")];
let results = tokenize(vector, opts);
assert_matches!(
&results[..],
[
(3, CharacterTokens(c1)),
(
3,
ParseError{error: Error::CharRef(CharRefError::InvalidNamedCharRef), ..},
),
(4, CharacterTokens(c2)),
] if c1 == "&\n" && c2 == "&aamp;\n"
);
}
}