#![deny(missing_docs)]
// This is an HTML parser. HTML can be untrusted input from the internet.
#![forbid(unsafe_code)]
#![doc = include_str!("../README.md")]
mod emitter;
mod entities;
mod error;
mod machine;
mod reader;
#[cfg(feature = "integration-tests")]
pub use machine::State;
#[cfg(not(feature = "integration-tests"))]
use machine::State;
use machine::{
ascii_digit_pat, control_pat, noncharacter_pat, surrogate_pat, whitespace_pat, ControlToken,
};
pub use emitter::{DefaultEmitter, Doctype, Emitter, EndTag, StartTag, Token};
pub use reader::{Readable, Reader, StringReader};
pub use error::Error;
macro_rules! ctostr {
($c:expr) => {
&*$c.encode_utf8(&mut [0; 4])
};
}
/// A HTML tokenizer. See crate-level docs for basic usage.
pub struct Tokenizer {
eof: bool,
state: State,
emitter: E,
temporary_buffer: String,
reader: R,
to_reconsume: Option>,
character_reference_code: u32,
return_state: Option,
}
impl Tokenizer {
/// Create a new tokenizer from some input.
///
/// `input` can be `&String` or `&str` at the moment, as those are the types for which
/// [`crate::Readable`] is implemented, but you can implement that trait on your own types.
///
/// Patches are welcome for providing an efficient implementation over async streams,
/// iterators, files, etc, as long as any dependencies come behind featureflags.
pub fn new<'a, S: Readable<'a, Reader = R>>(input: S) -> Self {
Tokenizer::::new_with_emitter(input, DefaultEmitter::default())
}
}
impl Tokenizer {
/// Construct a new tokenizer from some input and a custom emitter.
///
/// Use this method over [`Tokenizer::new`] when you want to have more control over string allocation for
/// tokens.
pub fn new_with_emitter<'a, S: Readable<'a, Reader = R>>(input: S, emitter: E) -> Self {
Tokenizer {
eof: false,
state: State::Data,
emitter,
temporary_buffer: String::new(),
to_reconsume: None,
reader: input.to_reader(),
character_reference_code: 0,
return_state: None,
}
}
#[cfg(feature = "integration-tests")]
/// Test-internal function to override internal state.
///
/// Only available with the `integration-tests` feature which is not public API.
pub fn set_state(&mut self, state: State) {
self.state = state;
}
/// Set the statemachine to start/continue in [plaintext
/// state](https://html.spec.whatwg.org/#plaintext-state).
///
/// This tokenizer never gets into that state naturally.
pub fn set_plaintext_state(&mut self) {
self.state = State::PlainText;
}
#[cfg(feature = "integration-tests")]
/// Test-internal function to override internal state.
///
/// Only available with the `integration-tests` feature which is not public API.
pub fn set_last_start_tag(&mut self, last_start_tag: Option<&str>) {
self.emitter.set_last_start_tag(last_start_tag);
}
#[inline]
fn unread_char(&mut self, c: Option) {
self.to_reconsume = Some(c);
}
#[inline]
fn validate_char(&mut self, c: char) {
match c as u32 {
surrogate_pat!() => {
self.emitter.emit_error(Error::SurrogateInInputStream);
}
noncharacter_pat!() => {
self.emitter.emit_error(Error::NoncharacterInInputStream);
}
// control without whitespace or nul
x @ control_pat!()
if !matches!(x, 0x0000 | 0x0009 | 0x000a | 0x000c | 0x000d | 0x0020) =>
{
self.emitter
.emit_error(Error::ControlCharacterInInputStream);
}
_ => (),
}
}
fn read_char(&mut self) -> Option {
if let Some(c) = self.to_reconsume.take() {
return c;
}
let c = self.reader.read_char()?;
self.validate_char(c);
Some(c)
}
#[inline]
fn try_read_string(&mut self, s: &str, case_sensitive: bool) -> bool {
debug_assert!(!s.is_empty());
debug_assert!(self.to_reconsume.is_none());
self.reader.try_read_string(s, case_sensitive)
}
fn is_consumed_as_part_of_an_attribute(&self) -> bool {
matches!(
self.return_state,
Some(
State::AttributeValueDoubleQuoted
| State::AttributeValueSingleQuoted
| State::AttributeValueUnquoted
)
)
}
fn flush_code_points_consumed_as_character_reference(&mut self) {
if self.is_consumed_as_part_of_an_attribute() {
self.emitter.push_attribute_value(&self.temporary_buffer);
self.temporary_buffer.clear();
} else {
self.flush_buffer_characters();
}
}
fn next_input_character(&mut self) -> Option {
let rv = self.read_char();
self.unread_char(rv);
rv
}
fn flush_buffer_characters(&mut self) {
self.emitter.emit_string(&self.temporary_buffer);
self.temporary_buffer.clear();
}
fn consume(&mut self) -> ControlToken {
macro_rules! mutate_character_reference {
(* $mul:literal + $x:ident - $sub:literal) => {
match self
.character_reference_code
.checked_mul($mul)
.and_then(|cr| cr.checked_add($x as u32 - $sub))
{
Some(cr) => self.character_reference_code = cr,
None => {
// provoke err
self.character_reference_code = 0x110000;
}
};
};
}
match self.state {
State::Data => match self.read_char() {
Some('&') => {
self.return_state = Some(self.state);
self.state = State::CharacterReference;
ControlToken::Continue
}
Some('<') => {
self.state = State::TagOpen;
ControlToken::Continue
}
Some('\0') => {
self.emitter.emit_error(Error::UnexpectedNullCharacter);
self.emitter.emit_string("\0");
ControlToken::Continue
}
Some(x) => {
self.emitter.emit_string(ctostr!(x));
ControlToken::Continue
}
None => ControlToken::Eof,
},
State::RcData => match self.read_char() {
Some('&') => {
self.return_state = Some(State::RcData);
self.state = State::CharacterReference;
ControlToken::Continue
}
Some('<') => {
self.state = State::RcDataLessThanSign;
ControlToken::Continue
}
Some('\0') => {
self.emitter.emit_error(Error::UnexpectedNullCharacter);
self.emitter.emit_string("\u{fffd}");
ControlToken::Continue
}
Some(x) => {
self.emitter.emit_string(ctostr!(x));
ControlToken::Continue
}
None => ControlToken::Eof,
},
State::RawText => match self.read_char() {
Some('<') => {
self.state = State::RawTextLessThanSign;
ControlToken::Continue
}
Some('\0') => {
self.emitter.emit_error(Error::UnexpectedNullCharacter);
self.emitter.emit_string("\u{fffd}");
ControlToken::Continue
}
Some(x) => {
self.emitter.emit_string(ctostr!(x));
ControlToken::Continue
}
None => ControlToken::Eof,
},
State::ScriptData => match self.read_char() {
Some('<') => {
self.state = State::ScriptDataLessThanSign;
ControlToken::Continue
}
Some('\0') => {
self.emitter.emit_error(Error::UnexpectedNullCharacter);
self.emitter.emit_string("\u{fffd}");
ControlToken::Continue
}
Some(x) => {
self.emitter.emit_string(ctostr!(x));
ControlToken::Continue
}
None => ControlToken::Eof,
},
State::PlainText => match self.read_char() {
Some('\0') => {
self.emitter.emit_error(Error::UnexpectedNullCharacter);
self.emitter.emit_string("\u{fffd}");
ControlToken::Continue
}
Some(x) => {
self.emitter.emit_string(ctostr!(x));
ControlToken::Continue
}
None => ControlToken::Eof,
},
State::TagOpen => match self.read_char() {
Some('!') => {
self.state = State::MarkupDeclarationOpen;
ControlToken::Continue
}
Some('/') => {
self.state = State::EndTagOpen;
ControlToken::Continue
}
Some(x) if x.is_ascii_alphabetic() => {
self.emitter.init_start_tag();
self.state = State::TagName;
self.unread_char(Some(x));
ControlToken::Continue
}
c @ Some('?') => {
self.emitter
.emit_error(Error::UnexpectedQuestionMarkInsteadOfTagName);
self.emitter.init_comment();
self.state = State::BogusComment;
self.unread_char(c);
ControlToken::Continue
}
None => {
self.emitter.emit_error(Error::EofBeforeTagName);
self.emitter.emit_string("<");
ControlToken::Eof
}
c @ Some(_) => {
self.emitter
.emit_error(Error::InvalidFirstCharacterOfTagName);
self.state = State::Data;
self.emitter.emit_string("<");
self.unread_char(c);
ControlToken::Continue
}
},
State::EndTagOpen => match self.read_char() {
Some(x) if x.is_ascii_alphabetic() => {
self.emitter.init_end_tag();
self.state = State::TagName;
self.unread_char(Some(x));
ControlToken::Continue
}
Some('>') => {
self.emitter.emit_error(Error::MissingEndTagName);
self.state = State::Data;
ControlToken::Continue
}
None => {
self.emitter.emit_error(Error::EofBeforeTagName);
self.emitter.emit_string("");
ControlToken::Eof
}
Some(x) => {
self.emitter
.emit_error(Error::InvalidFirstCharacterOfTagName);
self.emitter.init_comment();
self.state = State::BogusComment;
self.unread_char(Some(x));
ControlToken::Continue
}
},
State::TagName => match self.read_char() {
Some(whitespace_pat!()) => {
self.state = State::BeforeAttributeName;
ControlToken::Continue
}
Some('/') => {
self.state = State::SelfClosingStartTag;
ControlToken::Continue
}
Some('>') => {
self.state = State::Data;
self.emitter.emit_current_tag();
ControlToken::Continue
}
Some('\0') => {
self.emitter.emit_error(Error::UnexpectedNullCharacter);
self.emitter.push_tag_name("\u{fffd}");
ControlToken::Continue
}
Some(x) => {
self.emitter.push_tag_name(ctostr!(x.to_ascii_lowercase()));
ControlToken::Continue
}
None => {
self.emitter.emit_error(Error::EofInTag);
ControlToken::Eof
}
},
State::RcDataLessThanSign => match self.read_char() {
Some('/') => {
self.temporary_buffer.clear();
self.state = State::RcDataEndTagOpen;
ControlToken::Continue
}
c => {
self.emitter.emit_string("<");
self.state = State::RcData;
self.unread_char(c);
ControlToken::Continue
}
},
State::RcDataEndTagOpen => match self.read_char() {
Some(x) if x.is_ascii_alphabetic() => {
self.emitter.init_end_tag();
self.state = State::RcDataEndTagName;
self.unread_char(Some(x));
ControlToken::Continue
}
c => {
self.emitter.emit_string("");
self.state = State::RcData;
self.unread_char(c);
ControlToken::Continue
}
},
State::RcDataEndTagName => match self.read_char() {
Some(whitespace_pat!()) if self.emitter.current_is_appropriate_end_tag_token() => {
self.state = State::BeforeAttributeName;
ControlToken::Continue
}
Some('/') if self.emitter.current_is_appropriate_end_tag_token() => {
self.state = State::SelfClosingStartTag;
ControlToken::Continue
}
Some('>') if self.emitter.current_is_appropriate_end_tag_token() => {
self.state = State::Data;
self.emitter.emit_current_tag();
ControlToken::Continue
}
Some(x) if x.is_ascii_alphabetic() => {
self.emitter.push_tag_name(ctostr!(x.to_ascii_lowercase()));
self.temporary_buffer.push(x);
ControlToken::Continue
}
c => {
self.emitter.emit_string("");
self.flush_buffer_characters();
self.state = State::RcData;
self.unread_char(c);
ControlToken::Continue
}
},
State::RawTextLessThanSign => match self.read_char() {
Some('/') => {
self.temporary_buffer.clear();
self.state = State::RawTextEndTagOpen;
ControlToken::Continue
}
c => {
self.emitter.emit_string("<");
self.state = State::RawText;
self.unread_char(c);
ControlToken::Continue
}
},
State::RawTextEndTagOpen => match self.read_char() {
Some(x) if x.is_ascii_alphabetic() => {
self.emitter.init_end_tag();
self.state = State::RawTextEndTagName;
self.unread_char(Some(x));
ControlToken::Continue
}
c => {
self.emitter.emit_string("");
self.state = State::RawText;
self.unread_char(c);
ControlToken::Continue
}
},
State::RawTextEndTagName => match self.read_char() {
Some(whitespace_pat!()) if self.emitter.current_is_appropriate_end_tag_token() => {
self.state = State::BeforeAttributeName;
ControlToken::Continue
}
Some('/') if self.emitter.current_is_appropriate_end_tag_token() => {
self.state = State::SelfClosingStartTag;
ControlToken::Continue
}
Some('>') if self.emitter.current_is_appropriate_end_tag_token() => {
self.state = State::Data;
self.emitter.emit_current_tag();
ControlToken::Continue
}
Some(x) if x.is_ascii_alphabetic() => {
self.emitter.push_tag_name(ctostr!(x.to_ascii_lowercase()));
self.temporary_buffer.push(x);
ControlToken::Continue
}
c => {
self.emitter.emit_string("");
self.flush_buffer_characters();
self.state = State::RawText;
self.unread_char(c);
ControlToken::Continue
}
},
State::ScriptDataLessThanSign => match self.read_char() {
Some('/') => {
self.temporary_buffer.clear();
self.state = State::ScriptDataEndTagOpen;
ControlToken::Continue
}
Some('!') => {
self.state = State::ScriptDataEscapeStart;
self.emitter.emit_string(" {
self.emitter.emit_string("<");
self.state = State::Data;
self.unread_char(c);
ControlToken::Continue
}
},
State::ScriptDataEndTagOpen => match self.read_char() {
Some(x) if x.is_ascii_alphabetic() => {
self.emitter.init_end_tag();
self.state = State::ScriptDataEndTagName;
self.unread_char(Some(x));
ControlToken::Continue
}
c => {
self.emitter.emit_string("");
self.state = State::ScriptData;
self.unread_char(c);
ControlToken::Continue
}
},
State::ScriptDataEndTagName => match self.read_char() {
Some(whitespace_pat!()) if self.emitter.current_is_appropriate_end_tag_token() => {
self.state = State::BeforeAttributeName;
ControlToken::Continue
}
Some('/') if self.emitter.current_is_appropriate_end_tag_token() => {
self.state = State::SelfClosingStartTag;
ControlToken::Continue
}
Some('>') if self.emitter.current_is_appropriate_end_tag_token() => {
self.state = State::Data;
self.emitter.emit_current_tag();
ControlToken::Continue
}
Some(x) if x.is_ascii_alphabetic() => {
self.emitter.push_tag_name(ctostr!(x.to_ascii_lowercase()));
self.temporary_buffer.push(x.to_ascii_lowercase());
ControlToken::Continue
}
c => {
self.emitter.emit_string("");
self.flush_buffer_characters();
self.state = State::Data;
self.unread_char(c);
ControlToken::Continue
}
},
State::ScriptDataEscapeStart => match self.read_char() {
Some('-') => {
self.state = State::ScriptDataEscapeStartDash;
self.emitter.emit_string("-");
ControlToken::Continue
}
c => {
self.state = State::ScriptData;
self.unread_char(c);
ControlToken::Continue
}
},
State::ScriptDataEscapeStartDash => match self.read_char() {
Some('-') => {
self.state = State::ScriptDataEscapedDashDash;
self.emitter.emit_string("-");
ControlToken::Continue
}
c => {
self.state = State::ScriptData;
self.unread_char(c);
ControlToken::Continue
}
},
State::ScriptDataEscaped => match self.read_char() {
Some('-') => {
self.state = State::ScriptDataEscapedDash;
self.emitter.emit_string("-");
ControlToken::Continue
}
Some('<') => {
self.state = State::ScriptDataEscapedLessThanSign;
ControlToken::Continue
}
Some('\0') => {
self.emitter.emit_error(Error::UnexpectedNullCharacter);
self.emitter.emit_string("\u{fffd}");
ControlToken::Continue
}
None => {
self.emitter
.emit_error(Error::EofInScriptHtmlCommentLikeText);
ControlToken::Eof
}
Some(x) => {
self.emitter.emit_string(ctostr!(x));
ControlToken::Continue
}
},
State::ScriptDataEscapedDash => match self.read_char() {
Some('-') => {
self.state = State::ScriptDataEscapedDashDash;
self.emitter.emit_string("-");
ControlToken::Continue
}
Some('<') => {
self.state = State::ScriptDataEscapedLessThanSign;
ControlToken::Continue
}
Some('\0') => {
self.emitter.emit_error(Error::UnexpectedNullCharacter);
self.state = State::ScriptDataEscaped;
self.emitter.emit_string("\u{fffd}");
ControlToken::Continue
}
None => {
self.emitter
.emit_error(Error::EofInScriptHtmlCommentLikeText);
ControlToken::Eof
}
Some(x) => {
self.state = State::ScriptDataEscaped;
self.emitter.emit_string(ctostr!(x));
ControlToken::Continue
}
},
State::ScriptDataEscapedDashDash => match self.read_char() {
Some('-') => {
self.emitter.emit_string("-");
ControlToken::Continue
}
Some('<') => {
self.state = State::ScriptDataEscapedLessThanSign;
ControlToken::Continue
}
Some('>') => {
self.state = State::ScriptData;
self.emitter.emit_string(">");
ControlToken::Continue
}
Some('\0') => {
self.emitter.emit_error(Error::UnexpectedNullCharacter);
self.state = State::ScriptDataEscaped;
self.emitter.emit_string("\u{fffd}");
ControlToken::Continue
}
None => {
self.emitter
.emit_error(Error::EofInScriptHtmlCommentLikeText);
ControlToken::Eof
}
Some(x) => {
self.state = State::ScriptDataEscaped;
self.emitter.emit_string(ctostr!(x));
ControlToken::Continue
}
},
State::ScriptDataEscapedLessThanSign => match self.read_char() {
Some('/') => {
self.temporary_buffer.clear();
self.state = State::ScriptDataEscapedEndTagOpen;
ControlToken::Continue
}
Some(x) if x.is_ascii_alphabetic() => {
self.temporary_buffer.clear();
self.emitter.emit_string("<");
self.state = State::ScriptDataDoubleEscapeStart;
self.unread_char(Some(x));
ControlToken::Continue
}
c => {
self.emitter.emit_string("<");
self.state = State::ScriptDataEscaped;
self.unread_char(c);
ControlToken::Continue
}
},
State::ScriptDataEscapedEndTagOpen => match self.read_char() {
Some(x) if x.is_ascii_alphabetic() => {
self.emitter.init_end_tag();
self.state = State::ScriptDataEscapedEndTagName;
self.unread_char(Some(x));
ControlToken::Continue
}
c => {
self.emitter.emit_string("");
self.unread_char(c);
self.state = State::ScriptDataEscaped;
ControlToken::Continue
}
},
State::ScriptDataEscapedEndTagName => match self.read_char() {
Some(whitespace_pat!()) if self.emitter.current_is_appropriate_end_tag_token() => {
self.state = State::BeforeAttributeName;
ControlToken::Continue
}
Some('/') if self.emitter.current_is_appropriate_end_tag_token() => {
self.state = State::SelfClosingStartTag;
ControlToken::Continue
}
Some('>') if self.emitter.current_is_appropriate_end_tag_token() => {
self.state = State::Data;
self.emitter.emit_current_tag();
ControlToken::Continue
}
Some(x) if x.is_ascii_alphabetic() => {
self.emitter.push_tag_name(ctostr!(x.to_ascii_lowercase()));
self.temporary_buffer.push(x);
ControlToken::Continue
}
c => {
self.emitter.emit_string("");
self.flush_buffer_characters();
self.state = State::ScriptDataEscaped;
self.unread_char(c);
ControlToken::Continue
}
},
State::ScriptDataDoubleEscapeStart => match self.read_char() {
Some(x @ whitespace_pat!() | x @ '/' | x @ '>') => {
if self.temporary_buffer == "script" {
self.state = State::ScriptDataDoubleEscaped;
} else {
self.state = State::ScriptDataEscaped;
}
self.emitter.emit_string(ctostr!(x));
ControlToken::Continue
}
Some(x) if x.is_ascii_alphabetic() => {
self.temporary_buffer.push(x.to_ascii_lowercase());
self.emitter.emit_string(ctostr!(x));
ControlToken::Continue
}
c => {
self.state = State::ScriptDataEscaped;
self.unread_char(c);
ControlToken::Continue
}
},
State::ScriptDataDoubleEscaped => match self.read_char() {
Some('-') => {
self.state = State::ScriptDataDoubleEscapedDash;
self.emitter.emit_string("-");
ControlToken::Continue
}
Some('<') => {
self.state = State::ScriptDataDoubleEscapedLessThanSign;
self.emitter.emit_string("<");
ControlToken::Continue
}
Some('\0') => {
self.emitter.emit_error(Error::UnexpectedNullCharacter);
self.emitter.emit_string("\u{fffd}");
ControlToken::Continue
}
None => {
self.emitter
.emit_error(Error::EofInScriptHtmlCommentLikeText);
ControlToken::Eof
}
Some(x) => {
self.emitter.emit_string(ctostr!(x));
ControlToken::Continue
}
},
State::ScriptDataDoubleEscapedDash => match self.read_char() {
Some('-') => {
self.state = State::ScriptDataDoubleEscapedDashDash;
self.emitter.emit_string("-");
ControlToken::Continue
}
Some('<') => {
self.state = State::ScriptDataDoubleEscapedLessThanSign;
self.emitter.emit_string("<");
ControlToken::Continue
}
Some('\0') => {
self.emitter.emit_error(Error::UnexpectedNullCharacter);
self.state = State::ScriptDataDoubleEscaped;
self.emitter.emit_string("\u{fffd}");
ControlToken::Continue
}
None => {
self.emitter
.emit_error(Error::EofInScriptHtmlCommentLikeText);
ControlToken::Eof
}
Some(x) => {
self.state = State::ScriptDataDoubleEscaped;
self.emitter.emit_string(ctostr!(x));
ControlToken::Continue
}
},
State::ScriptDataDoubleEscapedDashDash => match self.read_char() {
Some('-') => {
self.emitter.emit_string("-");
ControlToken::Continue
}
Some('<') => {
self.emitter.emit_string("<");
self.state = State::ScriptDataDoubleEscapedLessThanSign;
ControlToken::Continue
}
Some('>') => {
self.emitter.emit_string(">");
self.state = State::ScriptData;
ControlToken::Continue
}
Some('\0') => {
self.emitter.emit_error(Error::UnexpectedNullCharacter);
self.state = State::ScriptDataDoubleEscaped;
self.emitter.emit_string("\u{fffd}");
ControlToken::Continue
}
None => {
self.emitter
.emit_error(Error::EofInScriptHtmlCommentLikeText);
ControlToken::Eof
}
Some(x) => {
self.state = State::ScriptDataDoubleEscaped;
self.emitter.emit_string(ctostr!(x));
ControlToken::Continue
}
},
State::ScriptDataDoubleEscapedLessThanSign => match self.read_char() {
Some('/') => {
self.temporary_buffer.clear();
self.state = State::ScriptDataDoubleEscapeEnd;
self.emitter.emit_string("/");
ControlToken::Continue
}
c => {
self.state = State::ScriptDataDoubleEscaped;
self.unread_char(c);
ControlToken::Continue
}
},
State::ScriptDataDoubleEscapeEnd => match self.read_char() {
Some(x @ whitespace_pat!() | x @ '/' | x @ '>') => {
if self.temporary_buffer == "script" {
self.state = State::ScriptDataEscaped;
} else {
self.state = State::ScriptDataDoubleEscaped;
}
self.emitter.emit_string(ctostr!(x));
ControlToken::Continue
}
Some(x) if x.is_ascii_alphabetic() => {
self.temporary_buffer.push(x.to_ascii_lowercase());
self.emitter.emit_string(ctostr!(x));
ControlToken::Continue
}
c => {
self.state = State::ScriptDataDoubleEscaped;
self.unread_char(c);
ControlToken::Continue
}
},
State::BeforeAttributeName => match self.read_char() {
Some(whitespace_pat!()) => ControlToken::Continue,
c @ Some('/' | '>') | c @ None => {
self.state = State::AfterAttributeName;
self.unread_char(c);
ControlToken::Continue
}
Some('=') => {
self.emitter
.emit_error(Error::UnexpectedEqualsSignBeforeAttributeName);
self.emitter.init_attribute();
self.emitter.push_attribute_name("=");
self.state = State::AttributeName;
ControlToken::Continue
}
Some(x) => {
self.emitter.init_attribute();
self.state = State::AttributeName;
self.unread_char(Some(x));
ControlToken::Continue
}
},
State::AttributeName => match self.read_char() {
c @ Some(whitespace_pat!() | '/' | '>') | c @ None => {
self.state = State::AfterAttributeName;
self.unread_char(c);
ControlToken::Continue
}
Some('=') => {
self.state = State::BeforeAttributeValue;
ControlToken::Continue
}
Some('\0') => {
self.emitter.emit_error(Error::UnexpectedNullCharacter);
self.emitter.push_attribute_name("\u{fffd}");
ControlToken::Continue
}
Some(x @ '"' | x @ '\'' | x @ '<') => {
self.emitter
.emit_error(Error::UnexpectedCharacterInAttributeName);
self.emitter
.push_attribute_name(ctostr!(x.to_ascii_lowercase()));
ControlToken::Continue
}
Some(x) => {
self.emitter
.push_attribute_name(ctostr!(x.to_ascii_lowercase()));
ControlToken::Continue
}
},
State::AfterAttributeName => match self.read_char() {
Some(whitespace_pat!()) => ControlToken::Continue,
Some('/') => {
self.state = State::SelfClosingStartTag;
ControlToken::Continue
}
Some('=') => {
self.state = State::BeforeAttributeValue;
ControlToken::Continue
}
Some('>') => {
self.state = State::Data;
self.emitter.emit_current_tag();
ControlToken::Continue
}
None => {
self.emitter.emit_error(Error::EofInTag);
ControlToken::Eof
}
Some(x) => {
self.emitter.init_attribute();
self.state = State::AttributeName;
self.unread_char(Some(x));
ControlToken::Continue
}
},
State::BeforeAttributeValue => match self.read_char() {
Some(whitespace_pat!()) => ControlToken::Continue,
Some('"') => {
self.state = State::AttributeValueDoubleQuoted;
ControlToken::Continue
}
Some('\'') => {
self.state = State::AttributeValueSingleQuoted;
ControlToken::Continue
}
Some('>') => {
self.emitter.emit_error(Error::MissingAttributeValue);
self.state = State::Data;
self.emitter.emit_current_tag();
ControlToken::Continue
}
c => {
self.state = State::AttributeValueUnquoted;
self.unread_char(c);
ControlToken::Continue
}
},
State::AttributeValueDoubleQuoted => match self.read_char() {
Some('"') => {
self.state = State::AfterAttributeValueQuoted;
ControlToken::Continue
}
Some('&') => {
self.return_state = Some(State::AttributeValueDoubleQuoted);
self.state = State::CharacterReference;
ControlToken::Continue
}
Some('\0') => {
self.emitter.emit_error(Error::UnexpectedNullCharacter);
self.emitter.push_attribute_value("\u{fffd}");
ControlToken::Continue
}
None => {
self.emitter.emit_error(Error::EofInTag);
ControlToken::Eof
}
Some(x) => {
self.emitter.push_attribute_value(ctostr!(x));
ControlToken::Continue
}
},
State::AttributeValueSingleQuoted => match self.read_char() {
Some('\'') => {
self.state = State::AfterAttributeValueQuoted;
ControlToken::Continue
}
Some('&') => {
self.return_state = Some(State::AttributeValueSingleQuoted);
self.state = State::CharacterReference;
ControlToken::Continue
}
Some('\0') => {
self.emitter.emit_error(Error::UnexpectedNullCharacter);
self.emitter.push_attribute_value("\u{fffd}");
ControlToken::Continue
}
None => {
self.emitter.emit_error(Error::EofInTag);
ControlToken::Eof
}
Some(x) => {
self.emitter.push_attribute_value(ctostr!(x));
ControlToken::Continue
}
},
State::AttributeValueUnquoted => match self.read_char() {
Some(whitespace_pat!()) => {
self.state = State::BeforeAttributeName;
ControlToken::Continue
}
Some('&') => {
self.return_state = Some(State::AttributeValueUnquoted);
self.state = State::CharacterReference;
ControlToken::Continue
}
Some('>') => {
self.state = State::Data;
self.emitter.emit_current_tag();
ControlToken::Continue
}
Some('\0') => {
self.emitter.emit_error(Error::UnexpectedNullCharacter);
self.emitter.push_attribute_value("\u{fffd}");
ControlToken::Continue
}
Some(x @ '"' | x @ '\'' | x @ '<' | x @ '=' | x @ '\u{60}') => {
self.emitter
.emit_error(Error::UnexpectedCharacterInUnquotedAttributeValue);
self.emitter.push_attribute_value(ctostr!(x));
ControlToken::Continue
}
None => {
self.emitter.emit_error(Error::EofInTag);
ControlToken::Eof
}
Some(x) => {
self.emitter.push_attribute_value(ctostr!(x));
ControlToken::Continue
}
},
State::AfterAttributeValueQuoted => match self.read_char() {
Some(whitespace_pat!()) => {
self.state = State::BeforeAttributeName;
ControlToken::Continue
}
Some('/') => {
self.state = State::SelfClosingStartTag;
ControlToken::Continue
}
Some('>') => {
self.state = State::Data;
self.emitter.emit_current_tag();
ControlToken::Continue
}
None => {
self.emitter.emit_error(Error::EofInTag);
ControlToken::Eof
}
Some(x) => {
self.emitter
.emit_error(Error::MissingWhitespaceBetweenAttributes);
self.state = State::BeforeAttributeName;
self.unread_char(Some(x));
ControlToken::Continue
}
},
State::SelfClosingStartTag => match self.read_char() {
Some('>') => {
self.emitter.set_self_closing();
self.state = State::Data;
self.emitter.emit_current_tag();
ControlToken::Continue
}
None => {
self.emitter.emit_error(Error::EofInTag);
ControlToken::Eof
}
Some(x) => {
self.emitter.emit_error(Error::UnexpectedSolidusInTag);
self.state = State::BeforeAttributeName;
self.unread_char(Some(x));
ControlToken::Continue
}
},
State::BogusComment => match self.read_char() {
Some('>') => {
self.state = State::Data;
self.emitter.emit_current_comment();
ControlToken::Continue
}
None => {
self.emitter.emit_current_comment();
ControlToken::Eof
}
Some('\0') => {
self.emitter.emit_error(Error::UnexpectedNullCharacter);
self.emitter.push_comment("\u{fffd}");
ControlToken::Continue
}
Some(x) => {
self.emitter.push_comment(ctostr!(x));
ControlToken::Continue
}
},
State::MarkupDeclarationOpen => match self.read_char() {
Some('-') if self.try_read_string("-", true) => {
self.emitter.init_comment();
self.state = State::CommentStart;
ControlToken::Continue
}
Some('d' | 'D') if self.try_read_string("octype", false) => {
self.state = State::Doctype;
ControlToken::Continue
}
Some('[') if self.try_read_string("CDATA[", true) => {
// missing: check for adjusted current element: we don't have an element stack
// at all
//
// missing: cdata transition
//
// let's hope that bogus comment can just sort of skip over cdata
self.emitter.emit_error(Error::CdataInHtmlContent);
self.emitter.init_comment();
self.emitter.push_comment("[CDATA[");
self.state = State::BogusComment;
ControlToken::Continue
}
c => {
self.emitter.emit_error(Error::IncorrectlyOpenedComment);
self.emitter.init_comment();
self.state = State::BogusComment;
self.unread_char(c);
ControlToken::Continue
}
},
State::CommentStart => match self.read_char() {
Some('-') => {
self.state = State::CommentStartDash;
ControlToken::Continue
}
Some('>') => {
self.emitter.emit_error(Error::AbruptClosingOfEmptyComment);
self.state = State::Data;
self.emitter.emit_current_comment();
ControlToken::Continue
}
c => {
self.unread_char(c);
self.state = State::Comment;
ControlToken::Continue
}
},
State::CommentStartDash => match self.read_char() {
Some('-') => {
self.state = State::CommentEnd;
ControlToken::Continue
}
Some('>') => {
self.emitter.emit_error(Error::AbruptClosingOfEmptyComment);
self.state = State::Data;
self.emitter.emit_current_comment();
ControlToken::Continue
}
None => {
self.emitter.emit_error(Error::EofInComment);
self.emitter.emit_current_comment();
ControlToken::Eof
}
c @ Some(_) => {
self.emitter.push_comment("-");
self.unread_char(c);
self.state = State::Comment;
ControlToken::Continue
}
},
State::Comment => match self.read_char() {
Some('<') => {
self.emitter.push_comment("<");
self.state = State::CommentLessThanSign;
ControlToken::Continue
}
Some('-') => {
self.state = State::CommentEndDash;
ControlToken::Continue
}
Some('\0') => {
self.emitter.emit_error(Error::UnexpectedNullCharacter);
self.emitter.push_comment("\u{fffd}");
ControlToken::Continue
}
None => {
self.emitter.emit_error(Error::EofInComment);
self.emitter.emit_current_comment();
ControlToken::Eof
}
Some(x) => {
self.emitter.push_comment(ctostr!(x));
ControlToken::Continue
}
},
State::CommentLessThanSign => match self.read_char() {
Some('!') => {
self.emitter.push_comment("!");
self.state = State::CommentLessThanSignBang;
ControlToken::Continue
}
Some('<') => {
self.emitter.push_comment("<");
ControlToken::Continue
}
c => {
self.unread_char(c);
self.state = State::Comment;
ControlToken::Continue
}
},
State::CommentLessThanSignBang => match self.read_char() {
Some('-') => {
self.state = State::CommentLessThanSignBangDash;
ControlToken::Continue
}
c => {
self.unread_char(c);
self.state = State::Comment;
ControlToken::Continue
}
},
State::CommentLessThanSignBangDash => match self.read_char() {
Some('-') => {
self.state = State::CommentLessThanSignBangDashDash;
ControlToken::Continue
}
c => {
self.unread_char(c);
self.state = State::CommentEndDash;
ControlToken::Continue
}
},
State::CommentLessThanSignBangDashDash => match self.read_char() {
c @ Some('>') | c @ None => {
self.unread_char(c);
self.state = State::CommentEnd;
ControlToken::Continue
}
c => {
self.emitter.emit_error(Error::NestedComment);
self.unread_char(c);
self.state = State::CommentEnd;
ControlToken::Continue
}
},
State::CommentEndDash => match self.read_char() {
Some('-') => {
self.state = State::CommentEnd;
ControlToken::Continue
}
None => {
self.emitter.emit_error(Error::EofInComment);
self.emitter.emit_current_comment();
ControlToken::Eof
}
c => {
self.emitter.push_comment("-");
self.unread_char(c);
self.state = State::Comment;
ControlToken::Continue
}
},
State::CommentEnd => match self.read_char() {
Some('>') => {
self.state = State::Data;
self.emitter.emit_current_comment();
ControlToken::Continue
}
Some('!') => {
self.state = State::CommentEndBang;
ControlToken::Continue
}
Some('-') => {
self.emitter.push_comment("-");
ControlToken::Continue
}
None => {
self.emitter.emit_error(Error::EofInComment);
self.emitter.emit_current_comment();
ControlToken::Eof
}
c @ Some(_) => {
self.emitter.push_comment("-");
self.emitter.push_comment("-");
self.unread_char(c);
self.state = State::Comment;
ControlToken::Continue
}
},
State::CommentEndBang => match self.read_char() {
Some('-') => {
self.emitter.push_comment("-");
self.emitter.push_comment("-");
self.emitter.push_comment("!");
self.state = State::CommentEndDash;
ControlToken::Continue
}
Some('>') => {
self.emitter.emit_error(Error::IncorrectlyClosedComment);
self.state = State::Data;
self.emitter.emit_current_comment();
ControlToken::Continue
}
None => {
self.emitter.emit_error(Error::EofInComment);
self.emitter.emit_current_comment();
ControlToken::Eof
}
c @ Some(_) => {
self.emitter.push_comment("-");
self.emitter.push_comment("-");
self.emitter.push_comment("!");
self.state = State::Comment;
self.unread_char(c);
ControlToken::Continue
}
},
State::Doctype => match self.read_char() {
Some(whitespace_pat!()) => {
self.state = State::BeforeDoctypeName;
ControlToken::Continue
}
c @ Some('>') => {
self.unread_char(c);
self.state = State::BeforeDoctypeName;
ControlToken::Continue
}
None => {
self.emitter.emit_error(Error::EofInDoctype);
self.emitter.init_doctype();
self.emitter.set_force_quirks();
self.emitter.emit_current_doctype();
ControlToken::Eof
}
c @ Some(_) => {
self.emitter
.emit_error(Error::MissingWhitespaceBeforeDoctypeName);
self.unread_char(c);
self.state = State::BeforeDoctypeName;
ControlToken::Continue
}
},
State::BeforeDoctypeName => match self.read_char() {
Some(whitespace_pat!()) => ControlToken::Continue,
Some('\0') => {
self.emitter.emit_error(Error::UnexpectedNullCharacter);
self.emitter.init_doctype();
self.emitter.push_doctype_name("\u{fffd}");
self.state = State::DoctypeName;
ControlToken::Continue
}
Some('>') => {
self.emitter.emit_error(Error::MissingDoctypeName);
self.emitter.init_doctype();
self.emitter.set_force_quirks();
self.state = State::Data;
self.emitter.emit_current_doctype();
ControlToken::Continue
}
None => {
self.emitter.emit_error(Error::EofInDoctype);
self.emitter.init_doctype();
self.emitter.set_force_quirks();
self.emitter.emit_current_doctype();
ControlToken::Eof
}
Some(x) => {
self.emitter.init_doctype();
self.emitter
.push_doctype_name(ctostr!(x.to_ascii_lowercase()));
self.state = State::DoctypeName;
ControlToken::Continue
}
},
State::DoctypeName => match self.read_char() {
Some(whitespace_pat!()) => {
self.state = State::AfterDoctypeName;
ControlToken::Continue
}
Some('>') => {
self.state = State::Data;
self.emitter.emit_current_doctype();
ControlToken::Continue
}
Some('\0') => {
self.emitter.emit_error(Error::UnexpectedNullCharacter);
self.emitter.push_doctype_name("\u{fffd}");
ControlToken::Continue
}
None => {
self.emitter.emit_error(Error::EofInDoctype);
self.emitter.set_force_quirks();
self.emitter.emit_current_doctype();
ControlToken::Eof
}
Some(x) => {
self.emitter
.push_doctype_name(ctostr!(x.to_ascii_lowercase()));
ControlToken::Continue
}
},
State::AfterDoctypeName => match self.read_char() {
Some(whitespace_pat!()) => ControlToken::Continue,
Some('>') => {
self.state = State::Data;
self.emitter.emit_current_doctype();
ControlToken::Continue
}
None => {
self.emitter.emit_error(Error::EofInDoctype);
self.emitter.set_force_quirks();
self.emitter.emit_current_doctype();
ControlToken::Eof
}
Some('p' | 'P') if self.try_read_string("ublic", false) => {
self.state = State::AfterDoctypePublicKeyword;
ControlToken::Continue
}
Some('s' | 'S') if self.try_read_string("ystem", false) => {
self.state = State::AfterDoctypeSystemKeyword;
ControlToken::Continue
}
c @ Some(_) => {
self.emitter
.emit_error(Error::InvalidCharacterSequenceAfterDoctypeName);
self.emitter.set_force_quirks();
self.unread_char(c);
self.state = State::BogusDoctype;
ControlToken::Continue
}
},
State::AfterDoctypePublicKeyword => match self.read_char() {
Some(whitespace_pat!()) => {
self.state = State::BeforeDoctypePublicIdentifier;
ControlToken::Continue
}
Some('"') => {
self.emitter
.emit_error(Error::MissingWhitespaceAfterDoctypePublicKeyword);
self.emitter.set_doctype_public_identifier("");
self.state = State::DoctypePublicIdentifierDoubleQuoted;
ControlToken::Continue
}
Some('\'') => {
self.emitter
.emit_error(Error::MissingWhitespaceAfterDoctypePublicKeyword);
self.emitter.set_doctype_public_identifier("");
self.state = State::DoctypePublicIdentifierSingleQuoted;
ControlToken::Continue
}
Some('>') => {
self.emitter
.emit_error(Error::MissingDoctypePublicIdentifier);
self.emitter.set_force_quirks();
self.state = State::Data;
self.emitter.emit_current_doctype();
ControlToken::Continue
}
None => {
self.emitter.emit_error(Error::EofInDoctype);
self.emitter.set_force_quirks();
self.emitter.emit_current_doctype();
ControlToken::Eof
}
c @ Some(_) => {
self.emitter
.emit_error(Error::MissingQuoteBeforeDoctypePublicIdentifier);
self.emitter.set_force_quirks();
self.unread_char(c);
self.state = State::BogusDoctype;
ControlToken::Continue
}
},
State::BeforeDoctypePublicIdentifier => match self.read_char() {
Some(whitespace_pat!()) => ControlToken::Continue,
Some('"') => {
self.emitter.set_doctype_public_identifier("");
self.state = State::DoctypePublicIdentifierDoubleQuoted;
ControlToken::Continue
}
Some('\'') => {
self.emitter.set_doctype_public_identifier("");
self.state = State::DoctypePublicIdentifierSingleQuoted;
ControlToken::Continue
}
Some('>') => {
self.emitter
.emit_error(Error::MissingDoctypePublicIdentifier);
self.emitter.set_force_quirks();
self.state = State::Data;
self.emitter.emit_current_doctype();
ControlToken::Continue
}
None => {
self.emitter.emit_error(Error::EofInDoctype);
self.emitter.set_force_quirks();
self.emitter.emit_current_doctype();
ControlToken::Eof
}
c @ Some(_) => {
self.emitter
.emit_error(Error::MissingQuoteBeforeDoctypePublicIdentifier);
self.emitter.set_force_quirks();
self.unread_char(c);
self.state = State::BogusDoctype;
ControlToken::Continue
}
},
State::DoctypePublicIdentifierDoubleQuoted => match self.read_char() {
Some('"') => {
self.state = State::AfterDoctypePublicIdentifier;
ControlToken::Continue
}
Some('\0') => {
self.emitter.emit_error(Error::UnexpectedNullCharacter);
self.emitter.push_doctype_public_identifier("\u{fffd}");
ControlToken::Continue
}
Some('>') => {
self.emitter
.emit_error(Error::AbruptDoctypePublicIdentifier);
self.emitter.set_force_quirks();
self.state = State::Data;
self.emitter.emit_current_doctype();
ControlToken::Continue
}
None => {
self.emitter.emit_error(Error::EofInDoctype);
self.emitter.set_force_quirks();
self.emitter.emit_current_doctype();
ControlToken::Eof
}
Some(x) => {
self.emitter.push_doctype_public_identifier(ctostr!(x));
ControlToken::Continue
}
},
State::DoctypePublicIdentifierSingleQuoted => match self.read_char() {
Some('\'') => {
self.state = State::AfterDoctypePublicIdentifier;
ControlToken::Continue
}
Some('\0') => {
self.emitter.emit_error(Error::UnexpectedNullCharacter);
self.emitter.push_doctype_public_identifier("\u{fffd}");
ControlToken::Continue
}
Some('>') => {
self.emitter
.emit_error(Error::AbruptDoctypePublicIdentifier);
self.emitter.set_force_quirks();
self.state = State::Data;
self.emitter.emit_current_doctype();
ControlToken::Continue
}
None => {
self.emitter.emit_error(Error::EofInDoctype);
self.emitter.set_force_quirks();
self.emitter.emit_current_doctype();
ControlToken::Eof
}
Some(x) => {
self.emitter.push_doctype_public_identifier(ctostr!(x));
ControlToken::Continue
}
},
State::AfterDoctypePublicIdentifier => match self.read_char() {
Some(whitespace_pat!()) => {
self.state = State::BetweenDoctypePublicAndSystemIdentifiers;
ControlToken::Continue
}
Some('>') => {
self.state = State::Data;
self.emitter.emit_current_doctype();
ControlToken::Continue
}
Some('"') => {
self.emitter.emit_error(
Error::MissingWhitespaceBetweenDoctypePublicAndSystemIdentifiers,
);
self.emitter.set_doctype_system_identifier("");
self.state = State::DoctypeSystemIdentifierDoubleQuoted;
ControlToken::Continue
}
Some('\'') => {
self.emitter.emit_error(
Error::MissingWhitespaceBetweenDoctypePublicAndSystemIdentifiers,
);
self.emitter.set_doctype_system_identifier("");
self.state = State::DoctypeSystemIdentifierSingleQuoted;
ControlToken::Continue
}
None => {
self.emitter.emit_error(Error::EofInDoctype);
self.emitter.set_force_quirks();
self.emitter.emit_current_doctype();
ControlToken::Eof
}
c @ Some(_) => {
self.emitter
.emit_error(Error::MissingQuoteBeforeDoctypeSystemIdentifier);
self.emitter.set_force_quirks();
self.unread_char(c);
self.state = State::BogusDoctype;
ControlToken::Continue
}
},
State::BetweenDoctypePublicAndSystemIdentifiers => match self.read_char() {
Some(whitespace_pat!()) => ControlToken::Continue,
Some('>') => {
self.state = State::Data;
self.emitter.emit_current_doctype();
ControlToken::Continue
}
Some('"') => {
self.emitter.set_doctype_system_identifier("");
self.state = State::DoctypeSystemIdentifierDoubleQuoted;
ControlToken::Continue
}
Some('\'') => {
self.emitter.set_doctype_system_identifier("");
self.state = State::DoctypeSystemIdentifierSingleQuoted;
ControlToken::Continue
}
None => {
self.emitter.emit_error(Error::EofInDoctype);
self.emitter.set_force_quirks();
self.emitter.emit_current_doctype();
ControlToken::Eof
}
c @ Some(_) => {
self.emitter
.emit_error(Error::MissingQuoteBeforeDoctypeSystemIdentifier);
self.emitter.set_force_quirks();
self.state = State::BogusDoctype;
self.unread_char(c);
ControlToken::Continue
}
},
State::AfterDoctypeSystemKeyword => match self.read_char() {
Some(whitespace_pat!()) => {
self.state = State::BeforeDoctypeSystemIdentifier;
ControlToken::Continue
}
Some('"') => {
self.emitter
.emit_error(Error::MissingWhitespaceAfterDoctypeSystemKeyword);
self.emitter.set_doctype_system_identifier("");
self.state = State::DoctypeSystemIdentifierDoubleQuoted;
ControlToken::Continue
}
Some('\'') => {
self.emitter
.emit_error(Error::MissingWhitespaceAfterDoctypeSystemKeyword);
self.emitter.set_doctype_system_identifier("");
self.state = State::DoctypeSystemIdentifierSingleQuoted;
ControlToken::Continue
}
Some('>') => {
self.emitter
.emit_error(Error::MissingDoctypeSystemIdentifier);
self.emitter.set_force_quirks();
self.state = State::Data;
self.emitter.emit_current_doctype();
ControlToken::Continue
}
None => {
self.emitter.emit_error(Error::EofInDoctype);
self.emitter.set_force_quirks();
self.emitter.emit_current_doctype();
ControlToken::Eof
}
c @ Some(_) => {
self.emitter
.emit_error(Error::MissingQuoteBeforeDoctypeSystemIdentifier);
self.emitter.set_force_quirks();
self.state = State::BogusDoctype;
self.unread_char(c);
ControlToken::Continue
}
},
State::BeforeDoctypeSystemIdentifier => match self.read_char() {
Some(whitespace_pat!()) => ControlToken::Continue,
Some('"') => {
self.emitter.set_doctype_system_identifier("");
self.state = State::DoctypeSystemIdentifierDoubleQuoted;
ControlToken::Continue
}
Some('\'') => {
self.emitter.set_doctype_system_identifier("");
self.state = State::DoctypeSystemIdentifierSingleQuoted;
ControlToken::Continue
}
Some('>') => {
self.emitter
.emit_error(Error::MissingDoctypeSystemIdentifier);
self.emitter.set_force_quirks();
self.state = State::Data;
self.emitter.emit_current_doctype();
ControlToken::Continue
}
None => {
self.emitter.emit_error(Error::EofInDoctype);
self.emitter.set_force_quirks();
self.emitter.emit_current_doctype();
ControlToken::Eof
}
c @ Some(_) => {
self.emitter
.emit_error(Error::MissingQuoteBeforeDoctypeSystemIdentifier);
self.emitter.set_force_quirks();
self.state = State::BogusDoctype;
self.unread_char(c);
ControlToken::Continue
}
},
State::DoctypeSystemIdentifierDoubleQuoted => match self.read_char() {
Some('"') => {
self.state = State::AfterDoctypeSystemIdentifier;
ControlToken::Continue
}
Some('\0') => {
self.emitter.emit_error(Error::UnexpectedNullCharacter);
self.emitter.push_doctype_system_identifier("\u{fffd}");
ControlToken::Continue
}
Some('>') => {
self.emitter
.emit_error(Error::AbruptDoctypeSystemIdentifier);
self.emitter.set_force_quirks();
self.state = State::Data;
self.emitter.emit_current_doctype();
ControlToken::Continue
}
None => {
self.emitter.emit_error(Error::EofInDoctype);
self.emitter.set_force_quirks();
self.emitter.emit_current_doctype();
ControlToken::Eof
}
Some(x) => {
self.emitter.push_doctype_system_identifier(ctostr!(x));
ControlToken::Continue
}
},
State::DoctypeSystemIdentifierSingleQuoted => match self.read_char() {
Some('\'') => {
self.state = State::AfterDoctypeSystemIdentifier;
ControlToken::Continue
}
Some('\0') => {
self.emitter.emit_error(Error::UnexpectedNullCharacter);
self.emitter.push_doctype_system_identifier("\u{fffd}");
ControlToken::Continue
}
Some('>') => {
self.emitter
.emit_error(Error::AbruptDoctypeSystemIdentifier);
self.emitter.set_force_quirks();
self.state = State::Data;
self.emitter.emit_current_doctype();
ControlToken::Continue
}
None => {
self.emitter.emit_error(Error::EofInDoctype);
self.emitter.set_force_quirks();
self.emitter.emit_current_doctype();
ControlToken::Eof
}
Some(x) => {
self.emitter.push_doctype_system_identifier(ctostr!(x));
ControlToken::Continue
}
},
State::AfterDoctypeSystemIdentifier => match self.read_char() {
Some(whitespace_pat!()) => ControlToken::Continue,
Some('>') => {
self.state = State::Data;
self.emitter.emit_current_doctype();
ControlToken::Continue
}
None => {
self.emitter.emit_error(Error::EofInDoctype);
self.emitter.set_force_quirks();
self.emitter.emit_current_doctype();
ControlToken::Eof
}
c @ Some(_) => {
self.emitter
.emit_error(Error::UnexpectedCharacterAfterDoctypeSystemIdentifier);
self.unread_char(c);
self.state = State::BogusDoctype;
ControlToken::Continue
}
},
State::BogusDoctype => match self.read_char() {
Some('>') => {
self.state = State::Data;
self.emitter.emit_current_doctype();
ControlToken::Continue
}
Some('\0') => {
self.emitter.emit_error(Error::UnexpectedNullCharacter);
ControlToken::Continue
}
None => {
self.emitter.emit_current_doctype();
ControlToken::Eof
}
Some(_) => ControlToken::Continue,
},
State::CdataSection => match self.read_char() {
Some(']') => {
self.state = State::CdataSectionBracket;
ControlToken::Continue
}
None => {
self.emitter.emit_error(Error::EofInCdata);
ControlToken::Eof
}
Some(x) => {
self.emitter.emit_string(ctostr!(x));
ControlToken::Continue
}
},
State::CdataSectionBracket => match self.read_char() {
Some(']') => {
self.state = State::CdataSectionEnd;
ControlToken::Continue
}
c => {
self.emitter.emit_string("]");
self.state = State::CdataSection;
self.unread_char(c);
ControlToken::Continue
}
},
State::CdataSectionEnd => match self.read_char() {
Some(']') => {
self.emitter.emit_string("]");
ControlToken::Continue
}
Some('>') => {
self.state = State::Data;
ControlToken::Continue
}
c => {
self.emitter.emit_string("]]");
self.unread_char(c);
self.state = State::CdataSection;
ControlToken::Continue
}
},
State::CharacterReference => {
self.temporary_buffer.clear();
self.temporary_buffer.push('&');
match self.read_char() {
Some(x) if x.is_ascii_alphanumeric() => {
self.unread_char(Some(x));
self.state = State::NamedCharacterReference;
ControlToken::Continue
}
Some('#') => {
self.temporary_buffer.push('#');
self.state = State::NumericCharacterReference;
ControlToken::Continue
}
c => {
self.flush_code_points_consumed_as_character_reference();
self.state = self.return_state.take().unwrap();
self.unread_char(c);
ControlToken::Continue
}
}
}
State::NamedCharacterReference => {
let c = self.read_char();
let char_ref = c.and_then(|x| {
Some((
x,
entities::try_read_character_reference(x, |x| {
self.try_read_string(x, true)
})?,
))
});
if let Some((x, char_ref)) = char_ref {
self.temporary_buffer.push(x);
self.temporary_buffer.push_str(char_ref.name);
let char_ref_name_last_character = char_ref.name.chars().last();
let next_character = self.next_input_character();
if self.is_consumed_as_part_of_an_attribute()
&& char_ref_name_last_character != Some(';')
&& matches!(next_character, Some(x) if x == '=' || x.is_ascii_alphanumeric())
{
self.flush_code_points_consumed_as_character_reference();
self.state = self.return_state.take().unwrap();
ControlToken::Continue
} else {
if char_ref_name_last_character != Some(';') {
self.emitter
.emit_error(Error::MissingSemicolonAfterCharacterReference);
}
self.temporary_buffer.clear();
self.temporary_buffer.push_str(char_ref.characters);
self.flush_code_points_consumed_as_character_reference();
self.state = self.return_state.take().unwrap();
ControlToken::Continue
}
} else {
self.unread_char(c);
self.flush_code_points_consumed_as_character_reference();
self.state = State::AmbiguousAmpersand;
ControlToken::Continue
}
}
State::AmbiguousAmpersand => match self.read_char() {
Some(x) if x.is_ascii_alphanumeric() => {
if self.is_consumed_as_part_of_an_attribute() {
self.emitter.push_attribute_value(ctostr!(x));
} else {
self.emitter.emit_string(ctostr!(x));
}
ControlToken::Continue
}
c @ Some(';') => {
self.emitter
.emit_error(Error::UnknownNamedCharacterReference);
self.unread_char(c);
self.state = self.return_state.take().unwrap();
ControlToken::Continue
}
c => {
self.unread_char(c);
self.state = self.return_state.take().unwrap();
ControlToken::Continue
}
},
State::NumericCharacterReference => {
self.character_reference_code = 0;
match self.read_char() {
Some(x @ 'x' | x @ 'X') => {
self.temporary_buffer.push(x);
self.state = State::HexadecimalCharacterReferenceStart;
ControlToken::Continue
}
c => {
self.unread_char(c);
self.state = State::DecimalCharacterReferenceStart;
ControlToken::Continue
}
}
}
State::HexadecimalCharacterReferenceStart => match self.read_char() {
c @ Some('0'..='9' | 'A'..='F' | 'a'..='f') => {
self.unread_char(c);
self.state = State::HexadecimalCharacterReference;
ControlToken::Continue
}
c => {
self.emitter
.emit_error(Error::AbsenceOfDigitsInNumericCharacterReference);
self.flush_code_points_consumed_as_character_reference();
self.unread_char(c);
self.state = self.return_state.take().unwrap();
ControlToken::Continue
}
},
State::DecimalCharacterReferenceStart => match self.read_char() {
Some(x @ ascii_digit_pat!()) => {
self.unread_char(Some(x));
self.state = State::DecimalCharacterReference;
ControlToken::Continue
}
c => {
self.emitter
.emit_error(Error::AbsenceOfDigitsInNumericCharacterReference);
self.flush_code_points_consumed_as_character_reference();
self.unread_char(c);
self.state = self.return_state.take().unwrap();
ControlToken::Continue
}
},
State::HexadecimalCharacterReference => match self.read_char() {
Some(x @ ascii_digit_pat!()) => {
mutate_character_reference!(*16 + x - 0x0030);
ControlToken::Continue
}
Some(x @ 'A'..='F') => {
mutate_character_reference!(*16 + x - 0x0037);
ControlToken::Continue
}
Some(x @ 'a'..='f') => {
mutate_character_reference!(*16 + x - 0x0057);
ControlToken::Continue
}
Some(';') => {
self.state = State::NumericCharacterReferenceEnd;
ControlToken::Continue
}
c => {
self.emitter
.emit_error(Error::MissingSemicolonAfterCharacterReference);
self.unread_char(c);
self.state = State::NumericCharacterReferenceEnd;
ControlToken::Continue
}
},
State::DecimalCharacterReference => match self.read_char() {
Some(x @ ascii_digit_pat!()) => {
mutate_character_reference!(*10 + x - 0x0030);
ControlToken::Continue
}
Some(';') => {
self.state = State::NumericCharacterReferenceEnd;
ControlToken::Continue
}
c => {
self.emitter
.emit_error(Error::MissingSemicolonAfterCharacterReference);
self.unread_char(c);
self.state = State::NumericCharacterReferenceEnd;
ControlToken::Continue
}
},
State::NumericCharacterReferenceEnd => {
match self.character_reference_code {
0x00 => {
self.emitter.emit_error(Error::NullCharacterReference);
self.character_reference_code = 0xfffd;
}
0x110000.. => {
self.emitter
.emit_error(Error::CharacterReferenceOutsideUnicodeRange);
self.character_reference_code = 0xfffd;
}
surrogate_pat!() => {
self.emitter.emit_error(Error::SurrogateCharacterReference);
self.character_reference_code = 0xfffd;
}
// noncharacter
noncharacter_pat!() => {
self.emitter
.emit_error(Error::NoncharacterCharacterReference);
}
// 0x000d, or a control that is not whitespace
x @ 0x000d | x @ control_pat!()
if !matches!(x, 0x0009 | 0x000a | 0x000c | 0x0020) =>
{
self.emitter.emit_error(Error::ControlCharacterReference);
self.character_reference_code = match x {
0x80 => 0x20AC, // EURO SIGN (€)
0x82 => 0x201A, // SINGLE LOW-9 QUOTATION MARK (‚)
0x83 => 0x0192, // LATIN SMALL LETTER F WITH HOOK (ƒ)
0x84 => 0x201E, // DOUBLE LOW-9 QUOTATION MARK („)
0x85 => 0x2026, // HORIZONTAL ELLIPSIS (…)
0x86 => 0x2020, // DAGGER (†)
0x87 => 0x2021, // DOUBLE DAGGER (‡)
0x88 => 0x02C6, // MODIFIER LETTER CIRCUMFLEX ACCENT (ˆ)
0x89 => 0x2030, // PER MILLE SIGN (‰)
0x8A => 0x0160, // LATIN CAPITAL LETTER S WITH CARON (Š)
0x8B => 0x2039, // SINGLE LEFT-POINTING ANGLE QUOTATION MARK (‹)
0x8C => 0x0152, // LATIN CAPITAL LIGATURE OE (Œ)
0x8E => 0x017D, // LATIN CAPITAL LETTER Z WITH CARON (Ž)
0x91 => 0x2018, // LEFT SINGLE QUOTATION MARK (‘)
0x92 => 0x2019, // RIGHT SINGLE QUOTATION MARK (’)
0x93 => 0x201C, // LEFT DOUBLE QUOTATION MARK (“)
0x94 => 0x201D, // RIGHT DOUBLE QUOTATION MARK (”)
0x95 => 0x2022, // BULLET (•)
0x96 => 0x2013, // EN DASH (–)
0x97 => 0x2014, // EM DASH (—)
0x98 => 0x02DC, // SMALL TILDE (˜)
0x99 => 0x2122, // TRADE MARK SIGN (™)
0x9A => 0x0161, // LATIN SMALL LETTER S WITH CARON (š)
0x9B => 0x203A, // SINGLE RIGHT-POINTING ANGLE QUOTATION MARK (›)
0x9C => 0x0153, // LATIN SMALL LIGATURE OE (œ)
0x9E => 0x017E, // LATIN SMALL LETTER Z WITH CARON (ž)
0x9F => 0x0178, // LATIN CAPITAL LETTER Y WITH DIAERESIS (Ÿ)
_ => self.character_reference_code,
};
}
_ => (),
}
self.temporary_buffer.clear();
self.temporary_buffer
.push(std::char::from_u32(self.character_reference_code).unwrap());
self.flush_code_points_consumed_as_character_reference();
self.state = self.return_state.take().unwrap();
ControlToken::Continue
}
}
}
}
impl Iterator for Tokenizer {
type Item = E::Token;
fn next(&mut self) -> Option {
loop {
if let Some(token) = self.emitter.pop_token() {
break Some(token);
} else if !self.eof {
match self.consume() {
ControlToken::Continue => (),
ControlToken::Eof => {
self.eof = true;
self.emitter.emit_eof();
}
}
} else {
break None;
}
}
}
}