aboutsummaryrefslogtreecommitdiff
path: root/src/lib.rs
diff options
context:
space:
mode:
Diffstat (limited to 'src/lib.rs')
-rw-r--r--src/lib.rs2112
1 files changed, 2112 insertions, 0 deletions
diff --git a/src/lib.rs b/src/lib.rs
new file mode 100644
index 0000000..ef6c9a2
--- /dev/null
+++ b/src/lib.rs
@@ -0,0 +1,2112 @@
+#![deny(missing_docs)]
+// This is an HTML parser. HTML can be untrusted input from the internet.
+#![forbid(unsafe_code)]
+#![doc = include_str!("../README.md")]
+
+mod emitter;
+mod entities;
+mod error;
+mod machine;
+mod reader;
+
+#[cfg(feature = "integration-tests")]
+pub use machine::State;
+#[cfg(not(feature = "integration-tests"))]
+use machine::State;
+
+use machine::{
+ ascii_digit_pat, control_pat, noncharacter_pat, surrogate_pat, whitespace_pat, ControlToken,
+};
+
+pub use emitter::{DefaultEmitter, Doctype, Emitter, EndTag, StartTag, Token};
+pub use reader::{Readable, Reader, StringReader};
+
+pub use error::Error;
+
+macro_rules! ctostr {
+ ($c:expr) => {
+ &*$c.encode_utf8(&mut [0; 4])
+ };
+}
+
+/// A HTML tokenizer. See crate-level docs for basic usage.
+pub struct Tokenizer<R: Reader, E: Emitter = DefaultEmitter> {
+ eof: bool,
+ state: State,
+ emitter: E,
+ temporary_buffer: String,
+ reader: R,
+ to_reconsume: Option<Option<char>>,
+ character_reference_code: u32,
+ return_state: Option<State>,
+}
+
+impl<R: Reader> Tokenizer<R> {
+ /// Create a new tokenizer from some input.
+ ///
+ /// `input` can be `&String` or `&str` at the moment, as those are the types for which
+ /// [`crate::Readable`] is implemented, but you can implement that trait on your own types.
+ ///
+ /// Patches are welcome for providing an efficient implementation over async streams,
+ /// iterators, files, etc, as long as any dependencies come behind featureflags.
+ pub fn new<'a, S: Readable<'a, Reader = R>>(input: S) -> Self {
+ Tokenizer::<S::Reader>::new_with_emitter(input, DefaultEmitter::default())
+ }
+}
+
+impl<R: Reader, E: Emitter> Tokenizer<R, E> {
+ /// Construct a new tokenizer from some input and a custom emitter.
+ ///
+ /// Use this method over [`Tokenizer::new`] when you want to have more control over string allocation for
+ /// tokens.
+ pub fn new_with_emitter<'a, S: Readable<'a, Reader = R>>(input: S, emitter: E) -> Self {
+ Tokenizer {
+ eof: false,
+ state: State::Data,
+ emitter,
+ temporary_buffer: String::new(),
+ to_reconsume: None,
+ reader: input.to_reader(),
+ character_reference_code: 0,
+ return_state: None,
+ }
+ }
+
+ #[cfg(feature = "integration-tests")]
+ /// Test-internal function to override internal state.
+ ///
+ /// Only available with the `integration-tests` feature which is not public API.
+ pub fn set_state(&mut self, state: State) {
+ self.state = state;
+ }
+
+ /// Set the statemachine to start/continue in [plaintext
+ /// state](https://html.spec.whatwg.org/#plaintext-state).
+ ///
+ /// This tokenizer never gets into that state naturally.
+ pub fn set_plaintext_state(&mut self) {
+ self.state = State::PlainText;
+ }
+
+ #[cfg(feature = "integration-tests")]
+ /// Test-internal function to override internal state.
+ ///
+ /// Only available with the `integration-tests` feature which is not public API.
+ pub fn set_last_start_tag(&mut self, last_start_tag: Option<&str>) {
+ self.emitter.set_last_start_tag(last_start_tag);
+ }
+
+ #[inline]
+ fn unread_char(&mut self, c: Option<char>) {
+ self.to_reconsume = Some(c);
+ }
+
+ #[inline]
+ fn validate_char(&mut self, c: char) {
+ match c as u32 {
+ surrogate_pat!() => {
+ self.emitter.emit_error(Error::SurrogateInInputStream);
+ }
+ noncharacter_pat!() => {
+ self.emitter.emit_error(Error::NoncharacterInInputStream);
+ }
+ // control without whitespace or nul
+ x @ control_pat!()
+ if !matches!(x, 0x0000 | 0x0009 | 0x000a | 0x000c | 0x000d | 0x0020) =>
+ {
+ self.emitter
+ .emit_error(Error::ControlCharacterInInputStream);
+ }
+ _ => (),
+ }
+ }
+
+ fn read_char(&mut self) -> Option<char> {
+ if let Some(c) = self.to_reconsume.take() {
+ return c;
+ }
+
+ let c = self.reader.read_char()?;
+ self.validate_char(c);
+ Some(c)
+ }
+
+ #[inline]
+ fn try_read_string(&mut self, s: &str, case_sensitive: bool) -> bool {
+ debug_assert!(!s.is_empty());
+ debug_assert!(self.to_reconsume.is_none());
+ self.reader.try_read_string(s, case_sensitive)
+ }
+
+ fn is_consumed_as_part_of_an_attribute(&self) -> bool {
+ matches!(
+ self.return_state,
+ Some(
+ State::AttributeValueDoubleQuoted
+ | State::AttributeValueSingleQuoted
+ | State::AttributeValueUnquoted
+ )
+ )
+ }
+
+ fn flush_code_points_consumed_as_character_reference(&mut self) {
+ if self.is_consumed_as_part_of_an_attribute() {
+ self.emitter.push_attribute_value(&self.temporary_buffer);
+ self.temporary_buffer.clear();
+ } else {
+ self.flush_buffer_characters();
+ }
+ }
+
+ fn next_input_character(&mut self) -> Option<char> {
+ let rv = self.read_char();
+ self.unread_char(rv);
+ rv
+ }
+
+ fn flush_buffer_characters(&mut self) {
+ self.emitter.emit_string(&self.temporary_buffer);
+ self.temporary_buffer.clear();
+ }
+
+ fn consume(&mut self) -> ControlToken {
+ macro_rules! mutate_character_reference {
+ (* $mul:literal + $x:ident - $sub:literal) => {
+ match self
+ .character_reference_code
+ .checked_mul($mul)
+ .and_then(|cr| cr.checked_add($x as u32 - $sub))
+ {
+ Some(cr) => self.character_reference_code = cr,
+ None => {
+ // provoke err
+ self.character_reference_code = 0x110000;
+ }
+ };
+ };
+ }
+
+ match self.state {
+ State::Data => match self.read_char() {
+ Some('&') => {
+ self.return_state = Some(self.state);
+ self.state = State::CharacterReference;
+ ControlToken::Continue
+ }
+ Some('<') => {
+ self.state = State::TagOpen;
+ ControlToken::Continue
+ }
+ Some('\0') => {
+ self.emitter.emit_error(Error::UnexpectedNullCharacter);
+ self.emitter.emit_string("\0");
+ ControlToken::Continue
+ }
+ Some(x) => {
+ self.emitter.emit_string(ctostr!(x));
+ ControlToken::Continue
+ }
+ None => ControlToken::Eof,
+ },
+ State::RcData => match self.read_char() {
+ Some('&') => {
+ self.return_state = Some(State::RcData);
+ self.state = State::CharacterReference;
+ ControlToken::Continue
+ }
+ Some('<') => {
+ self.state = State::RcDataLessThanSign;
+ ControlToken::Continue
+ }
+ Some('\0') => {
+ self.emitter.emit_error(Error::UnexpectedNullCharacter);
+ self.emitter.emit_string("\u{fffd}");
+ ControlToken::Continue
+ }
+ Some(x) => {
+ self.emitter.emit_string(ctostr!(x));
+ ControlToken::Continue
+ }
+ None => ControlToken::Eof,
+ },
+ State::RawText => match self.read_char() {
+ Some('<') => {
+ self.state = State::RawTextLessThanSign;
+ ControlToken::Continue
+ }
+ Some('\0') => {
+ self.emitter.emit_error(Error::UnexpectedNullCharacter);
+ self.emitter.emit_string("\u{fffd}");
+ ControlToken::Continue
+ }
+ Some(x) => {
+ self.emitter.emit_string(ctostr!(x));
+ ControlToken::Continue
+ }
+ None => ControlToken::Eof,
+ },
+ State::ScriptData => match self.read_char() {
+ Some('<') => {
+ self.state = State::ScriptDataLessThanSign;
+ ControlToken::Continue
+ }
+ Some('\0') => {
+ self.emitter.emit_error(Error::UnexpectedNullCharacter);
+ self.emitter.emit_string("\u{fffd}");
+ ControlToken::Continue
+ }
+ Some(x) => {
+ self.emitter.emit_string(ctostr!(x));
+ ControlToken::Continue
+ }
+ None => ControlToken::Eof,
+ },
+ State::PlainText => match self.read_char() {
+ Some('\0') => {
+ self.emitter.emit_error(Error::UnexpectedNullCharacter);
+ self.emitter.emit_string("\u{fffd}");
+ ControlToken::Continue
+ }
+ Some(x) => {
+ self.emitter.emit_string(ctostr!(x));
+ ControlToken::Continue
+ }
+ None => ControlToken::Eof,
+ },
+ State::TagOpen => match self.read_char() {
+ Some('!') => {
+ self.state = State::MarkupDeclarationOpen;
+ ControlToken::Continue
+ }
+ Some('/') => {
+ self.state = State::EndTagOpen;
+ ControlToken::Continue
+ }
+ Some(x) if x.is_ascii_alphabetic() => {
+ self.emitter.init_start_tag();
+ self.state = State::TagName;
+ self.unread_char(Some(x));
+ ControlToken::Continue
+ }
+ c @ Some('?') => {
+ self.emitter
+ .emit_error(Error::UnexpectedQuestionMarkInsteadOfTagName);
+ self.emitter.init_comment();
+ self.state = State::BogusComment;
+ self.unread_char(c);
+ ControlToken::Continue
+ }
+ None => {
+ self.emitter.emit_error(Error::EofBeforeTagName);
+ self.emitter.emit_string("<");
+ ControlToken::Eof
+ }
+ c @ Some(_) => {
+ self.emitter
+ .emit_error(Error::InvalidFirstCharacterOfTagName);
+ self.state = State::Data;
+ self.emitter.emit_string("<");
+ self.unread_char(c);
+ ControlToken::Continue
+ }
+ },
+ State::EndTagOpen => match self.read_char() {
+ Some(x) if x.is_ascii_alphabetic() => {
+ self.emitter.init_end_tag();
+ self.state = State::TagName;
+ self.unread_char(Some(x));
+ ControlToken::Continue
+ }
+ Some('>') => {
+ self.emitter.emit_error(Error::MissingEndTagName);
+ self.state = State::Data;
+ ControlToken::Continue
+ }
+ None => {
+ self.emitter.emit_error(Error::EofBeforeTagName);
+ self.emitter.emit_string("</");
+ ControlToken::Eof
+ }
+ Some(x) => {
+ self.emitter
+ .emit_error(Error::InvalidFirstCharacterOfTagName);
+ self.emitter.init_comment();
+ self.state = State::BogusComment;
+ self.unread_char(Some(x));
+ ControlToken::Continue
+ }
+ },
+ State::TagName => match self.read_char() {
+ Some(whitespace_pat!()) => {
+ self.state = State::BeforeAttributeName;
+ ControlToken::Continue
+ }
+ Some('/') => {
+ self.state = State::SelfClosingStartTag;
+ ControlToken::Continue
+ }
+ Some('>') => {
+ self.state = State::Data;
+ self.emitter.emit_current_tag();
+ ControlToken::Continue
+ }
+ Some('\0') => {
+ self.emitter.emit_error(Error::UnexpectedNullCharacter);
+ self.emitter.push_tag_name("\u{fffd}");
+ ControlToken::Continue
+ }
+ Some(x) => {
+ self.emitter.push_tag_name(ctostr!(x.to_ascii_lowercase()));
+ ControlToken::Continue
+ }
+ None => {
+ self.emitter.emit_error(Error::EofInTag);
+ ControlToken::Eof
+ }
+ },
+ State::RcDataLessThanSign => match self.read_char() {
+ Some('/') => {
+ self.temporary_buffer.clear();
+ self.state = State::RcDataEndTagOpen;
+ ControlToken::Continue
+ }
+ c => {
+ self.emitter.emit_string("<");
+ self.state = State::RcData;
+ self.unread_char(c);
+ ControlToken::Continue
+ }
+ },
+ State::RcDataEndTagOpen => match self.read_char() {
+ Some(x) if x.is_ascii_alphabetic() => {
+ self.emitter.init_end_tag();
+ self.state = State::RcDataEndTagName;
+ self.unread_char(Some(x));
+ ControlToken::Continue
+ }
+ c => {
+ self.emitter.emit_string("</");
+ self.state = State::RcData;
+ self.unread_char(c);
+ ControlToken::Continue
+ }
+ },
+ State::RcDataEndTagName => match self.read_char() {
+ Some(whitespace_pat!()) if self.emitter.current_is_appropriate_end_tag_token() => {
+ self.state = State::BeforeAttributeName;
+ ControlToken::Continue
+ }
+ Some('/') if self.emitter.current_is_appropriate_end_tag_token() => {
+ self.state = State::SelfClosingStartTag;
+ ControlToken::Continue
+ }
+ Some('>') if self.emitter.current_is_appropriate_end_tag_token() => {
+ self.state = State::Data;
+ self.emitter.emit_current_tag();
+ ControlToken::Continue
+ }
+ Some(x) if x.is_ascii_alphabetic() => {
+ self.emitter.push_tag_name(ctostr!(x.to_ascii_lowercase()));
+ self.temporary_buffer.push(x);
+ ControlToken::Continue
+ }
+ c => {
+ self.emitter.emit_string("</");
+ self.flush_buffer_characters();
+
+ self.state = State::RcData;
+ self.unread_char(c);
+ ControlToken::Continue
+ }
+ },
+ State::RawTextLessThanSign => match self.read_char() {
+ Some('/') => {
+ self.temporary_buffer.clear();
+ self.state = State::RawTextEndTagOpen;
+ ControlToken::Continue
+ }
+ c => {
+ self.emitter.emit_string("<");
+ self.state = State::RawText;
+ self.unread_char(c);
+ ControlToken::Continue
+ }
+ },
+ State::RawTextEndTagOpen => match self.read_char() {
+ Some(x) if x.is_ascii_alphabetic() => {
+ self.emitter.init_end_tag();
+ self.state = State::RawTextEndTagName;
+ self.unread_char(Some(x));
+ ControlToken::Continue
+ }
+ c => {
+ self.emitter.emit_string("</");
+ self.state = State::RawText;
+ self.unread_char(c);
+ ControlToken::Continue
+ }
+ },
+ State::RawTextEndTagName => match self.read_char() {
+ Some(whitespace_pat!()) if self.emitter.current_is_appropriate_end_tag_token() => {
+ self.state = State::BeforeAttributeName;
+ ControlToken::Continue
+ }
+ Some('/') if self.emitter.current_is_appropriate_end_tag_token() => {
+ self.state = State::SelfClosingStartTag;
+ ControlToken::Continue
+ }
+ Some('>') if self.emitter.current_is_appropriate_end_tag_token() => {
+ self.state = State::Data;
+ self.emitter.emit_current_tag();
+ ControlToken::Continue
+ }
+ Some(x) if x.is_ascii_alphabetic() => {
+ self.emitter.push_tag_name(ctostr!(x.to_ascii_lowercase()));
+ self.temporary_buffer.push(x);
+ ControlToken::Continue
+ }
+ c => {
+ self.emitter.emit_string("</");
+ self.flush_buffer_characters();
+
+ self.state = State::RawText;
+ self.unread_char(c);
+ ControlToken::Continue
+ }
+ },
+ State::ScriptDataLessThanSign => match self.read_char() {
+ Some('/') => {
+ self.temporary_buffer.clear();
+ self.state = State::ScriptDataEndTagOpen;
+ ControlToken::Continue
+ }
+ Some('!') => {
+ self.state = State::ScriptDataEscapeStart;
+ self.emitter.emit_string("<!");
+ ControlToken::Continue
+ }
+ c => {
+ self.emitter.emit_string("<");
+ self.state = State::Data;
+ self.unread_char(c);
+ ControlToken::Continue
+ }
+ },
+ State::ScriptDataEndTagOpen => match self.read_char() {
+ Some(x) if x.is_ascii_alphabetic() => {
+ self.emitter.init_end_tag();
+ self.state = State::ScriptDataEndTagName;
+ self.unread_char(Some(x));
+ ControlToken::Continue
+ }
+ c => {
+ self.emitter.emit_string("</");
+ self.state = State::ScriptData;
+ self.unread_char(c);
+ ControlToken::Continue
+ }
+ },
+ State::ScriptDataEndTagName => match self.read_char() {
+ Some(whitespace_pat!()) if self.emitter.current_is_appropriate_end_tag_token() => {
+ self.state = State::BeforeAttributeName;
+ ControlToken::Continue
+ }
+ Some('/') if self.emitter.current_is_appropriate_end_tag_token() => {
+ self.state = State::SelfClosingStartTag;
+ ControlToken::Continue
+ }
+ Some('>') if self.emitter.current_is_appropriate_end_tag_token() => {
+ self.state = State::Data;
+ self.emitter.emit_current_tag();
+ ControlToken::Continue
+ }
+ Some(x) if x.is_ascii_alphabetic() => {
+ self.emitter.push_tag_name(ctostr!(x.to_ascii_lowercase()));
+ self.temporary_buffer.push(x.to_ascii_lowercase());
+ ControlToken::Continue
+ }
+ c => {
+ self.emitter.emit_string("</");
+ self.flush_buffer_characters();
+ self.state = State::Data;
+ self.unread_char(c);
+ ControlToken::Continue
+ }
+ },
+ State::ScriptDataEscapeStart => match self.read_char() {
+ Some('-') => {
+ self.state = State::ScriptDataEscapeStartDash;
+ self.emitter.emit_string("-");
+ ControlToken::Continue
+ }
+ c => {
+ self.state = State::ScriptData;
+ self.unread_char(c);
+ ControlToken::Continue
+ }
+ },
+ State::ScriptDataEscapeStartDash => match self.read_char() {
+ Some('-') => {
+ self.state = State::ScriptDataEscapedDashDash;
+ self.emitter.emit_string("-");
+ ControlToken::Continue
+ }
+ c => {
+ self.state = State::ScriptData;
+ self.unread_char(c);
+ ControlToken::Continue
+ }
+ },
+ State::ScriptDataEscaped => match self.read_char() {
+ Some('-') => {
+ self.state = State::ScriptDataEscapedDash;
+ self.emitter.emit_string("-");
+ ControlToken::Continue
+ }
+ Some('<') => {
+ self.state = State::ScriptDataEscapedLessThanSign;
+ ControlToken::Continue
+ }
+ Some('\0') => {
+ self.emitter.emit_error(Error::UnexpectedNullCharacter);
+ self.emitter.emit_string("\u{fffd}");
+ ControlToken::Continue
+ }
+ None => {
+ self.emitter
+ .emit_error(Error::EofInScriptHtmlCommentLikeText);
+ ControlToken::Eof
+ }
+ Some(x) => {
+ self.emitter.emit_string(ctostr!(x));
+ ControlToken::Continue
+ }
+ },
+ State::ScriptDataEscapedDash => match self.read_char() {
+ Some('-') => {
+ self.state = State::ScriptDataEscapedDashDash;
+ self.emitter.emit_string("-");
+ ControlToken::Continue
+ }
+ Some('<') => {
+ self.state = State::ScriptDataEscapedLessThanSign;
+ ControlToken::Continue
+ }
+ Some('\0') => {
+ self.emitter.emit_error(Error::UnexpectedNullCharacter);
+ self.state = State::ScriptDataEscaped;
+ self.emitter.emit_string("\u{fffd}");
+ ControlToken::Continue
+ }
+ None => {
+ self.emitter
+ .emit_error(Error::EofInScriptHtmlCommentLikeText);
+ ControlToken::Eof
+ }
+ Some(x) => {
+ self.state = State::ScriptDataEscaped;
+ self.emitter.emit_string(ctostr!(x));
+ ControlToken::Continue
+ }
+ },
+ State::ScriptDataEscapedDashDash => match self.read_char() {
+ Some('-') => {
+ self.emitter.emit_string("-");
+ ControlToken::Continue
+ }
+ Some('<') => {
+ self.state = State::ScriptDataEscapedLessThanSign;
+ ControlToken::Continue
+ }
+ Some('>') => {
+ self.state = State::ScriptData;
+ self.emitter.emit_string(">");
+ ControlToken::Continue
+ }
+ Some('\0') => {
+ self.emitter.emit_error(Error::UnexpectedNullCharacter);
+ self.state = State::ScriptDataEscaped;
+ self.emitter.emit_string("\u{fffd}");
+ ControlToken::Continue
+ }
+ None => {
+ self.emitter
+ .emit_error(Error::EofInScriptHtmlCommentLikeText);
+ ControlToken::Eof
+ }
+ Some(x) => {
+ self.state = State::ScriptDataEscaped;
+ self.emitter.emit_string(ctostr!(x));
+ ControlToken::Continue
+ }
+ },
+ State::ScriptDataEscapedLessThanSign => match self.read_char() {
+ Some('/') => {
+ self.temporary_buffer.clear();
+ self.state = State::ScriptDataEscapedEndTagOpen;
+ ControlToken::Continue
+ }
+ Some(x) if x.is_ascii_alphabetic() => {
+ self.temporary_buffer.clear();
+ self.emitter.emit_string("<");
+ self.state = State::ScriptDataDoubleEscapeStart;
+ self.unread_char(Some(x));
+ ControlToken::Continue
+ }
+ c => {
+ self.emitter.emit_string("<");
+ self.state = State::ScriptDataEscaped;
+ self.unread_char(c);
+ ControlToken::Continue
+ }
+ },
+ State::ScriptDataEscapedEndTagOpen => match self.read_char() {
+ Some(x) if x.is_ascii_alphabetic() => {
+ self.emitter.init_end_tag();
+ self.state = State::ScriptDataEscapedEndTagName;
+ self.unread_char(Some(x));
+ ControlToken::Continue
+ }
+ c => {
+ self.emitter.emit_string("</");
+ self.unread_char(c);
+ self.state = State::ScriptDataEscaped;
+ ControlToken::Continue
+ }
+ },
+ State::ScriptDataEscapedEndTagName => match self.read_char() {
+ Some(whitespace_pat!()) if self.emitter.current_is_appropriate_end_tag_token() => {
+ self.state = State::BeforeAttributeName;
+ ControlToken::Continue
+ }
+ Some('/') if self.emitter.current_is_appropriate_end_tag_token() => {
+ self.state = State::SelfClosingStartTag;
+ ControlToken::Continue
+ }
+ Some('>') if self.emitter.current_is_appropriate_end_tag_token() => {
+ self.state = State::Data;
+ self.emitter.emit_current_tag();
+ ControlToken::Continue
+ }
+ Some(x) if x.is_ascii_alphabetic() => {
+ self.emitter.push_tag_name(ctostr!(x.to_ascii_lowercase()));
+ self.temporary_buffer.push(x);
+ ControlToken::Continue
+ }
+ c => {
+ self.emitter.emit_string("</");
+ self.flush_buffer_characters();
+ self.state = State::ScriptDataEscaped;
+ self.unread_char(c);
+ ControlToken::Continue
+ }
+ },
+ State::ScriptDataDoubleEscapeStart => match self.read_char() {
+ Some(x @ whitespace_pat!() | x @ '/' | x @ '>') => {
+ if self.temporary_buffer == "script" {
+ self.state = State::ScriptDataDoubleEscaped;
+ } else {
+ self.state = State::ScriptDataEscaped;
+ }
+ self.emitter.emit_string(ctostr!(x));
+ ControlToken::Continue
+ }
+ Some(x) if x.is_ascii_alphabetic() => {
+ self.temporary_buffer.push(x.to_ascii_lowercase());
+ self.emitter.emit_string(ctostr!(x));
+ ControlToken::Continue
+ }
+ c => {
+ self.state = State::ScriptDataEscaped;
+ self.unread_char(c);
+ ControlToken::Continue
+ }
+ },
+ State::ScriptDataDoubleEscaped => match self.read_char() {
+ Some('-') => {
+ self.state = State::ScriptDataDoubleEscapedDash;
+ self.emitter.emit_string("-");
+ ControlToken::Continue
+ }
+ Some('<') => {
+ self.state = State::ScriptDataDoubleEscapedLessThanSign;
+ self.emitter.emit_string("<");
+ ControlToken::Continue
+ }
+ Some('\0') => {
+ self.emitter.emit_error(Error::UnexpectedNullCharacter);
+ self.emitter.emit_string("\u{fffd}");
+ ControlToken::Continue
+ }
+ None => {
+ self.emitter
+ .emit_error(Error::EofInScriptHtmlCommentLikeText);
+ ControlToken::Eof
+ }
+ Some(x) => {
+ self.emitter.emit_string(ctostr!(x));
+ ControlToken::Continue
+ }
+ },
+ State::ScriptDataDoubleEscapedDash => match self.read_char() {
+ Some('-') => {
+ self.state = State::ScriptDataDoubleEscapedDashDash;
+ self.emitter.emit_string("-");
+ ControlToken::Continue
+ }
+ Some('<') => {
+ self.state = State::ScriptDataDoubleEscapedLessThanSign;
+ self.emitter.emit_string("<");
+ ControlToken::Continue
+ }
+ Some('\0') => {
+ self.emitter.emit_error(Error::UnexpectedNullCharacter);
+ self.state = State::ScriptDataDoubleEscaped;
+ self.emitter.emit_string("\u{fffd}");
+ ControlToken::Continue
+ }
+ None => {
+ self.emitter
+ .emit_error(Error::EofInScriptHtmlCommentLikeText);
+ ControlToken::Eof
+ }
+ Some(x) => {
+ self.state = State::ScriptDataDoubleEscaped;
+ self.emitter.emit_string(ctostr!(x));
+ ControlToken::Continue
+ }
+ },
+ State::ScriptDataDoubleEscapedDashDash => match self.read_char() {
+ Some('-') => {
+ self.emitter.emit_string("-");
+ ControlToken::Continue
+ }
+ Some('<') => {
+ self.emitter.emit_string("<");
+ self.state = State::ScriptDataDoubleEscapedLessThanSign;
+ ControlToken::Continue
+ }
+ Some('>') => {
+ self.emitter.emit_string(">");
+ self.state = State::ScriptData;
+ ControlToken::Continue
+ }
+ Some('\0') => {
+ self.emitter.emit_error(Error::UnexpectedNullCharacter);
+ self.state = State::ScriptDataDoubleEscaped;
+ self.emitter.emit_string("\u{fffd}");
+ ControlToken::Continue
+ }
+ None => {
+ self.emitter
+ .emit_error(Error::EofInScriptHtmlCommentLikeText);
+ ControlToken::Eof
+ }
+ Some(x) => {
+ self.state = State::ScriptDataDoubleEscaped;
+ self.emitter.emit_string(ctostr!(x));
+ ControlToken::Continue
+ }
+ },
+ State::ScriptDataDoubleEscapedLessThanSign => match self.read_char() {
+ Some('/') => {
+ self.temporary_buffer.clear();
+ self.state = State::ScriptDataDoubleEscapeEnd;
+ self.emitter.emit_string("/");
+ ControlToken::Continue
+ }
+ c => {
+ self.state = State::ScriptDataDoubleEscaped;
+ self.unread_char(c);
+ ControlToken::Continue
+ }
+ },
+ State::ScriptDataDoubleEscapeEnd => match self.read_char() {
+ Some(x @ whitespace_pat!() | x @ '/' | x @ '>') => {
+ if self.temporary_buffer == "script" {
+ self.state = State::ScriptDataEscaped;
+ } else {
+ self.state = State::ScriptDataDoubleEscaped;
+ }
+
+ self.emitter.emit_string(ctostr!(x));
+ ControlToken::Continue
+ }
+ Some(x) if x.is_ascii_alphabetic() => {
+ self.temporary_buffer.push(x.to_ascii_lowercase());
+ self.emitter.emit_string(ctostr!(x));
+ ControlToken::Continue
+ }
+ c => {
+ self.state = State::ScriptDataDoubleEscaped;
+ self.unread_char(c);
+ ControlToken::Continue
+ }
+ },
+ State::BeforeAttributeName => match self.read_char() {
+ Some(whitespace_pat!()) => ControlToken::Continue,
+ c @ Some('/' | '>') | c @ None => {
+ self.state = State::AfterAttributeName;
+ self.unread_char(c);
+ ControlToken::Continue
+ }
+ Some('=') => {
+ self.emitter
+ .emit_error(Error::UnexpectedEqualsSignBeforeAttributeName);
+ self.emitter.init_attribute();
+ self.emitter.push_attribute_name("=");
+ self.state = State::AttributeName;
+ ControlToken::Continue
+ }
+ Some(x) => {
+ self.emitter.init_attribute();
+ self.state = State::AttributeName;
+ self.unread_char(Some(x));
+ ControlToken::Continue
+ }
+ },
+ State::AttributeName => match self.read_char() {
+ c @ Some(whitespace_pat!() | '/' | '>') | c @ None => {
+ self.state = State::AfterAttributeName;
+ self.unread_char(c);
+ ControlToken::Continue
+ }
+ Some('=') => {
+ self.state = State::BeforeAttributeValue;
+ ControlToken::Continue
+ }
+ Some('\0') => {
+ self.emitter.emit_error(Error::UnexpectedNullCharacter);
+ self.emitter.push_attribute_name("\u{fffd}");
+ ControlToken::Continue
+ }
+ Some(x @ '"' | x @ '\'' | x @ '<') => {
+ self.emitter
+ .emit_error(Error::UnexpectedCharacterInAttributeName);
+ self.emitter
+ .push_attribute_name(ctostr!(x.to_ascii_lowercase()));
+ ControlToken::Continue
+ }
+ Some(x) => {
+ self.emitter
+ .push_attribute_name(ctostr!(x.to_ascii_lowercase()));
+ ControlToken::Continue
+ }
+ },
+ State::AfterAttributeName => match self.read_char() {
+ Some(whitespace_pat!()) => ControlToken::Continue,
+ Some('/') => {
+ self.state = State::SelfClosingStartTag;
+ ControlToken::Continue
+ }
+ Some('=') => {
+ self.state = State::BeforeAttributeValue;
+ ControlToken::Continue
+ }
+ Some('>') => {
+ self.state = State::Data;
+ self.emitter.emit_current_tag();
+ ControlToken::Continue
+ }
+ None => {
+ self.emitter.emit_error(Error::EofInTag);
+ ControlToken::Eof
+ }
+ Some(x) => {
+ self.emitter.init_attribute();
+ self.state = State::AttributeName;
+ self.unread_char(Some(x));
+ ControlToken::Continue
+ }
+ },
+ State::BeforeAttributeValue => match self.read_char() {
+ Some(whitespace_pat!()) => ControlToken::Continue,
+ Some('"') => {
+ self.state = State::AttributeValueDoubleQuoted;
+ ControlToken::Continue
+ }
+ Some('\'') => {
+ self.state = State::AttributeValueSingleQuoted;
+ ControlToken::Continue
+ }
+ Some('>') => {
+ self.emitter.emit_error(Error::MissingAttributeValue);
+ self.state = State::Data;
+ self.emitter.emit_current_tag();
+ ControlToken::Continue
+ }
+ c => {
+ self.state = State::AttributeValueUnquoted;
+ self.unread_char(c);
+ ControlToken::Continue
+ }
+ },
+ State::AttributeValueDoubleQuoted => match self.read_char() {
+ Some('"') => {
+ self.state = State::AfterAttributeValueQuoted;
+ ControlToken::Continue
+ }
+ Some('&') => {
+ self.return_state = Some(State::AttributeValueDoubleQuoted);
+ self.state = State::CharacterReference;
+ ControlToken::Continue
+ }
+ Some('\0') => {
+ self.emitter.emit_error(Error::UnexpectedNullCharacter);
+ self.emitter.push_attribute_value("\u{fffd}");
+ ControlToken::Continue
+ }
+ None => {
+ self.emitter.emit_error(Error::EofInTag);
+ ControlToken::Eof
+ }
+ Some(x) => {
+ self.emitter.push_attribute_value(ctostr!(x));
+ ControlToken::Continue
+ }
+ },
+ State::AttributeValueSingleQuoted => match self.read_char() {
+ Some('\'') => {
+ self.state = State::AfterAttributeValueQuoted;
+ ControlToken::Continue
+ }
+ Some('&') => {
+ self.return_state = Some(State::AttributeValueSingleQuoted);
+ self.state = State::CharacterReference;
+ ControlToken::Continue
+ }
+ Some('\0') => {
+ self.emitter.emit_error(Error::UnexpectedNullCharacter);
+ self.emitter.push_attribute_value("\u{fffd}");
+ ControlToken::Continue
+ }
+ None => {
+ self.emitter.emit_error(Error::EofInTag);
+ ControlToken::Eof
+ }
+ Some(x) => {
+ self.emitter.push_attribute_value(ctostr!(x));
+ ControlToken::Continue
+ }
+ },
+ State::AttributeValueUnquoted => match self.read_char() {
+ Some(whitespace_pat!()) => {
+ self.state = State::BeforeAttributeName;
+ ControlToken::Continue
+ }
+ Some('&') => {
+ self.return_state = Some(State::AttributeValueUnquoted);
+ self.state = State::CharacterReference;
+ ControlToken::Continue
+ }
+ Some('>') => {
+ self.state = State::Data;
+ self.emitter.emit_current_tag();
+ ControlToken::Continue
+ }
+ Some('\0') => {
+ self.emitter.emit_error(Error::UnexpectedNullCharacter);
+ self.emitter.push_attribute_value("\u{fffd}");
+ ControlToken::Continue
+ }
+ Some(x @ '"' | x @ '\'' | x @ '<' | x @ '=' | x @ '\u{60}') => {
+ self.emitter
+ .emit_error(Error::UnexpectedCharacterInUnquotedAttributeValue);
+ self.emitter.push_attribute_value(ctostr!(x));
+ ControlToken::Continue
+ }
+ None => {
+ self.emitter.emit_error(Error::EofInTag);
+ ControlToken::Eof
+ }
+ Some(x) => {
+ self.emitter.push_attribute_value(ctostr!(x));
+ ControlToken::Continue
+ }
+ },
+ State::AfterAttributeValueQuoted => match self.read_char() {
+ Some(whitespace_pat!()) => {
+ self.state = State::BeforeAttributeName;
+ ControlToken::Continue
+ }
+ Some('/') => {
+ self.state = State::SelfClosingStartTag;
+ ControlToken::Continue
+ }
+ Some('>') => {
+ self.state = State::Data;
+ self.emitter.emit_current_tag();
+ ControlToken::Continue
+ }
+ None => {
+ self.emitter.emit_error(Error::EofInTag);
+ ControlToken::Eof
+ }
+ Some(x) => {
+ self.emitter
+ .emit_error(Error::MissingWhitespaceBetweenAttributes);
+ self.state = State::BeforeAttributeName;
+ self.unread_char(Some(x));
+ ControlToken::Continue
+ }
+ },
+ State::SelfClosingStartTag => match self.read_char() {
+ Some('>') => {
+ self.emitter.set_self_closing();
+ self.state = State::Data;
+ self.emitter.emit_current_tag();
+ ControlToken::Continue
+ }
+ None => {
+ self.emitter.emit_error(Error::EofInTag);
+ ControlToken::Eof
+ }
+ Some(x) => {
+ self.emitter.emit_error(Error::UnexpectedSolidusInTag);
+ self.state = State::BeforeAttributeName;
+ self.unread_char(Some(x));
+ ControlToken::Continue
+ }
+ },
+ State::BogusComment => match self.read_char() {
+ Some('>') => {
+ self.state = State::Data;
+ self.emitter.emit_current_comment();
+ ControlToken::Continue
+ }
+ None => {
+ self.emitter.emit_current_comment();
+ ControlToken::Eof
+ }
+ Some('\0') => {
+ self.emitter.emit_error(Error::UnexpectedNullCharacter);
+ self.emitter.push_comment("\u{fffd}");
+ ControlToken::Continue
+ }
+ Some(x) => {
+ self.emitter.push_comment(ctostr!(x));
+ ControlToken::Continue
+ }
+ },
+ State::MarkupDeclarationOpen => match self.read_char() {
+ Some('-') if self.try_read_string("-", true) => {
+ self.emitter.init_comment();
+ self.state = State::CommentStart;
+ ControlToken::Continue
+ }
+ Some('d' | 'D') if self.try_read_string("octype", false) => {
+ self.state = State::Doctype;
+ ControlToken::Continue
+ }
+ Some('[') if self.try_read_string("CDATA[", true) => {
+ // missing: check for adjusted current element: we don't have an element stack
+ // at all
+ //
+ // missing: cdata transition
+ //
+ // let's hope that bogus comment can just sort of skip over cdata
+ self.emitter.emit_error(Error::CdataInHtmlContent);
+
+ self.emitter.init_comment();
+ self.emitter.push_comment("[CDATA[");
+ self.state = State::BogusComment;
+ ControlToken::Continue
+ }
+ c => {
+ self.emitter.emit_error(Error::IncorrectlyOpenedComment);
+ self.emitter.init_comment();
+ self.state = State::BogusComment;
+ self.unread_char(c);
+ ControlToken::Continue
+ }
+ },
+ State::CommentStart => match self.read_char() {
+ Some('-') => {
+ self.state = State::CommentStartDash;
+ ControlToken::Continue
+ }
+ Some('>') => {
+ self.emitter.emit_error(Error::AbruptClosingOfEmptyComment);
+ self.state = State::Data;
+ self.emitter.emit_current_comment();
+ ControlToken::Continue
+ }
+ c => {
+ self.unread_char(c);
+ self.state = State::Comment;
+ ControlToken::Continue
+ }
+ },
+ State::CommentStartDash => match self.read_char() {
+ Some('-') => {
+ self.state = State::CommentEnd;
+ ControlToken::Continue
+ }
+ Some('>') => {
+ self.emitter.emit_error(Error::AbruptClosingOfEmptyComment);
+ self.state = State::Data;
+ self.emitter.emit_current_comment();
+ ControlToken::Continue
+ }
+ None => {
+ self.emitter.emit_error(Error::EofInComment);
+ self.emitter.emit_current_comment();
+ ControlToken::Eof
+ }
+ c @ Some(_) => {
+ self.emitter.push_comment("-");
+ self.unread_char(c);
+ self.state = State::Comment;
+ ControlToken::Continue
+ }
+ },
+ State::Comment => match self.read_char() {
+ Some('<') => {
+ self.emitter.push_comment("<");
+ self.state = State::CommentLessThanSign;
+ ControlToken::Continue
+ }
+ Some('-') => {
+ self.state = State::CommentEndDash;
+ ControlToken::Continue
+ }
+ Some('\0') => {
+ self.emitter.emit_error(Error::UnexpectedNullCharacter);
+ self.emitter.push_comment("\u{fffd}");
+ ControlToken::Continue
+ }
+ None => {
+ self.emitter.emit_error(Error::EofInComment);
+ self.emitter.emit_current_comment();
+ ControlToken::Eof
+ }
+ Some(x) => {
+ self.emitter.push_comment(ctostr!(x));
+ ControlToken::Continue
+ }
+ },
+ State::CommentLessThanSign => match self.read_char() {
+ Some('!') => {
+ self.emitter.push_comment("!");
+ self.state = State::CommentLessThanSignBang;
+ ControlToken::Continue
+ }
+ Some('<') => {
+ self.emitter.push_comment("<");
+ ControlToken::Continue
+ }
+ c => {
+ self.unread_char(c);
+ self.state = State::Comment;
+ ControlToken::Continue
+ }
+ },
+ State::CommentLessThanSignBang => match self.read_char() {
+ Some('-') => {
+ self.state = State::CommentLessThanSignBangDash;
+ ControlToken::Continue
+ }
+ c => {
+ self.unread_char(c);
+ self.state = State::Comment;
+ ControlToken::Continue
+ }
+ },
+ State::CommentLessThanSignBangDash => match self.read_char() {
+ Some('-') => {
+ self.state = State::CommentLessThanSignBangDashDash;
+ ControlToken::Continue
+ }
+ c => {
+ self.unread_char(c);
+ self.state = State::CommentEndDash;
+ ControlToken::Continue
+ }
+ },
+ State::CommentLessThanSignBangDashDash => match self.read_char() {
+ c @ Some('>') | c @ None => {
+ self.unread_char(c);
+ self.state = State::CommentEnd;
+ ControlToken::Continue
+ }
+ c => {
+ self.emitter.emit_error(Error::NestedComment);
+ self.unread_char(c);
+ self.state = State::CommentEnd;
+ ControlToken::Continue
+ }
+ },
+ State::CommentEndDash => match self.read_char() {
+ Some('-') => {
+ self.state = State::CommentEnd;
+ ControlToken::Continue
+ }
+ None => {
+ self.emitter.emit_error(Error::EofInComment);
+ self.emitter.emit_current_comment();
+ ControlToken::Eof
+ }
+ c => {
+ self.emitter.push_comment("-");
+ self.unread_char(c);
+ self.state = State::Comment;
+ ControlToken::Continue
+ }
+ },
+ State::CommentEnd => match self.read_char() {
+ Some('>') => {
+ self.state = State::Data;
+ self.emitter.emit_current_comment();
+ ControlToken::Continue
+ }
+ Some('!') => {
+ self.state = State::CommentEndBang;
+ ControlToken::Continue
+ }
+ Some('-') => {
+ self.emitter.push_comment("-");
+ ControlToken::Continue
+ }
+ None => {
+ self.emitter.emit_error(Error::EofInComment);
+ self.emitter.emit_current_comment();
+ ControlToken::Eof
+ }
+ c @ Some(_) => {
+ self.emitter.push_comment("-");
+ self.emitter.push_comment("-");
+ self.unread_char(c);
+ self.state = State::Comment;
+ ControlToken::Continue
+ }
+ },
+ State::CommentEndBang => match self.read_char() {
+ Some('-') => {
+ self.emitter.push_comment("-");
+ self.emitter.push_comment("-");
+ self.emitter.push_comment("!");
+ self.state = State::CommentEndDash;
+ ControlToken::Continue
+ }
+ Some('>') => {
+ self.emitter.emit_error(Error::IncorrectlyClosedComment);
+ self.state = State::Data;
+ self.emitter.emit_current_comment();
+ ControlToken::Continue
+ }
+ None => {
+ self.emitter.emit_error(Error::EofInComment);
+ self.emitter.emit_current_comment();
+ ControlToken::Eof
+ }
+ c @ Some(_) => {
+ self.emitter.push_comment("-");
+ self.emitter.push_comment("-");
+ self.emitter.push_comment("!");
+ self.state = State::Comment;
+ self.unread_char(c);
+ ControlToken::Continue
+ }
+ },
+ State::Doctype => match self.read_char() {
+ Some(whitespace_pat!()) => {
+ self.state = State::BeforeDoctypeName;
+ ControlToken::Continue
+ }
+ c @ Some('>') => {
+ self.unread_char(c);
+ self.state = State::BeforeDoctypeName;
+ ControlToken::Continue
+ }
+ None => {
+ self.emitter.emit_error(Error::EofInDoctype);
+ self.emitter.init_doctype();
+ self.emitter.set_force_quirks();
+ self.emitter.emit_current_doctype();
+ ControlToken::Eof
+ }
+ c @ Some(_) => {
+ self.emitter
+ .emit_error(Error::MissingWhitespaceBeforeDoctypeName);
+ self.unread_char(c);
+ self.state = State::BeforeDoctypeName;
+ ControlToken::Continue
+ }
+ },
+ State::BeforeDoctypeName => match self.read_char() {
+ Some(whitespace_pat!()) => ControlToken::Continue,
+ Some('\0') => {
+ self.emitter.emit_error(Error::UnexpectedNullCharacter);
+ self.emitter.init_doctype();
+ self.emitter.push_doctype_name("\u{fffd}");
+ self.state = State::DoctypeName;
+ ControlToken::Continue
+ }
+ Some('>') => {
+ self.emitter.emit_error(Error::MissingDoctypeName);
+ self.emitter.init_doctype();
+ self.emitter.set_force_quirks();
+ self.state = State::Data;
+ self.emitter.emit_current_doctype();
+ ControlToken::Continue
+ }
+ None => {
+ self.emitter.emit_error(Error::EofInDoctype);
+ self.emitter.init_doctype();
+ self.emitter.set_force_quirks();
+ self.emitter.emit_current_doctype();
+ ControlToken::Eof
+ }
+ Some(x) => {
+ self.emitter.init_doctype();
+ self.emitter
+ .push_doctype_name(ctostr!(x.to_ascii_lowercase()));
+ self.state = State::DoctypeName;
+ ControlToken::Continue
+ }
+ },
+ State::DoctypeName => match self.read_char() {
+ Some(whitespace_pat!()) => {
+ self.state = State::AfterDoctypeName;
+ ControlToken::Continue
+ }
+ Some('>') => {
+ self.state = State::Data;
+ self.emitter.emit_current_doctype();
+ ControlToken::Continue
+ }
+ Some('\0') => {
+ self.emitter.emit_error(Error::UnexpectedNullCharacter);
+ self.emitter.push_doctype_name("\u{fffd}");
+ ControlToken::Continue
+ }
+ None => {
+ self.emitter.emit_error(Error::EofInDoctype);
+ self.emitter.set_force_quirks();
+ self.emitter.emit_current_doctype();
+ ControlToken::Eof
+ }
+ Some(x) => {
+ self.emitter
+ .push_doctype_name(ctostr!(x.to_ascii_lowercase()));
+ ControlToken::Continue
+ }
+ },
+ State::AfterDoctypeName => match self.read_char() {
+ Some(whitespace_pat!()) => ControlToken::Continue,
+ Some('>') => {
+ self.state = State::Data;
+ self.emitter.emit_current_doctype();
+ ControlToken::Continue
+ }
+ None => {
+ self.emitter.emit_error(Error::EofInDoctype);
+ self.emitter.set_force_quirks();
+ self.emitter.emit_current_doctype();
+ ControlToken::Eof
+ }
+ Some('p' | 'P') if self.try_read_string("ublic", false) => {
+ self.state = State::AfterDoctypePublicKeyword;
+ ControlToken::Continue
+ }
+ Some('s' | 'S') if self.try_read_string("ystem", false) => {
+ self.state = State::AfterDoctypeSystemKeyword;
+ ControlToken::Continue
+ }
+ c @ Some(_) => {
+ self.emitter
+ .emit_error(Error::InvalidCharacterSequenceAfterDoctypeName);
+ self.emitter.set_force_quirks();
+ self.unread_char(c);
+ self.state = State::BogusDoctype;
+ ControlToken::Continue
+ }
+ },
+ State::AfterDoctypePublicKeyword => match self.read_char() {
+ Some(whitespace_pat!()) => {
+ self.state = State::BeforeDoctypePublicIdentifier;
+ ControlToken::Continue
+ }
+ Some('"') => {
+ self.emitter
+ .emit_error(Error::MissingWhitespaceAfterDoctypePublicKeyword);
+ self.emitter.set_doctype_public_identifier("");
+ self.state = State::DoctypePublicIdentifierDoubleQuoted;
+ ControlToken::Continue
+ }
+ Some('\'') => {
+ self.emitter
+ .emit_error(Error::MissingWhitespaceAfterDoctypePublicKeyword);
+ self.emitter.set_doctype_public_identifier("");
+ self.state = State::DoctypePublicIdentifierSingleQuoted;
+ ControlToken::Continue
+ }
+ Some('>') => {
+ self.emitter
+ .emit_error(Error::MissingDoctypePublicIdentifier);
+ self.emitter.set_force_quirks();
+ self.state = State::Data;
+ self.emitter.emit_current_doctype();
+ ControlToken::Continue
+ }
+ None => {
+ self.emitter.emit_error(Error::EofInDoctype);
+ self.emitter.set_force_quirks();
+ self.emitter.emit_current_doctype();
+ ControlToken::Eof
+ }
+ c @ Some(_) => {
+ self.emitter
+ .emit_error(Error::MissingQuoteBeforeDoctypePublicIdentifier);
+ self.emitter.set_force_quirks();
+ self.unread_char(c);
+ self.state = State::BogusDoctype;
+ ControlToken::Continue
+ }
+ },
+ State::BeforeDoctypePublicIdentifier => match self.read_char() {
+ Some(whitespace_pat!()) => ControlToken::Continue,
+ Some('"') => {
+ self.emitter.set_doctype_public_identifier("");
+ self.state = State::DoctypePublicIdentifierDoubleQuoted;
+ ControlToken::Continue
+ }
+ Some('\'') => {
+ self.emitter.set_doctype_public_identifier("");
+ self.state = State::DoctypePublicIdentifierSingleQuoted;
+ ControlToken::Continue
+ }
+ Some('>') => {
+ self.emitter
+ .emit_error(Error::MissingDoctypePublicIdentifier);
+ self.emitter.set_force_quirks();
+ self.state = State::Data;
+ self.emitter.emit_current_doctype();
+ ControlToken::Continue
+ }
+ None => {
+ self.emitter.emit_error(Error::EofInDoctype);
+ self.emitter.set_force_quirks();
+ self.emitter.emit_current_doctype();
+ ControlToken::Eof
+ }
+ c @ Some(_) => {
+ self.emitter
+ .emit_error(Error::MissingQuoteBeforeDoctypePublicIdentifier);
+ self.emitter.set_force_quirks();
+ self.unread_char(c);
+ self.state = State::BogusDoctype;
+ ControlToken::Continue
+ }
+ },
+ State::DoctypePublicIdentifierDoubleQuoted => match self.read_char() {
+ Some('"') => {
+ self.state = State::AfterDoctypePublicIdentifier;
+ ControlToken::Continue
+ }
+ Some('\0') => {
+ self.emitter.emit_error(Error::UnexpectedNullCharacter);
+ self.emitter.push_doctype_public_identifier("\u{fffd}");
+ ControlToken::Continue
+ }
+ Some('>') => {
+ self.emitter
+ .emit_error(Error::AbruptDoctypePublicIdentifier);
+ self.emitter.set_force_quirks();
+ self.state = State::Data;
+ self.emitter.emit_current_doctype();
+ ControlToken::Continue
+ }
+ None => {
+ self.emitter.emit_error(Error::EofInDoctype);
+ self.emitter.set_force_quirks();
+ self.emitter.emit_current_doctype();
+ ControlToken::Eof
+ }
+ Some(x) => {
+ self.emitter.push_doctype_public_identifier(ctostr!(x));
+ ControlToken::Continue
+ }
+ },
+ State::DoctypePublicIdentifierSingleQuoted => match self.read_char() {
+ Some('\'') => {
+ self.state = State::AfterDoctypePublicIdentifier;
+ ControlToken::Continue
+ }
+ Some('\0') => {
+ self.emitter.emit_error(Error::UnexpectedNullCharacter);
+ self.emitter.push_doctype_public_identifier("\u{fffd}");
+ ControlToken::Continue
+ }
+ Some('>') => {
+ self.emitter
+ .emit_error(Error::AbruptDoctypePublicIdentifier);
+ self.emitter.set_force_quirks();
+ self.state = State::Data;
+ self.emitter.emit_current_doctype();
+ ControlToken::Continue
+ }
+ None => {
+ self.emitter.emit_error(Error::EofInDoctype);
+ self.emitter.set_force_quirks();
+ self.emitter.emit_current_doctype();
+ ControlToken::Eof
+ }
+ Some(x) => {
+ self.emitter.push_doctype_public_identifier(ctostr!(x));
+ ControlToken::Continue
+ }
+ },
+ State::AfterDoctypePublicIdentifier => match self.read_char() {
+ Some(whitespace_pat!()) => {
+ self.state = State::BetweenDoctypePublicAndSystemIdentifiers;
+ ControlToken::Continue
+ }
+ Some('>') => {
+ self.state = State::Data;
+ self.emitter.emit_current_doctype();
+ ControlToken::Continue
+ }
+ Some('"') => {
+ self.emitter.emit_error(
+ Error::MissingWhitespaceBetweenDoctypePublicAndSystemIdentifiers,
+ );
+ self.emitter.set_doctype_system_identifier("");
+ self.state = State::DoctypeSystemIdentifierDoubleQuoted;
+ ControlToken::Continue
+ }
+ Some('\'') => {
+ self.emitter.emit_error(
+ Error::MissingWhitespaceBetweenDoctypePublicAndSystemIdentifiers,
+ );
+ self.emitter.set_doctype_system_identifier("");
+ self.state = State::DoctypeSystemIdentifierSingleQuoted;
+ ControlToken::Continue
+ }
+ None => {
+ self.emitter.emit_error(Error::EofInDoctype);
+ self.emitter.set_force_quirks();
+ self.emitter.emit_current_doctype();
+ ControlToken::Eof
+ }
+ c @ Some(_) => {
+ self.emitter
+ .emit_error(Error::MissingQuoteBeforeDoctypeSystemIdentifier);
+ self.emitter.set_force_quirks();
+ self.unread_char(c);
+ self.state = State::BogusDoctype;
+ ControlToken::Continue
+ }
+ },
+ State::BetweenDoctypePublicAndSystemIdentifiers => match self.read_char() {
+ Some(whitespace_pat!()) => ControlToken::Continue,
+ Some('>') => {
+ self.state = State::Data;
+ self.emitter.emit_current_doctype();
+ ControlToken::Continue
+ }
+ Some('"') => {
+ self.emitter.set_doctype_system_identifier("");
+ self.state = State::DoctypeSystemIdentifierDoubleQuoted;
+ ControlToken::Continue
+ }
+ Some('\'') => {
+ self.emitter.set_doctype_system_identifier("");
+ self.state = State::DoctypeSystemIdentifierSingleQuoted;
+ ControlToken::Continue
+ }
+ None => {
+ self.emitter.emit_error(Error::EofInDoctype);
+ self.emitter.set_force_quirks();
+ self.emitter.emit_current_doctype();
+ ControlToken::Eof
+ }
+ c @ Some(_) => {
+ self.emitter
+ .emit_error(Error::MissingQuoteBeforeDoctypeSystemIdentifier);
+ self.emitter.set_force_quirks();
+ self.state = State::BogusDoctype;
+ self.unread_char(c);
+ ControlToken::Continue
+ }
+ },
+ State::AfterDoctypeSystemKeyword => match self.read_char() {
+ Some(whitespace_pat!()) => {
+ self.state = State::BeforeDoctypeSystemIdentifier;
+ ControlToken::Continue
+ }
+ Some('"') => {
+ self.emitter
+ .emit_error(Error::MissingWhitespaceAfterDoctypeSystemKeyword);
+ self.emitter.set_doctype_system_identifier("");
+ self.state = State::DoctypeSystemIdentifierDoubleQuoted;
+ ControlToken::Continue
+ }
+ Some('\'') => {
+ self.emitter
+ .emit_error(Error::MissingWhitespaceAfterDoctypeSystemKeyword);
+ self.emitter.set_doctype_system_identifier("");
+ self.state = State::DoctypeSystemIdentifierSingleQuoted;
+ ControlToken::Continue
+ }
+ Some('>') => {
+ self.emitter
+ .emit_error(Error::MissingDoctypeSystemIdentifier);
+ self.emitter.set_force_quirks();
+ self.state = State::Data;
+ self.emitter.emit_current_doctype();
+ ControlToken::Continue
+ }
+ None => {
+ self.emitter.emit_error(Error::EofInDoctype);
+ self.emitter.set_force_quirks();
+ self.emitter.emit_current_doctype();
+ ControlToken::Eof
+ }
+ c @ Some(_) => {
+ self.emitter
+ .emit_error(Error::MissingQuoteBeforeDoctypeSystemIdentifier);
+ self.emitter.set_force_quirks();
+ self.state = State::BogusDoctype;
+ self.unread_char(c);
+ ControlToken::Continue
+ }
+ },
+ State::BeforeDoctypeSystemIdentifier => match self.read_char() {
+ Some(whitespace_pat!()) => ControlToken::Continue,
+ Some('"') => {
+ self.emitter.set_doctype_system_identifier("");
+ self.state = State::DoctypeSystemIdentifierDoubleQuoted;
+ ControlToken::Continue
+ }
+ Some('\'') => {
+ self.emitter.set_doctype_system_identifier("");
+ self.state = State::DoctypeSystemIdentifierSingleQuoted;
+ ControlToken::Continue
+ }
+ Some('>') => {
+ self.emitter
+ .emit_error(Error::MissingDoctypeSystemIdentifier);
+ self.emitter.set_force_quirks();
+ self.state = State::Data;
+ self.emitter.emit_current_doctype();
+ ControlToken::Continue
+ }
+ None => {
+ self.emitter.emit_error(Error::EofInDoctype);
+ self.emitter.set_force_quirks();
+ self.emitter.emit_current_doctype();
+ ControlToken::Eof
+ }
+ c @ Some(_) => {
+ self.emitter
+ .emit_error(Error::MissingQuoteBeforeDoctypeSystemIdentifier);
+ self.emitter.set_force_quirks();
+ self.state = State::BogusDoctype;
+ self.unread_char(c);
+ ControlToken::Continue
+ }
+ },
+ State::DoctypeSystemIdentifierDoubleQuoted => match self.read_char() {
+ Some('"') => {
+ self.state = State::AfterDoctypeSystemIdentifier;
+ ControlToken::Continue
+ }
+ Some('\0') => {
+ self.emitter.emit_error(Error::UnexpectedNullCharacter);
+ self.emitter.push_doctype_system_identifier("\u{fffd}");
+ ControlToken::Continue
+ }
+ Some('>') => {
+ self.emitter
+ .emit_error(Error::AbruptDoctypeSystemIdentifier);
+ self.emitter.set_force_quirks();
+ self.state = State::Data;
+ self.emitter.emit_current_doctype();
+ ControlToken::Continue
+ }
+ None => {
+ self.emitter.emit_error(Error::EofInDoctype);
+ self.emitter.set_force_quirks();
+ self.emitter.emit_current_doctype();
+ ControlToken::Eof
+ }
+ Some(x) => {
+ self.emitter.push_doctype_system_identifier(ctostr!(x));
+ ControlToken::Continue
+ }
+ },
+ State::DoctypeSystemIdentifierSingleQuoted => match self.read_char() {
+ Some('\'') => {
+ self.state = State::AfterDoctypeSystemIdentifier;
+ ControlToken::Continue
+ }
+ Some('\0') => {
+ self.emitter.emit_error(Error::UnexpectedNullCharacter);
+ self.emitter.push_doctype_system_identifier("\u{fffd}");
+ ControlToken::Continue
+ }
+ Some('>') => {
+ self.emitter
+ .emit_error(Error::AbruptDoctypeSystemIdentifier);
+ self.emitter.set_force_quirks();
+ self.state = State::Data;
+ self.emitter.emit_current_doctype();
+ ControlToken::Continue
+ }
+ None => {
+ self.emitter.emit_error(Error::EofInDoctype);
+ self.emitter.set_force_quirks();
+ self.emitter.emit_current_doctype();
+ ControlToken::Eof
+ }
+ Some(x) => {
+ self.emitter.push_doctype_system_identifier(ctostr!(x));
+ ControlToken::Continue
+ }
+ },
+ State::AfterDoctypeSystemIdentifier => match self.read_char() {
+ Some(whitespace_pat!()) => ControlToken::Continue,
+ Some('>') => {
+ self.state = State::Data;
+ self.emitter.emit_current_doctype();
+ ControlToken::Continue
+ }
+ None => {
+ self.emitter.emit_error(Error::EofInDoctype);
+ self.emitter.set_force_quirks();
+ self.emitter.emit_current_doctype();
+ ControlToken::Eof
+ }
+ c @ Some(_) => {
+ self.emitter
+ .emit_error(Error::UnexpectedCharacterAfterDoctypeSystemIdentifier);
+ self.unread_char(c);
+ self.state = State::BogusDoctype;
+ ControlToken::Continue
+ }
+ },
+ State::BogusDoctype => match self.read_char() {
+ Some('>') => {
+ self.state = State::Data;
+ self.emitter.emit_current_doctype();
+ ControlToken::Continue
+ }
+ Some('\0') => {
+ self.emitter.emit_error(Error::UnexpectedNullCharacter);
+ ControlToken::Continue
+ }
+ None => {
+ self.emitter.emit_current_doctype();
+ ControlToken::Eof
+ }
+ Some(_) => ControlToken::Continue,
+ },
+ State::CdataSection => match self.read_char() {
+ Some(']') => {
+ self.state = State::CdataSectionBracket;
+ ControlToken::Continue
+ }
+ None => {
+ self.emitter.emit_error(Error::EofInCdata);
+ ControlToken::Eof
+ }
+ Some(x) => {
+ self.emitter.emit_string(ctostr!(x));
+ ControlToken::Continue
+ }
+ },
+ State::CdataSectionBracket => match self.read_char() {
+ Some(']') => {
+ self.state = State::CdataSectionEnd;
+ ControlToken::Continue
+ }
+ c => {
+ self.emitter.emit_string("]");
+ self.state = State::CdataSection;
+ self.unread_char(c);
+ ControlToken::Continue
+ }
+ },
+ State::CdataSectionEnd => match self.read_char() {
+ Some(']') => {
+ self.emitter.emit_string("]");
+ ControlToken::Continue
+ }
+ Some('>') => {
+ self.state = State::Data;
+ ControlToken::Continue
+ }
+ c => {
+ self.emitter.emit_string("]]");
+ self.unread_char(c);
+ self.state = State::CdataSection;
+ ControlToken::Continue
+ }
+ },
+ State::CharacterReference => {
+ self.temporary_buffer.clear();
+ self.temporary_buffer.push('&');
+ match self.read_char() {
+ Some(x) if x.is_ascii_alphanumeric() => {
+ self.unread_char(Some(x));
+ self.state = State::NamedCharacterReference;
+ ControlToken::Continue
+ }
+ Some('#') => {
+ self.temporary_buffer.push('#');
+ self.state = State::NumericCharacterReference;
+ ControlToken::Continue
+ }
+ c => {
+ self.flush_code_points_consumed_as_character_reference();
+ self.state = self.return_state.take().unwrap();
+ self.unread_char(c);
+ ControlToken::Continue
+ }
+ }
+ }
+ State::NamedCharacterReference => {
+ let c = self.read_char();
+
+ let char_ref = c.and_then(|x| {
+ Some((
+ x,
+ entities::try_read_character_reference(x, |x| {
+ self.try_read_string(x, true)
+ })?,
+ ))
+ });
+
+ if let Some((x, char_ref)) = char_ref {
+ self.temporary_buffer.push(x);
+ self.temporary_buffer.push_str(char_ref.name);
+ let char_ref_name_last_character = char_ref.name.chars().last();
+ let next_character = self.next_input_character();
+ if self.is_consumed_as_part_of_an_attribute()
+ && char_ref_name_last_character != Some(';')
+ && matches!(next_character, Some(x) if x == '=' || x.is_ascii_alphanumeric())
+ {
+ self.flush_code_points_consumed_as_character_reference();
+ self.state = self.return_state.take().unwrap();
+ ControlToken::Continue
+ } else {
+ if char_ref_name_last_character != Some(';') {
+ self.emitter
+ .emit_error(Error::MissingSemicolonAfterCharacterReference);
+ }
+
+ self.temporary_buffer.clear();
+ self.temporary_buffer.push_str(char_ref.characters);
+ self.flush_code_points_consumed_as_character_reference();
+ self.state = self.return_state.take().unwrap();
+ ControlToken::Continue
+ }
+ } else {
+ self.unread_char(c);
+ self.flush_code_points_consumed_as_character_reference();
+ self.state = State::AmbiguousAmpersand;
+ ControlToken::Continue
+ }
+ }
+ State::AmbiguousAmpersand => match self.read_char() {
+ Some(x) if x.is_ascii_alphanumeric() => {
+ if self.is_consumed_as_part_of_an_attribute() {
+ self.emitter.push_attribute_value(ctostr!(x));
+ } else {
+ self.emitter.emit_string(ctostr!(x));
+ }
+
+ ControlToken::Continue
+ }
+ c @ Some(';') => {
+ self.emitter
+ .emit_error(Error::UnknownNamedCharacterReference);
+ self.unread_char(c);
+ self.state = self.return_state.take().unwrap();
+ ControlToken::Continue
+ }
+ c => {
+ self.unread_char(c);
+ self.state = self.return_state.take().unwrap();
+ ControlToken::Continue
+ }
+ },
+ State::NumericCharacterReference => {
+ self.character_reference_code = 0;
+ match self.read_char() {
+ Some(x @ 'x' | x @ 'X') => {
+ self.temporary_buffer.push(x);
+ self.state = State::HexadecimalCharacterReferenceStart;
+ ControlToken::Continue
+ }
+ c => {
+ self.unread_char(c);
+ self.state = State::DecimalCharacterReferenceStart;
+ ControlToken::Continue
+ }
+ }
+ }
+ State::HexadecimalCharacterReferenceStart => match self.read_char() {
+ c @ Some('0'..='9' | 'A'..='F' | 'a'..='f') => {
+ self.unread_char(c);
+ self.state = State::HexadecimalCharacterReference;
+ ControlToken::Continue
+ }
+ c => {
+ self.emitter
+ .emit_error(Error::AbsenceOfDigitsInNumericCharacterReference);
+ self.flush_code_points_consumed_as_character_reference();
+ self.unread_char(c);
+ self.state = self.return_state.take().unwrap();
+ ControlToken::Continue
+ }
+ },
+ State::DecimalCharacterReferenceStart => match self.read_char() {
+ Some(x @ ascii_digit_pat!()) => {
+ self.unread_char(Some(x));
+ self.state = State::DecimalCharacterReference;
+ ControlToken::Continue
+ }
+ c => {
+ self.emitter
+ .emit_error(Error::AbsenceOfDigitsInNumericCharacterReference);
+ self.flush_code_points_consumed_as_character_reference();
+ self.unread_char(c);
+ self.state = self.return_state.take().unwrap();
+ ControlToken::Continue
+ }
+ },
+ State::HexadecimalCharacterReference => match self.read_char() {
+ Some(x @ ascii_digit_pat!()) => {
+ mutate_character_reference!(*16 + x - 0x0030);
+ ControlToken::Continue
+ }
+ Some(x @ 'A'..='F') => {
+ mutate_character_reference!(*16 + x - 0x0037);
+ ControlToken::Continue
+ }
+ Some(x @ 'a'..='f') => {
+ mutate_character_reference!(*16 + x - 0x0057);
+ ControlToken::Continue
+ }
+ Some(';') => {
+ self.state = State::NumericCharacterReferenceEnd;
+ ControlToken::Continue
+ }
+ c => {
+ self.emitter
+ .emit_error(Error::MissingSemicolonAfterCharacterReference);
+ self.unread_char(c);
+ self.state = State::NumericCharacterReferenceEnd;
+ ControlToken::Continue
+ }
+ },
+ State::DecimalCharacterReference => match self.read_char() {
+ Some(x @ ascii_digit_pat!()) => {
+ mutate_character_reference!(*10 + x - 0x0030);
+ ControlToken::Continue
+ }
+ Some(';') => {
+ self.state = State::NumericCharacterReferenceEnd;
+ ControlToken::Continue
+ }
+ c => {
+ self.emitter
+ .emit_error(Error::MissingSemicolonAfterCharacterReference);
+ self.unread_char(c);
+ self.state = State::NumericCharacterReferenceEnd;
+ ControlToken::Continue
+ }
+ },
+ State::NumericCharacterReferenceEnd => {
+ match self.character_reference_code {
+ 0x00 => {
+ self.emitter.emit_error(Error::NullCharacterReference);
+ self.character_reference_code = 0xfffd;
+ }
+ 0x110000.. => {
+ self.emitter
+ .emit_error(Error::CharacterReferenceOutsideUnicodeRange);
+ self.character_reference_code = 0xfffd;
+ }
+ surrogate_pat!() => {
+ self.emitter.emit_error(Error::SurrogateCharacterReference);
+ self.character_reference_code = 0xfffd;
+ }
+ // noncharacter
+ noncharacter_pat!() => {
+ self.emitter
+ .emit_error(Error::NoncharacterCharacterReference);
+ }
+ // 0x000d, or a control that is not whitespace
+ x @ 0x000d | x @ control_pat!()
+ if !matches!(x, 0x0009 | 0x000a | 0x000c | 0x0020) =>
+ {
+ self.emitter.emit_error(Error::ControlCharacterReference);
+ self.character_reference_code = match x {
+ 0x80 => 0x20AC, // EURO SIGN (€)
+ 0x82 => 0x201A, // SINGLE LOW-9 QUOTATION MARK (‚)
+ 0x83 => 0x0192, // LATIN SMALL LETTER F WITH HOOK (ƒ)
+ 0x84 => 0x201E, // DOUBLE LOW-9 QUOTATION MARK („)
+ 0x85 => 0x2026, // HORIZONTAL ELLIPSIS (…)
+ 0x86 => 0x2020, // DAGGER (†)
+ 0x87 => 0x2021, // DOUBLE DAGGER (‡)
+ 0x88 => 0x02C6, // MODIFIER LETTER CIRCUMFLEX ACCENT (ˆ)
+ 0x89 => 0x2030, // PER MILLE SIGN (‰)
+ 0x8A => 0x0160, // LATIN CAPITAL LETTER S WITH CARON (Š)
+ 0x8B => 0x2039, // SINGLE LEFT-POINTING ANGLE QUOTATION MARK (‹)
+ 0x8C => 0x0152, // LATIN CAPITAL LIGATURE OE (Œ)
+ 0x8E => 0x017D, // LATIN CAPITAL LETTER Z WITH CARON (Ž)
+ 0x91 => 0x2018, // LEFT SINGLE QUOTATION MARK (‘)
+ 0x92 => 0x2019, // RIGHT SINGLE QUOTATION MARK (’)
+ 0x93 => 0x201C, // LEFT DOUBLE QUOTATION MARK (“)
+ 0x94 => 0x201D, // RIGHT DOUBLE QUOTATION MARK (”)
+ 0x95 => 0x2022, // BULLET (•)
+ 0x96 => 0x2013, // EN DASH (–)
+ 0x97 => 0x2014, // EM DASH (—)
+ 0x98 => 0x02DC, // SMALL TILDE (˜)
+ 0x99 => 0x2122, // TRADE MARK SIGN (™)
+ 0x9A => 0x0161, // LATIN SMALL LETTER S WITH CARON (š)
+ 0x9B => 0x203A, // SINGLE RIGHT-POINTING ANGLE QUOTATION MARK (›)
+ 0x9C => 0x0153, // LATIN SMALL LIGATURE OE (œ)
+ 0x9E => 0x017E, // LATIN SMALL LETTER Z WITH CARON (ž)
+ 0x9F => 0x0178, // LATIN CAPITAL LETTER Y WITH DIAERESIS (Ÿ)
+ _ => self.character_reference_code,
+ };
+ }
+ _ => (),
+ }
+
+ self.temporary_buffer.clear();
+ self.temporary_buffer
+ .push(std::char::from_u32(self.character_reference_code).unwrap());
+ self.flush_code_points_consumed_as_character_reference();
+ self.state = self.return_state.take().unwrap();
+ ControlToken::Continue
+ }
+ }
+ }
+}
+
+impl<R: Reader, E: Emitter> Iterator for Tokenizer<R, E> {
+ type Item = E::Token;
+
+ fn next(&mut self) -> Option<Self::Item> {
+ loop {
+ if let Some(token) = self.emitter.pop_token() {
+ break Some(token);
+ } else if !self.eof {
+ match self.consume() {
+ ControlToken::Continue => (),
+ ControlToken::Eof => {
+ self.eof = true;
+ self.emitter.emit_eof();
+ }
+ }
+ } else {
+ break None;
+ }
+ }
+ }
+}