aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMarkus Unterwaditzer <markus-honeypot@unterwaditzer.net>2021-11-27 01:08:48 +0100
committerMarkus Unterwaditzer <markus-honeypot@unterwaditzer.net>2021-11-27 01:10:09 +0100
commit96808d0d940e1580cf86e433d0c844e943157e0d (patch)
tree4e4cabd0431dc2b465fad1b68ae54c27358dcd4e
parent8c6e4ccbb3fc6830239429c994468bc1ccd6832a (diff)
split up match-arms and tokenizer to isolate some tokenizer-internal state
purpose: don't want to expose self.to_reconsume to the consume() method
-rw-r--r--src/lib.rs2173
-rw-r--r--src/machine.rs2062
-rw-r--r--src/tokenizer.rs244
-rw-r--r--src/utils.rs164
4 files changed, 2324 insertions, 2319 deletions
diff --git a/src/lib.rs b/src/lib.rs
index 7722197..f3e0431 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -9,2179 +9,14 @@ mod error;
mod machine;
mod never;
mod reader;
+mod tokenizer;
+mod utils;
#[cfg(feature = "integration-tests")]
-pub use machine::State;
-#[cfg(not(feature = "integration-tests"))]
-use machine::State;
-
-use machine::{
- ascii_digit_pat, control_pat, noncharacter_pat, surrogate_pat, whitespace_pat, ControlToken,
-};
+pub use utils::State;
pub use emitter::{DefaultEmitter, Doctype, Emitter, EndTag, StartTag, Token};
pub use error::Error;
pub use never::Never;
pub use reader::{BufReadReader, Readable, Reader, StringReader};
-
-macro_rules! ctostr {
- ($c:expr) => {
- &*$c.encode_utf8(&mut [0; 4])
- };
-}
-
-// this is a stack that can hold 0 to 2 Ts
-#[derive(Debug, Default)]
-struct Stack2<T: Copy>(Option<(T, Option<T>)>);
-
-impl<T: Copy> Stack2<T> {
- #[inline]
- fn push(&mut self, c: T) {
- self.0 = match self.0 {
- None => Some((c, None)),
- Some((c1, None)) => Some((c1, Some(c))),
- Some((_c1, Some(_c2))) => panic!("stack full!"),
- }
- }
-
- #[inline]
- fn pop(&mut self) -> Option<T> {
- let (new_self, rv) = match self.0 {
- Some((c1, Some(c2))) => (Some((c1, None)), Some(c2)),
- Some((c1, None)) => (None, Some(c1)),
- None => (None, None),
- };
- self.0 = new_self;
- rv
- }
-
- #[inline]
- fn is_empty(&self) -> bool {
- matches!(self.0, None)
- }
-}
-
-/// A HTML tokenizer. See crate-level docs for basic usage.
-pub struct Tokenizer<R: Reader, E: Emitter = DefaultEmitter> {
- eof: bool,
- state: State,
- emitter: E,
- temporary_buffer: String,
- reader: R,
- to_reconsume: Stack2<Option<char>>,
- character_reference_code: u32,
- return_state: Option<State>,
-}
-
-impl<R: Reader> Tokenizer<R> {
- /// Create a new tokenizer from some input.
- ///
- /// `input` can be `&String` or `&str` at the moment, as those are the types for which
- /// [`crate::Readable`] is implemented, but you can implement that trait on your own types.
- ///
- /// Patches are welcome for providing an efficient implementation over async streams,
- /// iterators, files, etc, as long as any dependencies come behind featureflags.
- pub fn new<'a, S: Readable<'a, Reader = R>>(input: S) -> Self {
- Tokenizer::<S::Reader>::new_with_emitter(input, DefaultEmitter::default())
- }
-}
-
-impl<R: Reader, E: Emitter> Tokenizer<R, E> {
- /// Construct a new tokenizer from some input and a custom emitter.
- ///
- /// Use this method over [`Tokenizer::new`] when you want to have more control over string allocation for
- /// tokens.
- pub fn new_with_emitter<'a, S: Readable<'a, Reader = R>>(input: S, emitter: E) -> Self {
- Tokenizer {
- eof: false,
- state: State::Data,
- emitter,
- temporary_buffer: String::new(),
- to_reconsume: Stack2::default(),
- reader: input.to_reader(),
- character_reference_code: 0,
- return_state: None,
- }
- }
-
- #[cfg(feature = "integration-tests")]
- /// Test-internal function to override internal state.
- ///
- /// Only available with the `integration-tests` feature which is not public API.
- pub fn set_state(&mut self, state: State) {
- self.state = state;
- }
-
- /// Set the statemachine to start/continue in [plaintext
- /// state](https://html.spec.whatwg.org/#plaintext-state).
- ///
- /// This tokenizer never gets into that state naturally.
- pub fn set_plaintext_state(&mut self) {
- self.state = State::PlainText;
- }
-
- #[cfg(feature = "integration-tests")]
- /// Test-internal function to override internal state.
- ///
- /// Only available with the `integration-tests` feature which is not public API.
- pub fn set_last_start_tag(&mut self, last_start_tag: Option<&str>) {
- self.emitter.set_last_start_tag(last_start_tag);
- }
-
- #[inline]
- fn unread_char(&mut self, c: Option<char>) {
- self.to_reconsume.push(c);
- }
-
- #[inline]
- fn validate_char(&mut self, c: char) {
- match c as u32 {
- surrogate_pat!() => {
- self.emitter.emit_error(Error::SurrogateInInputStream);
- }
- noncharacter_pat!() => {
- self.emitter.emit_error(Error::NoncharacterInInputStream);
- }
- // control without whitespace or nul
- x @ control_pat!()
- if !matches!(x, 0x0000 | 0x0009 | 0x000a | 0x000c | 0x000d | 0x0020) =>
- {
- self.emitter
- .emit_error(Error::ControlCharacterInInputStream);
- }
- _ => (),
- }
- }
-
- fn read_char(&mut self) -> Result<Option<char>, R::Error> {
- let (c_res, reconsumed) = match self.to_reconsume.pop() {
- Some(c) => (Ok(c), true),
- None => (self.reader.read_char(), false),
- };
-
- let mut c = match c_res {
- Ok(Some(c)) => c,
- res => return res,
- };
-
- if c == '\r' {
- c = '\n';
- let c2 = self.reader.read_char()?;
- if c2 != Some('\n') {
- self.unread_char(c2);
- }
- }
-
- if !reconsumed {
- self.validate_char(c);
- }
-
- Ok(Some(c))
- }
-
- #[inline]
- fn try_read_string(&mut self, s: &str, case_sensitive: bool) -> Result<bool, R::Error> {
- debug_assert!(!s.is_empty());
- debug_assert!(self.to_reconsume.is_empty());
- self.reader.try_read_string(s, case_sensitive)
- }
-
- fn is_consumed_as_part_of_an_attribute(&self) -> bool {
- matches!(
- self.return_state,
- Some(
- State::AttributeValueDoubleQuoted
- | State::AttributeValueSingleQuoted
- | State::AttributeValueUnquoted
- )
- )
- }
-
- fn flush_code_points_consumed_as_character_reference(&mut self) {
- if self.is_consumed_as_part_of_an_attribute() {
- self.emitter.push_attribute_value(&self.temporary_buffer);
- self.temporary_buffer.clear();
- } else {
- self.flush_buffer_characters();
- }
- }
-
- fn next_input_character(&mut self) -> Result<Option<char>, R::Error> {
- let rv = self.read_char()?;
- self.unread_char(rv);
- Ok(rv)
- }
-
- fn flush_buffer_characters(&mut self) {
- self.emitter.emit_string(&self.temporary_buffer);
- self.temporary_buffer.clear();
- }
-
- fn consume(&mut self) -> Result<ControlToken, R::Error> {
- macro_rules! mutate_character_reference {
- (* $mul:literal + $x:ident - $sub:literal) => {
- match self
- .character_reference_code
- .checked_mul($mul)
- .and_then(|cr| cr.checked_add($x as u32 - $sub))
- {
- Some(cr) => self.character_reference_code = cr,
- None => {
- // provoke err
- self.character_reference_code = 0x110000;
- }
- };
- };
- }
-
- match self.state {
- State::Data => match self.read_char()? {
- Some('&') => {
- self.return_state = Some(self.state);
- self.state = State::CharacterReference;
- Ok(ControlToken::Continue)
- }
- Some('<') => {
- self.state = State::TagOpen;
- Ok(ControlToken::Continue)
- }
- Some('\0') => {
- self.emitter.emit_error(Error::UnexpectedNullCharacter);
- self.emitter.emit_string("\0");
- Ok(ControlToken::Continue)
- }
- Some(x) => {
- self.emitter.emit_string(ctostr!(x));
- Ok(ControlToken::Continue)
- }
- None => Ok(ControlToken::Eof),
- },
- State::RcData => match self.read_char()? {
- Some('&') => {
- self.return_state = Some(State::RcData);
- self.state = State::CharacterReference;
- Ok(ControlToken::Continue)
- }
- Some('<') => {
- self.state = State::RcDataLessThanSign;
- Ok(ControlToken::Continue)
- }
- Some('\0') => {
- self.emitter.emit_error(Error::UnexpectedNullCharacter);
- self.emitter.emit_string("\u{fffd}");
- Ok(ControlToken::Continue)
- }
- Some(x) => {
- self.emitter.emit_string(ctostr!(x));
- Ok(ControlToken::Continue)
- }
- None => Ok(ControlToken::Eof),
- },
- State::RawText => match self.read_char()? {
- Some('<') => {
- self.state = State::RawTextLessThanSign;
- Ok(ControlToken::Continue)
- }
- Some('\0') => {
- self.emitter.emit_error(Error::UnexpectedNullCharacter);
- self.emitter.emit_string("\u{fffd}");
- Ok(ControlToken::Continue)
- }
- Some(x) => {
- self.emitter.emit_string(ctostr!(x));
- Ok(ControlToken::Continue)
- }
- None => Ok(ControlToken::Eof),
- },
- State::ScriptData => match self.read_char()? {
- Some('<') => {
- self.state = State::ScriptDataLessThanSign;
- Ok(ControlToken::Continue)
- }
- Some('\0') => {
- self.emitter.emit_error(Error::UnexpectedNullCharacter);
- self.emitter.emit_string("\u{fffd}");
- Ok(ControlToken::Continue)
- }
- Some(x) => {
- self.emitter.emit_string(ctostr!(x));
- Ok(ControlToken::Continue)
- }
- None => Ok(ControlToken::Eof),
- },
- State::PlainText => match self.read_char()? {
- Some('\0') => {
- self.emitter.emit_error(Error::UnexpectedNullCharacter);
- self.emitter.emit_string("\u{fffd}");
- Ok(ControlToken::Continue)
- }
- Some(x) => {
- self.emitter.emit_string(ctostr!(x));
- Ok(ControlToken::Continue)
- }
- None => Ok(ControlToken::Eof),
- },
- State::TagOpen => match self.read_char()? {
- Some('!') => {
- self.state = State::MarkupDeclarationOpen;
- Ok(ControlToken::Continue)
- }
- Some('/') => {
- self.state = State::EndTagOpen;
- Ok(ControlToken::Continue)
- }
- Some(x) if x.is_ascii_alphabetic() => {
- self.emitter.init_start_tag();
- self.state = State::TagName;
- self.unread_char(Some(x));
- Ok(ControlToken::Continue)
- }
- c @ Some('?') => {
- self.emitter
- .emit_error(Error::UnexpectedQuestionMarkInsteadOfTagName);
- self.emitter.init_comment();
- self.state = State::BogusComment;
- self.unread_char(c);
- Ok(ControlToken::Continue)
- }
- None => {
- self.emitter.emit_error(Error::EofBeforeTagName);
- self.emitter.emit_string("<");
- Ok(ControlToken::Eof)
- }
- c @ Some(_) => {
- self.emitter
- .emit_error(Error::InvalidFirstCharacterOfTagName);
- self.state = State::Data;
- self.emitter.emit_string("<");
- self.unread_char(c);
- Ok(ControlToken::Continue)
- }
- },
- State::EndTagOpen => match self.read_char()? {
- Some(x) if x.is_ascii_alphabetic() => {
- self.emitter.init_end_tag();
- self.state = State::TagName;
- self.unread_char(Some(x));
- Ok(ControlToken::Continue)
- }
- Some('>') => {
- self.emitter.emit_error(Error::MissingEndTagName);
- self.state = State::Data;
- Ok(ControlToken::Continue)
- }
- None => {
- self.emitter.emit_error(Error::EofBeforeTagName);
- self.emitter.emit_string("</");
- Ok(ControlToken::Eof)
- }
- Some(x) => {
- self.emitter
- .emit_error(Error::InvalidFirstCharacterOfTagName);
- self.emitter.init_comment();
- self.state = State::BogusComment;
- self.unread_char(Some(x));
- Ok(ControlToken::Continue)
- }
- },
- State::TagName => match self.read_char()? {
- Some(whitespace_pat!()) => {
- self.state = State::BeforeAttributeName;
- Ok(ControlToken::Continue)
- }
- Some('/') => {
- self.state = State::SelfClosingStartTag;
- Ok(ControlToken::Continue)
- }
- Some('>') => {
- self.state = State::Data;
- self.emitter.emit_current_tag();
- Ok(ControlToken::Continue)
- }
- Some('\0') => {
- self.emitter.emit_error(Error::UnexpectedNullCharacter);
- self.emitter.push_tag_name("\u{fffd}");
- Ok(ControlToken::Continue)
- }
- Some(x) => {
- self.emitter.push_tag_name(ctostr!(x.to_ascii_lowercase()));
- Ok(ControlToken::Continue)
- }
- None => {
- self.emitter.emit_error(Error::EofInTag);
- Ok(ControlToken::Eof)
- }
- },
- State::RcDataLessThanSign => match self.read_char()? {
- Some('/') => {
- self.temporary_buffer.clear();
- self.state = State::RcDataEndTagOpen;
- Ok(ControlToken::Continue)
- }
- c => {
- self.emitter.emit_string("<");
- self.state = State::RcData;
- self.unread_char(c);
- Ok(ControlToken::Continue)
- }
- },
- State::RcDataEndTagOpen => match self.read_char()? {
- Some(x) if x.is_ascii_alphabetic() => {
- self.emitter.init_end_tag();
- self.state = State::RcDataEndTagName;
- self.unread_char(Some(x));
- Ok(ControlToken::Continue)
- }
- c => {
- self.emitter.emit_string("</");
- self.state = State::RcData;
- self.unread_char(c);
- Ok(ControlToken::Continue)
- }
- },
- State::RcDataEndTagName => match self.read_char()? {
- Some(whitespace_pat!()) if self.emitter.current_is_appropriate_end_tag_token() => {
- self.state = State::BeforeAttributeName;
- Ok(ControlToken::Continue)
- }
- Some('/') if self.emitter.current_is_appropriate_end_tag_token() => {
- self.state = State::SelfClosingStartTag;
- Ok(ControlToken::Continue)
- }
- Some('>') if self.emitter.current_is_appropriate_end_tag_token() => {
- self.state = State::Data;
- self.emitter.emit_current_tag();
- Ok(ControlToken::Continue)
- }
- Some(x) if x.is_ascii_alphabetic() => {
- self.emitter.push_tag_name(ctostr!(x.to_ascii_lowercase()));
- self.temporary_buffer.push(x);
- Ok(ControlToken::Continue)
- }
- c => {
- self.emitter.emit_string("</");
- self.flush_buffer_characters();
-
- self.state = State::RcData;
- self.unread_char(c);
- Ok(ControlToken::Continue)
- }
- },
- State::RawTextLessThanSign => match self.read_char()? {
- Some('/') => {
- self.temporary_buffer.clear();
- self.state = State::RawTextEndTagOpen;
- Ok(ControlToken::Continue)
- }
- c => {
- self.emitter.emit_string("<");
- self.state = State::RawText;
- self.unread_char(c);
- Ok(ControlToken::Continue)
- }
- },
- State::RawTextEndTagOpen => match self.read_char()? {
- Some(x) if x.is_ascii_alphabetic() => {
- self.emitter.init_end_tag();
- self.state = State::RawTextEndTagName;
- self.unread_char(Some(x));
- Ok(ControlToken::Continue)
- }
- c => {
- self.emitter.emit_string("</");
- self.state = State::RawText;
- self.unread_char(c);
- Ok(ControlToken::Continue)
- }
- },
- State::RawTextEndTagName => match self.read_char()? {
- Some(whitespace_pat!()) if self.emitter.current_is_appropriate_end_tag_token() => {
- self.state = State::BeforeAttributeName;
- Ok(ControlToken::Continue)
- }
- Some('/') if self.emitter.current_is_appropriate_end_tag_token() => {
- self.state = State::SelfClosingStartTag;
- Ok(ControlToken::Continue)
- }
- Some('>') if self.emitter.current_is_appropriate_end_tag_token() => {
- self.state = State::Data;
- self.emitter.emit_current_tag();
- Ok(ControlToken::Continue)
- }
- Some(x) if x.is_ascii_alphabetic() => {
- self.emitter.push_tag_name(ctostr!(x.to_ascii_lowercase()));
- self.temporary_buffer.push(x);
- Ok(ControlToken::Continue)
- }
- c => {
- self.emitter.emit_string("</");
- self.flush_buffer_characters();
-
- self.state = State::RawText;
- self.unread_char(c);
- Ok(ControlToken::Continue)
- }
- },
- State::ScriptDataLessThanSign => match self.read_char()? {
- Some('/') => {
- self.temporary_buffer.clear();
- self.state = State::ScriptDataEndTagOpen;
- Ok(ControlToken::Continue)
- }
- Some('!') => {
- self.state = State::ScriptDataEscapeStart;
- self.emitter.emit_string("<!");
- Ok(ControlToken::Continue)
- }
- c => {
- self.emitter.emit_string("<");
- self.state = State::Data;
- self.unread_char(c);
- Ok(ControlToken::Continue)
- }
- },
- State::ScriptDataEndTagOpen => match self.read_char()? {
- Some(x) if x.is_ascii_alphabetic() => {
- self.emitter.init_end_tag();
- self.state = State::ScriptDataEndTagName;
- self.unread_char(Some(x));
- Ok(ControlToken::Continue)
- }
- c => {
- self.emitter.emit_string("</");
- self.state = State::ScriptData;
- self.unread_char(c);
- Ok(ControlToken::Continue)
- }
- },
- State::ScriptDataEndTagName => match self.read_char()? {
- Some(whitespace_pat!()) if self.emitter.current_is_appropriate_end_tag_token() => {
- self.state = State::BeforeAttributeName;
- Ok(ControlToken::Continue)
- }
- Some('/') if self.emitter.current_is_appropriate_end_tag_token() => {
- self.state = State::SelfClosingStartTag;
- Ok(ControlToken::Continue)
- }
- Some('>') if self.emitter.current_is_appropriate_end_tag_token() => {
- self.state = State::Data;
- self.emitter.emit_current_tag();
- Ok(ControlToken::Continue)
- }
- Some(x) if x.is_ascii_alphabetic() => {
- self.emitter.push_tag_name(ctostr!(x.to_ascii_lowercase()));
- self.temporary_buffer.push(x.to_ascii_lowercase());
- Ok(ControlToken::Continue)
- }
- c => {
- self.emitter.emit_string("</");
- self.flush_buffer_characters();
- self.state = State::Data;
- self.unread_char(c);
- Ok(ControlToken::Continue)
- }
- },
- State::ScriptDataEscapeStart => match self.read_char()? {
- Some('-') => {
- self.state = State::ScriptDataEscapeStartDash;
- self.emitter.emit_string("-");
- Ok(ControlToken::Continue)
- }
- c => {
- self.state = State::ScriptData;
- self.unread_char(c);
- Ok(ControlToken::Continue)
- }
- },
- State::ScriptDataEscapeStartDash => match self.read_char()? {
- Some('-') => {
- self.state = State::ScriptDataEscapedDashDash;
- self.emitter.emit_string("-");
- Ok(ControlToken::Continue)
- }
- c => {
- self.state = State::ScriptData;
- self.unread_char(c);
- Ok(ControlToken::Continue)
- }
- },
- State::ScriptDataEscaped => match self.read_char()? {
- Some('-') => {
- self.state = State::ScriptDataEscapedDash;
- self.emitter.emit_string("-");
- Ok(ControlToken::Continue)
- }
- Some('<') => {
- self.state = State::ScriptDataEscapedLessThanSign;
- Ok(ControlToken::Continue)
- }
- Some('\0') => {
- self.emitter.emit_error(Error::UnexpectedNullCharacter);
- self.emitter.emit_string("\u{fffd}");
- Ok(ControlToken::Continue)
- }
- None => {
- self.emitter
- .emit_error(Error::EofInScriptHtmlCommentLikeText);
- Ok(ControlToken::Eof)
- }
- Some(x) => {
- self.emitter.emit_string(ctostr!(x));
- Ok(ControlToken::Continue)
- }
- },
- State::ScriptDataEscapedDash => match self.read_char()? {
- Some('-') => {
- self.state = State::ScriptDataEscapedDashDash;
- self.emitter.emit_string("-");
- Ok(ControlToken::Continue)
- }
- Some('<') => {
- self.state = State::ScriptDataEscapedLessThanSign;
- Ok(ControlToken::Continue)
- }
- Some('\0') => {
- self.emitter.emit_error(Error::UnexpectedNullCharacter);
- self.state = State::ScriptDataEscaped;
- self.emitter.emit_string("\u{fffd}");
- Ok(ControlToken::Continue)
- }
- None => {
- self.emitter
- .emit_error(Error::EofInScriptHtmlCommentLikeText);
- Ok(ControlToken::Eof)
- }
- Some(x) => {
- self.state = State::ScriptDataEscaped;
- self.emitter.emit_string(ctostr!(x));
- Ok(ControlToken::Continue)
- }
- },
- State::ScriptDataEscapedDashDash => match self.read_char()? {
- Some('-') => {
- self.emitter.emit_string("-");
- Ok(ControlToken::Continue)
- }
- Some('<') => {
- self.state = State::ScriptDataEscapedLessThanSign;
- Ok(ControlToken::Continue)
- }
- Some('>') => {
- self.state = State::ScriptData;
- self.emitter.emit_string(">");
- Ok(ControlToken::Continue)
- }
- Some('\0') => {
- self.emitter.emit_error(Error::UnexpectedNullCharacter);
- self.state = State::ScriptDataEscaped;
- self.emitter.emit_string("\u{fffd}");
- Ok(ControlToken::Continue)
- }
- None => {
- self.emitter
- .emit_error(Error::EofInScriptHtmlCommentLikeText);
- Ok(ControlToken::Eof)
- }
- Some(x) => {
- self.state = State::ScriptDataEscaped;
- self.emitter.emit_string(ctostr!(x));
- Ok(ControlToken::Continue)
- }
- },
- State::ScriptDataEscapedLessThanSign => match self.read_char()? {
- Some('/') => {
- self.temporary_buffer.clear();
- self.state = State::ScriptDataEscapedEndTagOpen;
- Ok(ControlToken::Continue)
- }
- Some(x) if x.is_ascii_alphabetic() => {
- self.temporary_buffer.clear();
- self.emitter.emit_string("<");
- self.state = State::ScriptDataDoubleEscapeStart;
- self.unread_char(Some(x));
- Ok(ControlToken::Continue)
- }
- c => {
- self.emitter.emit_string("<");
- self.state = State::ScriptDataEscaped;
- self.unread_char(c);
- Ok(ControlToken::Continue)
- }
- },
- State::ScriptDataEscapedEndTagOpen => match self.read_char()? {
- Some(x) if x.is_ascii_alphabetic() => {
- self.emitter.init_end_tag();
- self.state = State::ScriptDataEscapedEndTagName;
- self.unread_char(Some(x));
- Ok(ControlToken::Continue)
- }
- c => {
- self.emitter.emit_string("</");
- self.unread_char(c);
- self.state = State::ScriptDataEscaped;
- Ok(ControlToken::Continue)
- }
- },
- State::ScriptDataEscapedEndTagName => match self.read_char()? {
- Some(whitespace_pat!()) if self.emitter.current_is_appropriate_end_tag_token() => {
- self.state = State::BeforeAttributeName;
- Ok(ControlToken::Continue)
- }
- Some('/') if self.emitter.current_is_appropriate_end_tag_token() => {
- self.state = State::SelfClosingStartTag;
- Ok(ControlToken::Continue)
- }
- Some('>') if self.emitter.current_is_appropriate_end_tag_token() => {
- self.state = State::Data;
- self.emitter.emit_current_tag();
- Ok(ControlToken::Continue)
- }
- Some(x) if x.is_ascii_alphabetic() => {
- self.emitter.push_tag_name(ctostr!(x.to_ascii_lowercase()));
- self.temporary_buffer.push(x);
- Ok(ControlToken::Continue)
- }
- c => {
- self.emitter.emit_string("</");
- self.flush_buffer_characters();
- self.state = State::ScriptDataEscaped;
- self.unread_char(c);
- Ok(ControlToken::Continue)
- }
- },
- State::ScriptDataDoubleEscapeStart => match self.read_char()? {
- Some(x @ whitespace_pat!() | x @ '/' | x @ '>') => {
- if self.temporary_buffer == "script" {
- self.state = State::ScriptDataDoubleEscaped;
- } else {
- self.state = State::ScriptDataEscaped;
- }
- self.emitter.emit_string(ctostr!(x));
- Ok(ControlToken::Continue)
- }
- Some(x) if x.is_ascii_alphabetic() => {
- self.temporary_buffer.push(x.to_ascii_lowercase());
- self.emitter.emit_string(ctostr!(x));
- Ok(ControlToken::Continue)
- }
- c => {
- self.state = State::ScriptDataEscaped;
- self.unread_char(c);
- Ok(ControlToken::Continue)
- }
- },
- State::ScriptDataDoubleEscaped => match self.read_char()? {
- Some('-') => {
- self.state = State::ScriptDataDoubleEscapedDash;
- self.emitter.emit_string("-");
- Ok(ControlToken::Continue)
- }
- Some('<') => {
- self.state = State::ScriptDataDoubleEscapedLessThanSign;
- self.emitter.emit_string("<");
- Ok(ControlToken::Continue)
- }
- Some('\0') => {
- self.emitter.emit_error(Error::UnexpectedNullCharacter);
- self.emitter.emit_string("\u{fffd}");
- Ok(ControlToken::Continue)
- }
- None => {
- self.emitter
- .emit_error(Error::EofInScriptHtmlCommentLikeText);
- Ok(ControlToken::Eof)
- }
- Some(x) => {
- self.emitter.emit_string(ctostr!(x));
- Ok(ControlToken::Continue)
- }
- },
- State::ScriptDataDoubleEscapedDash => match self.read_char()? {
- Some('-') => {
- self.state = State::ScriptDataDoubleEscapedDashDash;
- self.emitter.emit_string("-");
- Ok(ControlToken::Continue)
- }
- Some('<') => {
- self.state = State::ScriptDataDoubleEscapedLessThanSign;
- self.emitter.emit_string("<");
- Ok(ControlToken::Continue)
- }
- Some('\0') => {
- self.emitter.emit_error(Error::UnexpectedNullCharacter);
- self.state = State::ScriptDataDoubleEscaped;
- self.emitter.emit_string("\u{fffd}");
- Ok(ControlToken::Continue)
- }
- None => {
- self.emitter
- .emit_error(Error::EofInScriptHtmlCommentLikeText);
- Ok(ControlToken::Eof)
- }
- Some(x) => {
- self.state = State::ScriptDataDoubleEscaped;
- self.emitter.emit_string(ctostr!(x));
- Ok(ControlToken::Continue)
- }
- },
- State::ScriptDataDoubleEscapedDashDash => match self.read_char()? {
- Some('-') => {
- self.emitter.emit_string("-");
- Ok(ControlToken::Continue)
- }
- Some('<') => {
- self.emitter.emit_string("<");
- self.state = State::ScriptDataDoubleEscapedLessThanSign;
- Ok(ControlToken::Continue)
- }
- Some('>') => {
- self.emitter.emit_string(">");
- self.state = State::ScriptData;
- Ok(ControlToken::Continue)
- }
- Some('\0') => {
- self.emitter.emit_error(Error::UnexpectedNullCharacter);
- self.state = State::ScriptDataDoubleEscaped;
- self.emitter.emit_string("\u{fffd}");
- Ok(ControlToken::Continue)
- }
- None => {
- self.emitter
- .emit_error(Error::EofInScriptHtmlCommentLikeText);
- Ok(ControlToken::Eof)
- }
- Some(x) => {
- self.state = State::ScriptDataDoubleEscaped;
- self.emitter.emit_string(ctostr!(x));
- Ok(ControlToken::Continue)
- }
- },
- State::ScriptDataDoubleEscapedLessThanSign => match self.read_char()? {
- Some('/') => {
- self.temporary_buffer.clear();
- self.state = State::ScriptDataDoubleEscapeEnd;
- self.emitter.emit_string("/");
- Ok(ControlToken::Continue)
- }
- c => {
- self.state = State::ScriptDataDoubleEscaped;
- self.unread_char(c);
- Ok(ControlToken::Continue)
- }
- },
- State::ScriptDataDoubleEscapeEnd => match self.read_char()? {
- Some(x @ whitespace_pat!() | x @ '/' | x @ '>') => {
- if self.temporary_buffer == "script" {
- self.state = State::ScriptDataEscaped;
- } else {
- self.state = State::ScriptDataDoubleEscaped;
- }
-
- self.emitter.emit_string(ctostr!(x));
- Ok(ControlToken::Continue)
- }
- Some(x) if x.is_ascii_alphabetic() => {
- self.temporary_buffer.push(x.to_ascii_lowercase());
- self.emitter.emit_string(ctostr!(x));
- Ok(ControlToken::Continue)
- }
- c => {
- self.state = State::ScriptDataDoubleEscaped;
- self.unread_char(c);
- Ok(ControlToken::Continue)
- }
- },
- State::BeforeAttributeName => match self.read_char()? {
- Some(whitespace_pat!()) => Ok(ControlToken::Continue),
- c @ Some('/' | '>') | c @ None => {
- self.state = State::AfterAttributeName;
- self.unread_char(c);
- Ok(ControlToken::Continue)
- }
- Some('=') => {
- self.emitter
- .emit_error(Error::UnexpectedEqualsSignBeforeAttributeName);
- self.emitter.init_attribute();
- self.emitter.push_attribute_name("=");
- self.state = State::AttributeName;
- Ok(ControlToken::Continue)
- }
- Some(x) => {
- self.emitter.init_attribute();
- self.state = State::AttributeName;
- self.unread_char(Some(x));
- Ok(ControlToken::Continue)
- }
- },
- State::AttributeName => match self.read_char()? {
- c @ Some(whitespace_pat!() | '/' | '>') | c @ None => {
- self.state = State::AfterAttributeName;
- self.unread_char(c);
- Ok(ControlToken::Continue)
- }
- Some('=') => {
- self.state = State::BeforeAttributeValue;
- Ok(ControlToken::Continue)
- }
- Some('\0') => {
- self.emitter.emit_error(Error::UnexpectedNullCharacter);
- self.emitter.push_attribute_name("\u{fffd}");
- Ok(ControlToken::Continue)
- }
- Some(x @ '"' | x @ '\'' | x @ '<') => {
- self.emitter
- .emit_error(Error::UnexpectedCharacterInAttributeName);
- self.emitter
- .push_attribute_name(ctostr!(x.to_ascii_lowercase()));
- Ok(ControlToken::Continue)
- }
- Some(x) => {
- self.emitter
- .push_attribute_name(ctostr!(x.to_ascii_lowercase()));
- Ok(ControlToken::Continue)
- }
- },
- State::AfterAttributeName => match self.read_char()? {
- Some(whitespace_pat!()) => Ok(ControlToken::Continue),
- Some('/') => {
- self.state = State::SelfClosingStartTag;
- Ok(ControlToken::Continue)
- }
- Some('=') => {
- self.state = State::BeforeAttributeValue;
- Ok(ControlToken::Continue)
- }
- Some('>') => {
- self.state = State::Data;
- self.emitter.emit_current_tag();
- Ok(ControlToken::Continue)
- }
- None => {
- self.emitter.emit_error(Error::EofInTag);
- Ok(ControlToken::Eof)
- }
- Some(x) => {
- self.emitter.init_attribute();
- self.state = State::AttributeName;
- self.unread_char(Some(x));
- Ok(ControlToken::Continue)
- }
- },
- State::BeforeAttributeValue => match self.read_char()? {
- Some(whitespace_pat!()) => Ok(ControlToken::Continue),
- Some('"') => {
- self.state = State::AttributeValueDoubleQuoted;
- Ok(ControlToken::Continue)
- }
- Some('\'') => {
- self.state = State::AttributeValueSingleQuoted;
- Ok(ControlToken::Continue)
- }
- Some('>') => {
- self.emitter.emit_error(Error::MissingAttributeValue);
- self.state = State::Data;
- self.emitter.emit_current_tag();
- Ok(ControlToken::Continue)
- }
- c => {
- self.state = State::AttributeValueUnquoted;
- self.unread_char(c);
- Ok(ControlToken::Continue)
- }
- },
- State::AttributeValueDoubleQuoted => match self.read_char()? {
- Some('"') => {
- self.state = State::AfterAttributeValueQuoted;
- Ok(ControlToken::Continue)
- }
- Some('&') => {
- self.return_state = Some(State::AttributeValueDoubleQuoted);
- self.state = State::CharacterReference;
- Ok(ControlToken::Continue)
- }
- Some('\0') => {
- self.emitter.emit_error(Error::UnexpectedNullCharacter);
- self.emitter.push_attribute_value("\u{fffd}");
- Ok(ControlToken::Continue)
- }
- None => {
- self.emitter.emit_error(Error::EofInTag);
- Ok(ControlToken::Eof)
- }
- Some(x) => {
- self.emitter.push_attribute_value(ctostr!(x));
- Ok(ControlToken::Continue)
- }
- },
- State::AttributeValueSingleQuoted => match self.read_char()? {
- Some('\'') => {
- self.state = State::AfterAttributeValueQuoted;
- Ok(ControlToken::Continue)
- }
- Some('&') => {
- self.return_state = Some(State::AttributeValueSingleQuoted);
- self.state = State::CharacterReference;
- Ok(ControlToken::Continue)
- }
- Some('\0') => {
- self.emitter.emit_error(Error::UnexpectedNullCharacter);
- self.emitter.push_attribute_value("\u{fffd}");
- Ok(ControlToken::Continue)
- }
- None => {
- self.emitter.emit_error(Error::EofInTag);
- Ok(ControlToken::Eof)
- }
- Some(x) => {
- self.emitter.push_attribute_value(ctostr!(x));
- Ok(ControlToken::Continue)
- }
- },
- State::AttributeValueUnquoted => match self.read_char()? {
- Some(whitespace_pat!()) => {
- self.state = State::BeforeAttributeName;
- Ok(ControlToken::Continue)
- }
- Some('&') => {
- self.return_state = Some(State::AttributeValueUnquoted);
- self.state = State::CharacterReference;
- Ok(ControlToken::Continue)
- }
- Some('>') => {
- self.state = State::Data;
- self.emitter.emit_current_tag();
- Ok(ControlToken::Continue)
- }
- Some('\0') => {
- self.emitter.emit_error(Error::UnexpectedNullCharacter);
- self.emitter.push_attribute_value("\u{fffd}");
- Ok(ControlToken::Continue)
- }
- Some(x @ '"' | x @ '\'' | x @ '<' | x @ '=' | x @ '\u{60}') => {
- self.emitter
- .emit_error(Error::UnexpectedCharacterInUnquotedAttributeValue);
- self.emitter.push_attribute_value(ctostr!(x));
- Ok(ControlToken::Continue)
- }
- None => {
- self.emitter.emit_error(Error::EofInTag);
- Ok(ControlToken::Eof)
- }
- Some(x) => {
- self.emitter.push_attribute_value(ctostr!(x));
- Ok(ControlToken::Continue)
- }
- },
- State::AfterAttributeValueQuoted => match self.read_char()? {
- Some(whitespace_pat!()) => {
- self.state = State::BeforeAttributeName;
- Ok(ControlToken::Continue)
- }
- Some('/') => {
- self.state = State::SelfClosingStartTag;
- Ok(ControlToken::Continue)
- }
- Some('>') => {
- self.state = State::Data;
- self.emitter.emit_current_tag();
- Ok(ControlToken::Continue)
- }
- None => {
- self.emitter.emit_error(Error::EofInTag);
- Ok(ControlToken::Eof)
- }
- Some(x) => {
- self.emitter
- .emit_error(Error::MissingWhitespaceBetweenAttributes);
- self.state = State::BeforeAttributeName;
- self.unread_char(Some(x));
- Ok(ControlToken::Continue)
- }
- },
- State::SelfClosingStartTag => match self.read_char()? {
- Some('>') => {
- self.emitter.set_self_closing();
- self.state = State::Data;
- self.emitter.emit_current_tag();
- Ok(ControlToken::Continue)
- }
- None => {
- self.emitter.emit_error(Error::EofInTag);
- Ok(ControlToken::Eof)
- }
- Some(x) => {
- self.emitter.emit_error(Error::UnexpectedSolidusInTag);
- self.state = State::BeforeAttributeName;
- self.unread_char(Some(x));
- Ok(ControlToken::Continue)
- }
- },
- State::BogusComment => match self.read_char()? {
- Some('>') => {
- self.state = State::Data;
- self.emitter.emit_current_comment();
- Ok(ControlToken::Continue)
- }
- None => {
- self.emitter.emit_current_comment();
- Ok(ControlToken::Eof)
- }
- Some('\0') => {
- self.emitter.emit_error(Error::UnexpectedNullCharacter);
- self.emitter.push_comment("\u{fffd}");
- Ok(ControlToken::Continue)
- }
- Some(x) => {
- self.emitter.push_comment(ctostr!(x));
- Ok(ControlToken::Continue)
- }
- },
- State::MarkupDeclarationOpen => match self.read_char()? {
- Some('-') if self.try_read_string("-", true)? => {
- self.emitter.init_comment();
- self.state = State::CommentStart;
- Ok(ControlToken::Continue)
- }
- Some('d' | 'D') if self.try_read_string("octype", false)? => {
- self.state = State::Doctype;
- Ok(ControlToken::Continue)
- }
- Some('[') if self.try_read_string("CDATA[", true)? => {
- // missing: check for adjusted current element: we don't have an element stack
- // at all
- //
- // missing: cdata transition
- //
- // let's hope that bogus comment can just sort of skip over cdata
- self.emitter.emit_error(Error::CdataInHtmlContent);
-
- self.emitter.init_comment();
- self.emitter.push_comment("[CDATA[");
- self.state = State::BogusComment;
- Ok(ControlToken::Continue)
- }
- c => {
- self.emitter.emit_error(Error::IncorrectlyOpenedComment);
- self.emitter.init_comment();
- self.state = State::BogusComment;
- self.unread_char(c);
- Ok(ControlToken::Continue)
- }
- },
- State::CommentStart => match self.read_char()? {
- Some('-') => {
- self.state = State::CommentStartDash;
- Ok(ControlToken::Continue)
- }
- Some('>') => {
- self.emitter.emit_error(Error::AbruptClosingOfEmptyComment);
- self.state = State::Data;
- self.emitter.emit_current_comment();
- Ok(ControlToken::Continue)
- }
- c => {
- self.unread_char(c);
- self.state = State::Comment;
- Ok(ControlToken::Continue)
- }
- },
- State::CommentStartDash => match self.read_char()? {
- Some('-') => {
- self.state = State::CommentEnd;
- Ok(ControlToken::Continue)
- }
- Some('>') => {
- self.emitter.emit_error(Error::AbruptClosingOfEmptyComment);
- self.state = State::Data;
- self.emitter.emit_current_comment();
- Ok(ControlToken::Continue)
- }
- None => {
- self.emitter.emit_error(Error::EofInComment);
- self.emitter.emit_current_comment();
- Ok(ControlToken::Eof)
- }
- c @ Some(_) => {
- self.emitter.push_comment("-");
- self.unread_char(c);
- self.state = State::Comment;
- Ok(ControlToken::Continue)
- }
- },
- State::Comment => match self.read_char()? {
- Some('<') => {
- self.emitter.push_comment("<");
- self.state = State::CommentLessThanSign;
- Ok(ControlToken::Continue)
- }
- Some('-') => {
- self.state = State::CommentEndDash;
- Ok(ControlToken::Continue)
- }
- Some('\0') => {
- self.emitter.emit_error(Error::UnexpectedNullCharacter);
- self.emitter.push_comment("\u{fffd}");
- Ok(ControlToken::Continue)
- }
- None => {
- self.emitter.emit_error(Error::EofInComment);
- self.emitter.emit_current_comment();
- Ok(ControlToken::Eof)
- }
- Some(x) => {
- self.emitter.push_comment(ctostr!(x));
- Ok(ControlToken::Continue)
- }
- },
- State::CommentLessThanSign => match self.read_char()? {
- Some('!') => {
- self.emitter.push_comment("!");
- self.state = State::CommentLessThanSignBang;
- Ok(ControlToken::Continue)
- }
- Some('<') => {
- self.emitter.push_comment("<");
- Ok(ControlToken::Continue)
- }
- c => {
- self.unread_char(c);
- self.state = State::Comment;
- Ok(ControlToken::Continue)
- }
- },
- State::CommentLessThanSignBang => match self.read_char()? {
- Some('-') => {
- self.state = State::CommentLessThanSignBangDash;
- Ok(ControlToken::Continue)
- }
- c => {
- self.unread_char(c);
- self.state = State::Comment;
- Ok(ControlToken::Continue)
- }
- },
- State::CommentLessThanSignBangDash => match self.read_char()? {
- Some('-') => {
- self.state = State::CommentLessThanSignBangDashDash;
- Ok(ControlToken::Continue)
- }
- c => {
- self.unread_char(c);
- self.state = State::CommentEndDash;
- Ok(ControlToken::Continue)
- }
- },
- State::CommentLessThanSignBangDashDash => match self.read_char()? {
- c @ Some('>') | c @ None => {
- self.unread_char(c);
- self.state = State::CommentEnd;
- Ok(ControlToken::Continue)
- }
- c => {
- self.emitter.emit_error(Error::NestedComment);
- self.unread_char(c);
- self.state = State::CommentEnd;
- Ok(ControlToken::Continue)
- }
- },
- State::CommentEndDash => match self.read_char()? {
- Some('-') => {
- self.state = State::CommentEnd;
- Ok(ControlToken::Continue)
- }
- None => {
- self.emitter.emit_error(Error::EofInComment);
- self.emitter.emit_current_comment();
- Ok(ControlToken::Eof)
- }
- c => {
- self.emitter.push_comment("-");
- self.unread_char(c);
- self.state = State::Comment;
- Ok(ControlToken::Continue)
- }
- },
- State::CommentEnd => match self.read_char()? {
- Some('>') => {
- self.state = State::Data;
- self.emitter.emit_current_comment();
- Ok(ControlToken::Continue)
- }
- Some('!') => {
- self.state = State::CommentEndBang;
- Ok(ControlToken::Continue)
- }
- Some('-') => {
- self.emitter.push_comment("-");
- Ok(ControlToken::Continue)
- }
- None => {
- self.emitter.emit_error(Error::EofInComment);
- self.emitter.emit_current_comment();
- Ok(ControlToken::Eof)
- }
- c @ Some(_) => {
- self.emitter.push_comment("-");
- self.emitter.push_comment("-");
- self.unread_char(c);
- self.state = State::Comment;
- Ok(ControlToken::Continue)
- }
- },
- State::CommentEndBang => match self.read_char()? {
- Some('-') => {
- self.emitter.push_comment("-");
- self.emitter.push_comment("-");
- self.emitter.push_comment("!");
- self.state = State::CommentEndDash;
- Ok(ControlToken::Continue)
- }
- Some('>') => {
- self.emitter.emit_error(Error::IncorrectlyClosedComment);
- self.state = State::Data;
- self.emitter.emit_current_comment();
- Ok(ControlToken::Continue)
- }
- None => {
- self.emitter.emit_error(Error::EofInComment);
- self.emitter.emit_current_comment();
- Ok(ControlToken::Eof)
- }
- c @ Some(_) => {
- self.emitter.push_comment("-");
- self.emitter.push_comment("-");
- self.emitter.push_comment("!");
- self.state = State::Comment;
- self.unread_char(c);
- Ok(ControlToken::Continue)
- }
- },
- State::Doctype => match self.read_char()? {
- Some(whitespace_pat!()) => {
- self.state = State::BeforeDoctypeName;
- Ok(ControlToken::Continue)
- }
- c @ Some('>') => {
- self.unread_char(c);
- self.state = State::BeforeDoctypeName;
- Ok(ControlToken::Continue)
- }
- None => {
- self.emitter.emit_error(Error::EofInDoctype);
- self.emitter.init_doctype();
- self.emitter.set_force_quirks();
- self.emitter.emit_current_doctype();
- Ok(ControlToken::Eof)
- }
- c @ Some(_) => {
- self.emitter
- .emit_error(Error::MissingWhitespaceBeforeDoctypeName);
- self.unread_char(c);
- self.state = State::BeforeDoctypeName;
- Ok(ControlToken::Continue)
- }
- },
- State::BeforeDoctypeName => match self.read_char()? {
- Some(whitespace_pat!()) => Ok(ControlToken::Continue),
- Some('\0') => {
- self.emitter.emit_error(Error::UnexpectedNullCharacter);
- self.emitter.init_doctype();
- self.emitter.push_doctype_name("\u{fffd}");
- self.state = State::DoctypeName;
- Ok(ControlToken::Continue)
- }
- Some('>') => {
- self.emitter.emit_error(Error::MissingDoctypeName);
- self.emitter.init_doctype();
- self.emitter.set_force_quirks();
- self.state = State::Data;
- self.emitter.emit_current_doctype();
- Ok(ControlToken::Continue)
- }
- None => {
- self.emitter.emit_error(Error::EofInDoctype);
- self.emitter.init_doctype();
- self.emitter.set_force_quirks();
- self.emitter.emit_current_doctype();
- Ok(ControlToken::Eof)
- }
- Some(x) => {
- self.emitter.init_doctype();
- self.emitter
- .push_doctype_name(ctostr!(x.to_ascii_lowercase()));
- self.state = State::DoctypeName;
- Ok(ControlToken::Continue)
- }
- },
- State::DoctypeName => match self.read_char()? {
- Some(whitespace_pat!()) => {
- self.state = State::AfterDoctypeName;
- Ok(ControlToken::Continue)
- }
- Some('>') => {
- self.state = State::Data;
- self.emitter.emit_current_doctype();
- Ok(ControlToken::Continue)
- }
- Some('\0') => {
- self.emitter.emit_error(Error::UnexpectedNullCharacter);
- self.emitter.push_doctype_name("\u{fffd}");
- Ok(ControlToken::Continue)
- }
- None => {
- self.emitter.emit_error(Error::EofInDoctype);
- self.emitter.set_force_quirks();
- self.emitter.emit_current_doctype();
- Ok(ControlToken::Eof)
- }
- Some(x) => {
- self.emitter
- .push_doctype_name(ctostr!(x.to_ascii_lowercase()));
- Ok(ControlToken::Continue)
- }
- },
- State::AfterDoctypeName => match self.read_char()? {
- Some(whitespace_pat!()) => Ok(ControlToken::Continue),
- Some('>') => {
- self.state = State::Data;
- self.emitter.emit_current_doctype();
- Ok(ControlToken::Continue)
- }
- None => {
- self.emitter.emit_error(Error::EofInDoctype);
- self.emitter.set_force_quirks();
- self.emitter.emit_current_doctype();
- Ok(ControlToken::Eof)
- }
- Some('p' | 'P') if self.try_read_string("ublic", false)? => {
- self.state = State::AfterDoctypePublicKeyword;
- Ok(ControlToken::Continue)
- }
- Some('s' | 'S') if self.try_read_string("ystem", false)? => {
- self.state = State::AfterDoctypeSystemKeyword;
- Ok(ControlToken::Continue)
- }
- c @ Some(_) => {
- self.emitter
- .emit_error(Error::InvalidCharacterSequenceAfterDoctypeName);
- self.emitter.set_force_quirks();
- self.unread_char(c);
- self.state = State::BogusDoctype;
- Ok(ControlToken::Continue)
- }
- },
- State::AfterDoctypePublicKeyword => match self.read_char()? {
- Some(whitespace_pat!()) => {
- self.state = State::BeforeDoctypePublicIdentifier;
- Ok(ControlToken::Continue)
- }
- Some('"') => {
- self.emitter
- .emit_error(Error::MissingWhitespaceAfterDoctypePublicKeyword);
- self.emitter.set_doctype_public_identifier("");
- self.state = State::DoctypePublicIdentifierDoubleQuoted;
- Ok(ControlToken::Continue)
- }
- Some('\'') => {
- self.emitter
- .emit_error(Error::MissingWhitespaceAfterDoctypePublicKeyword);
- self.emitter.set_doctype_public_identifier("");
- self.state = State::DoctypePublicIdentifierSingleQuoted;
- Ok(ControlToken::Continue)
- }
- Some('>') => {
- self.emitter
- .emit_error(Error::MissingDoctypePublicIdentifier);
- self.emitter.set_force_quirks();
- self.state = State::Data;
- self.emitter.emit_current_doctype();
- Ok(ControlToken::Continue)
- }
- None => {
- self.emitter.emit_error(Error::EofInDoctype);
- self.emitter.set_force_quirks();
- self.emitter.emit_current_doctype();
- Ok(ControlToken::Eof)
- }
- c @ Some(_) => {
- self.emitter
- .emit_error(Error::MissingQuoteBeforeDoctypePublicIdentifier);
- self.emitter.set_force_quirks();
- self.unread_char(c);
- self.state = State::BogusDoctype;
- Ok(ControlToken::Continue)
- }
- },
- State::BeforeDoctypePublicIdentifier => match self.read_char()? {
- Some(whitespace_pat!()) => Ok(ControlToken::Continue),
- Some('"') => {
- self.emitter.set_doctype_public_identifier("");
- self.state = State::DoctypePublicIdentifierDoubleQuoted;
- Ok(ControlToken::Continue)
- }
- Some('\'') => {
- self.emitter.set_doctype_public_identifier("");
- self.state = State::DoctypePublicIdentifierSingleQuoted;
- Ok(ControlToken::Continue)
- }
- Some('>') => {
- self.emitter
- .emit_error(Error::MissingDoctypePublicIdentifier);
- self.emitter.set_force_quirks();
- self.state = State::Data;
- self.emitter.emit_current_doctype();
- Ok(ControlToken::Continue)
- }
- None => {
- self.emitter.emit_error(Error::EofInDoctype);
- self.emitter.set_force_quirks();
- self.emitter.emit_current_doctype();
- Ok(ControlToken::Eof)
- }
- c @ Some(_) => {
- self.emitter
- .emit_error(Error::MissingQuoteBeforeDoctypePublicIdentifier);
- self.emitter.set_force_quirks();
- self.unread_char(c);
- self.state = State::BogusDoctype;
- Ok(ControlToken::Continue)
- }
- },
- State::DoctypePublicIdentifierDoubleQuoted => match self.read_char()? {
- Some('"') => {
- self.state = State::AfterDoctypePublicIdentifier;
- Ok(ControlToken::Continue)
- }
- Some('\0') => {
- self.emitter.emit_error(Error::UnexpectedNullCharacter);
- self.emitter.push_doctype_public_identifier("\u{fffd}");
- Ok(ControlToken::Continue)
- }
- Some('>') => {
- self.emitter
- .emit_error(Error::AbruptDoctypePublicIdentifier);
- self.emitter.set_force_quirks();
- self.state = State::Data;
- self.emitter.emit_current_doctype();
- Ok(ControlToken::Continue)
- }
- None => {
- self.emitter.emit_error(Error::EofInDoctype);
- self.emitter.set_force_quirks();
- self.emitter.emit_current_doctype();
- Ok(ControlToken::Eof)
- }
- Some(x) => {
- self.emitter.push_doctype_public_identifier(ctostr!(x));
- Ok(ControlToken::Continue)
- }
- },
- State::DoctypePublicIdentifierSingleQuoted => match self.read_char()? {
- Some('\'') => {
- self.state = State::AfterDoctypePublicIdentifier;
- Ok(ControlToken::Continue)
- }
- Some('\0') => {
- self.emitter.emit_error(Error::UnexpectedNullCharacter);
- self.emitter.push_doctype_public_identifier("\u{fffd}");
- Ok(ControlToken::Continue)
- }
- Some('>') => {
- self.emitter
- .emit_error(Error::AbruptDoctypePublicIdentifier);
- self.emitter.set_force_quirks();
- self.state = State::Data;
- self.emitter.emit_current_doctype();
- Ok(ControlToken::Continue)
- }
- None => {
- self.emitter.emit_error(Error::EofInDoctype);
- self.emitter.set_force_quirks();
- self.emitter.emit_current_doctype();
- Ok(ControlToken::Eof)
- }
- Some(x) => {
- self.emitter.push_doctype_public_identifier(ctostr!(x));
- Ok(ControlToken::Continue)
- }
- },
- State::AfterDoctypePublicIdentifier => match self.read_char()? {
- Some(whitespace_pat!()) => {
- self.state = State::BetweenDoctypePublicAndSystemIdentifiers;
- Ok(ControlToken::Continue)
- }
- Some('>') => {
- self.state = State::Data;
- self.emitter.emit_current_doctype();
- Ok(ControlToken::Continue)
- }
- Some('"') => {
- self.emitter.emit_error(
- Error::MissingWhitespaceBetweenDoctypePublicAndSystemIdentifiers,
- );
- self.emitter.set_doctype_system_identifier("");
- self.state = State::DoctypeSystemIdentifierDoubleQuoted;
- Ok(ControlToken::Continue)
- }
- Some('\'') => {
- self.emitter.emit_error(
- Error::MissingWhitespaceBetweenDoctypePublicAndSystemIdentifiers,
- );
- self.emitter.set_doctype_system_identifier("");
- self.state = State::DoctypeSystemIdentifierSingleQuoted;
- Ok(ControlToken::Continue)
- }
- None => {
- self.emitter.emit_error(Error::EofInDoctype);
- self.emitter.set_force_quirks();
- self.emitter.emit_current_doctype();
- Ok(ControlToken::Eof)
- }
- c @ Some(_) => {
- self.emitter
- .emit_error(Error::MissingQuoteBeforeDoctypeSystemIdentifier);
- self.emitter.set_force_quirks();
- self.unread_char(c);
- self.state = State::BogusDoctype;
- Ok(ControlToken::Continue)
- }
- },
- State::BetweenDoctypePublicAndSystemIdentifiers => match self.read_char()? {
- Some(whitespace_pat!()) => Ok(ControlToken::Continue),
- Some('>') => {
- self.state = State::Data;
- self.emitter.emit_current_doctype();
- Ok(ControlToken::Continue)
- }
- Some('"') => {
- self.emitter.set_doctype_system_identifier("");
- self.state = State::DoctypeSystemIdentifierDoubleQuoted;
- Ok(ControlToken::Continue)
- }
- Some('\'') => {
- self.emitter.set_doctype_system_identifier("");
- self.state = State::DoctypeSystemIdentifierSingleQuoted;
- Ok(ControlToken::Continue)
- }
- None => {
- self.emitter.emit_error(Error::EofInDoctype);
- self.emitter.set_force_quirks();
- self.emitter.emit_current_doctype();
- Ok(ControlToken::Eof)
- }
- c @ Some(_) => {
- self.emitter
- .emit_error(Error::MissingQuoteBeforeDoctypeSystemIdentifier);
- self.emitter.set_force_quirks();
- self.state = State::BogusDoctype;
- self.unread_char(c);
- Ok(ControlToken::Continue)
- }
- },
- State::AfterDoctypeSystemKeyword => match self.read_char()? {
- Some(whitespace_pat!()) => {
- self.state = State::BeforeDoctypeSystemIdentifier;
- Ok(ControlToken::Continue)
- }
- Some('"') => {
- self.emitter
- .emit_error(Error::MissingWhitespaceAfterDoctypeSystemKeyword);
- self.emitter.set_doctype_system_identifier("");
- self.state = State::DoctypeSystemIdentifierDoubleQuoted;
- Ok(ControlToken::Continue)
- }
- Some('\'') => {
- self.emitter
- .emit_error(Error::MissingWhitespaceAfterDoctypeSystemKeyword);
- self.emitter.set_doctype_system_identifier("");
- self.state = State::DoctypeSystemIdentifierSingleQuoted;
- Ok(ControlToken::Continue)
- }
- Some('>') => {
- self.emitter
- .emit_error(Error::MissingDoctypeSystemIdentifier);
- self.emitter.set_force_quirks();
- self.state = State::Data;
- self.emitter.emit_current_doctype();
- Ok(ControlToken::Continue)
- }
- None => {
- self.emitter.emit_error(Error::EofInDoctype);
- self.emitter.set_force_quirks();
- self.emitter.emit_current_doctype();
- Ok(ControlToken::Eof)
- }
- c @ Some(_) => {
- self.emitter
- .emit_error(Error::MissingQuoteBeforeDoctypeSystemIdentifier);
- self.emitter.set_force_quirks();
- self.state = State::BogusDoctype;
- self.unread_char(c);
- Ok(ControlToken::Continue)
- }
- },
- State::BeforeDoctypeSystemIdentifier => match self.read_char()? {
- Some(whitespace_pat!()) => Ok(ControlToken::Continue),
- Some('"') => {
- self.emitter.set_doctype_system_identifier("");
- self.state = State::DoctypeSystemIdentifierDoubleQuoted;
- Ok(ControlToken::Continue)
- }
- Some('\'') => {
- self.emitter.set_doctype_system_identifier("");
- self.state = State::DoctypeSystemIdentifierSingleQuoted;
- Ok(ControlToken::Continue)
- }
- Some('>') => {
- self.emitter
- .emit_error(Error::MissingDoctypeSystemIdentifier);
- self.emitter.set_force_quirks();
- self.state = State::Data;
- self.emitter.emit_current_doctype();
- Ok(ControlToken::Continue)
- }
- None => {
- self.emitter.emit_error(Error::EofInDoctype);
- self.emitter.set_force_quirks();
- self.emitter.emit_current_doctype();
- Ok(ControlToken::Eof)
- }
- c @ Some(_) => {
- self.emitter
- .emit_error(Error::MissingQuoteBeforeDoctypeSystemIdentifier);
- self.emitter.set_force_quirks();
- self.state = State::BogusDoctype;
- self.unread_char(c);
- Ok(ControlToken::Continue)
- }
- },
- State::DoctypeSystemIdentifierDoubleQuoted => match self.read_char()? {
- Some('"') => {
- self.state = State::AfterDoctypeSystemIdentifier;
- Ok(ControlToken::Continue)
- }
- Some('\0') => {
- self.emitter.emit_error(Error::UnexpectedNullCharacter);
- self.emitter.push_doctype_system_identifier("\u{fffd}");
- Ok(ControlToken::Continue)
- }
- Some('>') => {
- self.emitter
- .emit_error(Error::AbruptDoctypeSystemIdentifier);
- self.emitter.set_force_quirks();
- self.state = State::Data;
- self.emitter.emit_current_doctype();
- Ok(ControlToken::Continue)
- }
- None => {
- self.emitter.emit_error(Error::EofInDoctype);
- self.emitter.set_force_quirks();
- self.emitter.emit_current_doctype();
- Ok(ControlToken::Eof)
- }
- Some(x) => {
- self.emitter.push_doctype_system_identifier(ctostr!(x));
- Ok(ControlToken::Continue)
- }
- },
- State::DoctypeSystemIdentifierSingleQuoted => match self.read_char()? {
- Some('\'') => {
- self.state = State::AfterDoctypeSystemIdentifier;
- Ok(ControlToken::Continue)
- }
- Some('\0') => {
- self.emitter.emit_error(Error::UnexpectedNullCharacter);
- self.emitter.push_doctype_system_identifier("\u{fffd}");
- Ok(ControlToken::Continue)
- }
- Some('>') => {
- self.emitter
- .emit_error(Error::AbruptDoctypeSystemIdentifier);
- self.emitter.set_force_quirks();
- self.state = State::Data;
- self.emitter.emit_current_doctype();
- Ok(ControlToken::Continue)
- }
- None => {
- self.emitter.emit_error(Error::EofInDoctype);
- self.emitter.set_force_quirks();
- self.emitter.emit_current_doctype();
- Ok(ControlToken::Eof)
- }
- Some(x) => {
- self.emitter.push_doctype_system_identifier(ctostr!(x));
- Ok(ControlToken::Continue)
- }
- },
- State::AfterDoctypeSystemIdentifier => match self.read_char()? {
- Some(whitespace_pat!()) => Ok(ControlToken::Continue),
- Some('>') => {
- self.state = State::Data;
- self.emitter.emit_current_doctype();
- Ok(ControlToken::Continue)
- }
- None => {
- self.emitter.emit_error(Error::EofInDoctype);
- self.emitter.set_force_quirks();
- self.emitter.emit_current_doctype();
- Ok(ControlToken::Eof)
- }
- c @ Some(_) => {
- self.emitter
- .emit_error(Error::UnexpectedCharacterAfterDoctypeSystemIdentifier);
- self.unread_char(c);
- self.state = State::BogusDoctype;
- Ok(ControlToken::Continue)
- }
- },
- State::BogusDoctype => match self.read_char()? {
- Some('>') => {
- self.state = State::Data;
- self.emitter.emit_current_doctype();
- Ok(ControlToken::Continue)
- }
- Some('\0') => {
- self.emitter.emit_error(Error::UnexpectedNullCharacter);
- Ok(ControlToken::Continue)
- }
- None => {
- self.emitter.emit_current_doctype();
- Ok(ControlToken::Eof)
- }
- Some(_) => Ok(ControlToken::Continue),
- },
- State::CdataSection => match self.read_char()? {
- Some(']') => {
- self.state = State::CdataSectionBracket;
- Ok(ControlToken::Continue)
- }
- None => {
- self.emitter.emit_error(Error::EofInCdata);
- Ok(ControlToken::Eof)
- }
- Some(x) => {
- self.emitter.emit_string(ctostr!(x));
- Ok(ControlToken::Continue)
- }
- },
- State::CdataSectionBracket => match self.read_char()? {
- Some(']') => {
- self.state = State::CdataSectionEnd;
- Ok(ControlToken::Continue)
- }
- c => {
- self.emitter.emit_string("]");
- self.state = State::CdataSection;
- self.unread_char(c);
- Ok(ControlToken::Continue)
- }
- },
- State::CdataSectionEnd => match self.read_char()? {
- Some(']') => {
- self.emitter.emit_string("]");
- Ok(ControlToken::Continue)
- }
- Some('>') => {
- self.state = State::Data;
- Ok(ControlToken::Continue)
- }
- c => {
- self.emitter.emit_string("]]");
- self.unread_char(c);
- self.state = State::CdataSection;
- Ok(ControlToken::Continue)
- }
- },
- State::CharacterReference => {
- self.temporary_buffer.clear();
- self.temporary_buffer.push('&');
- match self.read_char()? {
- Some(x) if x.is_ascii_alphanumeric() => {
- self.unread_char(Some(x));
- self.state = State::NamedCharacterReference;
- Ok(ControlToken::Continue)
- }
- Some('#') => {
- self.temporary_buffer.push('#');
- self.state = State::NumericCharacterReference;
- Ok(ControlToken::Continue)
- }
- c => {
- self.flush_code_points_consumed_as_character_reference();
- self.state = self.return_state.take().unwrap();
- self.unread_char(c);
- Ok(ControlToken::Continue)
- }
- }
- }
- State::NamedCharacterReference => {
- let c = self.read_char()?;
-
- let char_ref = match c {
- Some(x) => entities::try_read_character_reference(x, |x| {
- self.try_read_string(x, true)
- })?
- .map(|char_ref| (x, char_ref)),
-
- None => None,
- };
-
- if let Some((x, char_ref)) = char_ref {
- self.temporary_buffer.push(x);
- self.temporary_buffer.push_str(char_ref.name);
- let char_ref_name_last_character = char_ref.name.chars().last();
- let next_character = self.next_input_character()?;
- if self.is_consumed_as_part_of_an_attribute()
- && char_ref_name_last_character != Some(';')
- && matches!(next_character, Some(x) if x == '=' || x.is_ascii_alphanumeric())
- {
- self.flush_code_points_consumed_as_character_reference();
- self.state = self.return_state.take().unwrap();
- Ok(ControlToken::Continue)
- } else {
- if char_ref_name_last_character != Some(';') {
- self.emitter
- .emit_error(Error::MissingSemicolonAfterCharacterReference);
- }
-
- self.temporary_buffer.clear();
- self.temporary_buffer.push_str(char_ref.characters);
- self.flush_code_points_consumed_as_character_reference();
- self.state = self.return_state.take().unwrap();
- Ok(ControlToken::Continue)
- }
- } else {
- self.unread_char(c);
- self.flush_code_points_consumed_as_character_reference();
- self.state = State::AmbiguousAmpersand;
- Ok(ControlToken::Continue)
- }
- }
- State::AmbiguousAmpersand => match self.read_char()? {
- Some(x) if x.is_ascii_alphanumeric() => {
- if self.is_consumed_as_part_of_an_attribute() {
- self.emitter.push_attribute_value(ctostr!(x));
- } else {
- self.emitter.emit_string(ctostr!(x));
- }
-
- Ok(ControlToken::Continue)
- }
- c @ Some(';') => {
- self.emitter
- .emit_error(Error::UnknownNamedCharacterReference);
- self.unread_char(c);
- self.state = self.return_state.take().unwrap();
- Ok(ControlToken::Continue)
- }
- c => {
- self.unread_char(c);
- self.state = self.return_state.take().unwrap();
- Ok(ControlToken::Continue)
- }
- },
- State::NumericCharacterReference => {
- self.character_reference_code = 0;
- match self.read_char()? {
- Some(x @ 'x' | x @ 'X') => {
- self.temporary_buffer.push(x);
- self.state = State::HexadecimalCharacterReferenceStart;
- Ok(ControlToken::Continue)
- }
- c => {
- self.unread_char(c);
- self.state = State::DecimalCharacterReferenceStart;
- Ok(ControlToken::Continue)
- }
- }
- }
- State::HexadecimalCharacterReferenceStart => match self.read_char()? {
- c @ Some('0'..='9' | 'A'..='F' | 'a'..='f') => {
- self.unread_char(c);
- self.state = State::HexadecimalCharacterReference;
- Ok(ControlToken::Continue)
- }
- c => {
- self.emitter
- .emit_error(Error::AbsenceOfDigitsInNumericCharacterReference);
- self.flush_code_points_consumed_as_character_reference();
- self.unread_char(c);
- self.state = self.return_state.take().unwrap();
- Ok(ControlToken::Continue)
- }
- },
- State::DecimalCharacterReferenceStart => match self.read_char()? {
- Some(x @ ascii_digit_pat!()) => {
- self.unread_char(Some(x));
- self.state = State::DecimalCharacterReference;
- Ok(ControlToken::Continue)
- }
- c => {
- self.emitter
- .emit_error(Error::AbsenceOfDigitsInNumericCharacterReference);
- self.flush_code_points_consumed_as_character_reference();
- self.unread_char(c);
- self.state = self.return_state.take().unwrap();
- Ok(ControlToken::Continue)
- }
- },
- State::HexadecimalCharacterReference => match self.read_char()? {
- Some(x @ ascii_digit_pat!()) => {
- mutate_character_reference!(*16 + x - 0x0030);
- Ok(ControlToken::Continue)
- }
- Some(x @ 'A'..='F') => {
- mutate_character_reference!(*16 + x - 0x0037);
- Ok(ControlToken::Continue)
- }
- Some(x @ 'a'..='f') => {
- mutate_character_reference!(*16 + x - 0x0057);
- Ok(ControlToken::Continue)
- }
- Some(';') => {
- self.state = State::NumericCharacterReferenceEnd;
- Ok(ControlToken::Continue)
- }
- c => {
- self.emitter
- .emit_error(Error::MissingSemicolonAfterCharacterReference);
- self.unread_char(c);
- self.state = State::NumericCharacterReferenceEnd;
- Ok(ControlToken::Continue)
- }
- },
- State::DecimalCharacterReference => match self.read_char()? {
- Some(x @ ascii_digit_pat!()) => {
- mutate_character_reference!(*10 + x - 0x0030);
- Ok(ControlToken::Continue)
- }
- Some(';') => {
- self.state = State::NumericCharacterReferenceEnd;
- Ok(ControlToken::Continue)
- }
- c => {
- self.emitter
- .emit_error(Error::MissingSemicolonAfterCharacterReference);
- self.unread_char(c);
- self.state = State::NumericCharacterReferenceEnd;
- Ok(ControlToken::Continue)
- }
- },
- State::NumericCharacterReferenceEnd => {
- match self.character_reference_code {
- 0x00 => {
- self.emitter.emit_error(Error::NullCharacterReference);
- self.character_reference_code = 0xfffd;
- }
- 0x110000.. => {
- self.emitter
- .emit_error(Error::CharacterReferenceOutsideUnicodeRange);
- self.character_reference_code = 0xfffd;
- }
- surrogate_pat!() => {
- self.emitter.emit_error(Error::SurrogateCharacterReference);
- self.character_reference_code = 0xfffd;
- }
- // noncharacter
- noncharacter_pat!() => {
- self.emitter
- .emit_error(Error::NoncharacterCharacterReference);
- }
- // 0x000d, or a control that is not whitespace
- x @ 0x000d | x @ control_pat!()
- if !matches!(x, 0x0009 | 0x000a | 0x000c | 0x0020) =>
- {
- self.emitter.emit_error(Error::ControlCharacterReference);
- self.character_reference_code = match x {
- 0x80 => 0x20AC, // EURO SIGN (€)
- 0x82 => 0x201A, // SINGLE LOW-9 QUOTATION MARK (‚)
- 0x83 => 0x0192, // LATIN SMALL LETTER F WITH HOOK (ƒ)
- 0x84 => 0x201E, // DOUBLE LOW-9 QUOTATION MARK („)
- 0x85 => 0x2026, // HORIZONTAL ELLIPSIS (…)
- 0x86 => 0x2020, // DAGGER (†)
- 0x87 => 0x2021, // DOUBLE DAGGER (‡)
- 0x88 => 0x02C6, // MODIFIER LETTER CIRCUMFLEX ACCENT (ˆ)
- 0x89 => 0x2030, // PER MILLE SIGN (‰)
- 0x8A => 0x0160, // LATIN CAPITAL LETTER S WITH CARON (Š)
- 0x8B => 0x2039, // SINGLE LEFT-POINTING ANGLE QUOTATION MARK (‹)
- 0x8C => 0x0152, // LATIN CAPITAL LIGATURE OE (Œ)
- 0x8E => 0x017D, // LATIN CAPITAL LETTER Z WITH CARON (Ž)
- 0x91 => 0x2018, // LEFT SINGLE QUOTATION MARK (‘)
- 0x92 => 0x2019, // RIGHT SINGLE QUOTATION MARK (’)
- 0x93 => 0x201C, // LEFT DOUBLE QUOTATION MARK (“)
- 0x94 => 0x201D, // RIGHT DOUBLE QUOTATION MARK (”)
- 0x95 => 0x2022, // BULLET (•)
- 0x96 => 0x2013, // EN DASH (–)
- 0x97 => 0x2014, // EM DASH (—)
- 0x98 => 0x02DC, // SMALL TILDE (˜)
- 0x99 => 0x2122, // TRADE MARK SIGN (™)
- 0x9A => 0x0161, // LATIN SMALL LETTER S WITH CARON (š)
- 0x9B => 0x203A, // SINGLE RIGHT-POINTING ANGLE QUOTATION MARK (›)
- 0x9C => 0x0153, // LATIN SMALL LIGATURE OE (œ)
- 0x9E => 0x017E, // LATIN SMALL LETTER Z WITH CARON (ž)
- 0x9F => 0x0178, // LATIN CAPITAL LETTER Y WITH DIAERESIS (Ÿ)
- _ => self.character_reference_code,
- };
- }
- _ => (),
- }
-
- self.temporary_buffer.clear();
- self.temporary_buffer
- .push(std::char::from_u32(self.character_reference_code).unwrap());
- self.flush_code_points_consumed_as_character_reference();
- self.state = self.return_state.take().unwrap();
- Ok(ControlToken::Continue)
- }
- }
- }
-}
-
-impl<R: Reader, E: Emitter> Iterator for Tokenizer<R, E> {
- type Item = Result<E::Token, R::Error>;
-
- fn next(&mut self) -> Option<Self::Item> {
- loop {
- if let Some(token) = self.emitter.pop_token() {
- break Some(Ok(token));
- } else if !self.eof {
- match self.consume() {
- Ok(ControlToken::Continue) => (),
- Ok(ControlToken::Eof) => {
- self.eof = true;
- self.emitter.emit_eof();
- }
- Err(e) => break Some(Err(e)),
- }
- } else {
- break None;
- }
- }
- }
-}
-
-/// A kind of tokenizer that directly yields tokens when used as an iterator, so `Token` instead of
-/// `Result<Token, _>`.
-///
-/// This is the return value of [`Tokenizer::infallible`].
-pub struct InfallibleTokenizer<R: Reader<Error = Never>, E: Emitter>(Tokenizer<R, E>);
-
-impl<R: Reader<Error = Never>, E: Emitter> Tokenizer<R, E> {
- /// Statically assert that this iterator is infallible.
- ///
- /// Call this to get rid of error handling when parsing HTML from strings.
- pub fn infallible(self) -> InfallibleTokenizer<R, E> {
- InfallibleTokenizer(self)
- }
-}
-
-impl<R: Reader<Error = Never>, E: Emitter> Iterator for InfallibleTokenizer<R, E> {
- type Item = E::Token;
-
- fn next(&mut self) -> Option<Self::Item> {
- match self.0.next()? {
- Ok(token) => Some(token),
- Err(e) => match e {},
- }
- }
-}
+pub use tokenizer::{InfallibleTokenizer, Tokenizer};
diff --git a/src/machine.rs b/src/machine.rs
index 67db1b9..5991912 100644
--- a/src/machine.rs
+++ b/src/machine.rs
@@ -1,164 +1,1926 @@
-macro_rules! surrogate_pat {
- () => {
- 0xd800..=0xdfff
+use crate::entities::try_read_character_reference;
+use crate::utils::{
+ ascii_digit_pat, control_pat, noncharacter_pat, surrogate_pat, whitespace_pat, ControlToken,
+ State,
+};
+use crate::{Emitter, Error, Reader, Tokenizer};
+
+macro_rules! ctostr {
+ ($c:expr) => {
+ &*$c.encode_utf8(&mut [0; 4])
};
}
-pub(crate) use surrogate_pat;
+// Note: This is not implemented as a method on Tokenizer because there's fields on Tokenizer that
+// should not be available in this method, such as Tokenizer.to_reconsume or the Reader instance
+#[inline]
+pub fn consume<R: Reader, E: Emitter>(slf: &mut Tokenizer<R, E>) -> Result<ControlToken, R::Error> {
+ macro_rules! mutate_character_reference {
+ (* $mul:literal + $x:ident - $sub:literal) => {
+ match slf
+ .character_reference_code
+ .checked_mul($mul)
+ .and_then(|cr| cr.checked_add($x as u32 - $sub))
+ {
+ Some(cr) => slf.character_reference_code = cr,
+ None => {
+ // provoke err
+ slf.character_reference_code = 0x110000;
+ }
+ };
+ };
+ }
-macro_rules! control_pat {
- () => (0x0d | 0x0000..=0x001f | 0x007f..=0x009f)
-}
+ match slf.state {
+ State::Data => match slf.read_char()? {
+ Some('&') => {
+ slf.return_state = Some(slf.state);
+ slf.state = State::CharacterReference;
+ Ok(ControlToken::Continue)
+ }
+ Some('<') => {
+ slf.state = State::TagOpen;
+ Ok(ControlToken::Continue)
+ }
+ Some('\0') => {
+ slf.emitter.emit_error(Error::UnexpectedNullCharacter);
+ slf.emitter.emit_string("\0");
+ Ok(ControlToken::Continue)
+ }
+ Some(x) => {
+ slf.emitter.emit_string(ctostr!(x));
+ Ok(ControlToken::Continue)
+ }
+ None => Ok(ControlToken::Eof),
+ },
+ State::RcData => match slf.read_char()? {
+ Some('&') => {
+ slf.return_state = Some(State::RcData);
+ slf.state = State::CharacterReference;
+ Ok(ControlToken::Continue)
+ }
+ Some('<') => {
+ slf.state = State::RcDataLessThanSign;
+ Ok(ControlToken::Continue)
+ }
+ Some('\0') => {
+ slf.emitter.emit_error(Error::UnexpectedNullCharacter);
+ slf.emitter.emit_string("\u{fffd}");
+ Ok(ControlToken::Continue)
+ }
+ Some(x) => {
+ slf.emitter.emit_string(ctostr!(x));
+ Ok(ControlToken::Continue)
+ }
+ None => Ok(ControlToken::Eof),
+ },
+ State::RawText => match slf.read_char()? {
+ Some('<') => {
+ slf.state = State::RawTextLessThanSign;
+ Ok(ControlToken::Continue)
+ }
+ Some('\0') => {
+ slf.emitter.emit_error(Error::UnexpectedNullCharacter);
+ slf.emitter.emit_string("\u{fffd}");
+ Ok(ControlToken::Continue)
+ }
+ Some(x) => {
+ slf.emitter.emit_string(ctostr!(x));
+ Ok(ControlToken::Continue)
+ }
+ None => Ok(ControlToken::Eof),
+ },
+ State::ScriptData => match slf.read_char()? {
+ Some('<') => {
+ slf.state = State::ScriptDataLessThanSign;
+ Ok(ControlToken::Continue)
+ }
+ Some('\0') => {
+ slf.emitter.emit_error(Error::UnexpectedNullCharacter);
+ slf.emitter.emit_string("\u{fffd}");
+ Ok(ControlToken::Continue)
+ }
+ Some(x) => {
+ slf.emitter.emit_string(ctostr!(x));
+ Ok(ControlToken::Continue)
+ }
+ None => Ok(ControlToken::Eof),
+ },
+ State::PlainText => match slf.read_char()? {
+ Some('\0') => {
+ slf.emitter.emit_error(Error::UnexpectedNullCharacter);
+ slf.emitter.emit_string("\u{fffd}");
+ Ok(ControlToken::Continue)
+ }
+ Some(x) => {
+ slf.emitter.emit_string(ctostr!(x));
+ Ok(ControlToken::Continue)
+ }
+ None => Ok(ControlToken::Eof),
+ },
+ State::TagOpen => match slf.read_char()? {
+ Some('!') => {
+ slf.state = State::MarkupDeclarationOpen;
+ Ok(ControlToken::Continue)
+ }
+ Some('/') => {
+ slf.state = State::EndTagOpen;
+ Ok(ControlToken::Continue)
+ }
+ Some(x) if x.is_ascii_alphabetic() => {
+ slf.emitter.init_start_tag();
+ slf.state = State::TagName;
+ slf.unread_char(Some(x));
+ Ok(ControlToken::Continue)
+ }
+ c @ Some('?') => {
+ slf.emitter
+ .emit_error(Error::UnexpectedQuestionMarkInsteadOfTagName);
+ slf.emitter.init_comment();
+ slf.state = State::BogusComment;
+ slf.unread_char(c);
+ Ok(ControlToken::Continue)
+ }
+ None => {
+ slf.emitter.emit_error(Error::EofBeforeTagName);
+ slf.emitter.emit_string("<");
+ Ok(ControlToken::Eof)
+ }
+ c @ Some(_) => {
+ slf.emitter
+ .emit_error(Error::InvalidFirstCharacterOfTagName);
+ slf.state = State::Data;
+ slf.emitter.emit_string("<");
+ slf.unread_char(c);
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::EndTagOpen => match slf.read_char()? {
+ Some(x) if x.is_ascii_alphabetic() => {
+ slf.emitter.init_end_tag();
+ slf.state = State::TagName;
+ slf.unread_char(Some(x));
+ Ok(ControlToken::Continue)
+ }
+ Some('>') => {
+ slf.emitter.emit_error(Error::MissingEndTagName);
+ slf.state = State::Data;
+ Ok(ControlToken::Continue)
+ }
+ None => {
+ slf.emitter.emit_error(Error::EofBeforeTagName);
+ slf.emitter.emit_string("</");
+ Ok(ControlToken::Eof)
+ }
+ Some(x) => {
+ slf.emitter
+ .emit_error(Error::InvalidFirstCharacterOfTagName);
+ slf.emitter.init_comment();
+ slf.state = State::BogusComment;
+ slf.unread_char(Some(x));
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::TagName => match slf.read_char()? {
+ Some(whitespace_pat!()) => {
+ slf.state = State::BeforeAttributeName;
+ Ok(ControlToken::Continue)
+ }
+ Some('/') => {
+ slf.state = State::SelfClosingStartTag;
+ Ok(ControlToken::Continue)
+ }
+ Some('>') => {
+ slf.state = State::Data;
+ slf.emitter.emit_current_tag();
+ Ok(ControlToken::Continue)
+ }
+ Some('\0') => {
+ slf.emitter.emit_error(Error::UnexpectedNullCharacter);
+ slf.emitter.push_tag_name("\u{fffd}");
+ Ok(ControlToken::Continue)
+ }
+ Some(x) => {
+ slf.emitter.push_tag_name(ctostr!(x.to_ascii_lowercase()));
+ Ok(ControlToken::Continue)
+ }
+ None => {
+ slf.emitter.emit_error(Error::EofInTag);
+ Ok(ControlToken::Eof)
+ }
+ },
+ State::RcDataLessThanSign => match slf.read_char()? {
+ Some('/') => {
+ slf.temporary_buffer.clear();
+ slf.state = State::RcDataEndTagOpen;
+ Ok(ControlToken::Continue)
+ }
+ c => {
+ slf.emitter.emit_string("<");
+ slf.state = State::RcData;
+ slf.unread_char(c);
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::RcDataEndTagOpen => match slf.read_char()? {
+ Some(x) if x.is_ascii_alphabetic() => {
+ slf.emitter.init_end_tag();
+ slf.state = State::RcDataEndTagName;
+ slf.unread_char(Some(x));
+ Ok(ControlToken::Continue)
+ }
+ c => {
+ slf.emitter.emit_string("</");
+ slf.state = State::RcData;
+ slf.unread_char(c);
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::RcDataEndTagName => match slf.read_char()? {
+ Some(whitespace_pat!()) if slf.emitter.current_is_appropriate_end_tag_token() => {
+ slf.state = State::BeforeAttributeName;
+ Ok(ControlToken::Continue)
+ }
+ Some('/') if slf.emitter.current_is_appropriate_end_tag_token() => {
+ slf.state = State::SelfClosingStartTag;
+ Ok(ControlToken::Continue)
+ }
+ Some('>') if slf.emitter.current_is_appropriate_end_tag_token() => {
+ slf.state = State::Data;
+ slf.emitter.emit_current_tag();
+ Ok(ControlToken::Continue)
+ }
+ Some(x) if x.is_ascii_alphabetic() => {
+ slf.emitter.push_tag_name(ctostr!(x.to_ascii_lowercase()));
+ slf.temporary_buffer.push(x);
+ Ok(ControlToken::Continue)
+ }
+ c => {
+ slf.emitter.emit_string("</");
+ slf.flush_buffer_characters();
-pub(crate) use control_pat;
+ slf.state = State::RcData;
+ slf.unread_char(c);
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::RawTextLessThanSign => match slf.read_char()? {
+ Some('/') => {
+ slf.temporary_buffer.clear();
+ slf.state = State::RawTextEndTagOpen;
+ Ok(ControlToken::Continue)
+ }
+ c => {
+ slf.emitter.emit_string("<");
+ slf.state = State::RawText;
+ slf.unread_char(c);
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::RawTextEndTagOpen => match slf.read_char()? {
+ Some(x) if x.is_ascii_alphabetic() => {
+ slf.emitter.init_end_tag();
+ slf.state = State::RawTextEndTagName;
+ slf.unread_char(Some(x));
+ Ok(ControlToken::Continue)
+ }
+ c => {
+ slf.emitter.emit_string("</");
+ slf.state = State::RawText;
+ slf.unread_char(c);
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::RawTextEndTagName => match slf.read_char()? {
+ Some(whitespace_pat!()) if slf.emitter.current_is_appropriate_end_tag_token() => {
+ slf.state = State::BeforeAttributeName;
+ Ok(ControlToken::Continue)
+ }
+ Some('/') if slf.emitter.current_is_appropriate_end_tag_token() => {
+ slf.state = State::SelfClosingStartTag;
+ Ok(ControlToken::Continue)
+ }
+ Some('>') if slf.emitter.current_is_appropriate_end_tag_token() => {
+ slf.state = State::Data;
+ slf.emitter.emit_current_tag();
+ Ok(ControlToken::Continue)
+ }
+ Some(x) if x.is_ascii_alphabetic() => {
+ slf.emitter.push_tag_name(ctostr!(x.to_ascii_lowercase()));
+ slf.temporary_buffer.push(x);
+ Ok(ControlToken::Continue)
+ }
+ c => {
+ slf.emitter.emit_string("</");
+ slf.flush_buffer_characters();
-macro_rules! ascii_digit_pat {
- () => {
- '0'..='9'
- };
-}
+ slf.state = State::RawText;
+ slf.unread_char(c);
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::ScriptDataLessThanSign => match slf.read_char()? {
+ Some('/') => {
+ slf.temporary_buffer.clear();
+ slf.state = State::ScriptDataEndTagOpen;
+ Ok(ControlToken::Continue)
+ }
+ Some('!') => {
+ slf.state = State::ScriptDataEscapeStart;
+ slf.emitter.emit_string("<!");
+ Ok(ControlToken::Continue)
+ }
+ c => {
+ slf.emitter.emit_string("<");
+ slf.state = State::Data;
+ slf.unread_char(c);
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::ScriptDataEndTagOpen => match slf.read_char()? {
+ Some(x) if x.is_ascii_alphabetic() => {
+ slf.emitter.init_end_tag();
+ slf.state = State::ScriptDataEndTagName;
+ slf.unread_char(Some(x));
+ Ok(ControlToken::Continue)
+ }
+ c => {
+ slf.emitter.emit_string("</");
+ slf.state = State::ScriptData;
+ slf.unread_char(c);
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::ScriptDataEndTagName => match slf.read_char()? {
+ Some(whitespace_pat!()) if slf.emitter.current_is_appropriate_end_tag_token() => {
+ slf.state = State::BeforeAttributeName;
+ Ok(ControlToken::Continue)
+ }
+ Some('/') if slf.emitter.current_is_appropriate_end_tag_token() => {
+ slf.state = State::SelfClosingStartTag;
+ Ok(ControlToken::Continue)
+ }
+ Some('>') if slf.emitter.current_is_appropriate_end_tag_token() => {
+ slf.state = State::Data;
+ slf.emitter.emit_current_tag();
+ Ok(ControlToken::Continue)
+ }
+ Some(x) if x.is_ascii_alphabetic() => {
+ slf.emitter.push_tag_name(ctostr!(x.to_ascii_lowercase()));
+ slf.temporary_buffer.push(x.to_ascii_lowercase());
+ Ok(ControlToken::Continue)
+ }
+ c => {
+ slf.emitter.emit_string("</");
+ slf.flush_buffer_characters();
+ slf.state = State::Data;
+ slf.unread_char(c);
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::ScriptDataEscapeStart => match slf.read_char()? {
+ Some('-') => {
+ slf.state = State::ScriptDataEscapeStartDash;
+ slf.emitter.emit_string("-");
+ Ok(ControlToken::Continue)
+ }
+ c => {
+ slf.state = State::ScriptData;
+ slf.unread_char(c);
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::ScriptDataEscapeStartDash => match slf.read_char()? {
+ Some('-') => {
+ slf.state = State::ScriptDataEscapedDashDash;
+ slf.emitter.emit_string("-");
+ Ok(ControlToken::Continue)
+ }
+ c => {
+ slf.state = State::ScriptData;
+ slf.unread_char(c);
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::ScriptDataEscaped => match slf.read_char()? {
+ Some('-') => {
+ slf.state = State::ScriptDataEscapedDash;
+ slf.emitter.emit_string("-");
+ Ok(ControlToken::Continue)
+ }
+ Some('<') => {
+ slf.state = State::ScriptDataEscapedLessThanSign;
+ Ok(ControlToken::Continue)
+ }
+ Some('\0') => {
+ slf.emitter.emit_error(Error::UnexpectedNullCharacter);
+ slf.emitter.emit_string("\u{fffd}");
+ Ok(ControlToken::Continue)
+ }
+ None => {
+ slf.emitter
+ .emit_error(Error::EofInScriptHtmlCommentLikeText);
+ Ok(ControlToken::Eof)
+ }
+ Some(x) => {
+ slf.emitter.emit_string(ctostr!(x));
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::ScriptDataEscapedDash => match slf.read_char()? {
+ Some('-') => {
+ slf.state = State::ScriptDataEscapedDashDash;
+ slf.emitter.emit_string("-");
+ Ok(ControlToken::Continue)
+ }
+ Some('<') => {
+ slf.state = State::ScriptDataEscapedLessThanSign;
+ Ok(ControlToken::Continue)
+ }
+ Some('\0') => {
+ slf.emitter.emit_error(Error::UnexpectedNullCharacter);
+ slf.state = State::ScriptDataEscaped;
+ slf.emitter.emit_string("\u{fffd}");
+ Ok(ControlToken::Continue)
+ }
+ None => {
+ slf.emitter
+ .emit_error(Error::EofInScriptHtmlCommentLikeText);
+ Ok(ControlToken::Eof)
+ }
+ Some(x) => {
+ slf.state = State::ScriptDataEscaped;
+ slf.emitter.emit_string(ctostr!(x));
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::ScriptDataEscapedDashDash => match slf.read_char()? {
+ Some('-') => {
+ slf.emitter.emit_string("-");
+ Ok(ControlToken::Continue)
+ }
+ Some('<') => {
+ slf.state = State::ScriptDataEscapedLessThanSign;
+ Ok(ControlToken::Continue)
+ }
+ Some('>') => {
+ slf.state = State::ScriptData;
+ slf.emitter.emit_string(">");
+ Ok(ControlToken::Continue)
+ }
+ Some('\0') => {
+ slf.emitter.emit_error(Error::UnexpectedNullCharacter);
+ slf.state = State::ScriptDataEscaped;
+ slf.emitter.emit_string("\u{fffd}");
+ Ok(ControlToken::Continue)
+ }
+ None => {
+ slf.emitter
+ .emit_error(Error::EofInScriptHtmlCommentLikeText);
+ Ok(ControlToken::Eof)
+ }
+ Some(x) => {
+ slf.state = State::ScriptDataEscaped;
+ slf.emitter.emit_string(ctostr!(x));
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::ScriptDataEscapedLessThanSign => match slf.read_char()? {
+ Some('/') => {
+ slf.temporary_buffer.clear();
+ slf.state = State::ScriptDataEscapedEndTagOpen;
+ Ok(ControlToken::Continue)
+ }
+ Some(x) if x.is_ascii_alphabetic() => {
+ slf.temporary_buffer.clear();
+ slf.emitter.emit_string("<");
+ slf.state = State::ScriptDataDoubleEscapeStart;
+ slf.unread_char(Some(x));
+ Ok(ControlToken::Continue)
+ }
+ c => {
+ slf.emitter.emit_string("<");
+ slf.state = State::ScriptDataEscaped;
+ slf.unread_char(c);
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::ScriptDataEscapedEndTagOpen => match slf.read_char()? {
+ Some(x) if x.is_ascii_alphabetic() => {
+ slf.emitter.init_end_tag();
+ slf.state = State::ScriptDataEscapedEndTagName;
+ slf.unread_char(Some(x));
+ Ok(ControlToken::Continue)
+ }
+ c => {
+ slf.emitter.emit_string("</");
+ slf.unread_char(c);
+ slf.state = State::ScriptDataEscaped;
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::ScriptDataEscapedEndTagName => match slf.read_char()? {
+ Some(whitespace_pat!()) if slf.emitter.current_is_appropriate_end_tag_token() => {
+ slf.state = State::BeforeAttributeName;
+ Ok(ControlToken::Continue)
+ }
+ Some('/') if slf.emitter.current_is_appropriate_end_tag_token() => {
+ slf.state = State::SelfClosingStartTag;
+ Ok(ControlToken::Continue)
+ }
+ Some('>') if slf.emitter.current_is_appropriate_end_tag_token() => {
+ slf.state = State::Data;
+ slf.emitter.emit_current_tag();
+ Ok(ControlToken::Continue)
+ }
+ Some(x) if x.is_ascii_alphabetic() => {
+ slf.emitter.push_tag_name(ctostr!(x.to_ascii_lowercase()));
+ slf.temporary_buffer.push(x);
+ Ok(ControlToken::Continue)
+ }
+ c => {
+ slf.emitter.emit_string("</");
+ slf.flush_buffer_characters();
+ slf.state = State::ScriptDataEscaped;
+ slf.unread_char(c);
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::ScriptDataDoubleEscapeStart => match slf.read_char()? {
+ Some(x @ whitespace_pat!() | x @ '/' | x @ '>') => {
+ if slf.temporary_buffer == "script" {
+ slf.state = State::ScriptDataDoubleEscaped;
+ } else {
+ slf.state = State::ScriptDataEscaped;
+ }
+ slf.emitter.emit_string(ctostr!(x));
+ Ok(ControlToken::Continue)
+ }
+ Some(x) if x.is_ascii_alphabetic() => {
+ slf.temporary_buffer.push(x.to_ascii_lowercase());
+ slf.emitter.emit_string(ctostr!(x));
+ Ok(ControlToken::Continue)
+ }
+ c => {
+ slf.state = State::ScriptDataEscaped;
+ slf.unread_char(c);
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::ScriptDataDoubleEscaped => match slf.read_char()? {
+ Some('-') => {
+ slf.state = State::ScriptDataDoubleEscapedDash;
+ slf.emitter.emit_string("-");
+ Ok(ControlToken::Continue)
+ }
+ Some('<') => {
+ slf.state = State::ScriptDataDoubleEscapedLessThanSign;
+ slf.emitter.emit_string("<");
+ Ok(ControlToken::Continue)
+ }
+ Some('\0') => {
+ slf.emitter.emit_error(Error::UnexpectedNullCharacter);
+ slf.emitter.emit_string("\u{fffd}");
+ Ok(ControlToken::Continue)
+ }
+ None => {
+ slf.emitter
+ .emit_error(Error::EofInScriptHtmlCommentLikeText);
+ Ok(ControlToken::Eof)
+ }
+ Some(x) => {
+ slf.emitter.emit_string(ctostr!(x));
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::ScriptDataDoubleEscapedDash => match slf.read_char()? {
+ Some('-') => {
+ slf.state = State::ScriptDataDoubleEscapedDashDash;
+ slf.emitter.emit_string("-");
+ Ok(ControlToken::Continue)
+ }
+ Some('<') => {
+ slf.state = State::ScriptDataDoubleEscapedLessThanSign;
+ slf.emitter.emit_string("<");
+ Ok(ControlToken::Continue)
+ }
+ Some('\0') => {
+ slf.emitter.emit_error(Error::UnexpectedNullCharacter);
+ slf.state = State::ScriptDataDoubleEscaped;
+ slf.emitter.emit_string("\u{fffd}");
+ Ok(ControlToken::Continue)
+ }
+ None => {
+ slf.emitter
+ .emit_error(Error::EofInScriptHtmlCommentLikeText);
+ Ok(ControlToken::Eof)
+ }
+ Some(x) => {
+ slf.state = State::ScriptDataDoubleEscaped;
+ slf.emitter.emit_string(ctostr!(x));
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::ScriptDataDoubleEscapedDashDash => match slf.read_char()? {
+ Some('-') => {
+ slf.emitter.emit_string("-");
+ Ok(ControlToken::Continue)
+ }
+ Some('<') => {
+ slf.emitter.emit_string("<");
+ slf.state = State::ScriptDataDoubleEscapedLessThanSign;
+ Ok(ControlToken::Continue)
+ }
+ Some('>') => {
+ slf.emitter.emit_string(">");
+ slf.state = State::ScriptData;
+ Ok(ControlToken::Continue)
+ }
+ Some('\0') => {
+ slf.emitter.emit_error(Error::UnexpectedNullCharacter);
+ slf.state = State::ScriptDataDoubleEscaped;
+ slf.emitter.emit_string("\u{fffd}");
+ Ok(ControlToken::Continue)
+ }
+ None => {
+ slf.emitter
+ .emit_error(Error::EofInScriptHtmlCommentLikeText);
+ Ok(ControlToken::Eof)
+ }
+ Some(x) => {
+ slf.state = State::ScriptDataDoubleEscaped;
+ slf.emitter.emit_string(ctostr!(x));
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::ScriptDataDoubleEscapedLessThanSign => match slf.read_char()? {
+ Some('/') => {
+ slf.temporary_buffer.clear();
+ slf.state = State::ScriptDataDoubleEscapeEnd;
+ slf.emitter.emit_string("/");
+ Ok(ControlToken::Continue)
+ }
+ c => {
+ slf.state = State::ScriptDataDoubleEscaped;
+ slf.unread_char(c);
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::ScriptDataDoubleEscapeEnd => match slf.read_char()? {
+ Some(x @ whitespace_pat!() | x @ '/' | x @ '>') => {
+ if slf.temporary_buffer == "script" {
+ slf.state = State::ScriptDataEscaped;
+ } else {
+ slf.state = State::ScriptDataDoubleEscaped;
+ }
-pub(crate) use ascii_digit_pat;
+ slf.emitter.emit_string(ctostr!(x));
+ Ok(ControlToken::Continue)
+ }
+ Some(x) if x.is_ascii_alphabetic() => {
+ slf.temporary_buffer.push(x.to_ascii_lowercase());
+ slf.emitter.emit_string(ctostr!(x));
+ Ok(ControlToken::Continue)
+ }
+ c => {
+ slf.state = State::ScriptDataDoubleEscaped;
+ slf.unread_char(c);
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::BeforeAttributeName => match slf.read_char()? {
+ Some(whitespace_pat!()) => Ok(ControlToken::Continue),
+ c @ Some('/' | '>') | c @ None => {
+ slf.state = State::AfterAttributeName;
+ slf.unread_char(c);
+ Ok(ControlToken::Continue)
+ }
+ Some('=') => {
+ slf.emitter
+ .emit_error(Error::UnexpectedEqualsSignBeforeAttributeName);
+ slf.emitter.init_attribute();
+ slf.emitter.push_attribute_name("=");
+ slf.state = State::AttributeName;
+ Ok(ControlToken::Continue)
+ }
+ Some(x) => {
+ slf.emitter.init_attribute();
+ slf.state = State::AttributeName;
+ slf.unread_char(Some(x));
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::AttributeName => match slf.read_char()? {
+ c @ Some(whitespace_pat!() | '/' | '>') | c @ None => {
+ slf.state = State::AfterAttributeName;
+ slf.unread_char(c);
+ Ok(ControlToken::Continue)
+ }
+ Some('=') => {
+ slf.state = State::BeforeAttributeValue;
+ Ok(ControlToken::Continue)
+ }
+ Some('\0') => {
+ slf.emitter.emit_error(Error::UnexpectedNullCharacter);
+ slf.emitter.push_attribute_name("\u{fffd}");
+ Ok(ControlToken::Continue)
+ }
+ Some(x @ '"' | x @ '\'' | x @ '<') => {
+ slf.emitter
+ .emit_error(Error::UnexpectedCharacterInAttributeName);
+ slf.emitter
+ .push_attribute_name(ctostr!(x.to_ascii_lowercase()));
+ Ok(ControlToken::Continue)
+ }
+ Some(x) => {
+ slf.emitter
+ .push_attribute_name(ctostr!(x.to_ascii_lowercase()));
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::AfterAttributeName => match slf.read_char()? {
+ Some(whitespace_pat!()) => Ok(ControlToken::Continue),
+ Some('/') => {
+ slf.state = State::SelfClosingStartTag;
+ Ok(ControlToken::Continue)
+ }
+ Some('=') => {
+ slf.state = State::BeforeAttributeValue;
+ Ok(ControlToken::Continue)
+ }
+ Some('>') => {
+ slf.state = State::Data;
+ slf.emitter.emit_current_tag();
+ Ok(ControlToken::Continue)
+ }
+ None => {
+ slf.emitter.emit_error(Error::EofInTag);
+ Ok(ControlToken::Eof)
+ }
+ Some(x) => {
+ slf.emitter.init_attribute();
+ slf.state = State::AttributeName;
+ slf.unread_char(Some(x));
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::BeforeAttributeValue => match slf.read_char()? {
+ Some(whitespace_pat!()) => Ok(ControlToken::Continue),
+ Some('"') => {
+ slf.state = State::AttributeValueDoubleQuoted;
+ Ok(ControlToken::Continue)
+ }
+ Some('\'') => {
+ slf.state = State::AttributeValueSingleQuoted;
+ Ok(ControlToken::Continue)
+ }
+ Some('>') => {
+ slf.emitter.emit_error(Error::MissingAttributeValue);
+ slf.state = State::Data;
+ slf.emitter.emit_current_tag();
+ Ok(ControlToken::Continue)
+ }
+ c => {
+ slf.state = State::AttributeValueUnquoted;
+ slf.unread_char(c);
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::AttributeValueDoubleQuoted => match slf.read_char()? {
+ Some('"') => {
+ slf.state = State::AfterAttributeValueQuoted;
+ Ok(ControlToken::Continue)
+ }
+ Some('&') => {
+ slf.return_state = Some(State::AttributeValueDoubleQuoted);
+ slf.state = State::CharacterReference;
+ Ok(ControlToken::Continue)
+ }
+ Some('\0') => {
+ slf.emitter.emit_error(Error::UnexpectedNullCharacter);
+ slf.emitter.push_attribute_value("\u{fffd}");
+ Ok(ControlToken::Continue)
+ }
+ None => {
+ slf.emitter.emit_error(Error::EofInTag);
+ Ok(ControlToken::Eof)
+ }
+ Some(x) => {
+ slf.emitter.push_attribute_value(ctostr!(x));
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::AttributeValueSingleQuoted => match slf.read_char()? {
+ Some('\'') => {
+ slf.state = State::AfterAttributeValueQuoted;
+ Ok(ControlToken::Continue)
+ }
+ Some('&') => {
+ slf.return_state = Some(State::AttributeValueSingleQuoted);
+ slf.state = State::CharacterReference;
+ Ok(ControlToken::Continue)
+ }
+ Some('\0') => {
+ slf.emitter.emit_error(Error::UnexpectedNullCharacter);
+ slf.emitter.push_attribute_value("\u{fffd}");
+ Ok(ControlToken::Continue)
+ }
+ None => {
+ slf.emitter.emit_error(Error::EofInTag);
+ Ok(ControlToken::Eof)
+ }
+ Some(x) => {
+ slf.emitter.push_attribute_value(ctostr!(x));
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::AttributeValueUnquoted => match slf.read_char()? {
+ Some(whitespace_pat!()) => {
+ slf.state = State::BeforeAttributeName;
+ Ok(ControlToken::Continue)
+ }
+ Some('&') => {
+ slf.return_state = Some(State::AttributeValueUnquoted);
+ slf.state = State::CharacterReference;
+ Ok(ControlToken::Continue)
+ }
+ Some('>') => {
+ slf.state = State::Data;
+ slf.emitter.emit_current_tag();
+ Ok(ControlToken::Continue)
+ }
+ Some('\0') => {
+ slf.emitter.emit_error(Error::UnexpectedNullCharacter);
+ slf.emitter.push_attribute_value("\u{fffd}");
+ Ok(ControlToken::Continue)
+ }
+ Some(x @ '"' | x @ '\'' | x @ '<' | x @ '=' | x @ '\u{60}') => {
+ slf.emitter
+ .emit_error(Error::UnexpectedCharacterInUnquotedAttributeValue);
+ slf.emitter.push_attribute_value(ctostr!(x));
+ Ok(ControlToken::Continue)
+ }
+ None => {
+ slf.emitter.emit_error(Error::EofInTag);
+ Ok(ControlToken::Eof)
+ }
+ Some(x) => {
+ slf.emitter.push_attribute_value(ctostr!(x));
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::AfterAttributeValueQuoted => match slf.read_char()? {
+ Some(whitespace_pat!()) => {
+ slf.state = State::BeforeAttributeName;
+ Ok(ControlToken::Continue)
+ }
+ Some('/') => {
+ slf.state = State::SelfClosingStartTag;
+ Ok(ControlToken::Continue)
+ }
+ Some('>') => {
+ slf.state = State::Data;
+ slf.emitter.emit_current_tag();
+ Ok(ControlToken::Continue)
+ }
+ None => {
+ slf.emitter.emit_error(Error::EofInTag);
+ Ok(ControlToken::Eof)
+ }
+ Some(x) => {
+ slf.emitter
+ .emit_error(Error::MissingWhitespaceBetweenAttributes);
+ slf.state = State::BeforeAttributeName;
+ slf.unread_char(Some(x));
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::SelfClosingStartTag => match slf.read_char()? {
+ Some('>') => {
+ slf.emitter.set_self_closing();
+ slf.state = State::Data;
+ slf.emitter.emit_current_tag();
+ Ok(ControlToken::Continue)
+ }
+ None => {
+ slf.emitter.emit_error(Error::EofInTag);
+ Ok(ControlToken::Eof)
+ }
+ Some(x) => {
+ slf.emitter.emit_error(Error::UnexpectedSolidusInTag);
+ slf.state = State::BeforeAttributeName;
+ slf.unread_char(Some(x));
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::BogusComment => match slf.read_char()? {
+ Some('>') => {
+ slf.state = State::Data;
+ slf.emitter.emit_current_comment();
+ Ok(ControlToken::Continue)
+ }
+ None => {
+ slf.emitter.emit_current_comment();
+ Ok(ControlToken::Eof)
+ }
+ Some('\0') => {
+ slf.emitter.emit_error(Error::UnexpectedNullCharacter);
+ slf.emitter.push_comment("\u{fffd}");
+ Ok(ControlToken::Continue)
+ }
+ Some(x) => {
+ slf.emitter.push_comment(ctostr!(x));
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::MarkupDeclarationOpen => match slf.read_char()? {
+ Some('-') if slf.try_read_string("-", true)? => {
+ slf.emitter.init_comment();
+ slf.state = State::CommentStart;
+ Ok(ControlToken::Continue)
+ }
+ Some('d' | 'D') if slf.try_read_string("octype", false)? => {
+ slf.state = State::Doctype;
+ Ok(ControlToken::Continue)
+ }
+ Some('[') if slf.try_read_string("CDATA[", true)? => {
+ // missing: check for adjusted current element: we don't have an element stack
+ // at all
+ //
+ // missing: cdata transition
+ //
+ // let's hope that bogus comment can just sort of skip over cdata
+ slf.emitter.emit_error(Error::CdataInHtmlContent);
-macro_rules! whitespace_pat {
- () => {
- '\t' | '\u{0A}' | '\u{0C}' | ' '
- };
-}
+ slf.emitter.init_comment();
+ slf.emitter.push_comment("[CDATA[");
+ slf.state = State::BogusComment;
+ Ok(ControlToken::Continue)
+ }
+ c => {
+ slf.emitter.emit_error(Error::IncorrectlyOpenedComment);
+ slf.emitter.init_comment();
+ slf.state = State::BogusComment;
+ slf.unread_char(c);
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::CommentStart => match slf.read_char()? {
+ Some('-') => {
+ slf.state = State::CommentStartDash;
+ Ok(ControlToken::Continue)
+ }
+ Some('>') => {
+ slf.emitter.emit_error(Error::AbruptClosingOfEmptyComment);
+ slf.state = State::Data;
+ slf.emitter.emit_current_comment();
+ Ok(ControlToken::Continue)
+ }
+ c => {
+ slf.unread_char(c);
+ slf.state = State::Comment;
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::CommentStartDash => match slf.read_char()? {
+ Some('-') => {
+ slf.state = State::CommentEnd;
+ Ok(ControlToken::Continue)
+ }
+ Some('>') => {
+ slf.emitter.emit_error(Error::AbruptClosingOfEmptyComment);
+ slf.state = State::Data;
+ slf.emitter.emit_current_comment();
+ Ok(ControlToken::Continue)
+ }
+ None => {
+ slf.emitter.emit_error(Error::EofInComment);
+ slf.emitter.emit_current_comment();
+ Ok(ControlToken::Eof)
+ }
+ c @ Some(_) => {
+ slf.emitter.push_comment("-");
+ slf.unread_char(c);
+ slf.state = State::Comment;
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::Comment => match slf.read_char()? {
+ Some('<') => {
+ slf.emitter.push_comment("<");
+ slf.state = State::CommentLessThanSign;
+ Ok(ControlToken::Continue)
+ }
+ Some('-') => {
+ slf.state = State::CommentEndDash;
+ Ok(ControlToken::Continue)
+ }
+ Some('\0') => {
+ slf.emitter.emit_error(Error::UnexpectedNullCharacter);
+ slf.emitter.push_comment("\u{fffd}");
+ Ok(ControlToken::Continue)
+ }
+ None => {
+ slf.emitter.emit_error(Error::EofInComment);
+ slf.emitter.emit_current_comment();
+ Ok(ControlToken::Eof)
+ }
+ Some(x) => {
+ slf.emitter.push_comment(ctostr!(x));
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::CommentLessThanSign => match slf.read_char()? {
+ Some('!') => {
+ slf.emitter.push_comment("!");
+ slf.state = State::CommentLessThanSignBang;
+ Ok(ControlToken::Continue)
+ }
+ Some('<') => {
+ slf.emitter.push_comment("<");
+ Ok(ControlToken::Continue)
+ }
+ c => {
+ slf.unread_char(c);
+ slf.state = State::Comment;
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::CommentLessThanSignBang => match slf.read_char()? {
+ Some('-') => {
+ slf.state = State::CommentLessThanSignBangDash;
+ Ok(ControlToken::Continue)
+ }
+ c => {
+ slf.unread_char(c);
+ slf.state = State::Comment;
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::CommentLessThanSignBangDash => match slf.read_char()? {
+ Some('-') => {
+ slf.state = State::CommentLessThanSignBangDashDash;
+ Ok(ControlToken::Continue)
+ }
+ c => {
+ slf.unread_char(c);
+ slf.state = State::CommentEndDash;
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::CommentLessThanSignBangDashDash => match slf.read_char()? {
+ c @ Some('>') | c @ None => {
+ slf.unread_char(c);
+ slf.state = State::CommentEnd;
+ Ok(ControlToken::Continue)
+ }
+ c => {
+ slf.emitter.emit_error(Error::NestedComment);
+ slf.unread_char(c);
+ slf.state = State::CommentEnd;
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::CommentEndDash => match slf.read_char()? {
+ Some('-') => {
+ slf.state = State::CommentEnd;
+ Ok(ControlToken::Continue)
+ }
+ None => {
+ slf.emitter.emit_error(Error::EofInComment);
+ slf.emitter.emit_current_comment();
+ Ok(ControlToken::Eof)
+ }
+ c => {
+ slf.emitter.push_comment("-");
+ slf.unread_char(c);
+ slf.state = State::Comment;
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::CommentEnd => match slf.read_char()? {
+ Some('>') => {
+ slf.state = State::Data;
+ slf.emitter.emit_current_comment();
+ Ok(ControlToken::Continue)
+ }
+ Some('!') => {
+ slf.state = State::CommentEndBang;
+ Ok(ControlToken::Continue)
+ }
+ Some('-') => {
+ slf.emitter.push_comment("-");
+ Ok(ControlToken::Continue)
+ }
+ None => {
+ slf.emitter.emit_error(Error::EofInComment);
+ slf.emitter.emit_current_comment();
+ Ok(ControlToken::Eof)
+ }
+ c @ Some(_) => {
+ slf.emitter.push_comment("-");
+ slf.emitter.push_comment("-");
+ slf.unread_char(c);
+ slf.state = State::Comment;
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::CommentEndBang => match slf.read_char()? {
+ Some('-') => {
+ slf.emitter.push_comment("-");
+ slf.emitter.push_comment("-");
+ slf.emitter.push_comment("!");
+ slf.state = State::CommentEndDash;
+ Ok(ControlToken::Continue)
+ }
+ Some('>') => {
+ slf.emitter.emit_error(Error::IncorrectlyClosedComment);
+ slf.state = State::Data;
+ slf.emitter.emit_current_comment();
+ Ok(ControlToken::Continue)
+ }
+ None => {
+ slf.emitter.emit_error(Error::EofInComment);
+ slf.emitter.emit_current_comment();
+ Ok(ControlToken::Eof)
+ }
+ c @ Some(_) => {
+ slf.emitter.push_comment("-");
+ slf.emitter.push_comment("-");
+ slf.emitter.push_comment("!");
+ slf.state = State::Comment;
+ slf.unread_char(c);
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::Doctype => match slf.read_char()? {
+ Some(whitespace_pat!()) => {
+ slf.state = State::BeforeDoctypeName;
+ Ok(ControlToken::Continue)
+ }
+ c @ Some('>') => {
+ slf.unread_char(c);
+ slf.state = State::BeforeDoctypeName;
+ Ok(ControlToken::Continue)
+ }
+ None => {
+ slf.emitter.emit_error(Error::EofInDoctype);
+ slf.emitter.init_doctype();
+ slf.emitter.set_force_quirks();
+ slf.emitter.emit_current_doctype();
+ Ok(ControlToken::Eof)
+ }
+ c @ Some(_) => {
+ slf.emitter
+ .emit_error(Error::MissingWhitespaceBeforeDoctypeName);
+ slf.unread_char(c);
+ slf.state = State::BeforeDoctypeName;
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::BeforeDoctypeName => match slf.read_char()? {
+ Some(whitespace_pat!()) => Ok(ControlToken::Continue),
+ Some('\0') => {
+ slf.emitter.emit_error(Error::UnexpectedNullCharacter);
+ slf.emitter.init_doctype();
+ slf.emitter.push_doctype_name("\u{fffd}");
+ slf.state = State::DoctypeName;
+ Ok(ControlToken::Continue)
+ }
+ Some('>') => {
+ slf.emitter.emit_error(Error::MissingDoctypeName);
+ slf.emitter.init_doctype();
+ slf.emitter.set_force_quirks();
+ slf.state = State::Data;
+ slf.emitter.emit_current_doctype();
+ Ok(ControlToken::Continue)
+ }
+ None => {
+ slf.emitter.emit_error(Error::EofInDoctype);
+ slf.emitter.init_doctype();
+ slf.emitter.set_force_quirks();
+ slf.emitter.emit_current_doctype();
+ Ok(ControlToken::Eof)
+ }
+ Some(x) => {
+ slf.emitter.init_doctype();
+ slf.emitter
+ .push_doctype_name(ctostr!(x.to_ascii_lowercase()));
+ slf.state = State::DoctypeName;
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::DoctypeName => match slf.read_char()? {
+ Some(whitespace_pat!()) => {
+ slf.state = State::AfterDoctypeName;
+ Ok(ControlToken::Continue)
+ }
+ Some('>') => {
+ slf.state = State::Data;
+ slf.emitter.emit_current_doctype();
+ Ok(ControlToken::Continue)
+ }
+ Some('\0') => {
+ slf.emitter.emit_error(Error::UnexpectedNullCharacter);
+ slf.emitter.push_doctype_name("\u{fffd}");
+ Ok(ControlToken::Continue)
+ }
+ None => {
+ slf.emitter.emit_error(Error::EofInDoctype);
+ slf.emitter.set_force_quirks();
+ slf.emitter.emit_current_doctype();
+ Ok(ControlToken::Eof)
+ }
+ Some(x) => {
+ slf.emitter
+ .push_doctype_name(ctostr!(x.to_ascii_lowercase()));
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::AfterDoctypeName => match slf.read_char()? {
+ Some(whitespace_pat!()) => Ok(ControlToken::Continue),
+ Some('>') => {
+ slf.state = State::Data;
+ slf.emitter.emit_current_doctype();
+ Ok(ControlToken::Continue)
+ }
+ None => {
+ slf.emitter.emit_error(Error::EofInDoctype);
+ slf.emitter.set_force_quirks();
+ slf.emitter.emit_current_doctype();
+ Ok(ControlToken::Eof)
+ }
+ Some('p' | 'P') if slf.try_read_string("ublic", false)? => {
+ slf.state = State::AfterDoctypePublicKeyword;
+ Ok(ControlToken::Continue)
+ }
+ Some('s' | 'S') if slf.try_read_string("ystem", false)? => {
+ slf.state = State::AfterDoctypeSystemKeyword;
+ Ok(ControlToken::Continue)
+ }
+ c @ Some(_) => {
+ slf.emitter
+ .emit_error(Error::InvalidCharacterSequenceAfterDoctypeName);
+ slf.emitter.set_force_quirks();
+ slf.unread_char(c);
+ slf.state = State::BogusDoctype;
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::AfterDoctypePublicKeyword => match slf.read_char()? {
+ Some(whitespace_pat!()) => {
+ slf.state = State::BeforeDoctypePublicIdentifier;
+ Ok(ControlToken::Continue)
+ }
+ Some('"') => {
+ slf.emitter
+ .emit_error(Error::MissingWhitespaceAfterDoctypePublicKeyword);
+ slf.emitter.set_doctype_public_identifier("");
+ slf.state = State::DoctypePublicIdentifierDoubleQuoted;
+ Ok(ControlToken::Continue)
+ }
+ Some('\'') => {
+ slf.emitter
+ .emit_error(Error::MissingWhitespaceAfterDoctypePublicKeyword);
+ slf.emitter.set_doctype_public_identifier("");
+ slf.state = State::DoctypePublicIdentifierSingleQuoted;
+ Ok(ControlToken::Continue)
+ }
+ Some('>') => {
+ slf.emitter
+ .emit_error(Error::MissingDoctypePublicIdentifier);
+ slf.emitter.set_force_quirks();
+ slf.state = State::Data;
+ slf.emitter.emit_current_doctype();
+ Ok(ControlToken::Continue)
+ }
+ None => {
+ slf.emitter.emit_error(Error::EofInDoctype);
+ slf.emitter.set_force_quirks();
+ slf.emitter.emit_current_doctype();
+ Ok(ControlToken::Eof)
+ }
+ c @ Some(_) => {
+ slf.emitter
+ .emit_error(Error::MissingQuoteBeforeDoctypePublicIdentifier);
+ slf.emitter.set_force_quirks();
+ slf.unread_char(c);
+ slf.state = State::BogusDoctype;
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::BeforeDoctypePublicIdentifier => match slf.read_char()? {
+ Some(whitespace_pat!()) => Ok(ControlToken::Continue),
+ Some('"') => {
+ slf.emitter.set_doctype_public_identifier("");
+ slf.state = State::DoctypePublicIdentifierDoubleQuoted;
+ Ok(ControlToken::Continue)
+ }
+ Some('\'') => {
+ slf.emitter.set_doctype_public_identifier("");
+ slf.state = State::DoctypePublicIdentifierSingleQuoted;
+ Ok(ControlToken::Continue)
+ }
+ Some('>') => {
+ slf.emitter
+ .emit_error(Error::MissingDoctypePublicIdentifier);
+ slf.emitter.set_force_quirks();
+ slf.state = State::Data;
+ slf.emitter.emit_current_doctype();
+ Ok(ControlToken::Continue)
+ }
+ None => {
+ slf.emitter.emit_error(Error::EofInDoctype);
+ slf.emitter.set_force_quirks();
+ slf.emitter.emit_current_doctype();
+ Ok(ControlToken::Eof)
+ }
+ c @ Some(_) => {
+ slf.emitter
+ .emit_error(Error::MissingQuoteBeforeDoctypePublicIdentifier);
+ slf.emitter.set_force_quirks();
+ slf.unread_char(c);
+ slf.state = State::BogusDoctype;
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::DoctypePublicIdentifierDoubleQuoted => match slf.read_char()? {
+ Some('"') => {
+ slf.state = State::AfterDoctypePublicIdentifier;
+ Ok(ControlToken::Continue)
+ }
+ Some('\0') => {
+ slf.emitter.emit_error(Error::UnexpectedNullCharacter);
+ slf.emitter.push_doctype_public_identifier("\u{fffd}");
+ Ok(ControlToken::Continue)
+ }
+ Some('>') => {
+ slf.emitter.emit_error(Error::AbruptDoctypePublicIdentifier);
+ slf.emitter.set_force_quirks();
+ slf.state = State::Data;
+ slf.emitter.emit_current_doctype();
+ Ok(ControlToken::Continue)
+ }
+ None => {
+ slf.emitter.emit_error(Error::EofInDoctype);
+ slf.emitter.set_force_quirks();
+ slf.emitter.emit_current_doctype();
+ Ok(ControlToken::Eof)
+ }
+ Some(x) => {
+ slf.emitter.push_doctype_public_identifier(ctostr!(x));
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::DoctypePublicIdentifierSingleQuoted => match slf.read_char()? {
+ Some('\'') => {
+ slf.state = State::AfterDoctypePublicIdentifier;
+ Ok(ControlToken::Continue)
+ }
+ Some('\0') => {
+ slf.emitter.emit_error(Error::UnexpectedNullCharacter);
+ slf.emitter.push_doctype_public_identifier("\u{fffd}");
+ Ok(ControlToken::Continue)
+ }
+ Some('>') => {
+ slf.emitter.emit_error(Error::AbruptDoctypePublicIdentifier);
+ slf.emitter.set_force_quirks();
+ slf.state = State::Data;
+ slf.emitter.emit_current_doctype();
+ Ok(ControlToken::Continue)
+ }
+ None => {
+ slf.emitter.emit_error(Error::EofInDoctype);
+ slf.emitter.set_force_quirks();
+ slf.emitter.emit_current_doctype();
+ Ok(ControlToken::Eof)
+ }
+ Some(x) => {
+ slf.emitter.push_doctype_public_identifier(ctostr!(x));
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::AfterDoctypePublicIdentifier => match slf.read_char()? {
+ Some(whitespace_pat!()) => {
+ slf.state = State::BetweenDoctypePublicAndSystemIdentifiers;
+ Ok(ControlToken::Continue)
+ }
+ Some('>') => {
+ slf.state = State::Data;
+ slf.emitter.emit_current_doctype();
+ Ok(ControlToken::Continue)
+ }
+ Some('"') => {
+ slf.emitter
+ .emit_error(Error::MissingWhitespaceBetweenDoctypePublicAndSystemIdentifiers);
+ slf.emitter.set_doctype_system_identifier("");
+ slf.state = State::DoctypeSystemIdentifierDoubleQuoted;
+ Ok(ControlToken::Continue)
+ }
+ Some('\'') => {
+ slf.emitter
+ .emit_error(Error::MissingWhitespaceBetweenDoctypePublicAndSystemIdentifiers);
+ slf.emitter.set_doctype_system_identifier("");
+ slf.state = State::DoctypeSystemIdentifierSingleQuoted;
+ Ok(ControlToken::Continue)
+ }
+ None => {
+ slf.emitter.emit_error(Error::EofInDoctype);
+ slf.emitter.set_force_quirks();
+ slf.emitter.emit_current_doctype();
+ Ok(ControlToken::Eof)
+ }
+ c @ Some(_) => {
+ slf.emitter
+ .emit_error(Error::MissingQuoteBeforeDoctypeSystemIdentifier);
+ slf.emitter.set_force_quirks();
+ slf.unread_char(c);
+ slf.state = State::BogusDoctype;
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::BetweenDoctypePublicAndSystemIdentifiers => match slf.read_char()? {
+ Some(whitespace_pat!()) => Ok(ControlToken::Continue),
+ Some('>') => {
+ slf.state = State::Data;
+ slf.emitter.emit_current_doctype();
+ Ok(ControlToken::Continue)
+ }
+ Some('"') => {
+ slf.emitter.set_doctype_system_identifier("");
+ slf.state = State::DoctypeSystemIdentifierDoubleQuoted;
+ Ok(ControlToken::Continue)
+ }
+ Some('\'') => {
+ slf.emitter.set_doctype_system_identifier("");
+ slf.state = State::DoctypeSystemIdentifierSingleQuoted;
+ Ok(ControlToken::Continue)
+ }
+ None => {
+ slf.emitter.emit_error(Error::EofInDoctype);
+ slf.emitter.set_force_quirks();
+ slf.emitter.emit_current_doctype();
+ Ok(ControlToken::Eof)
+ }
+ c @ Some(_) => {
+ slf.emitter
+ .emit_error(Error::MissingQuoteBeforeDoctypeSystemIdentifier);
+ slf.emitter.set_force_quirks();
+ slf.state = State::BogusDoctype;
+ slf.unread_char(c);
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::AfterDoctypeSystemKeyword => match slf.read_char()? {
+ Some(whitespace_pat!()) => {
+ slf.state = State::BeforeDoctypeSystemIdentifier;
+ Ok(ControlToken::Continue)
+ }
+ Some('"') => {
+ slf.emitter
+ .emit_error(Error::MissingWhitespaceAfterDoctypeSystemKeyword);
+ slf.emitter.set_doctype_system_identifier("");
+ slf.state = State::DoctypeSystemIdentifierDoubleQuoted;
+ Ok(ControlToken::Continue)
+ }
+ Some('\'') => {
+ slf.emitter
+ .emit_error(Error::MissingWhitespaceAfterDoctypeSystemKeyword);
+ slf.emitter.set_doctype_system_identifier("");
+ slf.state = State::DoctypeSystemIdentifierSingleQuoted;
+ Ok(ControlToken::Continue)
+ }
+ Some('>') => {
+ slf.emitter
+ .emit_error(Error::MissingDoctypeSystemIdentifier);
+ slf.emitter.set_force_quirks();
+ slf.state = State::Data;
+ slf.emitter.emit_current_doctype();
+ Ok(ControlToken::Continue)
+ }
+ None => {
+ slf.emitter.emit_error(Error::EofInDoctype);
+ slf.emitter.set_force_quirks();
+ slf.emitter.emit_current_doctype();
+ Ok(ControlToken::Eof)
+ }
+ c @ Some(_) => {
+ slf.emitter
+ .emit_error(Error::MissingQuoteBeforeDoctypeSystemIdentifier);
+ slf.emitter.set_force_quirks();
+ slf.state = State::BogusDoctype;
+ slf.unread_char(c);
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::BeforeDoctypeSystemIdentifier => match slf.read_char()? {
+ Some(whitespace_pat!()) => Ok(ControlToken::Continue),
+ Some('"') => {
+ slf.emitter.set_doctype_system_identifier("");
+ slf.state = State::DoctypeSystemIdentifierDoubleQuoted;
+ Ok(ControlToken::Continue)
+ }
+ Some('\'') => {
+ slf.emitter.set_doctype_system_identifier("");
+ slf.state = State::DoctypeSystemIdentifierSingleQuoted;
+ Ok(ControlToken::Continue)
+ }
+ Some('>') => {
+ slf.emitter
+ .emit_error(Error::MissingDoctypeSystemIdentifier);
+ slf.emitter.set_force_quirks();
+ slf.state = State::Data;
+ slf.emitter.emit_current_doctype();
+ Ok(ControlToken::Continue)
+ }
+ None => {
+ slf.emitter.emit_error(Error::EofInDoctype);
+ slf.emitter.set_force_quirks();
+ slf.emitter.emit_current_doctype();
+ Ok(ControlToken::Eof)
+ }
+ c @ Some(_) => {
+ slf.emitter
+ .emit_error(Error::MissingQuoteBeforeDoctypeSystemIdentifier);
+ slf.emitter.set_force_quirks();
+ slf.state = State::BogusDoctype;
+ slf.unread_char(c);
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::DoctypeSystemIdentifierDoubleQuoted => match slf.read_char()? {
+ Some('"') => {
+ slf.state = State::AfterDoctypeSystemIdentifier;
+ Ok(ControlToken::Continue)
+ }
+ Some('\0') => {
+ slf.emitter.emit_error(Error::UnexpectedNullCharacter);
+ slf.emitter.push_doctype_system_identifier("\u{fffd}");
+ Ok(ControlToken::Continue)
+ }
+ Some('>') => {
+ slf.emitter.emit_error(Error::AbruptDoctypeSystemIdentifier);
+ slf.emitter.set_force_quirks();
+ slf.state = State::Data;
+ slf.emitter.emit_current_doctype();
+ Ok(ControlToken::Continue)
+ }
+ None => {
+ slf.emitter.emit_error(Error::EofInDoctype);
+ slf.emitter.set_force_quirks();
+ slf.emitter.emit_current_doctype();
+ Ok(ControlToken::Eof)
+ }
+ Some(x) => {
+ slf.emitter.push_doctype_system_identifier(ctostr!(x));
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::DoctypeSystemIdentifierSingleQuoted => match slf.read_char()? {
+ Some('\'') => {
+ slf.state = State::AfterDoctypeSystemIdentifier;
+ Ok(ControlToken::Continue)
+ }
+ Some('\0') => {
+ slf.emitter.emit_error(Error::UnexpectedNullCharacter);
+ slf.emitter.push_doctype_system_identifier("\u{fffd}");
+ Ok(ControlToken::Continue)
+ }
+ Some('>') => {
+ slf.emitter.emit_error(Error::AbruptDoctypeSystemIdentifier);
+ slf.emitter.set_force_quirks();
+ slf.state = State::Data;
+ slf.emitter.emit_current_doctype();
+ Ok(ControlToken::Continue)
+ }
+ None => {
+ slf.emitter.emit_error(Error::EofInDoctype);
+ slf.emitter.set_force_quirks();
+ slf.emitter.emit_current_doctype();
+ Ok(ControlToken::Eof)
+ }
+ Some(x) => {
+ slf.emitter.push_doctype_system_identifier(ctostr!(x));
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::AfterDoctypeSystemIdentifier => match slf.read_char()? {
+ Some(whitespace_pat!()) => Ok(ControlToken::Continue),
+ Some('>') => {
+ slf.state = State::Data;
+ slf.emitter.emit_current_doctype();
+ Ok(ControlToken::Continue)
+ }
+ None => {
+ slf.emitter.emit_error(Error::EofInDoctype);
+ slf.emitter.set_force_quirks();
+ slf.emitter.emit_current_doctype();
+ Ok(ControlToken::Eof)
+ }
+ c @ Some(_) => {
+ slf.emitter
+ .emit_error(Error::UnexpectedCharacterAfterDoctypeSystemIdentifier);
+ slf.unread_char(c);
+ slf.state = State::BogusDoctype;
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::BogusDoctype => match slf.read_char()? {
+ Some('>') => {
+ slf.state = State::Data;
+ slf.emitter.emit_current_doctype();
+ Ok(ControlToken::Continue)
+ }
+ Some('\0') => {
+ slf.emitter.emit_error(Error::UnexpectedNullCharacter);
+ Ok(ControlToken::Continue)
+ }
+ None => {
+ slf.emitter.emit_current_doctype();
+ Ok(ControlToken::Eof)
+ }
+ Some(_) => Ok(ControlToken::Continue),
+ },
+ State::CdataSection => match slf.read_char()? {
+ Some(']') => {
+ slf.state = State::CdataSectionBracket;
+ Ok(ControlToken::Continue)
+ }
+ None => {
+ slf.emitter.emit_error(Error::EofInCdata);
+ Ok(ControlToken::Eof)
+ }
+ Some(x) => {
+ slf.emitter.emit_string(ctostr!(x));
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::CdataSectionBracket => match slf.read_char()? {
+ Some(']') => {
+ slf.state = State::CdataSectionEnd;
+ Ok(ControlToken::Continue)
+ }
+ c => {
+ slf.emitter.emit_string("]");
+ slf.state = State::CdataSection;
+ slf.unread_char(c);
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::CdataSectionEnd => match slf.read_char()? {
+ Some(']') => {
+ slf.emitter.emit_string("]");
+ Ok(ControlToken::Continue)
+ }
+ Some('>') => {
+ slf.state = State::Data;
+ Ok(ControlToken::Continue)
+ }
+ c => {
+ slf.emitter.emit_string("]]");
+ slf.unread_char(c);
+ slf.state = State::CdataSection;
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::CharacterReference => {
+ slf.temporary_buffer.clear();
+ slf.temporary_buffer.push('&');
+ match slf.read_char()? {
+ Some(x) if x.is_ascii_alphanumeric() => {
+ slf.unread_char(Some(x));
+ slf.state = State::NamedCharacterReference;
+ Ok(ControlToken::Continue)
+ }
+ Some('#') => {
+ slf.temporary_buffer.push('#');
+ slf.state = State::NumericCharacterReference;
+ Ok(ControlToken::Continue)
+ }
+ c => {
+ slf.flush_code_points_consumed_as_character_reference();
+ slf.state = slf.return_state.take().unwrap();
+ slf.unread_char(c);
+ Ok(ControlToken::Continue)
+ }
+ }
+ }
+ State::NamedCharacterReference => {
+ let c = slf.read_char()?;
-pub(crate) use whitespace_pat;
+ let char_ref = match c {
+ Some(x) => try_read_character_reference(x, |x| slf.try_read_string(x, true))?
+ .map(|char_ref| (x, char_ref)),
-macro_rules! noncharacter_pat {
- () => {
- 0xfdd0
- ..=0xfdef
- | 0xfffe
- | 0xffff
- | 0x1fffe
- | 0x1ffff
- | 0x2fffe
- | 0x2ffff
- | 0x3fffe
- | 0x3ffff
- | 0x4fffe
- | 0x4ffff
- | 0x5fffe
- | 0x5ffff
- | 0x6fffe
- | 0x6ffff
- | 0x7fffe
- | 0x7ffff
- | 0x8fffe
- | 0x8ffff
- | 0x9fffe
- | 0x9ffff
- | 0xafffe
- | 0xaffff
- | 0xbfffe
- | 0xbffff
- | 0xcfffe
- | 0xcffff
- | 0xdfffe
- | 0xdffff
- | 0xefffe
- | 0xeffff
- | 0xffffe
- | 0xfffff
- | 0x10fffe
- | 0x10ffff
- };
-}
+ None => None,
+ };
-pub(crate) use noncharacter_pat;
+ if let Some((x, char_ref)) = char_ref {
+ slf.temporary_buffer.push(x);
+ slf.temporary_buffer.push_str(char_ref.name);
+ let char_ref_name_last_character = char_ref.name.chars().last();
+ let next_character = slf.next_input_character()?;
+ if slf.is_consumed_as_part_of_an_attribute()
+ && char_ref_name_last_character != Some(';')
+ && matches!(next_character, Some(x) if x == '=' || x.is_ascii_alphanumeric())
+ {
+ slf.flush_code_points_consumed_as_character_reference();
+ slf.state = slf.return_state.take().unwrap();
+ Ok(ControlToken::Continue)
+ } else {
+ if char_ref_name_last_character != Some(';') {
+ slf.emitter
+ .emit_error(Error::MissingSemicolonAfterCharacterReference);
+ }
-// When integration tests are running, this enum is public and we get warnings about missing docs.
-// However, it's not actually part of public API.
-#[allow(missing_docs)]
-#[derive(Clone, Copy, Debug, Eq, PartialEq)]
-pub enum State {
- Data,
- RcData,
- RawText,
- ScriptData,
- PlainText,
- TagOpen,
- EndTagOpen,
- TagName,
- RcDataLessThanSign,
- RcDataEndTagOpen,
- RcDataEndTagName,
- RawTextLessThanSign,
- RawTextEndTagOpen,
- RawTextEndTagName,
- ScriptDataLessThanSign,
- ScriptDataEndTagOpen,
- ScriptDataEndTagName,
- ScriptDataEscapeStart,
- ScriptDataEscapeStartDash,
- ScriptDataEscaped,
- ScriptDataEscapedDash,
- ScriptDataEscapedDashDash,
- ScriptDataEscapedLessThanSign,
- ScriptDataEscapedEndTagOpen,
- ScriptDataEscapedEndTagName,
- ScriptDataDoubleEscapeStart,
- ScriptDataDoubleEscaped,
- ScriptDataDoubleEscapedDash,
- ScriptDataDoubleEscapedDashDash,
- ScriptDataDoubleEscapedLessThanSign,
- ScriptDataDoubleEscapeEnd,
- BeforeAttributeName,
- AttributeName,
- AfterAttributeName,
- BeforeAttributeValue,
- AttributeValueDoubleQuoted,
- AttributeValueSingleQuoted,
- AttributeValueUnquoted,
- AfterAttributeValueQuoted,
- SelfClosingStartTag,
- BogusComment,
- MarkupDeclarationOpen,
- CommentStart,
- CommentStartDash,
- Comment,
- CommentLessThanSign,
- CommentLessThanSignBang,
- CommentLessThanSignBangDash,
- CommentLessThanSignBangDashDash,
- CommentEndDash,
- CommentEnd,
- CommentEndBang,
- Doctype,
- BeforeDoctypeName,
- DoctypeName,
- AfterDoctypeName,
- AfterDoctypePublicKeyword,
- BeforeDoctypePublicIdentifier,
- DoctypePublicIdentifierDoubleQuoted,
- DoctypePublicIdentifierSingleQuoted,
- AfterDoctypePublicIdentifier,
- BetweenDoctypePublicAndSystemIdentifiers,
- AfterDoctypeSystemKeyword,
- BeforeDoctypeSystemIdentifier,
- DoctypeSystemIdentifierDoubleQuoted,
- DoctypeSystemIdentifierSingleQuoted,
- AfterDoctypeSystemIdentifier,
- BogusDoctype,
- CdataSection,
- CdataSectionBracket,
- CdataSectionEnd,
- CharacterReference,
- NamedCharacterReference,
- AmbiguousAmpersand,
- NumericCharacterReference,
- HexadecimalCharacterReferenceStart,
- DecimalCharacterReferenceStart,
- HexadecimalCharacterReference,
- DecimalCharacterReference,
- NumericCharacterReferenceEnd,
-}
+ slf.temporary_buffer.clear();
+ slf.temporary_buffer.push_str(char_ref.characters);
+ slf.flush_code_points_consumed_as_character_reference();
+ slf.state = slf.return_state.take().unwrap();
+ Ok(ControlToken::Continue)
+ }
+ } else {
+ slf.unread_char(c);
+ slf.flush_code_points_consumed_as_character_reference();
+ slf.state = State::AmbiguousAmpersand;
+ Ok(ControlToken::Continue)
+ }
+ }
+ State::AmbiguousAmpersand => match slf.read_char()? {
+ Some(x) if x.is_ascii_alphanumeric() => {
+ if slf.is_consumed_as_part_of_an_attribute() {
+ slf.emitter.push_attribute_value(ctostr!(x));
+ } else {
+ slf.emitter.emit_string(ctostr!(x));
+ }
+
+ Ok(ControlToken::Continue)
+ }
+ c @ Some(';') => {
+ slf.emitter
+ .emit_error(Error::UnknownNamedCharacterReference);
+ slf.unread_char(c);
+ slf.state = slf.return_state.take().unwrap();
+ Ok(ControlToken::Continue)
+ }
+ c => {
+ slf.unread_char(c);
+ slf.state = slf.return_state.take().unwrap();
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::NumericCharacterReference => {
+ slf.character_reference_code = 0;
+ match slf.read_char()? {
+ Some(x @ 'x' | x @ 'X') => {
+ slf.temporary_buffer.push(x);
+ slf.state = State::HexadecimalCharacterReferenceStart;
+ Ok(ControlToken::Continue)
+ }
+ c => {
+ slf.unread_char(c);
+ slf.state = State::DecimalCharacterReferenceStart;
+ Ok(ControlToken::Continue)
+ }
+ }
+ }
+ State::HexadecimalCharacterReferenceStart => match slf.read_char()? {
+ c @ Some('0'..='9' | 'A'..='F' | 'a'..='f') => {
+ slf.unread_char(c);
+ slf.state = State::HexadecimalCharacterReference;
+ Ok(ControlToken::Continue)
+ }
+ c => {
+ slf.emitter
+ .emit_error(Error::AbsenceOfDigitsInNumericCharacterReference);
+ slf.flush_code_points_consumed_as_character_reference();
+ slf.unread_char(c);
+ slf.state = slf.return_state.take().unwrap();
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::DecimalCharacterReferenceStart => match slf.read_char()? {
+ Some(x @ ascii_digit_pat!()) => {
+ slf.unread_char(Some(x));
+ slf.state = State::DecimalCharacterReference;
+ Ok(ControlToken::Continue)
+ }
+ c => {
+ slf.emitter
+ .emit_error(Error::AbsenceOfDigitsInNumericCharacterReference);
+ slf.flush_code_points_consumed_as_character_reference();
+ slf.unread_char(c);
+ slf.state = slf.return_state.take().unwrap();
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::HexadecimalCharacterReference => match slf.read_char()? {
+ Some(x @ ascii_digit_pat!()) => {
+ mutate_character_reference!(*16 + x - 0x0030);
+ Ok(ControlToken::Continue)
+ }
+ Some(x @ 'A'..='F') => {
+ mutate_character_reference!(*16 + x - 0x0037);
+ Ok(ControlToken::Continue)
+ }
+ Some(x @ 'a'..='f') => {
+ mutate_character_reference!(*16 + x - 0x0057);
+ Ok(ControlToken::Continue)
+ }
+ Some(';') => {
+ slf.state = State::NumericCharacterReferenceEnd;
+ Ok(ControlToken::Continue)
+ }
+ c => {
+ slf.emitter
+ .emit_error(Error::MissingSemicolonAfterCharacterReference);
+ slf.unread_char(c);
+ slf.state = State::NumericCharacterReferenceEnd;
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::DecimalCharacterReference => match slf.read_char()? {
+ Some(x @ ascii_digit_pat!()) => {
+ mutate_character_reference!(*10 + x - 0x0030);
+ Ok(ControlToken::Continue)
+ }
+ Some(';') => {
+ slf.state = State::NumericCharacterReferenceEnd;
+ Ok(ControlToken::Continue)
+ }
+ c => {
+ slf.emitter
+ .emit_error(Error::MissingSemicolonAfterCharacterReference);
+ slf.unread_char(c);
+ slf.state = State::NumericCharacterReferenceEnd;
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::NumericCharacterReferenceEnd => {
+ match slf.character_reference_code {
+ 0x00 => {
+ slf.emitter.emit_error(Error::NullCharacterReference);
+ slf.character_reference_code = 0xfffd;
+ }
+ 0x110000.. => {
+ slf.emitter
+ .emit_error(Error::CharacterReferenceOutsideUnicodeRange);
+ slf.character_reference_code = 0xfffd;
+ }
+ surrogate_pat!() => {
+ slf.emitter.emit_error(Error::SurrogateCharacterReference);
+ slf.character_reference_code = 0xfffd;
+ }
+ // noncharacter
+ noncharacter_pat!() => {
+ slf.emitter
+ .emit_error(Error::NoncharacterCharacterReference);
+ }
+ // 0x000d, or a control that is not whitespace
+ x @ 0x000d | x @ control_pat!()
+ if !matches!(x, 0x0009 | 0x000a | 0x000c | 0x0020) =>
+ {
+ slf.emitter.emit_error(Error::ControlCharacterReference);
+ slf.character_reference_code = match x {
+ 0x80 => 0x20AC, // EURO SIGN (€)
+ 0x82 => 0x201A, // SINGLE LOW-9 QUOTATION MARK (‚)
+ 0x83 => 0x0192, // LATIN SMALL LETTER F WITH HOOK (ƒ)
+ 0x84 => 0x201E, // DOUBLE LOW-9 QUOTATION MARK („)
+ 0x85 => 0x2026, // HORIZONTAL ELLIPSIS (…)
+ 0x86 => 0x2020, // DAGGER (†)
+ 0x87 => 0x2021, // DOUBLE DAGGER (‡)
+ 0x88 => 0x02C6, // MODIFIER LETTER CIRCUMFLEX ACCENT (ˆ)
+ 0x89 => 0x2030, // PER MILLE SIGN (‰)
+ 0x8A => 0x0160, // LATIN CAPITAL LETTER S WITH CARON (Š)
+ 0x8B => 0x2039, // SINGLE LEFT-POINTING ANGLE QUOTATION MARK (‹)
+ 0x8C => 0x0152, // LATIN CAPITAL LIGATURE OE (Œ)
+ 0x8E => 0x017D, // LATIN CAPITAL LETTER Z WITH CARON (Ž)
+ 0x91 => 0x2018, // LEFT SINGLE QUOTATION MARK (‘)
+ 0x92 => 0x2019, // RIGHT SINGLE QUOTATION MARK (’)
+ 0x93 => 0x201C, // LEFT DOUBLE QUOTATION MARK (“)
+ 0x94 => 0x201D, // RIGHT DOUBLE QUOTATION MARK (”)
+ 0x95 => 0x2022, // BULLET (•)
+ 0x96 => 0x2013, // EN DASH (–)
+ 0x97 => 0x2014, // EM DASH (—)
+ 0x98 => 0x02DC, // SMALL TILDE (˜)
+ 0x99 => 0x2122, // TRADE MARK SIGN (™)
+ 0x9A => 0x0161, // LATIN SMALL LETTER S WITH CARON (š)
+ 0x9B => 0x203A, // SINGLE RIGHT-POINTING ANGLE QUOTATION MARK (›)
+ 0x9C => 0x0153, // LATIN SMALL LIGATURE OE (œ)
+ 0x9E => 0x017E, // LATIN SMALL LETTER Z WITH CARON (ž)
+ 0x9F => 0x0178, // LATIN CAPITAL LETTER Y WITH DIAERESIS (Ÿ)
+ _ => slf.character_reference_code,
+ };
+ }
+ _ => (),
+ }
-pub enum ControlToken {
- Eof,
- Continue,
+ slf.temporary_buffer.clear();
+ slf.temporary_buffer
+ .push(std::char::from_u32(slf.character_reference_code).unwrap());
+ slf.flush_code_points_consumed_as_character_reference();
+ slf.state = slf.return_state.take().unwrap();
+ Ok(ControlToken::Continue)
+ }
+ }
}
diff --git a/src/tokenizer.rs b/src/tokenizer.rs
new file mode 100644
index 0000000..d7e60ac
--- /dev/null
+++ b/src/tokenizer.rs
@@ -0,0 +1,244 @@
+use crate::machine;
+use crate::utils::{control_pat, noncharacter_pat, surrogate_pat, ControlToken, State};
+use crate::{DefaultEmitter, Emitter, Error, Never, Readable, Reader};
+
+// this is a stack that can hold 0 to 2 Ts
+#[derive(Debug, Default)]
+struct Stack2<T: Copy>(Option<(T, Option<T>)>);
+
+impl<T: Copy> Stack2<T> {
+ #[inline]
+ fn push(&mut self, c: T) {
+ self.0 = match self.0 {
+ None => Some((c, None)),
+ Some((c1, None)) => Some((c1, Some(c))),
+ Some((_c1, Some(_c2))) => panic!("stack full!"),
+ }
+ }
+
+ #[inline]
+ fn pop(&mut self) -> Option<T> {
+ let (new_self, rv) = match self.0 {
+ Some((c1, Some(c2))) => (Some((c1, None)), Some(c2)),
+ Some((c1, None)) => (None, Some(c1)),
+ None => (None, None),
+ };
+ self.0 = new_self;
+ rv
+ }
+
+ #[inline]
+ fn is_empty(&self) -> bool {
+ matches!(self.0, None)
+ }
+}
+
+/// A HTML tokenizer. See crate-level docs for basic usage.
+pub struct Tokenizer<R: Reader, E: Emitter = DefaultEmitter> {
+ eof: bool,
+ pub(crate) state: State,
+ pub(crate) emitter: E,
+ pub(crate) temporary_buffer: String,
+ reader: R,
+ to_reconsume: Stack2<Option<char>>,
+ pub(crate) character_reference_code: u32,
+ pub(crate) return_state: Option<State>,
+}
+
+impl<R: Reader> Tokenizer<R> {
+ /// Create a new tokenizer from some input.
+ ///
+ /// `input` can be `&String` or `&str` at the moment, as those are the types for which
+ /// [`crate::Readable`] is implemented, but you can implement that trait on your own types.
+ ///
+ /// Patches are welcome for providing an efficient implementation over async streams,
+ /// iterators, files, etc, as long as any dependencies come behind featureflags.
+ pub fn new<'a, S: Readable<'a, Reader = R>>(input: S) -> Self {
+ Tokenizer::<S::Reader>::new_with_emitter(input, DefaultEmitter::default())
+ }
+}
+
+impl<R: Reader, E: Emitter> Tokenizer<R, E> {
+ /// Construct a new tokenizer from some input and a custom emitter.
+ ///
+ /// Use this method over [`Tokenizer::new`] when you want to have more control over string allocation for
+ /// tokens.
+ pub fn new_with_emitter<'a, S: Readable<'a, Reader = R>>(input: S, emitter: E) -> Self {
+ Tokenizer {
+ eof: false,
+ state: State::Data,
+ emitter,
+ temporary_buffer: String::new(),
+ to_reconsume: Stack2::default(),
+ reader: input.to_reader(),
+ character_reference_code: 0,
+ return_state: None,
+ }
+ }
+
+ /// Test-internal function to override internal state.
+ ///
+ /// Only available with the `integration-tests` feature which is not public API.
+ #[cfg(feature = "integration-tests")]
+ pub fn set_state(&mut self, state: State) {
+ self.state = state;
+ }
+
+ /// Set the statemachine to start/continue in [plaintext
+ /// state](https://html.spec.whatwg.org/#plaintext-state).
+ ///
+ /// This tokenizer never gets into that state naturally.
+ pub fn set_plaintext_state(&mut self) {
+ self.state = State::PlainText;
+ }
+
+ /// Test-internal function to override internal state.
+ ///
+ /// Only available with the `integration-tests` feature which is not public API.
+ #[cfg(feature = "integration-tests")]
+ pub fn set_last_start_tag(&mut self, last_start_tag: Option<&str>) {
+ self.emitter.set_last_start_tag(last_start_tag);
+ }
+
+ #[inline]
+ pub(crate) fn unread_char(&mut self, c: Option<char>) {
+ self.to_reconsume.push(c);
+ }
+
+ #[inline]
+ fn validate_char(&mut self, c: char) {
+ match c as u32 {
+ surrogate_pat!() => {
+ self.emitter.emit_error(Error::SurrogateInInputStream);
+ }
+ noncharacter_pat!() => {
+ self.emitter.emit_error(Error::NoncharacterInInputStream);
+ }
+ // control without whitespace or nul
+ x @ control_pat!()
+ if !matches!(x, 0x0000 | 0x0009 | 0x000a | 0x000c | 0x000d | 0x0020) =>
+ {
+ self.emitter
+ .emit_error(Error::ControlCharacterInInputStream);
+ }
+ _ => (),
+ }
+ }
+
+ pub(crate) fn read_char(&mut self) -> Result<Option<char>, R::Error> {
+ let (c_res, reconsumed) = match self.to_reconsume.pop() {
+ Some(c) => (Ok(c), true),
+ None => (self.reader.read_char(), false),
+ };
+
+ let mut c = match c_res {
+ Ok(Some(c)) => c,
+ res => return res,
+ };
+
+ if c == '\r' {
+ c = '\n';
+ let c2 = self.reader.read_char()?;
+ if c2 != Some('\n') {
+ self.unread_char(c2);
+ }
+ }
+
+ if !reconsumed {
+ self.validate_char(c);
+ }
+
+ Ok(Some(c))
+ }
+
+ #[inline]
+ pub(crate) fn try_read_string(
+ &mut self,
+ s: &str,
+ case_sensitive: bool,
+ ) -> Result<bool, R::Error> {
+ debug_assert!(!s.is_empty());
+ debug_assert!(self.to_reconsume.is_empty());
+ self.reader.try_read_string(s, case_sensitive)
+ }
+
+ pub(crate) fn is_consumed_as_part_of_an_attribute(&self) -> bool {
+ matches!(
+ self.return_state,
+ Some(
+ State::AttributeValueDoubleQuoted
+ | State::AttributeValueSingleQuoted
+ | State::AttributeValueUnquoted
+ )
+ )
+ }
+
+ pub(crate) fn flush_code_points_consumed_as_character_reference(&mut self) {
+ if self.is_consumed_as_part_of_an_attribute() {
+ self.emitter.push_attribute_value(&self.temporary_buffer);
+ self.temporary_buffer.clear();
+ } else {
+ self.flush_buffer_characters();
+ }
+ }
+
+ pub(crate) fn next_input_character(&mut self) -> Result<Option<char>, R::Error> {
+ let rv = self.read_char()?;
+ self.unread_char(rv);
+ Ok(rv)
+ }
+
+ pub(crate) fn flush_buffer_characters(&mut self) {
+ self.emitter.emit_string(&self.temporary_buffer);
+ self.temporary_buffer.clear();
+ }
+}
+
+impl<R: Reader, E: Emitter> Iterator for Tokenizer<R, E> {
+ type Item = Result<E::Token, R::Error>;
+
+ fn next(&mut self) -> Option<Self::Item> {
+ loop {
+ if let Some(token) = self.emitter.pop_token() {
+ break Some(Ok(token));
+ } else if !self.eof {
+ match machine::consume(self) {
+ Ok(ControlToken::Continue) => (),
+ Ok(ControlToken::Eof) => {
+ self.eof = true;
+ self.emitter.emit_eof();
+ }
+ Err(e) => break Some(Err(e)),
+ }
+ } else {
+ break None;
+ }
+ }
+ }
+}
+
+/// A kind of tokenizer that directly yields tokens when used as an iterator, so `Token` instead of
+/// `Result<Token, _>`.
+///
+/// This is the return value of [`Tokenizer::infallible`].
+pub struct InfallibleTokenizer<R: Reader<Error = Never>, E: Emitter>(Tokenizer<R, E>);
+
+impl<R: Reader<Error = Never>, E: Emitter> Tokenizer<R, E> {
+ /// Statically assert that this iterator is infallible.
+ ///
+ /// Call this to get rid of error handling when parsing HTML from strings.
+ pub fn infallible(self) -> InfallibleTokenizer<R, E> {
+ InfallibleTokenizer(self)
+ }
+}
+
+impl<R: Reader<Error = Never>, E: Emitter> Iterator for InfallibleTokenizer<R, E> {
+ type Item = E::Token;
+
+ fn next(&mut self) -> Option<Self::Item> {
+ match self.0.next()? {
+ Ok(token) => Some(token),
+ Err(e) => match e {},
+ }
+ }
+}
diff --git a/src/utils.rs b/src/utils.rs
new file mode 100644
index 0000000..67db1b9
--- /dev/null
+++ b/src/utils.rs
@@ -0,0 +1,164 @@
+macro_rules! surrogate_pat {
+ () => {
+ 0xd800..=0xdfff
+ };
+}
+
+pub(crate) use surrogate_pat;
+
+macro_rules! control_pat {
+ () => (0x0d | 0x0000..=0x001f | 0x007f..=0x009f)
+}
+
+pub(crate) use control_pat;
+
+macro_rules! ascii_digit_pat {
+ () => {
+ '0'..='9'
+ };
+}
+
+pub(crate) use ascii_digit_pat;
+
+macro_rules! whitespace_pat {
+ () => {
+ '\t' | '\u{0A}' | '\u{0C}' | ' '
+ };
+}
+
+pub(crate) use whitespace_pat;
+
+macro_rules! noncharacter_pat {
+ () => {
+ 0xfdd0
+ ..=0xfdef
+ | 0xfffe
+ | 0xffff
+ | 0x1fffe
+ | 0x1ffff
+ | 0x2fffe
+ | 0x2ffff
+ | 0x3fffe
+ | 0x3ffff
+ | 0x4fffe
+ | 0x4ffff
+ | 0x5fffe
+ | 0x5ffff
+ | 0x6fffe
+ | 0x6ffff
+ | 0x7fffe
+ | 0x7ffff
+ | 0x8fffe
+ | 0x8ffff
+ | 0x9fffe
+ | 0x9ffff
+ | 0xafffe
+ | 0xaffff
+ | 0xbfffe
+ | 0xbffff
+ | 0xcfffe
+ | 0xcffff
+ | 0xdfffe
+ | 0xdffff
+ | 0xefffe
+ | 0xeffff
+ | 0xffffe
+ | 0xfffff
+ | 0x10fffe
+ | 0x10ffff
+ };
+}
+
+pub(crate) use noncharacter_pat;
+
+// When integration tests are running, this enum is public and we get warnings about missing docs.
+// However, it's not actually part of public API.
+#[allow(missing_docs)]
+#[derive(Clone, Copy, Debug, Eq, PartialEq)]
+pub enum State {
+ Data,
+ RcData,
+ RawText,
+ ScriptData,
+ PlainText,
+ TagOpen,
+ EndTagOpen,
+ TagName,
+ RcDataLessThanSign,
+ RcDataEndTagOpen,
+ RcDataEndTagName,
+ RawTextLessThanSign,
+ RawTextEndTagOpen,
+ RawTextEndTagName,
+ ScriptDataLessThanSign,
+ ScriptDataEndTagOpen,
+ ScriptDataEndTagName,
+ ScriptDataEscapeStart,
+ ScriptDataEscapeStartDash,
+ ScriptDataEscaped,
+ ScriptDataEscapedDash,
+ ScriptDataEscapedDashDash,
+ ScriptDataEscapedLessThanSign,
+ ScriptDataEscapedEndTagOpen,
+ ScriptDataEscapedEndTagName,
+ ScriptDataDoubleEscapeStart,
+ ScriptDataDoubleEscaped,
+ ScriptDataDoubleEscapedDash,
+ ScriptDataDoubleEscapedDashDash,
+ ScriptDataDoubleEscapedLessThanSign,
+ ScriptDataDoubleEscapeEnd,
+ BeforeAttributeName,
+ AttributeName,
+ AfterAttributeName,
+ BeforeAttributeValue,
+ AttributeValueDoubleQuoted,
+ AttributeValueSingleQuoted,
+ AttributeValueUnquoted,
+ AfterAttributeValueQuoted,
+ SelfClosingStartTag,
+ BogusComment,
+ MarkupDeclarationOpen,
+ CommentStart,
+ CommentStartDash,
+ Comment,
+ CommentLessThanSign,
+ CommentLessThanSignBang,
+ CommentLessThanSignBangDash,
+ CommentLessThanSignBangDashDash,
+ CommentEndDash,
+ CommentEnd,
+ CommentEndBang,
+ Doctype,
+ BeforeDoctypeName,
+ DoctypeName,
+ AfterDoctypeName,
+ AfterDoctypePublicKeyword,
+ BeforeDoctypePublicIdentifier,
+ DoctypePublicIdentifierDoubleQuoted,
+ DoctypePublicIdentifierSingleQuoted,
+ AfterDoctypePublicIdentifier,
+ BetweenDoctypePublicAndSystemIdentifiers,
+ AfterDoctypeSystemKeyword,
+ BeforeDoctypeSystemIdentifier,
+ DoctypeSystemIdentifierDoubleQuoted,
+ DoctypeSystemIdentifierSingleQuoted,
+ AfterDoctypeSystemIdentifier,
+ BogusDoctype,
+ CdataSection,
+ CdataSectionBracket,
+ CdataSectionEnd,
+ CharacterReference,
+ NamedCharacterReference,
+ AmbiguousAmpersand,
+ NumericCharacterReference,
+ HexadecimalCharacterReferenceStart,
+ DecimalCharacterReferenceStart,
+ HexadecimalCharacterReference,
+ DecimalCharacterReference,
+ NumericCharacterReferenceEnd,
+}
+
+pub enum ControlToken {
+ Eof,
+ Continue,
+}