summaryrefslogtreecommitdiff
path: root/src/tokenizer
diff options
context:
space:
mode:
authorMartin Fischer <martin@push-f.com>2023-09-09 20:38:51 +0200
committerMartin Fischer <martin@push-f.com>2023-09-28 10:36:08 +0200
commita4846482707412f3cbacabeb082f25762a2636d5 (patch)
treebbde44d58c0e1f63110e008ac3001acc11de84cf /src/tokenizer
parent595985f5e17fafedc1cf6a72691ea7eb7dc20174 (diff)
refactor: move machine module under tokenizer
Diffstat (limited to 'src/tokenizer')
-rw-r--r--src/tokenizer/machine.rs1987
1 files changed, 1987 insertions, 0 deletions
diff --git a/src/tokenizer/machine.rs b/src/tokenizer/machine.rs
new file mode 100644
index 0000000..fd4b36b
--- /dev/null
+++ b/src/tokenizer/machine.rs
@@ -0,0 +1,1987 @@
+use crate::entities::try_read_character_reference;
+use crate::offset::{Offset, Position};
+use crate::token::AttrValueSyntax;
+use crate::tokenizer::CdataAction;
+use crate::utils::{
+ ascii_digit_pat, control_pat, ctostr, noncharacter_pat, surrogate_pat, whitespace_pat, State,
+};
+use crate::{reader::Reader, Emitter, Error, Tokenizer};
+
+pub enum ControlToken {
+ Eof,
+ Continue,
+ CdataOpen,
+}
+
+#[inline]
+pub fn consume<O, R, E>(slf: &mut Tokenizer<R, O, E>) -> Result<ControlToken, R::Error>
+where
+ O: Offset,
+ R: Reader + Position<O>,
+ E: Emitter<O>,
+{
+ macro_rules! mutate_character_reference {
+ (* $mul:literal + $x:ident - $sub:literal) => {
+ match slf
+ .character_reference_code
+ .checked_mul($mul)
+ .and_then(|cr| cr.checked_add($x as u32 - $sub))
+ {
+ Some(cr) => slf.character_reference_code = cr,
+ None => {
+ // provoke err
+ slf.character_reference_code = 0x110000;
+ }
+ };
+ };
+ }
+
+ slf.position_before_match = slf.reader.position();
+
+ match slf.state {
+ State::Data => match slf.read_char()? {
+ Some('&') => {
+ slf.return_state = Some(slf.state);
+ slf.state = State::CharacterReference;
+ Ok(ControlToken::Continue)
+ }
+ Some('<') => {
+ slf.some_offset = slf.position_before_match;
+ slf.state = State::TagOpen;
+ Ok(ControlToken::Continue)
+ }
+ Some('\0') => {
+ slf.emit_error(Error::UnexpectedNullCharacter);
+ slf.emitter.emit_string("\0");
+ Ok(ControlToken::Continue)
+ }
+ Some(x) => {
+ slf.emitter.emit_string(ctostr!(x));
+ Ok(ControlToken::Continue)
+ }
+ None => Ok(ControlToken::Eof),
+ },
+ State::RcData => match slf.read_char()? {
+ Some('&') => {
+ slf.return_state = Some(State::RcData);
+ slf.state = State::CharacterReference;
+ Ok(ControlToken::Continue)
+ }
+ Some('<') => {
+ slf.state = State::RcDataLessThanSign;
+ Ok(ControlToken::Continue)
+ }
+ Some('\0') => {
+ slf.emit_error(Error::UnexpectedNullCharacter);
+ slf.emitter.emit_string("\u{fffd}");
+ Ok(ControlToken::Continue)
+ }
+ Some(x) => {
+ slf.emitter.emit_string(ctostr!(x));
+ Ok(ControlToken::Continue)
+ }
+ None => Ok(ControlToken::Eof),
+ },
+ State::RawText => match slf.read_char()? {
+ Some('<') => {
+ slf.state = State::RawTextLessThanSign;
+ Ok(ControlToken::Continue)
+ }
+ Some('\0') => {
+ slf.emit_error(Error::UnexpectedNullCharacter);
+ slf.emitter.emit_string("\u{fffd}");
+ Ok(ControlToken::Continue)
+ }
+ Some(x) => {
+ slf.emitter.emit_string(ctostr!(x));
+ Ok(ControlToken::Continue)
+ }
+ None => Ok(ControlToken::Eof),
+ },
+ State::ScriptData => match slf.read_char()? {
+ Some('<') => {
+ slf.state = State::ScriptDataLessThanSign;
+ Ok(ControlToken::Continue)
+ }
+ Some('\0') => {
+ slf.emit_error(Error::UnexpectedNullCharacter);
+ slf.emitter.emit_string("\u{fffd}");
+ Ok(ControlToken::Continue)
+ }
+ Some(x) => {
+ slf.emitter.emit_string(ctostr!(x));
+ Ok(ControlToken::Continue)
+ }
+ None => Ok(ControlToken::Eof),
+ },
+ State::PlainText => match slf.read_char()? {
+ Some('\0') => {
+ slf.emit_error(Error::UnexpectedNullCharacter);
+ slf.emitter.emit_string("\u{fffd}");
+ Ok(ControlToken::Continue)
+ }
+ Some(x) => {
+ slf.emitter.emit_string(ctostr!(x));
+ Ok(ControlToken::Continue)
+ }
+ None => Ok(ControlToken::Eof),
+ },
+ State::TagOpen => match slf.read_char()? {
+ Some('!') => {
+ slf.state = State::MarkupDeclarationOpen;
+ Ok(ControlToken::Continue)
+ }
+ Some('/') => {
+ slf.state = State::EndTagOpen;
+ Ok(ControlToken::Continue)
+ }
+ Some(x) if x.is_ascii_alphabetic() => {
+ slf.init_start_tag();
+ slf.state = State::TagName;
+ slf.unread_char(Some(x));
+ Ok(ControlToken::Continue)
+ }
+ c @ Some('?') => {
+ slf.emit_error(Error::UnexpectedQuestionMarkInsteadOfTagName);
+ slf.emitter.init_comment(slf.reader.position());
+ slf.state = State::BogusComment;
+ slf.unread_char(c);
+ Ok(ControlToken::Continue)
+ }
+ None => {
+ slf.emit_error(Error::EofBeforeTagName);
+ slf.emitter.emit_string("<");
+ Ok(ControlToken::Eof)
+ }
+ c @ Some(_) => {
+ slf.emit_error(Error::InvalidFirstCharacterOfTagName);
+ slf.state = State::Data;
+ slf.emitter.emit_string("<");
+ slf.unread_char(c);
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::EndTagOpen => match slf.read_char()? {
+ Some(x) if x.is_ascii_alphabetic() => {
+ slf.init_end_tag();
+ slf.state = State::TagName;
+ slf.unread_char(Some(x));
+ Ok(ControlToken::Continue)
+ }
+ Some('>') => {
+ slf.emit_error(Error::MissingEndTagName);
+ slf.state = State::Data;
+ Ok(ControlToken::Continue)
+ }
+ None => {
+ slf.emit_error(Error::EofBeforeTagName);
+ slf.emitter.emit_string("</");
+ Ok(ControlToken::Eof)
+ }
+ Some(x) => {
+ slf.emit_error(Error::InvalidFirstCharacterOfTagName);
+ slf.emitter.init_comment(slf.reader.position());
+ slf.state = State::BogusComment;
+ slf.unread_char(Some(x));
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::TagName => match slf.read_char()? {
+ Some(whitespace_pat!()) => {
+ slf.emitter.terminate_tag_name(slf.position_before_match);
+ slf.state = State::BeforeAttributeName;
+ Ok(ControlToken::Continue)
+ }
+ Some('/') => {
+ slf.emitter.terminate_tag_name(slf.position_before_match);
+ slf.state = State::SelfClosingStartTag;
+ Ok(ControlToken::Continue)
+ }
+ Some('>') => {
+ slf.emitter.terminate_tag_name(slf.position_before_match);
+ slf.state = State::Data;
+ slf.emit_current_tag();
+ Ok(ControlToken::Continue)
+ }
+ Some('\0') => {
+ slf.emit_error(Error::UnexpectedNullCharacter);
+ slf.push_tag_name("\u{fffd}");
+ Ok(ControlToken::Continue)
+ }
+ Some(x) => {
+ slf.push_tag_name(ctostr!(x.to_ascii_lowercase()));
+ Ok(ControlToken::Continue)
+ }
+ None => {
+ slf.emit_error(Error::EofInTag);
+ Ok(ControlToken::Eof)
+ }
+ },
+ State::RcDataLessThanSign => match slf.read_char()? {
+ Some('/') => {
+ slf.temporary_buffer.clear();
+ slf.state = State::RcDataEndTagOpen;
+ Ok(ControlToken::Continue)
+ }
+ c => {
+ slf.emitter.emit_string("<");
+ slf.state = State::RcData;
+ slf.unread_char(c);
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::RcDataEndTagOpen => match slf.read_char()? {
+ Some(x) if x.is_ascii_alphabetic() => {
+ slf.init_end_tag();
+ slf.state = State::RcDataEndTagName;
+ slf.unread_char(Some(x));
+ Ok(ControlToken::Continue)
+ }
+ c => {
+ slf.emitter.emit_string("</");
+ slf.state = State::RcData;
+ slf.unread_char(c);
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::RcDataEndTagName => match slf.read_char()? {
+ Some(whitespace_pat!()) if slf.current_end_tag_is_appropriate() => {
+ slf.state = State::BeforeAttributeName;
+ Ok(ControlToken::Continue)
+ }
+ Some('/') if slf.current_end_tag_is_appropriate() => {
+ slf.state = State::SelfClosingStartTag;
+ Ok(ControlToken::Continue)
+ }
+ Some('>') if slf.current_end_tag_is_appropriate() => {
+ slf.state = State::Data;
+ slf.emit_current_tag();
+ Ok(ControlToken::Continue)
+ }
+ Some(x) if x.is_ascii_alphabetic() => {
+ slf.push_tag_name(ctostr!(x.to_ascii_lowercase()));
+ slf.temporary_buffer.push(x);
+ Ok(ControlToken::Continue)
+ }
+ c => {
+ slf.emitter.emit_string("</");
+ slf.flush_buffer_characters();
+
+ slf.state = State::RcData;
+ slf.unread_char(c);
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::RawTextLessThanSign => match slf.read_char()? {
+ Some('/') => {
+ slf.temporary_buffer.clear();
+ slf.state = State::RawTextEndTagOpen;
+ Ok(ControlToken::Continue)
+ }
+ c => {
+ slf.emitter.emit_string("<");
+ slf.state = State::RawText;
+ slf.unread_char(c);
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::RawTextEndTagOpen => match slf.read_char()? {
+ Some(x) if x.is_ascii_alphabetic() => {
+ slf.init_end_tag();
+ slf.state = State::RawTextEndTagName;
+ slf.unread_char(Some(x));
+ Ok(ControlToken::Continue)
+ }
+ c => {
+ slf.emitter.emit_string("</");
+ slf.state = State::RawText;
+ slf.unread_char(c);
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::RawTextEndTagName => match slf.read_char()? {
+ Some(whitespace_pat!()) if slf.current_end_tag_is_appropriate() => {
+ slf.state = State::BeforeAttributeName;
+ Ok(ControlToken::Continue)
+ }
+ Some('/') if slf.current_end_tag_is_appropriate() => {
+ slf.state = State::SelfClosingStartTag;
+ Ok(ControlToken::Continue)
+ }
+ Some('>') if slf.current_end_tag_is_appropriate() => {
+ slf.state = State::Data;
+ slf.emit_current_tag();
+ Ok(ControlToken::Continue)
+ }
+ Some(x) if x.is_ascii_alphabetic() => {
+ slf.push_tag_name(ctostr!(x.to_ascii_lowercase()));
+ slf.temporary_buffer.push(x);
+ Ok(ControlToken::Continue)
+ }
+ c => {
+ slf.emitter.emit_string("</");
+ slf.flush_buffer_characters();
+
+ slf.state = State::RawText;
+ slf.unread_char(c);
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::ScriptDataLessThanSign => match slf.read_char()? {
+ Some('/') => {
+ slf.temporary_buffer.clear();
+ slf.state = State::ScriptDataEndTagOpen;
+ Ok(ControlToken::Continue)
+ }
+ Some('!') => {
+ slf.state = State::ScriptDataEscapeStart;
+ slf.emitter.emit_string("<!");
+ Ok(ControlToken::Continue)
+ }
+ c => {
+ slf.emitter.emit_string("<");
+ slf.state = State::ScriptData;
+ slf.unread_char(c);
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::ScriptDataEndTagOpen => match slf.read_char()? {
+ Some(x) if x.is_ascii_alphabetic() => {
+ slf.init_end_tag();
+ slf.state = State::ScriptDataEndTagName;
+ slf.unread_char(Some(x));
+ Ok(ControlToken::Continue)
+ }
+ c => {
+ slf.emitter.emit_string("</");
+ slf.state = State::ScriptData;
+ slf.unread_char(c);
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::ScriptDataEndTagName => match slf.read_char()? {
+ Some(whitespace_pat!()) if slf.current_end_tag_is_appropriate() => {
+ slf.state = State::BeforeAttributeName;
+ Ok(ControlToken::Continue)
+ }
+ Some('/') if slf.current_end_tag_is_appropriate() => {
+ slf.state = State::SelfClosingStartTag;
+ Ok(ControlToken::Continue)
+ }
+ Some('>') if slf.current_end_tag_is_appropriate() => {
+ slf.state = State::Data;
+ slf.emit_current_tag();
+ Ok(ControlToken::Continue)
+ }
+ Some(x) if x.is_ascii_alphabetic() => {
+ slf.push_tag_name(ctostr!(x.to_ascii_lowercase()));
+ slf.temporary_buffer.push(x.to_ascii_lowercase());
+ Ok(ControlToken::Continue)
+ }
+ c => {
+ slf.emitter.emit_string("</");
+ slf.flush_buffer_characters();
+ slf.state = State::Data;
+ slf.unread_char(c);
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::ScriptDataEscapeStart => match slf.read_char()? {
+ Some('-') => {
+ slf.state = State::ScriptDataEscapeStartDash;
+ slf.emitter.emit_string("-");
+ Ok(ControlToken::Continue)
+ }
+ c => {
+ slf.state = State::ScriptData;
+ slf.unread_char(c);
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::ScriptDataEscapeStartDash => match slf.read_char()? {
+ Some('-') => {
+ slf.state = State::ScriptDataEscapedDashDash;
+ slf.emitter.emit_string("-");
+ Ok(ControlToken::Continue)
+ }
+ c => {
+ slf.state = State::ScriptData;
+ slf.unread_char(c);
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::ScriptDataEscaped => match slf.read_char()? {
+ Some('-') => {
+ slf.state = State::ScriptDataEscapedDash;
+ slf.emitter.emit_string("-");
+ Ok(ControlToken::Continue)
+ }
+ Some('<') => {
+ slf.state = State::ScriptDataEscapedLessThanSign;
+ Ok(ControlToken::Continue)
+ }
+ Some('\0') => {
+ slf.emit_error(Error::UnexpectedNullCharacter);
+ slf.emitter.emit_string("\u{fffd}");
+ Ok(ControlToken::Continue)
+ }
+ None => {
+ slf.emit_error(Error::EofInScriptHtmlCommentLikeText);
+ Ok(ControlToken::Eof)
+ }
+ Some(x) => {
+ slf.emitter.emit_string(ctostr!(x));
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::ScriptDataEscapedDash => match slf.read_char()? {
+ Some('-') => {
+ slf.state = State::ScriptDataEscapedDashDash;
+ slf.emitter.emit_string("-");
+ Ok(ControlToken::Continue)
+ }
+ Some('<') => {
+ slf.state = State::ScriptDataEscapedLessThanSign;
+ Ok(ControlToken::Continue)
+ }
+ Some('\0') => {
+ slf.emit_error(Error::UnexpectedNullCharacter);
+ slf.state = State::ScriptDataEscaped;
+ slf.emitter.emit_string("\u{fffd}");
+ Ok(ControlToken::Continue)
+ }
+ None => {
+ slf.emit_error(Error::EofInScriptHtmlCommentLikeText);
+ Ok(ControlToken::Eof)
+ }
+ Some(x) => {
+ slf.state = State::ScriptDataEscaped;
+ slf.emitter.emit_string(ctostr!(x));
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::ScriptDataEscapedDashDash => match slf.read_char()? {
+ Some('-') => {
+ slf.emitter.emit_string("-");
+ Ok(ControlToken::Continue)
+ }
+ Some('<') => {
+ slf.state = State::ScriptDataEscapedLessThanSign;
+ Ok(ControlToken::Continue)
+ }
+ Some('>') => {
+ slf.state = State::ScriptData;
+ slf.emitter.emit_string(">");
+ Ok(ControlToken::Continue)
+ }
+ Some('\0') => {
+ slf.emit_error(Error::UnexpectedNullCharacter);
+ slf.state = State::ScriptDataEscaped;
+ slf.emitter.emit_string("\u{fffd}");
+ Ok(ControlToken::Continue)
+ }
+ None => {
+ slf.emit_error(Error::EofInScriptHtmlCommentLikeText);
+ Ok(ControlToken::Eof)
+ }
+ Some(x) => {
+ slf.state = State::ScriptDataEscaped;
+ slf.emitter.emit_string(ctostr!(x));
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::ScriptDataEscapedLessThanSign => match slf.read_char()? {
+ Some('/') => {
+ slf.temporary_buffer.clear();
+ slf.state = State::ScriptDataEscapedEndTagOpen;
+ Ok(ControlToken::Continue)
+ }
+ Some(x) if x.is_ascii_alphabetic() => {
+ slf.temporary_buffer.clear();
+ slf.emitter.emit_string("<");
+ slf.state = State::ScriptDataDoubleEscapeStart;
+ slf.unread_char(Some(x));
+ Ok(ControlToken::Continue)
+ }
+ c => {
+ slf.emitter.emit_string("<");
+ slf.state = State::ScriptDataEscaped;
+ slf.unread_char(c);
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::ScriptDataEscapedEndTagOpen => match slf.read_char()? {
+ Some(x) if x.is_ascii_alphabetic() => {
+ slf.init_end_tag();
+ slf.state = State::ScriptDataEscapedEndTagName;
+ slf.unread_char(Some(x));
+ Ok(ControlToken::Continue)
+ }
+ c => {
+ slf.emitter.emit_string("</");
+ slf.unread_char(c);
+ slf.state = State::ScriptDataEscaped;
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::ScriptDataEscapedEndTagName => match slf.read_char()? {
+ Some(whitespace_pat!()) if slf.current_end_tag_is_appropriate() => {
+ slf.state = State::BeforeAttributeName;
+ Ok(ControlToken::Continue)
+ }
+ Some('/') if slf.current_end_tag_is_appropriate() => {
+ slf.state = State::SelfClosingStartTag;
+ Ok(ControlToken::Continue)
+ }
+ Some('>') if slf.current_end_tag_is_appropriate() => {
+ slf.state = State::Data;
+ slf.emit_current_tag();
+ Ok(ControlToken::Continue)
+ }
+ Some(x) if x.is_ascii_alphabetic() => {
+ slf.push_tag_name(ctostr!(x.to_ascii_lowercase()));
+ slf.temporary_buffer.push(x);
+ Ok(ControlToken::Continue)
+ }
+ c => {
+ slf.emitter.emit_string("</");
+ slf.flush_buffer_characters();
+ slf.state = State::ScriptDataEscaped;
+ slf.unread_char(c);
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::ScriptDataDoubleEscapeStart => match slf.read_char()? {
+ Some(x @ whitespace_pat!() | x @ '/' | x @ '>') => {
+ if slf.temporary_buffer == "script" {
+ slf.state = State::ScriptDataDoubleEscaped;
+ } else {
+ slf.state = State::ScriptDataEscaped;
+ }
+ slf.emitter.emit_string(ctostr!(x));
+ Ok(ControlToken::Continue)
+ }
+ Some(x) if x.is_ascii_alphabetic() => {
+ slf.temporary_buffer.push(x.to_ascii_lowercase());
+ slf.emitter.emit_string(ctostr!(x));
+ Ok(ControlToken::Continue)
+ }
+ c => {
+ slf.state = State::ScriptDataEscaped;
+ slf.unread_char(c);
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::ScriptDataDoubleEscaped => match slf.read_char()? {
+ Some('-') => {
+ slf.state = State::ScriptDataDoubleEscapedDash;
+ slf.emitter.emit_string("-");
+ Ok(ControlToken::Continue)
+ }
+ Some('<') => {
+ slf.state = State::ScriptDataDoubleEscapedLessThanSign;
+ slf.emitter.emit_string("<");
+ Ok(ControlToken::Continue)
+ }
+ Some('\0') => {
+ slf.emit_error(Error::UnexpectedNullCharacter);
+ slf.emitter.emit_string("\u{fffd}");
+ Ok(ControlToken::Continue)
+ }
+ None => {
+ slf.emit_error(Error::EofInScriptHtmlCommentLikeText);
+ Ok(ControlToken::Eof)
+ }
+ Some(x) => {
+ slf.emitter.emit_string(ctostr!(x));
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::ScriptDataDoubleEscapedDash => match slf.read_char()? {
+ Some('-') => {
+ slf.state = State::ScriptDataDoubleEscapedDashDash;
+ slf.emitter.emit_string("-");
+ Ok(ControlToken::Continue)
+ }
+ Some('<') => {
+ slf.state = State::ScriptDataDoubleEscapedLessThanSign;
+ slf.emitter.emit_string("<");
+ Ok(ControlToken::Continue)
+ }
+ Some('\0') => {
+ slf.emit_error(Error::UnexpectedNullCharacter);
+ slf.state = State::ScriptDataDoubleEscaped;
+ slf.emitter.emit_string("\u{fffd}");
+ Ok(ControlToken::Continue)
+ }
+ None => {
+ slf.emit_error(Error::EofInScriptHtmlCommentLikeText);
+ Ok(ControlToken::Eof)
+ }
+ Some(x) => {
+ slf.state = State::ScriptDataDoubleEscaped;
+ slf.emitter.emit_string(ctostr!(x));
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::ScriptDataDoubleEscapedDashDash => match slf.read_char()? {
+ Some('-') => {
+ slf.emitter.emit_string("-");
+ Ok(ControlToken::Continue)
+ }
+ Some('<') => {
+ slf.emitter.emit_string("<");
+ slf.state = State::ScriptDataDoubleEscapedLessThanSign;
+ Ok(ControlToken::Continue)
+ }
+ Some('>') => {
+ slf.emitter.emit_string(">");
+ slf.state = State::ScriptData;
+ Ok(ControlToken::Continue)
+ }
+ Some('\0') => {
+ slf.emit_error(Error::UnexpectedNullCharacter);
+ slf.state = State::ScriptDataDoubleEscaped;
+ slf.emitter.emit_string("\u{fffd}");
+ Ok(ControlToken::Continue)
+ }
+ None => {
+ slf.emit_error(Error::EofInScriptHtmlCommentLikeText);
+ Ok(ControlToken::Eof)
+ }
+ Some(x) => {
+ slf.state = State::ScriptDataDoubleEscaped;
+ slf.emitter.emit_string(ctostr!(x));
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::ScriptDataDoubleEscapedLessThanSign => match slf.read_char()? {
+ Some('/') => {
+ slf.temporary_buffer.clear();
+ slf.state = State::ScriptDataDoubleEscapeEnd;
+ slf.emitter.emit_string("/");
+ Ok(ControlToken::Continue)
+ }
+ c => {
+ slf.state = State::ScriptDataDoubleEscaped;
+ slf.unread_char(c);
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::ScriptDataDoubleEscapeEnd => match slf.read_char()? {
+ Some(x @ whitespace_pat!() | x @ '/' | x @ '>') => {
+ if slf.temporary_buffer == "script" {
+ slf.state = State::ScriptDataEscaped;
+ } else {
+ slf.state = State::ScriptDataDoubleEscaped;
+ }
+
+ slf.emitter.emit_string(ctostr!(x));
+ Ok(ControlToken::Continue)
+ }
+ Some(x) if x.is_ascii_alphabetic() => {
+ slf.temporary_buffer.push(x.to_ascii_lowercase());
+ slf.emitter.emit_string(ctostr!(x));
+ Ok(ControlToken::Continue)
+ }
+ c => {
+ slf.state = State::ScriptDataDoubleEscaped;
+ slf.unread_char(c);
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::BeforeAttributeName => match slf.read_char()? {
+ Some(whitespace_pat!()) => Ok(ControlToken::Continue),
+ c @ Some('/' | '>') | c @ None => {
+ slf.state = State::AfterAttributeName;
+ slf.unread_char(c);
+ Ok(ControlToken::Continue)
+ }
+ Some('=') => {
+ slf.emit_error(Error::UnexpectedEqualsSignBeforeAttributeName);
+ slf.emitter.init_attribute_name(slf.reader.position());
+ slf.emitter.push_attribute_name("=");
+ slf.state = State::AttributeName;
+ Ok(ControlToken::Continue)
+ }
+ Some(x) => {
+ slf.emitter.init_attribute_name(slf.position_before_match);
+ slf.state = State::AttributeName;
+ slf.unread_char(Some(x));
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::AttributeName => match slf.read_char()? {
+ c @ Some(whitespace_pat!() | '/' | '>') | c @ None => {
+ slf.emitter
+ .terminate_attribute_name(slf.position_before_match);
+ slf.state = State::AfterAttributeName;
+ slf.unread_char(c);
+ Ok(ControlToken::Continue)
+ }
+ Some('=') => {
+ slf.emitter
+ .terminate_attribute_name(slf.position_before_match);
+ slf.state = State::BeforeAttributeValue;
+ Ok(ControlToken::Continue)
+ }
+ Some('\0') => {
+ slf.emit_error(Error::UnexpectedNullCharacter);
+ slf.emitter.push_attribute_name("\u{fffd}");
+ Ok(ControlToken::Continue)
+ }
+ Some(x @ '"' | x @ '\'' | x @ '<') => {
+ slf.emit_error(Error::UnexpectedCharacterInAttributeName);
+ slf.emitter
+ .push_attribute_name(ctostr!(x.to_ascii_lowercase()));
+ Ok(ControlToken::Continue)
+ }
+ Some(x) => {
+ slf.emitter
+ .push_attribute_name(ctostr!(x.to_ascii_lowercase()));
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::AfterAttributeName => match slf.read_char()? {
+ Some(whitespace_pat!()) => Ok(ControlToken::Continue),
+ Some('/') => {
+ slf.state = State::SelfClosingStartTag;
+ Ok(ControlToken::Continue)
+ }
+ Some('=') => {
+ slf.state = State::BeforeAttributeValue;
+ Ok(ControlToken::Continue)
+ }
+ Some('>') => {
+ slf.state = State::Data;
+ slf.emit_current_tag();
+ Ok(ControlToken::Continue)
+ }
+ None => {
+ slf.emit_error(Error::EofInTag);
+ Ok(ControlToken::Eof)
+ }
+ Some(x) => {
+ slf.emitter.init_attribute_name(slf.position_before_match);
+ slf.state = State::AttributeName;
+ slf.unread_char(Some(x));
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::BeforeAttributeValue => match slf.read_char()? {
+ Some(whitespace_pat!()) => Ok(ControlToken::Continue),
+ Some('"') => {
+ slf.emitter
+ .init_attribute_value(AttrValueSyntax::DoubleQuoted, slf.reader.position());
+ slf.state = State::AttributeValueDoubleQuoted;
+ Ok(ControlToken::Continue)
+ }
+ Some('\'') => {
+ slf.emitter
+ .init_attribute_value(AttrValueSyntax::SingleQuoted, slf.reader.position());
+ slf.state = State::AttributeValueSingleQuoted;
+ Ok(ControlToken::Continue)
+ }
+ Some('>') => {
+ slf.emit_error(Error::MissingAttributeValue);
+ slf.state = State::Data;
+ slf.emit_current_tag();
+ Ok(ControlToken::Continue)
+ }
+ c => {
+ slf.emitter
+ .init_attribute_value(AttrValueSyntax::Unquoted, slf.position_before_match);
+ slf.state = State::AttributeValueUnquoted;
+ slf.unread_char(c);
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::AttributeValueDoubleQuoted => match slf.read_char()? {
+ Some('"') => {
+ slf.emitter.terminate_attribute_value(
+ // We cannot simply pass slf.position_before_match because
+ // State::NamedCharacterReference calls Tokenizer::unread_char
+ // which Reader::position doesn't account for.
+ // TODO: pass slf.position_before_match once CharacterReference has been converted to a function call
+ slf.reader.position() - slf.reader.len_of_char_in_current_encoding('"'),
+ );
+ slf.state = State::AfterAttributeValueQuoted;
+ Ok(ControlToken::Continue)
+ }
+ Some('&') => {
+ slf.return_state = Some(State::AttributeValueDoubleQuoted);
+ slf.state = State::CharacterReference;
+ Ok(ControlToken::Continue)
+ }
+ Some('\0') => {
+ slf.emit_error(Error::UnexpectedNullCharacter);
+ slf.emitter.push_attribute_value("\u{fffd}");
+ Ok(ControlToken::Continue)
+ }
+ None => {
+ slf.emit_error(Error::EofInTag);
+ Ok(ControlToken::Eof)
+ }
+ Some(x) => {
+ slf.emitter.push_attribute_value(ctostr!(x));
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::AttributeValueSingleQuoted => match slf.read_char()? {
+ Some('\'') => {
+ slf.emitter.terminate_attribute_value(
+ // We cannot simply pass slf.position_before_match because
+ // State::NamedCharacterReference calls Tokenizer::unread_char
+ // which Reader::position doesn't account for.
+ // TODO: pass slf.position_before_match once CharacterReference has been converted to a function call
+ slf.reader.position() - slf.reader.len_of_char_in_current_encoding('\''),
+ );
+ slf.state = State::AfterAttributeValueQuoted;
+ Ok(ControlToken::Continue)
+ }
+ Some('&') => {
+ slf.return_state = Some(State::AttributeValueSingleQuoted);
+ slf.state = State::CharacterReference;
+ Ok(ControlToken::Continue)
+ }
+ Some('\0') => {
+ slf.emit_error(Error::UnexpectedNullCharacter);
+ slf.emitter.push_attribute_value("\u{fffd}");
+ Ok(ControlToken::Continue)
+ }
+ None => {
+ slf.emit_error(Error::EofInTag);
+ Ok(ControlToken::Eof)
+ }
+ Some(x) => {
+ slf.emitter.push_attribute_value(ctostr!(x));
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::AttributeValueUnquoted => match slf.read_char()? {
+ Some(whitespace_pat!()) => {
+ slf.emitter.terminate_attribute_value(
+ // We cannot simply pass slf.position_before_match because
+ // State::NamedCharacterReference calls Tokenizer::unread_char
+ // which Reader::position doesn't account for.
+ // TODO: pass slf.position_before_match once CharacterReference has been converted to a function call
+ slf.reader.position() - slf.reader.len_of_char_in_current_encoding(' '),
+ );
+ slf.state = State::BeforeAttributeName;
+ Ok(ControlToken::Continue)
+ }
+ Some('&') => {
+ slf.return_state = Some(State::AttributeValueUnquoted);
+ slf.state = State::CharacterReference;
+ Ok(ControlToken::Continue)
+ }
+ Some('>') => {
+ slf.state = State::Data;
+ slf.emit_current_tag();
+ Ok(ControlToken::Continue)
+ }
+ Some('\0') => {
+ slf.emit_error(Error::UnexpectedNullCharacter);
+ slf.emitter.push_attribute_value("\u{fffd}");
+ Ok(ControlToken::Continue)
+ }
+ Some(x @ '"' | x @ '\'' | x @ '<' | x @ '=' | x @ '\u{60}') => {
+ slf.emit_error(Error::UnexpectedCharacterInUnquotedAttributeValue);
+ slf.emitter.push_attribute_value(ctostr!(x));
+ Ok(ControlToken::Continue)
+ }
+ None => {
+ slf.emit_error(Error::EofInTag);
+ Ok(ControlToken::Eof)
+ }
+ Some(x) => {
+ slf.emitter.push_attribute_value(ctostr!(x));
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::AfterAttributeValueQuoted => match slf.read_char()? {
+ Some(whitespace_pat!()) => {
+ slf.state = State::BeforeAttributeName;
+ Ok(ControlToken::Continue)
+ }
+ Some('/') => {
+ slf.state = State::SelfClosingStartTag;
+ Ok(ControlToken::Continue)
+ }
+ Some('>') => {
+ slf.state = State::Data;
+ slf.emit_current_tag();
+ Ok(ControlToken::Continue)
+ }
+ None => {
+ slf.emit_error(Error::EofInTag);
+ Ok(ControlToken::Eof)
+ }
+ Some(x) => {
+ slf.emit_error(Error::MissingWhitespaceBetweenAttributes);
+ slf.state = State::BeforeAttributeName;
+ slf.unread_char(Some(x));
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::SelfClosingStartTag => match slf.read_char()? {
+ Some('>') => {
+ slf.emitter.set_self_closing(
+ slf.position_before_match - slf.reader.len_of_char_in_current_encoding('/')
+ ..slf.position_before_match,
+ );
+ slf.state = State::Data;
+ slf.emit_current_tag();
+ Ok(ControlToken::Continue)
+ }
+ None => {
+ slf.emit_error(Error::EofInTag);
+ Ok(ControlToken::Eof)
+ }
+ Some(x) => {
+ slf.emit_error(Error::UnexpectedSolidusInTag);
+ slf.state = State::BeforeAttributeName;
+ slf.unread_char(Some(x));
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::BogusComment => match slf.read_char()? {
+ Some('>') => {
+ slf.state = State::Data;
+ slf.emitter.emit_current_comment(slf.position_before_match);
+ Ok(ControlToken::Continue)
+ }
+ None => {
+ slf.emitter.emit_current_comment(slf.position_before_match);
+ Ok(ControlToken::Eof)
+ }
+ Some('\0') => {
+ slf.emit_error(Error::UnexpectedNullCharacter);
+ slf.emitter.push_comment("\u{fffd}");
+ Ok(ControlToken::Continue)
+ }
+ Some(x) => {
+ slf.emitter.push_comment(ctostr!(x));
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::MarkupDeclarationOpen => match slf.read_char()? {
+ Some('-') if slf.try_read_string("-", true)? => {
+ slf.emitter.init_comment(slf.reader.position());
+ slf.state = State::CommentStart;
+ Ok(ControlToken::Continue)
+ }
+ Some('d' | 'D') if slf.try_read_string("octype", false)? => {
+ slf.state = State::Doctype;
+ Ok(ControlToken::Continue)
+ }
+ Some('[') if slf.try_read_string("CDATA[", true)? => Ok(ControlToken::CdataOpen),
+ c => {
+ slf.emit_error(Error::IncorrectlyOpenedComment);
+ slf.emitter.init_comment(slf.position_before_match);
+ slf.state = State::BogusComment;
+ slf.unread_char(c);
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::CommentStart => match slf.read_char()? {
+ Some('-') => {
+ slf.state = State::CommentStartDash;
+ Ok(ControlToken::Continue)
+ }
+ Some('>') => {
+ slf.emit_error(Error::AbruptClosingOfEmptyComment);
+ slf.state = State::Data;
+ slf.emitter.emit_current_comment(slf.position_before_match);
+ Ok(ControlToken::Continue)
+ }
+ c => {
+ slf.unread_char(c);
+ slf.state = State::Comment;
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::CommentStartDash => match slf.read_char()? {
+ Some('-') => {
+ slf.state = State::CommentEnd;
+ Ok(ControlToken::Continue)
+ }
+ Some(c @ '>') => {
+ slf.emit_error(Error::AbruptClosingOfEmptyComment);
+ slf.state = State::Data;
+ slf.emitter.emit_current_comment(
+ slf.position_before_match - slf.reader.len_of_char_in_current_encoding(c),
+ );
+ Ok(ControlToken::Continue)
+ }
+ None => {
+ slf.emit_error(Error::EofInComment);
+ slf.emitter.emit_current_comment(
+ slf.position_before_match - slf.reader.len_of_char_in_current_encoding('-'),
+ );
+ Ok(ControlToken::Eof)
+ }
+ c @ Some(_) => {
+ slf.emitter.push_comment("-");
+ slf.unread_char(c);
+ slf.state = State::Comment;
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::Comment => match slf.read_char()? {
+ Some('<') => {
+ slf.emitter.push_comment("<");
+ slf.state = State::CommentLessThanSign;
+ Ok(ControlToken::Continue)
+ }
+ Some('-') => {
+ slf.some_offset = slf.position_before_match;
+ slf.state = State::CommentEndDash;
+ Ok(ControlToken::Continue)
+ }
+ Some('\0') => {
+ slf.emit_error(Error::UnexpectedNullCharacter);
+ slf.emitter.push_comment("\u{fffd}");
+ Ok(ControlToken::Continue)
+ }
+ None => {
+ slf.emit_error(Error::EofInComment);
+ slf.emitter.emit_current_comment(slf.reader.position());
+ Ok(ControlToken::Eof)
+ }
+ Some(x) => {
+ slf.emitter.push_comment(ctostr!(x));
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::CommentLessThanSign => match slf.read_char()? {
+ Some('!') => {
+ slf.emitter.push_comment("!");
+ slf.state = State::CommentLessThanSignBang;
+ Ok(ControlToken::Continue)
+ }
+ Some('<') => {
+ slf.emitter.push_comment("<");
+ Ok(ControlToken::Continue)
+ }
+ c => {
+ slf.unread_char(c);
+ slf.state = State::Comment;
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::CommentLessThanSignBang => match slf.read_char()? {
+ Some('-') => {
+ slf.state = State::CommentLessThanSignBangDash;
+ Ok(ControlToken::Continue)
+ }
+ c => {
+ slf.unread_char(c);
+ slf.state = State::Comment;
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::CommentLessThanSignBangDash => match slf.read_char()? {
+ Some('-') => {
+ slf.state = State::CommentLessThanSignBangDashDash;
+ Ok(ControlToken::Continue)
+ }
+ c => {
+ slf.unread_char(c);
+ slf.state = State::CommentEndDash;
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::CommentLessThanSignBangDashDash => match slf.read_char()? {
+ c @ Some('>') | c @ None => {
+ slf.unread_char(c);
+ slf.state = State::CommentEnd;
+ Ok(ControlToken::Continue)
+ }
+ c => {
+ slf.emit_error(Error::NestedComment);
+ slf.unread_char(c);
+ slf.state = State::CommentEnd;
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::CommentEndDash => match slf.read_char()? {
+ Some('-') => {
+ slf.state = State::CommentEnd;
+ Ok(ControlToken::Continue)
+ }
+ None => {
+ slf.emit_error(Error::EofInComment);
+ slf.emitter.emit_current_comment(slf.some_offset);
+ Ok(ControlToken::Eof)
+ }
+ c => {
+ slf.emitter.push_comment("-");
+ slf.unread_char(c);
+ slf.state = State::Comment;
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::CommentEnd => match slf.read_char()? {
+ Some('>') => {
+ slf.state = State::Data;
+ slf.emitter.emit_current_comment(slf.some_offset);
+ Ok(ControlToken::Continue)
+ }
+ Some('!') => {
+ slf.state = State::CommentEndBang;
+ Ok(ControlToken::Continue)
+ }
+ Some('-') => {
+ slf.emitter.push_comment("-");
+ Ok(ControlToken::Continue)
+ }
+ None => {
+ slf.emit_error(Error::EofInComment);
+ slf.emitter.emit_current_comment(slf.some_offset);
+ Ok(ControlToken::Eof)
+ }
+ c @ Some(_) => {
+ slf.emitter.push_comment("-");
+ slf.emitter.push_comment("-");
+ slf.unread_char(c);
+ slf.state = State::Comment;
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::CommentEndBang => match slf.read_char()? {
+ Some('-') => {
+ slf.emitter.push_comment("-");
+ slf.emitter.push_comment("-");
+ slf.emitter.push_comment("!");
+ slf.state = State::CommentEndDash;
+ Ok(ControlToken::Continue)
+ }
+ Some('>') => {
+ slf.emit_error(Error::IncorrectlyClosedComment);
+ slf.state = State::Data;
+ slf.emitter.emit_current_comment(slf.some_offset);
+ Ok(ControlToken::Continue)
+ }
+ None => {
+ slf.emit_error(Error::EofInComment);
+ slf.emitter.emit_current_comment(slf.some_offset);
+ Ok(ControlToken::Eof)
+ }
+ c @ Some(_) => {
+ slf.emitter.push_comment("-");
+ slf.emitter.push_comment("-");
+ slf.emitter.push_comment("!");
+ slf.state = State::Comment;
+ slf.unread_char(c);
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::Doctype => match slf.read_char()? {
+ Some(whitespace_pat!()) => {
+ slf.state = State::BeforeDoctypeName;
+ Ok(ControlToken::Continue)
+ }
+ c @ Some('>') => {
+ slf.unread_char(c);
+ slf.state = State::BeforeDoctypeName;
+ Ok(ControlToken::Continue)
+ }
+ None => {
+ slf.emit_error(Error::EofInDoctype);
+ slf.init_doctype();
+ slf.emitter.set_force_quirks();
+ slf.emitter.emit_current_doctype(slf.reader.position());
+ Ok(ControlToken::Eof)
+ }
+ c @ Some(_) => {
+ slf.emit_error(Error::MissingWhitespaceBeforeDoctypeName);
+ slf.unread_char(c);
+ slf.state = State::BeforeDoctypeName;
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::BeforeDoctypeName => match slf.read_char()? {
+ Some(whitespace_pat!()) => Ok(ControlToken::Continue),
+ Some('\0') => {
+ slf.emit_error(Error::UnexpectedNullCharacter);
+ slf.init_doctype();
+ slf.emitter.init_doctype_name(slf.position_before_match);
+ slf.emitter.push_doctype_name("\u{fffd}");
+ slf.state = State::DoctypeName;
+ Ok(ControlToken::Continue)
+ }
+ Some('>') => {
+ slf.emit_error(Error::MissingDoctypeName);
+ slf.init_doctype();
+ slf.emitter.set_force_quirks();
+ slf.state = State::Data;
+ slf.emitter.emit_current_doctype(slf.reader.position());
+ Ok(ControlToken::Continue)
+ }
+ None => {
+ slf.emit_error(Error::EofInDoctype);
+ slf.init_doctype();
+ slf.emitter.set_force_quirks();
+ slf.emitter.emit_current_doctype(slf.reader.position());
+ Ok(ControlToken::Eof)
+ }
+ Some(x) => {
+ slf.init_doctype();
+ slf.emitter.init_doctype_name(slf.position_before_match);
+ slf.emitter
+ .push_doctype_name(ctostr!(x.to_ascii_lowercase()));
+ slf.state = State::DoctypeName;
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::DoctypeName => match slf.read_char()? {
+ Some(whitespace_pat!()) => {
+ slf.emitter
+ .terminate_doctype_name(slf.position_before_match);
+ slf.state = State::AfterDoctypeName;
+ Ok(ControlToken::Continue)
+ }
+ Some('>') => {
+ slf.emitter
+ .terminate_doctype_name(slf.position_before_match);
+ slf.state = State::Data;
+ slf.emitter.emit_current_doctype(slf.reader.position());
+ Ok(ControlToken::Continue)
+ }
+ Some('\0') => {
+ slf.emit_error(Error::UnexpectedNullCharacter);
+ slf.emitter.push_doctype_name("\u{fffd}");
+ Ok(ControlToken::Continue)
+ }
+ None => {
+ slf.emit_error(Error::EofInDoctype);
+ slf.emitter
+ .terminate_doctype_name(slf.position_before_match);
+ slf.emitter.set_force_quirks();
+ slf.emitter.emit_current_doctype(slf.reader.position());
+ Ok(ControlToken::Eof)
+ }
+ Some(x) => {
+ slf.emitter
+ .push_doctype_name(ctostr!(x.to_ascii_lowercase()));
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::AfterDoctypeName => match slf.read_char()? {
+ Some(whitespace_pat!()) => Ok(ControlToken::Continue),
+ Some('>') => {
+ slf.state = State::Data;
+ slf.emitter.emit_current_doctype(slf.reader.position());
+ Ok(ControlToken::Continue)
+ }
+ None => {
+ slf.emit_error(Error::EofInDoctype);
+ slf.emitter.set_force_quirks();
+ slf.emitter.emit_current_doctype(slf.reader.position());
+ Ok(ControlToken::Eof)
+ }
+ Some('p' | 'P') if slf.try_read_string("ublic", false)? => {
+ slf.state = State::AfterDoctypePublicKeyword;
+ Ok(ControlToken::Continue)
+ }
+ Some('s' | 'S') if slf.try_read_string("ystem", false)? => {
+ slf.state = State::AfterDoctypeSystemKeyword;
+ Ok(ControlToken::Continue)
+ }
+ c @ Some(_) => {
+ slf.emit_error(Error::InvalidCharacterSequenceAfterDoctypeName);
+ slf.emitter.set_force_quirks();
+ slf.unread_char(c);
+ slf.state = State::BogusDoctype;
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::AfterDoctypePublicKeyword => match slf.read_char()? {
+ Some(whitespace_pat!()) => {
+ slf.state = State::BeforeDoctypePublicIdentifier;
+ Ok(ControlToken::Continue)
+ }
+ Some('"') => {
+ slf.emit_error(Error::MissingWhitespaceAfterDoctypePublicKeyword);
+ slf.emitter.init_doctype_public_id(slf.reader.position());
+ slf.state = State::DoctypePublicIdentifierDoubleQuoted;
+ Ok(ControlToken::Continue)
+ }
+ Some('\'') => {
+ slf.emit_error(Error::MissingWhitespaceAfterDoctypePublicKeyword);
+ slf.emitter.init_doctype_public_id(slf.reader.position());
+ slf.state = State::DoctypePublicIdentifierSingleQuoted;
+ Ok(ControlToken::Continue)
+ }
+ Some('>') => {
+ slf.emit_error(Error::MissingDoctypePublicIdentifier);
+ slf.emitter.set_force_quirks();
+ slf.state = State::Data;
+ slf.emitter.emit_current_doctype(slf.reader.position());
+ Ok(ControlToken::Continue)
+ }
+ None => {
+ slf.emit_error(Error::EofInDoctype);
+ slf.emitter.set_force_quirks();
+ slf.emitter.emit_current_doctype(slf.reader.position());
+ Ok(ControlToken::Eof)
+ }
+ c @ Some(_) => {
+ slf.emit_error(Error::MissingQuoteBeforeDoctypePublicIdentifier);
+ slf.emitter.set_force_quirks();
+ slf.unread_char(c);
+ slf.state = State::BogusDoctype;
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::BeforeDoctypePublicIdentifier => match slf.read_char()? {
+ Some(whitespace_pat!()) => Ok(ControlToken::Continue),
+ Some('"') => {
+ slf.emitter.init_doctype_public_id(slf.reader.position());
+ slf.state = State::DoctypePublicIdentifierDoubleQuoted;
+ Ok(ControlToken::Continue)
+ }
+ Some('\'') => {
+ slf.emitter.init_doctype_public_id(slf.reader.position());
+ slf.state = State::DoctypePublicIdentifierSingleQuoted;
+ Ok(ControlToken::Continue)
+ }
+ Some('>') => {
+ slf.emit_error(Error::MissingDoctypePublicIdentifier);
+ slf.emitter.set_force_quirks();
+ slf.state = State::Data;
+ slf.emitter.emit_current_doctype(slf.reader.position());
+ Ok(ControlToken::Continue)
+ }
+ None => {
+ slf.emit_error(Error::EofInDoctype);
+ slf.emitter.set_force_quirks();
+ slf.emitter.emit_current_doctype(slf.reader.position());
+ Ok(ControlToken::Eof)
+ }
+ c @ Some(_) => {
+ slf.emit_error(Error::MissingQuoteBeforeDoctypePublicIdentifier);
+ slf.emitter.set_force_quirks();
+ slf.unread_char(c);
+ slf.state = State::BogusDoctype;
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::DoctypePublicIdentifierDoubleQuoted => match slf.read_char()? {
+ Some('"') => {
+ slf.emitter
+ .terminate_doctype_public_id(slf.position_before_match);
+ slf.state = State::AfterDoctypePublicIdentifier;
+ Ok(ControlToken::Continue)
+ }
+ Some('\0') => {
+ slf.emit_error(Error::UnexpectedNullCharacter);
+ slf.emitter.push_doctype_public_id("\u{fffd}");
+ Ok(ControlToken::Continue)
+ }
+ Some('>') => {
+ slf.emitter
+ .terminate_doctype_public_id(slf.position_before_match);
+ slf.emit_error(Error::AbruptDoctypePublicIdentifier);
+ slf.emitter.set_force_quirks();
+ slf.state = State::Data;
+ slf.emitter.emit_current_doctype(slf.reader.position());
+ Ok(ControlToken::Continue)
+ }
+ None => {
+ slf.emitter
+ .terminate_doctype_public_id(slf.reader.position());
+ slf.emit_error(Error::EofInDoctype);
+ slf.emitter.set_force_quirks();
+ slf.emitter.emit_current_doctype(slf.reader.position());
+ Ok(ControlToken::Eof)
+ }
+ Some(x) => {
+ slf.emitter.push_doctype_public_id(ctostr!(x));
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::DoctypePublicIdentifierSingleQuoted => match slf.read_char()? {
+ Some('\'') => {
+ slf.emitter
+ .terminate_doctype_public_id(slf.position_before_match);
+ slf.state = State::AfterDoctypePublicIdentifier;
+ Ok(ControlToken::Continue)
+ }
+ Some('\0') => {
+ slf.emit_error(Error::UnexpectedNullCharacter);
+ slf.emitter.push_doctype_public_id("\u{fffd}");
+ Ok(ControlToken::Continue)
+ }
+ Some('>') => {
+ slf.emitter
+ .terminate_doctype_public_id(slf.position_before_match);
+ slf.emit_error(Error::AbruptDoctypePublicIdentifier);
+ slf.emitter.set_force_quirks();
+ slf.state = State::Data;
+ slf.emitter.emit_current_doctype(slf.reader.position());
+ Ok(ControlToken::Continue)
+ }
+ None => {
+ slf.emitter
+ .terminate_doctype_public_id(slf.reader.position());
+ slf.emit_error(Error::EofInDoctype);
+ slf.emitter.set_force_quirks();
+ slf.emitter.emit_current_doctype(slf.reader.position());
+ Ok(ControlToken::Eof)
+ }
+ Some(x) => {
+ slf.emitter.push_doctype_public_id(ctostr!(x));
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::AfterDoctypePublicIdentifier => match slf.read_char()? {
+ Some(whitespace_pat!()) => {
+ slf.state = State::BetweenDoctypePublicAndSystemIdentifiers;
+ Ok(ControlToken::Continue)
+ }
+ Some('>') => {
+ slf.state = State::Data;
+ slf.emitter.emit_current_doctype(slf.reader.position());
+ Ok(ControlToken::Continue)
+ }
+ Some('"') => {
+ slf.emit_error(Error::MissingWhitespaceBetweenDoctypePublicAndSystemIdentifiers);
+ slf.emitter.init_doctype_system_id(slf.reader.position());
+ slf.state = State::DoctypeSystemIdentifierDoubleQuoted;
+ Ok(ControlToken::Continue)
+ }
+ Some('\'') => {
+ slf.emit_error(Error::MissingWhitespaceBetweenDoctypePublicAndSystemIdentifiers);
+ slf.emitter.init_doctype_system_id(slf.reader.position());
+ slf.state = State::DoctypeSystemIdentifierSingleQuoted;
+ Ok(ControlToken::Continue)
+ }
+ None => {
+ slf.emit_error(Error::EofInDoctype);
+ slf.emitter.set_force_quirks();
+ slf.emitter.emit_current_doctype(slf.reader.position());
+ Ok(ControlToken::Eof)
+ }
+ c @ Some(_) => {
+ slf.emit_error(Error::MissingQuoteBeforeDoctypeSystemIdentifier);
+ slf.emitter.set_force_quirks();
+ slf.unread_char(c);
+ slf.state = State::BogusDoctype;
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::BetweenDoctypePublicAndSystemIdentifiers => match slf.read_char()? {
+ Some(whitespace_pat!()) => Ok(ControlToken::Continue),
+ Some('>') => {
+ slf.state = State::Data;
+ slf.emitter.emit_current_doctype(slf.reader.position());
+ Ok(ControlToken::Continue)
+ }
+ Some('"') => {
+ slf.emitter.init_doctype_system_id(slf.reader.position());
+ slf.state = State::DoctypeSystemIdentifierDoubleQuoted;
+ Ok(ControlToken::Continue)
+ }
+ Some('\'') => {
+ slf.emitter.init_doctype_system_id(slf.reader.position());
+ slf.state = State::DoctypeSystemIdentifierSingleQuoted;
+ Ok(ControlToken::Continue)
+ }
+ None => {
+ slf.emit_error(Error::EofInDoctype);
+ slf.emitter.set_force_quirks();
+ slf.emitter.emit_current_doctype(slf.reader.position());
+ Ok(ControlToken::Eof)
+ }
+ c @ Some(_) => {
+ slf.emit_error(Error::MissingQuoteBeforeDoctypeSystemIdentifier);
+ slf.emitter.set_force_quirks();
+ slf.state = State::BogusDoctype;
+ slf.unread_char(c);
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::AfterDoctypeSystemKeyword => match slf.read_char()? {
+ Some(whitespace_pat!()) => {
+ slf.state = State::BeforeDoctypeSystemIdentifier;
+ Ok(ControlToken::Continue)
+ }
+ Some('"') => {
+ slf.emit_error(Error::MissingWhitespaceAfterDoctypeSystemKeyword);
+ slf.emitter.init_doctype_system_id(slf.reader.position());
+ slf.state = State::DoctypeSystemIdentifierDoubleQuoted;
+ Ok(ControlToken::Continue)
+ }
+ Some('\'') => {
+ slf.emit_error(Error::MissingWhitespaceAfterDoctypeSystemKeyword);
+ slf.emitter.init_doctype_system_id(slf.reader.position());
+ slf.state = State::DoctypeSystemIdentifierSingleQuoted;
+ Ok(ControlToken::Continue)
+ }
+ Some('>') => {
+ slf.emit_error(Error::MissingDoctypeSystemIdentifier);
+ slf.emitter.set_force_quirks();
+ slf.state = State::Data;
+ slf.emitter.emit_current_doctype(slf.reader.position());
+ Ok(ControlToken::Continue)
+ }
+ None => {
+ slf.emit_error(Error::EofInDoctype);
+ slf.emitter.set_force_quirks();
+ slf.emitter.emit_current_doctype(slf.reader.position());
+ Ok(ControlToken::Eof)
+ }
+ c @ Some(_) => {
+ slf.emit_error(Error::MissingQuoteBeforeDoctypeSystemIdentifier);
+ slf.emitter.set_force_quirks();
+ slf.state = State::BogusDoctype;
+ slf.unread_char(c);
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::BeforeDoctypeSystemIdentifier => match slf.read_char()? {
+ Some(whitespace_pat!()) => Ok(ControlToken::Continue),
+ Some('"') => {
+ slf.emitter.init_doctype_system_id(slf.reader.position());
+ slf.state = State::DoctypeSystemIdentifierDoubleQuoted;
+ Ok(ControlToken::Continue)
+ }
+ Some('\'') => {
+ slf.emitter.init_doctype_system_id(slf.reader.position());
+ slf.state = State::DoctypeSystemIdentifierSingleQuoted;
+ Ok(ControlToken::Continue)
+ }
+ Some('>') => {
+ slf.emit_error(Error::MissingDoctypeSystemIdentifier);
+ slf.emitter.set_force_quirks();
+ slf.state = State::Data;
+ slf.emitter.emit_current_doctype(slf.reader.position());
+ Ok(ControlToken::Continue)
+ }
+ None => {
+ slf.emit_error(Error::EofInDoctype);
+ slf.emitter.set_force_quirks();
+ slf.emitter.emit_current_doctype(slf.reader.position());
+ Ok(ControlToken::Eof)
+ }
+ c @ Some(_) => {
+ slf.emit_error(Error::MissingQuoteBeforeDoctypeSystemIdentifier);
+ slf.emitter.set_force_quirks();
+ slf.state = State::BogusDoctype;
+ slf.unread_char(c);
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::DoctypeSystemIdentifierDoubleQuoted => match slf.read_char()? {
+ Some('"') => {
+ slf.emitter
+ .terminate_doctype_system_id(slf.position_before_match);
+ slf.state = State::AfterDoctypeSystemIdentifier;
+ Ok(ControlToken::Continue)
+ }
+ Some('\0') => {
+ slf.emit_error(Error::UnexpectedNullCharacter);
+ slf.emitter.push_doctype_system_id("\u{fffd}");
+ Ok(ControlToken::Continue)
+ }
+ Some('>') => {
+ slf.emitter
+ .terminate_doctype_system_id(slf.position_before_match);
+ slf.emit_error(Error::AbruptDoctypeSystemIdentifier);
+ slf.emitter.set_force_quirks();
+ slf.state = State::Data;
+ slf.emitter.emit_current_doctype(slf.reader.position());
+ Ok(ControlToken::Continue)
+ }
+ None => {
+ slf.emitter
+ .terminate_doctype_system_id(slf.reader.position());
+ slf.emit_error(Error::EofInDoctype);
+ slf.emitter.set_force_quirks();
+ slf.emitter.emit_current_doctype(slf.reader.position());
+ Ok(ControlToken::Eof)
+ }
+ Some(x) => {
+ slf.emitter.push_doctype_system_id(ctostr!(x));
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::DoctypeSystemIdentifierSingleQuoted => match slf.read_char()? {
+ Some('\'') => {
+ slf.emitter
+ .terminate_doctype_system_id(slf.position_before_match);
+ slf.state = State::AfterDoctypeSystemIdentifier;
+ Ok(ControlToken::Continue)
+ }
+ Some('\0') => {
+ slf.emit_error(Error::UnexpectedNullCharacter);
+ slf.emitter.push_doctype_system_id("\u{fffd}");
+ Ok(ControlToken::Continue)
+ }
+ Some('>') => {
+ slf.emitter
+ .terminate_doctype_system_id(slf.position_before_match);
+ slf.emit_error(Error::AbruptDoctypeSystemIdentifier);
+ slf.emitter.set_force_quirks();
+ slf.state = State::Data;
+ slf.emitter.emit_current_doctype(slf.reader.position());
+ Ok(ControlToken::Continue)
+ }
+ None => {
+ slf.emitter
+ .terminate_doctype_system_id(slf.reader.position());
+ slf.emit_error(Error::EofInDoctype);
+ slf.emitter.set_force_quirks();
+ slf.emitter.emit_current_doctype(slf.reader.position());
+ Ok(ControlToken::Eof)
+ }
+ Some(x) => {
+ slf.emitter.push_doctype_system_id(ctostr!(x));
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::AfterDoctypeSystemIdentifier => match slf.read_char()? {
+ Some(whitespace_pat!()) => Ok(ControlToken::Continue),
+ Some('>') => {
+ slf.state = State::Data;
+ slf.emitter.emit_current_doctype(slf.reader.position());
+ Ok(ControlToken::Continue)
+ }
+ None => {
+ slf.emit_error(Error::EofInDoctype);
+ slf.emitter.set_force_quirks();
+ slf.emitter.emit_current_doctype(slf.reader.position());
+ Ok(ControlToken::Eof)
+ }
+ c @ Some(_) => {
+ slf.emit_error(Error::UnexpectedCharacterAfterDoctypeSystemIdentifier);
+ slf.unread_char(c);
+ slf.state = State::BogusDoctype;
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::BogusDoctype => match slf.read_char()? {
+ Some('>') => {
+ slf.state = State::Data;
+ slf.emitter.emit_current_doctype(slf.reader.position());
+ Ok(ControlToken::Continue)
+ }
+ Some('\0') => {
+ slf.emit_error(Error::UnexpectedNullCharacter);
+ Ok(ControlToken::Continue)
+ }
+ None => {
+ slf.emitter.emit_current_doctype(slf.reader.position());
+ Ok(ControlToken::Eof)
+ }
+ Some(_) => Ok(ControlToken::Continue),
+ },
+ State::CdataSection => match slf.read_char()? {
+ Some(']') => {
+ slf.state = State::CdataSectionBracket;
+ Ok(ControlToken::Continue)
+ }
+ None => {
+ slf.emit_error(Error::EofInCdata);
+ Ok(ControlToken::Eof)
+ }
+ Some(x) => {
+ slf.emitter.emit_string(ctostr!(x));
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::CdataSectionBracket => match slf.read_char()? {
+ Some(']') => {
+ slf.state = State::CdataSectionEnd;
+ Ok(ControlToken::Continue)
+ }
+ c => {
+ slf.emitter.emit_string("]");
+ slf.state = State::CdataSection;
+ slf.unread_char(c);
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::CdataSectionEnd => match slf.read_char()? {
+ Some(']') => {
+ slf.emitter.emit_string("]");
+ Ok(ControlToken::Continue)
+ }
+ Some('>') => {
+ slf.state = State::Data;
+ Ok(ControlToken::Continue)
+ }
+ c => {
+ slf.emitter.emit_string("]]");
+ slf.unread_char(c);
+ slf.state = State::CdataSection;
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::CharacterReference => {
+ // TODO: we can avoid these Reader method calls by changing CharacterReference to be a function instead of a state
+ slf.some_offset =
+ slf.reader.position() - slf.reader.len_of_char_in_current_encoding('&');
+ slf.temporary_buffer.clear();
+ slf.temporary_buffer.push('&');
+ match slf.read_char()? {
+ Some(x) if x.is_ascii_alphanumeric() => {
+ slf.unread_char(Some(x));
+ slf.state = State::NamedCharacterReference;
+ Ok(ControlToken::Continue)
+ }
+ Some('#') => {
+ slf.temporary_buffer.push('#');
+ slf.state = State::NumericCharacterReference;
+ Ok(ControlToken::Continue)
+ }
+ c => {
+ slf.flush_code_points_consumed_as_character_reference();
+ slf.state = slf.return_state.take().unwrap();
+ slf.unread_char(c);
+ Ok(ControlToken::Continue)
+ }
+ }
+ }
+ State::NamedCharacterReference => {
+ let c = slf.read_char()?;
+
+ let char_ref = match c {
+ Some(x) => try_read_character_reference(x, |x| slf.try_read_string(x, true))?
+ .map(|char_ref| (x, char_ref)),
+
+ None => None,
+ };
+
+ if let Some((x, char_ref)) = char_ref {
+ slf.temporary_buffer.push(x);
+ slf.temporary_buffer.push_str(char_ref.name);
+ let char_ref_name_last_character = char_ref.name.chars().last();
+
+ let next_character = slf.read_char()?;
+ slf.unread_char(next_character);
+
+ if slf.is_consumed_as_part_of_an_attribute()
+ && char_ref_name_last_character != Some(';')
+ && matches!(next_character, Some(x) if x == '=' || x.is_ascii_alphanumeric())
+ {
+ slf.flush_code_points_consumed_as_character_reference();
+ slf.state = slf.return_state.take().unwrap();
+ Ok(ControlToken::Continue)
+ } else {
+ if char_ref_name_last_character != Some(';') {
+ slf.emit_error(Error::MissingSemicolonAfterCharacterReference);
+ }
+
+ slf.temporary_buffer.clear();
+ slf.temporary_buffer.push_str(char_ref.characters);
+ slf.flush_code_points_consumed_as_character_reference();
+ slf.state = slf.return_state.take().unwrap();
+ Ok(ControlToken::Continue)
+ }
+ } else {
+ slf.unread_char(c);
+ slf.flush_code_points_consumed_as_character_reference();
+ slf.state = State::AmbiguousAmpersand;
+ Ok(ControlToken::Continue)
+ }
+ }
+ State::AmbiguousAmpersand => match slf.read_char()? {
+ Some(x) if x.is_ascii_alphanumeric() => {
+ if slf.is_consumed_as_part_of_an_attribute() {
+ slf.emitter.push_attribute_value(ctostr!(x));
+ } else {
+ slf.emitter.emit_string(ctostr!(x));
+ }
+
+ Ok(ControlToken::Continue)
+ }
+ c @ Some(';') => {
+ slf.emit_error(Error::UnknownNamedCharacterReference);
+ slf.unread_char(c);
+ slf.state = slf.return_state.take().unwrap();
+ Ok(ControlToken::Continue)
+ }
+ c => {
+ slf.unread_char(c);
+ slf.state = slf.return_state.take().unwrap();
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::NumericCharacterReference => {
+ slf.character_reference_code = 0;
+ match slf.read_char()? {
+ Some(x @ 'x' | x @ 'X') => {
+ slf.temporary_buffer.push(x);
+ slf.state = State::HexadecimalCharacterReferenceStart;
+ Ok(ControlToken::Continue)
+ }
+ c => {
+ slf.unread_char(c);
+ slf.state = State::DecimalCharacterReferenceStart;
+ Ok(ControlToken::Continue)
+ }
+ }
+ }
+ State::HexadecimalCharacterReferenceStart => match slf.read_char()? {
+ c @ Some('0'..='9' | 'A'..='F' | 'a'..='f') => {
+ slf.unread_char(c);
+ slf.state = State::HexadecimalCharacterReference;
+ Ok(ControlToken::Continue)
+ }
+ c => {
+ slf.emit_error(Error::AbsenceOfDigitsInNumericCharacterReference);
+ slf.flush_code_points_consumed_as_character_reference();
+ slf.unread_char(c);
+ slf.state = slf.return_state.take().unwrap();
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::DecimalCharacterReferenceStart => match slf.read_char()? {
+ Some(x @ ascii_digit_pat!()) => {
+ slf.unread_char(Some(x));
+ slf.state = State::DecimalCharacterReference;
+ Ok(ControlToken::Continue)
+ }
+ c => {
+ slf.emit_error(Error::AbsenceOfDigitsInNumericCharacterReference);
+ slf.flush_code_points_consumed_as_character_reference();
+ slf.unread_char(c);
+ slf.state = slf.return_state.take().unwrap();
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::HexadecimalCharacterReference => match slf.read_char()? {
+ Some(x @ ascii_digit_pat!()) => {
+ mutate_character_reference!(*16 + x - 0x0030);
+ Ok(ControlToken::Continue)
+ }
+ Some(x @ 'A'..='F') => {
+ mutate_character_reference!(*16 + x - 0x0037);
+ Ok(ControlToken::Continue)
+ }
+ Some(x @ 'a'..='f') => {
+ mutate_character_reference!(*16 + x - 0x0057);
+ Ok(ControlToken::Continue)
+ }
+ Some(';') => {
+ slf.state = State::NumericCharacterReferenceEnd;
+ Ok(ControlToken::Continue)
+ }
+ c => {
+ slf.emit_error(Error::MissingSemicolonAfterCharacterReference);
+ slf.unread_char(c);
+ slf.state = State::NumericCharacterReferenceEnd;
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::DecimalCharacterReference => match slf.read_char()? {
+ Some(x @ ascii_digit_pat!()) => {
+ mutate_character_reference!(*10 + x - 0x0030);
+ Ok(ControlToken::Continue)
+ }
+ Some(';') => {
+ slf.state = State::NumericCharacterReferenceEnd;
+ Ok(ControlToken::Continue)
+ }
+ c => {
+ slf.emit_error(Error::MissingSemicolonAfterCharacterReference);
+ slf.unread_char(c);
+ slf.state = State::NumericCharacterReferenceEnd;
+ Ok(ControlToken::Continue)
+ }
+ },
+ State::NumericCharacterReferenceEnd => {
+ match slf.character_reference_code {
+ 0x00 => {
+ slf.emit_error(Error::NullCharacterReference);
+ slf.character_reference_code = 0xfffd;
+ }
+ 0x110000.. => {
+ slf.emit_error(Error::CharacterReferenceOutsideUnicodeRange);
+ slf.character_reference_code = 0xfffd;
+ }
+ surrogate_pat!() => {
+ slf.emit_error(Error::SurrogateCharacterReference);
+ slf.character_reference_code = 0xfffd;
+ }
+ // noncharacter
+ noncharacter_pat!() => {
+ slf.emit_error(Error::NoncharacterCharacterReference);
+ }
+ // 0x000d, or a control that is not whitespace
+ x @ 0x000d | x @ control_pat!()
+ if !matches!(x, 0x0009 | 0x000a | 0x000c | 0x0020) =>
+ {
+ slf.emit_error(Error::ControlCharacterReference);
+ slf.character_reference_code = match x {
+ 0x80 => 0x20AC, // EURO SIGN (€)
+ 0x82 => 0x201A, // SINGLE LOW-9 QUOTATION MARK (‚)
+ 0x83 => 0x0192, // LATIN SMALL LETTER F WITH HOOK (ƒ)
+ 0x84 => 0x201E, // DOUBLE LOW-9 QUOTATION MARK („)
+ 0x85 => 0x2026, // HORIZONTAL ELLIPSIS (…)
+ 0x86 => 0x2020, // DAGGER (†)
+ 0x87 => 0x2021, // DOUBLE DAGGER (‡)
+ 0x88 => 0x02C6, // MODIFIER LETTER CIRCUMFLEX ACCENT (ˆ)
+ 0x89 => 0x2030, // PER MILLE SIGN (‰)
+ 0x8A => 0x0160, // LATIN CAPITAL LETTER S WITH CARON (Š)
+ 0x8B => 0x2039, // SINGLE LEFT-POINTING ANGLE QUOTATION MARK (‹)
+ 0x8C => 0x0152, // LATIN CAPITAL LIGATURE OE (Œ)
+ 0x8E => 0x017D, // LATIN CAPITAL LETTER Z WITH CARON (Ž)
+ 0x91 => 0x2018, // LEFT SINGLE QUOTATION MARK (‘)
+ 0x92 => 0x2019, // RIGHT SINGLE QUOTATION MARK (’)
+ 0x93 => 0x201C, // LEFT DOUBLE QUOTATION MARK (“)
+ 0x94 => 0x201D, // RIGHT DOUBLE QUOTATION MARK (”)
+ 0x95 => 0x2022, // BULLET (•)
+ 0x96 => 0x2013, // EN DASH (–)
+ 0x97 => 0x2014, // EM DASH (—)
+ 0x98 => 0x02DC, // SMALL TILDE (˜)
+ 0x99 => 0x2122, // TRADE MARK SIGN (™)
+ 0x9A => 0x0161, // LATIN SMALL LETTER S WITH CARON (š)
+ 0x9B => 0x203A, // SINGLE RIGHT-POINTING ANGLE QUOTATION MARK (›)
+ 0x9C => 0x0153, // LATIN SMALL LIGATURE OE (œ)
+ 0x9E => 0x017E, // LATIN SMALL LETTER Z WITH CARON (ž)
+ 0x9F => 0x0178, // LATIN CAPITAL LETTER Y WITH DIAERESIS (Ÿ)
+ _ => slf.character_reference_code,
+ };
+ }
+ _ => (),
+ }
+
+ slf.temporary_buffer.clear();
+ slf.temporary_buffer
+ .push(std::char::from_u32(slf.character_reference_code).unwrap());
+ slf.flush_code_points_consumed_as_character_reference();
+ slf.state = slf.return_state.take().unwrap();
+ Ok(ControlToken::Continue)
+ }
+ }
+}
+
+impl<R: Reader + Position<O>, O: Offset, E: Emitter<O>> Tokenizer<R, O, E> {
+ #[inline]
+ fn init_doctype(&mut self) {
+ self.emitter.init_doctype(self.some_offset);
+ }
+}
+
+#[inline]
+pub fn handle_cdata_open<O, R, E>(slf: &mut Tokenizer<R, O, E>, action: CdataAction)
+where
+ O: Offset,
+ R: Reader + Position<O>,
+ E: Emitter<O>,
+{
+ match action {
+ CdataAction::Cdata => slf.state = State::CdataSection,
+ CdataAction::BogusComment => {
+ slf.emit_error(Error::CdataInHtmlContent);
+
+ slf.emitter.init_comment(slf.reader.position());
+ slf.emitter.push_comment("[CDATA[");
+ slf.state = State::BogusComment;
+ }
+ }
+}