use crate::{ naive_parser::naive_next_state, offset::{Offset, Position}, reader::Reader, Emitter, Error, }; use super::Machine; impl Machine where R: Reader + Position, O: Offset, E: Emitter, { pub(crate) fn reader_position(&self) -> O { self.reader.position() } /// Emits the given character as a character token. #[inline] pub(super) fn emit_char(&mut self, c: char) { self.emitter.emit_char(c); } /// Emits every byte of the given byte slice as a character token. /// /// (We're operating on bytes to enable compiler optimization, /// since [`str::chars`] isn't `const`.) #[inline] pub(super) fn emit_chars(&mut self, s: &[u8]) { for c in s { self.emit_char(*c as char); } } #[inline] pub(crate) fn emit_error(&mut self, error: Error) { let span = match error { Error::EofBeforeTagName | Error::EofInCdata | Error::EofInComment | Error::EofInDoctype | Error::EofInScriptHtmlCommentLikeText | Error::EofInTag | Error::MissingSemicolonAfterCharacterReference => { self.reader.position()..self.reader.position() } Error::AbsenceOfDigitsInNumericCharacterReference | Error::NullCharacterReference | Error::CharacterReferenceOutsideUnicodeRange | Error::SurrogateCharacterReference | Error::NoncharacterCharacterReference | Error::ControlCharacterReference | Error::UnknownNamedCharacterReference => self.some_offset..self.reader.position(), _ => self.position_before_match..self.reader.position(), }; self.emitter.report_error(error, span); } /// Assuming the _current token_ is an end tag, return true if all of these hold. Return false otherwise. /// /// * the _last start tag_ exists /// * the current end tag token's name equals to the last start tag's name. /// /// See also WHATWG's definition of [appropriate end tag token]. /// /// [appropriate end tag token]: https://html.spec.whatwg.org/multipage/parsing.html#appropriate-end-tag-token #[inline] pub(super) fn current_end_tag_is_appropriate(&mut self) -> bool { self.current_tag_name == self.last_start_tag_name } #[inline] pub(super) fn init_start_tag(&mut self) { self.emitter .init_start_tag(self.some_offset, self.position_before_match); self.current_tag_name.clear(); self.is_start_tag = true; } #[inline] pub(super) fn init_end_tag(&mut self) { self.emitter .init_end_tag(self.some_offset, self.position_before_match); self.current_tag_name.clear(); self.is_start_tag = false; } #[inline] pub(super) fn init_doctype(&mut self) { self.emitter.init_doctype(self.some_offset); } #[inline] pub(super) fn push_tag_name(&mut self, s: &str) { self.emitter.push_tag_name(s); self.current_tag_name.push_str(s); } #[inline] pub(super) fn emit_current_tag(&mut self) { self.emitter.emit_current_tag(self.reader.position()); if self.is_start_tag { if self.naively_switch_state { self.state = naive_next_state(&self.current_tag_name).into(); } std::mem::swap(&mut self.last_start_tag_name, &mut self.current_tag_name); } } #[inline] pub(super) fn unread_char(&mut self, c: Option) { self.to_reconsume.push(c); } #[inline] fn validate_char(&mut self, c: char) { match c as u32 { surrogate_pat!() => { self.emit_error(Error::SurrogateInInputStream); } noncharacter_pat!() => { self.emit_error(Error::NoncharacterInInputStream); } // control without whitespace or nul x @ control_pat!() if !matches!(x, 0x0000 | 0x0009 | 0x000a | 0x000c | 0x000d | 0x0020) => { self.emit_error(Error::ControlCharacterInInputStream); } _ => (), } } pub(super) fn read_char(&mut self) -> Result, R::Error> { let (c_res, reconsumed) = match self.to_reconsume.pop() { Some(c) => (Ok(c), true), None => (self.reader.read_char(), false), }; let mut c = match c_res { Ok(Some(c)) => c, res => return res, }; if c == '\r' { c = '\n'; let c2 = self.reader.read_char()?; if c2 != Some('\n') { self.unread_char(c2); } } if !reconsumed { self.validate_char(c); } Ok(Some(c)) } #[inline] pub(super) fn try_read_string( &mut self, mut s: &str, case_sensitive: bool, ) -> Result { debug_assert!(!s.is_empty()); let to_reconsume_bak = self.to_reconsume; let mut chars = s.chars(); while let Some(c) = self.to_reconsume.pop() { if let (Some(x), Some(x2)) = (c, chars.next()) { if x == x2 || (!case_sensitive && x.to_ascii_lowercase() == x2.to_ascii_lowercase()) { s = &s[x.len_utf8()..]; continue; } } self.to_reconsume = to_reconsume_bak; return Ok(false); } self.reader.try_read_string(s, case_sensitive) } pub(super) fn is_consumed_as_part_of_an_attribute(&self) -> bool { matches!( self.return_state, Some( State::AttributeValueDoubleQuoted | State::AttributeValueSingleQuoted | State::AttributeValueUnquoted ) ) } pub(super) fn flush_code_points_consumed_as_character_reference(&mut self) { if self.is_consumed_as_part_of_an_attribute() { self.emitter.push_attribute_value(&self.temporary_buffer); self.temporary_buffer.clear(); } else { self.flush_buffer_characters(); } } pub(super) fn flush_buffer_characters(&mut self) { for c in self.temporary_buffer.chars() { self.emitter.emit_char(c); } self.temporary_buffer.clear(); } } macro_rules! surrogate_pat { () => { 0xd800..=0xdfff }; } pub(crate) use surrogate_pat; macro_rules! control_pat { () => (0x0d | 0x0000..=0x001f | 0x007f..=0x009f) } pub(crate) use control_pat; macro_rules! ascii_digit_pat { () => { '0'..='9' }; } pub(crate) use ascii_digit_pat; macro_rules! whitespace_pat { () => { '\t' | '\u{0A}' | '\u{0C}' | ' ' }; } pub(crate) use whitespace_pat; macro_rules! noncharacter_pat { () => { 0xfdd0 ..=0xfdef | 0xfffe | 0xffff | 0x1fffe | 0x1ffff | 0x2fffe | 0x2ffff | 0x3fffe | 0x3ffff | 0x4fffe | 0x4ffff | 0x5fffe | 0x5ffff | 0x6fffe | 0x6ffff | 0x7fffe | 0x7ffff | 0x8fffe | 0x8ffff | 0x9fffe | 0x9ffff | 0xafffe | 0xaffff | 0xbfffe | 0xbffff | 0xcfffe | 0xcffff | 0xdfffe | 0xdffff | 0xefffe | 0xeffff | 0xffffe | 0xfffff | 0x10fffe | 0x10ffff }; } pub(crate) use noncharacter_pat; // When integration tests are running, this enum is public and we get warnings about missing docs. // However, it's not actually part of public API. #[allow(missing_docs)] #[derive(Clone, Copy, Debug, Eq, PartialEq)] pub enum State { Data, RcData, RawText, ScriptData, PlainText, TagOpen, EndTagOpen, TagName, RcDataLessThanSign, RcDataEndTagOpen, RcDataEndTagName, RawTextLessThanSign, RawTextEndTagOpen, RawTextEndTagName, ScriptDataLessThanSign, ScriptDataEndTagOpen, ScriptDataEndTagName, ScriptDataEscapeStart, ScriptDataEscapeStartDash, ScriptDataEscaped, ScriptDataEscapedDash, ScriptDataEscapedDashDash, ScriptDataEscapedLessThanSign, ScriptDataEscapedEndTagOpen, ScriptDataEscapedEndTagName, ScriptDataDoubleEscapeStart, ScriptDataDoubleEscaped, ScriptDataDoubleEscapedDash, ScriptDataDoubleEscapedDashDash, ScriptDataDoubleEscapedLessThanSign, ScriptDataDoubleEscapeEnd, BeforeAttributeName, AttributeName, AfterAttributeName, BeforeAttributeValue, AttributeValueDoubleQuoted, AttributeValueSingleQuoted, AttributeValueUnquoted, AfterAttributeValueQuoted, SelfClosingStartTag, BogusComment, MarkupDeclarationOpen, CommentStart, CommentStartDash, Comment, CommentLessThanSign, CommentLessThanSignBang, CommentLessThanSignBangDash, CommentLessThanSignBangDashDash, CommentEndDash, CommentEnd, CommentEndBang, Doctype, BeforeDoctypeName, DoctypeName, AfterDoctypeName, AfterDoctypePublicKeyword, BeforeDoctypePublicIdentifier, DoctypePublicIdentifierDoubleQuoted, DoctypePublicIdentifierSingleQuoted, AfterDoctypePublicIdentifier, BetweenDoctypePublicAndSystemIdentifiers, AfterDoctypeSystemKeyword, BeforeDoctypeSystemIdentifier, DoctypeSystemIdentifierDoubleQuoted, DoctypeSystemIdentifierSingleQuoted, AfterDoctypeSystemIdentifier, BogusDoctype, CdataSection, CdataSectionBracket, CdataSectionEnd, CharacterReference, NamedCharacterReference, AmbiguousAmpersand, NumericCharacterReference, HexadecimalCharacterReferenceStart, DecimalCharacterReferenceStart, HexadecimalCharacterReference, DecimalCharacterReference, NumericCharacterReferenceEnd, } macro_rules! ctostr { ($c:expr) => { &*$c.encode_utf8(&mut [0; 4]) }; } pub(crate) use ctostr;