diff options
Diffstat (limited to 'src/tokenizer/machine')
-rw-r--r-- | src/tokenizer/machine/utils.rs | 193 |
1 files changed, 193 insertions, 0 deletions
diff --git a/src/tokenizer/machine/utils.rs b/src/tokenizer/machine/utils.rs index 7d220cf..6e45f4d 100644 --- a/src/tokenizer/machine/utils.rs +++ b/src/tokenizer/machine/utils.rs @@ -1,3 +1,196 @@ +use crate::{ + naive_parser::naive_next_state, + offset::{Offset, Position}, + reader::Reader, + Emitter, Error, +}; + +use super::Machine; + +impl<R, O, E> Machine<R, O, E> +where + R: Reader + Position<O>, + O: Offset, + E: Emitter<O>, +{ + #[inline] + pub(crate) fn emit_error(&mut self, error: Error) { + let span = match error { + Error::EofBeforeTagName + | Error::EofInCdata + | Error::EofInComment + | Error::EofInDoctype + | Error::EofInScriptHtmlCommentLikeText + | Error::EofInTag + | Error::MissingSemicolonAfterCharacterReference => { + self.reader.position()..self.reader.position() + } + Error::AbsenceOfDigitsInNumericCharacterReference + | Error::NullCharacterReference + | Error::CharacterReferenceOutsideUnicodeRange + | Error::SurrogateCharacterReference + | Error::NoncharacterCharacterReference + | Error::ControlCharacterReference + | Error::UnknownNamedCharacterReference => self.some_offset..self.reader.position(), + + _ => self.position_before_match..self.reader.position(), + }; + self.emitter.report_error(error, span); + } + + /// Assuming the _current token_ is an end tag, return true if all of these hold. Return false otherwise. + /// + /// * the _last start tag_ exists + /// * the current end tag token's name equals to the last start tag's name. + /// + /// See also WHATWG's definition of [appropriate end tag token]. + /// + /// [appropriate end tag token]: https://html.spec.whatwg.org/multipage/parsing.html#appropriate-end-tag-token + #[inline] + pub(super) fn current_end_tag_is_appropriate(&mut self) -> bool { + self.current_tag_name == self.last_start_tag_name + } + + #[inline] + pub(super) fn init_start_tag(&mut self) { + self.emitter + .init_start_tag(self.some_offset, self.position_before_match); + self.current_tag_name.clear(); + self.is_start_tag = true; + } + + #[inline] + pub(super) fn init_end_tag(&mut self) { + self.emitter + .init_end_tag(self.some_offset, self.position_before_match); + self.current_tag_name.clear(); + self.is_start_tag = false; + } + + #[inline] + pub(super) fn init_doctype(&mut self) { + self.emitter.init_doctype(self.some_offset); + } + + #[inline] + pub(super) fn push_tag_name(&mut self, s: &str) { + self.emitter.push_tag_name(s); + self.current_tag_name.push_str(s); + } + + #[inline] + pub(super) fn emit_current_tag(&mut self) { + self.emitter.emit_current_tag(self.reader.position()); + if self.is_start_tag { + if self.naively_switch_state { + self.state = naive_next_state(&self.current_tag_name).into(); + } + std::mem::swap(&mut self.last_start_tag_name, &mut self.current_tag_name); + } + } + + #[inline] + pub(super) fn unread_char(&mut self, c: Option<char>) { + self.to_reconsume.push(c); + } + + #[inline] + fn validate_char(&mut self, c: char) { + match c as u32 { + surrogate_pat!() => { + self.emit_error(Error::SurrogateInInputStream); + } + noncharacter_pat!() => { + self.emit_error(Error::NoncharacterInInputStream); + } + // control without whitespace or nul + x @ control_pat!() + if !matches!(x, 0x0000 | 0x0009 | 0x000a | 0x000c | 0x000d | 0x0020) => + { + self.emit_error(Error::ControlCharacterInInputStream); + } + _ => (), + } + } + + pub(super) fn read_char(&mut self) -> Result<Option<char>, R::Error> { + let (c_res, reconsumed) = match self.to_reconsume.pop() { + Some(c) => (Ok(c), true), + None => (self.reader.read_char(), false), + }; + + let mut c = match c_res { + Ok(Some(c)) => c, + res => return res, + }; + + if c == '\r' { + c = '\n'; + let c2 = self.reader.read_char()?; + if c2 != Some('\n') { + self.unread_char(c2); + } + } + + if !reconsumed { + self.validate_char(c); + } + + Ok(Some(c)) + } + + #[inline] + pub(super) fn try_read_string( + &mut self, + mut s: &str, + case_sensitive: bool, + ) -> Result<bool, R::Error> { + debug_assert!(!s.is_empty()); + + let to_reconsume_bak = self.to_reconsume; + let mut chars = s.chars(); + while let Some(c) = self.to_reconsume.pop() { + if let (Some(x), Some(x2)) = (c, chars.next()) { + if x == x2 || (!case_sensitive && x.to_ascii_lowercase() == x2.to_ascii_lowercase()) + { + s = &s[x.len_utf8()..]; + continue; + } + } + + self.to_reconsume = to_reconsume_bak; + return Ok(false); + } + + self.reader.try_read_string(s, case_sensitive) + } + + pub(super) fn is_consumed_as_part_of_an_attribute(&self) -> bool { + matches!( + self.return_state, + Some( + State::AttributeValueDoubleQuoted + | State::AttributeValueSingleQuoted + | State::AttributeValueUnquoted + ) + ) + } + + pub(super) fn flush_code_points_consumed_as_character_reference(&mut self) { + if self.is_consumed_as_part_of_an_attribute() { + self.emitter.push_attribute_value(&self.temporary_buffer); + self.temporary_buffer.clear(); + } else { + self.flush_buffer_characters(); + } + } + + pub(super) fn flush_buffer_characters(&mut self) { + self.emitter.emit_string(&self.temporary_buffer); + self.temporary_buffer.clear(); + } +} + macro_rules! surrogate_pat { () => { 0xd800..=0xdfff |