summaryrefslogtreecommitdiff
path: root/src/tokenizer/machine
diff options
context:
space:
mode:
Diffstat (limited to 'src/tokenizer/machine')
-rw-r--r--src/tokenizer/machine/utils.rs193
1 files changed, 193 insertions, 0 deletions
diff --git a/src/tokenizer/machine/utils.rs b/src/tokenizer/machine/utils.rs
index 7d220cf..6e45f4d 100644
--- a/src/tokenizer/machine/utils.rs
+++ b/src/tokenizer/machine/utils.rs
@@ -1,3 +1,196 @@
+use crate::{
+ naive_parser::naive_next_state,
+ offset::{Offset, Position},
+ reader::Reader,
+ Emitter, Error,
+};
+
+use super::Machine;
+
+impl<R, O, E> Machine<R, O, E>
+where
+ R: Reader + Position<O>,
+ O: Offset,
+ E: Emitter<O>,
+{
+ #[inline]
+ pub(crate) fn emit_error(&mut self, error: Error) {
+ let span = match error {
+ Error::EofBeforeTagName
+ | Error::EofInCdata
+ | Error::EofInComment
+ | Error::EofInDoctype
+ | Error::EofInScriptHtmlCommentLikeText
+ | Error::EofInTag
+ | Error::MissingSemicolonAfterCharacterReference => {
+ self.reader.position()..self.reader.position()
+ }
+ Error::AbsenceOfDigitsInNumericCharacterReference
+ | Error::NullCharacterReference
+ | Error::CharacterReferenceOutsideUnicodeRange
+ | Error::SurrogateCharacterReference
+ | Error::NoncharacterCharacterReference
+ | Error::ControlCharacterReference
+ | Error::UnknownNamedCharacterReference => self.some_offset..self.reader.position(),
+
+ _ => self.position_before_match..self.reader.position(),
+ };
+ self.emitter.report_error(error, span);
+ }
+
+ /// Assuming the _current token_ is an end tag, return true if all of these hold. Return false otherwise.
+ ///
+ /// * the _last start tag_ exists
+ /// * the current end tag token's name equals to the last start tag's name.
+ ///
+ /// See also WHATWG's definition of [appropriate end tag token].
+ ///
+ /// [appropriate end tag token]: https://html.spec.whatwg.org/multipage/parsing.html#appropriate-end-tag-token
+ #[inline]
+ pub(super) fn current_end_tag_is_appropriate(&mut self) -> bool {
+ self.current_tag_name == self.last_start_tag_name
+ }
+
+ #[inline]
+ pub(super) fn init_start_tag(&mut self) {
+ self.emitter
+ .init_start_tag(self.some_offset, self.position_before_match);
+ self.current_tag_name.clear();
+ self.is_start_tag = true;
+ }
+
+ #[inline]
+ pub(super) fn init_end_tag(&mut self) {
+ self.emitter
+ .init_end_tag(self.some_offset, self.position_before_match);
+ self.current_tag_name.clear();
+ self.is_start_tag = false;
+ }
+
+ #[inline]
+ pub(super) fn init_doctype(&mut self) {
+ self.emitter.init_doctype(self.some_offset);
+ }
+
+ #[inline]
+ pub(super) fn push_tag_name(&mut self, s: &str) {
+ self.emitter.push_tag_name(s);
+ self.current_tag_name.push_str(s);
+ }
+
+ #[inline]
+ pub(super) fn emit_current_tag(&mut self) {
+ self.emitter.emit_current_tag(self.reader.position());
+ if self.is_start_tag {
+ if self.naively_switch_state {
+ self.state = naive_next_state(&self.current_tag_name).into();
+ }
+ std::mem::swap(&mut self.last_start_tag_name, &mut self.current_tag_name);
+ }
+ }
+
+ #[inline]
+ pub(super) fn unread_char(&mut self, c: Option<char>) {
+ self.to_reconsume.push(c);
+ }
+
+ #[inline]
+ fn validate_char(&mut self, c: char) {
+ match c as u32 {
+ surrogate_pat!() => {
+ self.emit_error(Error::SurrogateInInputStream);
+ }
+ noncharacter_pat!() => {
+ self.emit_error(Error::NoncharacterInInputStream);
+ }
+ // control without whitespace or nul
+ x @ control_pat!()
+ if !matches!(x, 0x0000 | 0x0009 | 0x000a | 0x000c | 0x000d | 0x0020) =>
+ {
+ self.emit_error(Error::ControlCharacterInInputStream);
+ }
+ _ => (),
+ }
+ }
+
+ pub(super) fn read_char(&mut self) -> Result<Option<char>, R::Error> {
+ let (c_res, reconsumed) = match self.to_reconsume.pop() {
+ Some(c) => (Ok(c), true),
+ None => (self.reader.read_char(), false),
+ };
+
+ let mut c = match c_res {
+ Ok(Some(c)) => c,
+ res => return res,
+ };
+
+ if c == '\r' {
+ c = '\n';
+ let c2 = self.reader.read_char()?;
+ if c2 != Some('\n') {
+ self.unread_char(c2);
+ }
+ }
+
+ if !reconsumed {
+ self.validate_char(c);
+ }
+
+ Ok(Some(c))
+ }
+
+ #[inline]
+ pub(super) fn try_read_string(
+ &mut self,
+ mut s: &str,
+ case_sensitive: bool,
+ ) -> Result<bool, R::Error> {
+ debug_assert!(!s.is_empty());
+
+ let to_reconsume_bak = self.to_reconsume;
+ let mut chars = s.chars();
+ while let Some(c) = self.to_reconsume.pop() {
+ if let (Some(x), Some(x2)) = (c, chars.next()) {
+ if x == x2 || (!case_sensitive && x.to_ascii_lowercase() == x2.to_ascii_lowercase())
+ {
+ s = &s[x.len_utf8()..];
+ continue;
+ }
+ }
+
+ self.to_reconsume = to_reconsume_bak;
+ return Ok(false);
+ }
+
+ self.reader.try_read_string(s, case_sensitive)
+ }
+
+ pub(super) fn is_consumed_as_part_of_an_attribute(&self) -> bool {
+ matches!(
+ self.return_state,
+ Some(
+ State::AttributeValueDoubleQuoted
+ | State::AttributeValueSingleQuoted
+ | State::AttributeValueUnquoted
+ )
+ )
+ }
+
+ pub(super) fn flush_code_points_consumed_as_character_reference(&mut self) {
+ if self.is_consumed_as_part_of_an_attribute() {
+ self.emitter.push_attribute_value(&self.temporary_buffer);
+ self.temporary_buffer.clear();
+ } else {
+ self.flush_buffer_characters();
+ }
+ }
+
+ pub(super) fn flush_buffer_characters(&mut self) {
+ self.emitter.emit_string(&self.temporary_buffer);
+ self.temporary_buffer.clear();
+ }
+}
+
macro_rules! surrogate_pat {
() => {
0xd800..=0xdfff