refactor: move machine impl details to machine module

This commit separates the public API (the "Tokenizer") from the internal implementation (the "Machine") to make the code more readable.
author: Martin Fischer <martin@push-f.com> 2023-09-09 21:53:22 +0200
committer: Martin Fischer <martin@push-f.com> 2023-09-28 10:36:08 +0200
commit: 5aa3b82fbe62882da8007b0a4548b979c845aa97 (patch)
tree: 9788640728ea7894a7ff53c561ed10bff3a611c1 /src/tokenizer/machine
parent: 2c73901944e2d22747a2a4ebcc11881b3f8c2ad3 (diff)
1 files changed, 193 insertions, 0 deletions
diff --git a/src/tokenizer/machine/utils.rs b/src/tokenizer/machine/utils.rs
index 7d220cf..6e45f4d 100644
--- a/src/tokenizer/machine/utils.rs
+++ b/src/tokenizer/machine/utils.rs
@@ -1,3 +1,196 @@
+use crate::{
+    naive_parser::naive_next_state,
+    offset::{Offset, Position},
+    reader::Reader,
+    Emitter, Error,
+};
+
+use super::Machine;
+
+impl<R, O, E> Machine<R, O, E>
+where
+    R: Reader + Position<O>,
+    O: Offset,
+    E: Emitter<O>,
+{
+    #[inline]
+    pub(crate) fn emit_error(&mut self, error: Error) {
+        let span = match error {
+            Error::EofBeforeTagName
+            | Error::EofInCdata
+            | Error::EofInComment
+            | Error::EofInDoctype
+            | Error::EofInScriptHtmlCommentLikeText
+            | Error::EofInTag
+            | Error::MissingSemicolonAfterCharacterReference => {
+                self.reader.position()..self.reader.position()
+            }
+            Error::AbsenceOfDigitsInNumericCharacterReference
+            | Error::NullCharacterReference
+            | Error::CharacterReferenceOutsideUnicodeRange
+            | Error::SurrogateCharacterReference
+            | Error::NoncharacterCharacterReference
+            | Error::ControlCharacterReference
+            | Error::UnknownNamedCharacterReference => self.some_offset..self.reader.position(),
+
+            _ => self.position_before_match..self.reader.position(),
+        };
+        self.emitter.report_error(error, span);
+    }
+
+    /// Assuming the _current token_ is an end tag, return true if all of these hold. Return false otherwise.
+    ///
+    /// * the _last start tag_ exists
+    /// * the current end tag token's name equals to the last start tag's name.
+    ///
+    /// See also WHATWG's definition of [appropriate end tag token].
+    ///
+    /// [appropriate end tag token]: https://html.spec.whatwg.org/multipage/parsing.html#appropriate-end-tag-token
+    #[inline]
+    pub(super) fn current_end_tag_is_appropriate(&mut self) -> bool {
+        self.current_tag_name == self.last_start_tag_name
+    }
+
+    #[inline]
+    pub(super) fn init_start_tag(&mut self) {
+        self.emitter
+            .init_start_tag(self.some_offset, self.position_before_match);
+        self.current_tag_name.clear();
+        self.is_start_tag = true;
+    }
+
+    #[inline]
+    pub(super) fn init_end_tag(&mut self) {
+        self.emitter
+            .init_end_tag(self.some_offset, self.position_before_match);
+        self.current_tag_name.clear();
+        self.is_start_tag = false;
+    }
+
+    #[inline]
+    pub(super) fn init_doctype(&mut self) {
+        self.emitter.init_doctype(self.some_offset);
+    }
+
+    #[inline]
+    pub(super) fn push_tag_name(&mut self, s: &str) {
+        self.emitter.push_tag_name(s);
+        self.current_tag_name.push_str(s);
+    }
+
+    #[inline]
+    pub(super) fn emit_current_tag(&mut self) {
+        self.emitter.emit_current_tag(self.reader.position());
+        if self.is_start_tag {
+            if self.naively_switch_state {
+                self.state = naive_next_state(&self.current_tag_name).into();
+            }
+            std::mem::swap(&mut self.last_start_tag_name, &mut self.current_tag_name);
+        }
+    }
+
+    #[inline]
+    pub(super) fn unread_char(&mut self, c: Option<char>) {
+        self.to_reconsume.push(c);
+    }
+
+    #[inline]
+    fn validate_char(&mut self, c: char) {
+        match c as u32 {
+            surrogate_pat!() => {
+                self.emit_error(Error::SurrogateInInputStream);
+            }
+            noncharacter_pat!() => {
+                self.emit_error(Error::NoncharacterInInputStream);
+            }
+            // control without whitespace or nul
+            x @ control_pat!()
+                if !matches!(x, 0x0000 | 0x0009 | 0x000a | 0x000c | 0x000d | 0x0020) =>
+            {
+                self.emit_error(Error::ControlCharacterInInputStream);
+            }
+            _ => (),
+        }
+    }
+
+    pub(super) fn read_char(&mut self) -> Result<Option<char>, R::Error> {
+        let (c_res, reconsumed) = match self.to_reconsume.pop() {
+            Some(c) => (Ok(c), true),
+            None => (self.reader.read_char(), false),
+        };
+
+        let mut c = match c_res {
+            Ok(Some(c)) => c,
+            res => return res,
+        };
+
+        if c == '\r' {
+            c = '\n';
+            let c2 = self.reader.read_char()?;
+            if c2 != Some('\n') {
+                self.unread_char(c2);
+            }
+        }
+
+        if !reconsumed {
+            self.validate_char(c);
+        }
+
+        Ok(Some(c))
+    }
+
+    #[inline]
+    pub(super) fn try_read_string(
+        &mut self,
+        mut s: &str,
+        case_sensitive: bool,
+    ) -> Result<bool, R::Error> {
+        debug_assert!(!s.is_empty());
+
+        let to_reconsume_bak = self.to_reconsume;
+        let mut chars = s.chars();
+        while let Some(c) = self.to_reconsume.pop() {
+            if let (Some(x), Some(x2)) = (c, chars.next()) {
+                if x == x2 || (!case_sensitive && x.to_ascii_lowercase() == x2.to_ascii_lowercase())
+                {
+                    s = &s[x.len_utf8()..];
+                    continue;
+                }
+            }
+
+            self.to_reconsume = to_reconsume_bak;
+            return Ok(false);
+        }
+
+        self.reader.try_read_string(s, case_sensitive)
+    }
+
+    pub(super) fn is_consumed_as_part_of_an_attribute(&self) -> bool {
+        matches!(
+            self.return_state,
+            Some(
+                State::AttributeValueDoubleQuoted
+                    | State::AttributeValueSingleQuoted
+                    | State::AttributeValueUnquoted
+            )
+        )
+    }
+
+    pub(super) fn flush_code_points_consumed_as_character_reference(&mut self) {
+        if self.is_consumed_as_part_of_an_attribute() {
+            self.emitter.push_attribute_value(&self.temporary_buffer);
+            self.temporary_buffer.clear();
+        } else {
+            self.flush_buffer_characters();
+        }
+    }
+
+    pub(super) fn flush_buffer_characters(&mut self) {
+        self.emitter.emit_string(&self.temporary_buffer);
+        self.temporary_buffer.clear();
+    }
+}
+
 macro_rules! surrogate_pat {
     () => {
         0xd800..=0xdfff
author	Martin Fischer <martin@push-f.com>	2023-09-09 21:53:22 +0200
committer	Martin Fischer <martin@push-f.com>	2023-09-28 10:36:08 +0200
commit	5aa3b82fbe62882da8007b0a4548b979c845aa97 (patch)
tree	9788640728ea7894a7ff53c561ed10bff3a611c1 /src/tokenizer/machine
parent	2c73901944e2d22747a2a4ebcc11881b3f8c2ad3 (diff)