refactor: move machine impl details to machine module

This commit separates the public API (the "Tokenizer") from the internal implementation (the "Machine") to make the code more readable.
author: Martin Fischer <martin@push-f.com> 2023-09-09 21:53:22 +0200
committer: Martin Fischer <martin@push-f.com> 2023-09-28 10:36:08 +0200
commit: 5aa3b82fbe62882da8007b0a4548b979c845aa97 (patch)
tree: 9788640728ea7894a7ff53c561ed10bff3a611c1 /src/tokenizer
parent: 2c73901944e2d22747a2a4ebcc11881b3f8c2ad3 (diff)
2 files changed, 272 insertions, 11 deletions
diff --git a/src/tokenizer/machine.rs b/src/tokenizer/machine.rs
index fc31a42..e9a3e68 100644
--- a/src/tokenizer/machine.rs
+++ b/src/tokenizer/machine.rs
@@ -1,16 +1,65 @@
-pub(super) mod utils;
+mod utils;
 
 use crate::entities::try_read_character_reference;
 use crate::offset::{Offset, Position};
 use crate::token::AttrValueSyntax;
 use crate::tokenizer::CdataAction;
-use crate::{reader::Reader, Emitter, Error, Tokenizer};
+use crate::{reader::Reader, Emitter, Error};
 use utils::{
     ascii_digit_pat, control_pat, ctostr, noncharacter_pat, surrogate_pat, whitespace_pat,
 };
 
 pub use utils::State;
 
+pub(super) struct Machine<R, O, E> {
+    pub(super) state: State,
+    pub(super) emitter: E,
+    temporary_buffer: String,
+    reader: R,
+    to_reconsume: Stack2<Option<char>>,
+    character_reference_code: u32,
+    return_state: Option<State>,
+    current_tag_name: String,
+    pub(super) last_start_tag_name: String,
+    is_start_tag: bool,
+    /// The reader position before the match block in [`consume`].
+    position_before_match: O,
+    /// * Set to the offset of `<` in [`State::Data`].
+    /// * Set to the offset of `-` in [`State::Comment`].
+    /// * Set to the offset of `&` in [`State::CharacterReference`].
+    some_offset: O,
+    /// This boolean flag exists so that the [`NaiveParser`](crate::NaiveParser) can work with any [`Emitter`]
+    /// (it cannot call [`Tokenizer::set_state`] using the emitted start tags since they can be of an arbitrary type).
+    ///
+    /// [`Tokenizer::set_state`]: super::Tokenizer::set_state
+    pub(crate) naively_switch_state: bool,
+}
+
+impl<R, O, E> Machine<R, O, E>
+where
+    R: Reader + Position<O>,
+    O: Offset,
+    E: Emitter<O>,
+{
+    pub fn new(reader: R, emitter: E) -> Self {
+        Self {
+            reader,
+            emitter,
+            state: State::Data,
+            to_reconsume: Stack2::default(),
+            return_state: None,
+            temporary_buffer: String::new(),
+            character_reference_code: 0,
+            current_tag_name: String::new(),
+            last_start_tag_name: String::new(),
+            is_start_tag: false,
+            position_before_match: O::default(),
+            some_offset: O::default(),
+            naively_switch_state: false,
+        }
+    }
+}
+
 pub enum ControlToken {
     Eof,
     Continue,
@@ -18,7 +67,7 @@ pub enum ControlToken {
 }
 
 #[inline]
-pub fn consume<O, R, E>(slf: &mut Tokenizer<R, O, E>) -> Result<ControlToken, R::Error>
+pub(super) fn consume<O, R, E>(slf: &mut Machine<R, O, E>) -> Result<ControlToken, R::Error>
 where
     O: Offset,
     R: Reader + Position<O>,
@@ -1964,15 +2013,8 @@ where
     }
 }
 
-impl<R: Reader + Position<O>, O: Offset, E: Emitter<O>> Tokenizer<R, O, E> {
-    #[inline]
-    fn init_doctype(&mut self) {
-        self.emitter.init_doctype(self.some_offset);
-    }
-}
-
 #[inline]
-pub fn handle_cdata_open<O, R, E>(slf: &mut Tokenizer<R, O, E>, action: CdataAction)
+pub(super) fn handle_cdata_open<O, R, E>(slf: &mut Machine<R, O, E>, action: CdataAction)
 where
     O: Offset,
     R: Reader + Position<O>,
@@ -1989,3 +2031,29 @@ where
         }
     }
 }
+
+// this is a stack that can hold 0 to 2 Ts
+#[derive(Debug, Default, Clone, Copy)]
+struct Stack2<T: Copy>(Option<(T, Option<T>)>);
+
+impl<T: Copy> Stack2<T> {
+    #[inline]
+    fn push(&mut self, c: T) {
+        self.0 = match self.0 {
+            None => Some((c, None)),
+            Some((c1, None)) => Some((c1, Some(c))),
+            Some((_c1, Some(_c2))) => panic!("stack full!"),
+        }
+    }
+
+    #[inline]
+    fn pop(&mut self) -> Option<T> {
+        let (new_self, rv) = match self.0 {
+            Some((c1, Some(c2))) => (Some((c1, None)), Some(c2)),
+            Some((c1, None)) => (None, Some(c1)),
+            None => (None, None),
+        };
+        self.0 = new_self;
+        rv
+    }
+}
diff --git a/src/tokenizer/machine/utils.rs b/src/tokenizer/machine/utils.rs
index 7d220cf..6e45f4d 100644
--- a/src/tokenizer/machine/utils.rs
+++ b/src/tokenizer/machine/utils.rs
@@ -1,3 +1,196 @@
+use crate::{
+    naive_parser::naive_next_state,
+    offset::{Offset, Position},
+    reader::Reader,
+    Emitter, Error,
+};
+
+use super::Machine;
+
+impl<R, O, E> Machine<R, O, E>
+where
+    R: Reader + Position<O>,
+    O: Offset,
+    E: Emitter<O>,
+{
+    #[inline]
+    pub(crate) fn emit_error(&mut self, error: Error) {
+        let span = match error {
+            Error::EofBeforeTagName
+            | Error::EofInCdata
+            | Error::EofInComment
+            | Error::EofInDoctype
+            | Error::EofInScriptHtmlCommentLikeText
+            | Error::EofInTag
+            | Error::MissingSemicolonAfterCharacterReference => {
+                self.reader.position()..self.reader.position()
+            }
+            Error::AbsenceOfDigitsInNumericCharacterReference
+            | Error::NullCharacterReference
+            | Error::CharacterReferenceOutsideUnicodeRange
+            | Error::SurrogateCharacterReference
+            | Error::NoncharacterCharacterReference
+            | Error::ControlCharacterReference
+            | Error::UnknownNamedCharacterReference => self.some_offset..self.reader.position(),
+
+            _ => self.position_before_match..self.reader.position(),
+        };
+        self.emitter.report_error(error, span);
+    }
+
+    /// Assuming the _current token_ is an end tag, return true if all of these hold. Return false otherwise.
+    ///
+    /// * the _last start tag_ exists
+    /// * the current end tag token's name equals to the last start tag's name.
+    ///
+    /// See also WHATWG's definition of [appropriate end tag token].
+    ///
+    /// [appropriate end tag token]: https://html.spec.whatwg.org/multipage/parsing.html#appropriate-end-tag-token
+    #[inline]
+    pub(super) fn current_end_tag_is_appropriate(&mut self) -> bool {
+        self.current_tag_name == self.last_start_tag_name
+    }
+
+    #[inline]
+    pub(super) fn init_start_tag(&mut self) {
+        self.emitter
+            .init_start_tag(self.some_offset, self.position_before_match);
+        self.current_tag_name.clear();
+        self.is_start_tag = true;
+    }
+
+    #[inline]
+    pub(super) fn init_end_tag(&mut self) {
+        self.emitter
+            .init_end_tag(self.some_offset, self.position_before_match);
+        self.current_tag_name.clear();
+        self.is_start_tag = false;
+    }
+
+    #[inline]
+    pub(super) fn init_doctype(&mut self) {
+        self.emitter.init_doctype(self.some_offset);
+    }
+
+    #[inline]
+    pub(super) fn push_tag_name(&mut self, s: &str) {
+        self.emitter.push_tag_name(s);
+        self.current_tag_name.push_str(s);
+    }
+
+    #[inline]
+    pub(super) fn emit_current_tag(&mut self) {
+        self.emitter.emit_current_tag(self.reader.position());
+        if self.is_start_tag {
+            if self.naively_switch_state {
+                self.state = naive_next_state(&self.current_tag_name).into();
+            }
+            std::mem::swap(&mut self.last_start_tag_name, &mut self.current_tag_name);
+        }
+    }
+
+    #[inline]
+    pub(super) fn unread_char(&mut self, c: Option<char>) {
+        self.to_reconsume.push(c);
+    }
+
+    #[inline]
+    fn validate_char(&mut self, c: char) {
+        match c as u32 {
+            surrogate_pat!() => {
+                self.emit_error(Error::SurrogateInInputStream);
+            }
+            noncharacter_pat!() => {
+                self.emit_error(Error::NoncharacterInInputStream);
+            }
+            // control without whitespace or nul
+            x @ control_pat!()
+                if !matches!(x, 0x0000 | 0x0009 | 0x000a | 0x000c | 0x000d | 0x0020) =>
+            {
+                self.emit_error(Error::ControlCharacterInInputStream);
+            }
+            _ => (),
+        }
+    }
+
+    pub(super) fn read_char(&mut self) -> Result<Option<char>, R::Error> {
+        let (c_res, reconsumed) = match self.to_reconsume.pop() {
+            Some(c) => (Ok(c), true),
+            None => (self.reader.read_char(), false),
+        };
+
+        let mut c = match c_res {
+            Ok(Some(c)) => c,
+            res => return res,
+        };
+
+        if c == '\r' {
+            c = '\n';
+            let c2 = self.reader.read_char()?;
+            if c2 != Some('\n') {
+                self.unread_char(c2);
+            }
+        }
+
+        if !reconsumed {
+            self.validate_char(c);
+        }
+
+        Ok(Some(c))
+    }
+
+    #[inline]
+    pub(super) fn try_read_string(
+        &mut self,
+        mut s: &str,
+        case_sensitive: bool,
+    ) -> Result<bool, R::Error> {
+        debug_assert!(!s.is_empty());
+
+        let to_reconsume_bak = self.to_reconsume;
+        let mut chars = s.chars();
+        while let Some(c) = self.to_reconsume.pop() {
+            if let (Some(x), Some(x2)) = (c, chars.next()) {
+                if x == x2 || (!case_sensitive && x.to_ascii_lowercase() == x2.to_ascii_lowercase())
+                {
+                    s = &s[x.len_utf8()..];
+                    continue;
+                }
+            }
+
+            self.to_reconsume = to_reconsume_bak;
+            return Ok(false);
+        }
+
+        self.reader.try_read_string(s, case_sensitive)
+    }
+
+    pub(super) fn is_consumed_as_part_of_an_attribute(&self) -> bool {
+        matches!(
+            self.return_state,
+            Some(
+                State::AttributeValueDoubleQuoted
+                    | State::AttributeValueSingleQuoted
+                    | State::AttributeValueUnquoted
+            )
+        )
+    }
+
+    pub(super) fn flush_code_points_consumed_as_character_reference(&mut self) {
+        if self.is_consumed_as_part_of_an_attribute() {
+            self.emitter.push_attribute_value(&self.temporary_buffer);
+            self.temporary_buffer.clear();
+        } else {
+            self.flush_buffer_characters();
+        }
+    }
+
+    pub(super) fn flush_buffer_characters(&mut self) {
+        self.emitter.emit_string(&self.temporary_buffer);
+        self.temporary_buffer.clear();
+    }
+}
+
 macro_rules! surrogate_pat {
     () => {
         0xd800..=0xdfff
author	Martin Fischer <martin@push-f.com>	2023-09-09 21:53:22 +0200
committer	Martin Fischer <martin@push-f.com>	2023-09-28 10:36:08 +0200
commit	5aa3b82fbe62882da8007b0a4548b979c845aa97 (patch)
tree	9788640728ea7894a7ff53c561ed10bff3a611c1 /src/tokenizer
parent	2c73901944e2d22747a2a4ebcc11881b3f8c2ad3 (diff)