aboutsummaryrefslogtreecommitdiff
path: root/src/tokenizer/machine.rs
diff options
context:
space:
mode:
authorMartin Fischer <martin@push-f.com>2023-09-09 21:53:22 +0200
committerMartin Fischer <martin@push-f.com>2023-09-28 10:36:08 +0200
commit5aa3b82fbe62882da8007b0a4548b979c845aa97 (patch)
tree9788640728ea7894a7ff53c561ed10bff3a611c1 /src/tokenizer/machine.rs
parent2c73901944e2d22747a2a4ebcc11881b3f8c2ad3 (diff)
refactor: move machine impl details to machine module
This commit separates the public API (the "Tokenizer") from the internal implementation (the "Machine") to make the code more readable.
Diffstat (limited to 'src/tokenizer/machine.rs')
-rw-r--r--src/tokenizer/machine.rs90
1 files changed, 79 insertions, 11 deletions
diff --git a/src/tokenizer/machine.rs b/src/tokenizer/machine.rs
index fc31a42..e9a3e68 100644
--- a/src/tokenizer/machine.rs
+++ b/src/tokenizer/machine.rs
@@ -1,16 +1,65 @@
-pub(super) mod utils;
+mod utils;
use crate::entities::try_read_character_reference;
use crate::offset::{Offset, Position};
use crate::token::AttrValueSyntax;
use crate::tokenizer::CdataAction;
-use crate::{reader::Reader, Emitter, Error, Tokenizer};
+use crate::{reader::Reader, Emitter, Error};
use utils::{
ascii_digit_pat, control_pat, ctostr, noncharacter_pat, surrogate_pat, whitespace_pat,
};
pub use utils::State;
+pub(super) struct Machine<R, O, E> {
+ pub(super) state: State,
+ pub(super) emitter: E,
+ temporary_buffer: String,
+ reader: R,
+ to_reconsume: Stack2<Option<char>>,
+ character_reference_code: u32,
+ return_state: Option<State>,
+ current_tag_name: String,
+ pub(super) last_start_tag_name: String,
+ is_start_tag: bool,
+ /// The reader position before the match block in [`consume`].
+ position_before_match: O,
+ /// * Set to the offset of `<` in [`State::Data`].
+ /// * Set to the offset of `-` in [`State::Comment`].
+ /// * Set to the offset of `&` in [`State::CharacterReference`].
+ some_offset: O,
+ /// This boolean flag exists so that the [`NaiveParser`](crate::NaiveParser) can work with any [`Emitter`]
+ /// (it cannot call [`Tokenizer::set_state`] using the emitted start tags since they can be of an arbitrary type).
+ ///
+ /// [`Tokenizer::set_state`]: super::Tokenizer::set_state
+ pub(crate) naively_switch_state: bool,
+}
+
+impl<R, O, E> Machine<R, O, E>
+where
+ R: Reader + Position<O>,
+ O: Offset,
+ E: Emitter<O>,
+{
+ pub fn new(reader: R, emitter: E) -> Self {
+ Self {
+ reader,
+ emitter,
+ state: State::Data,
+ to_reconsume: Stack2::default(),
+ return_state: None,
+ temporary_buffer: String::new(),
+ character_reference_code: 0,
+ current_tag_name: String::new(),
+ last_start_tag_name: String::new(),
+ is_start_tag: false,
+ position_before_match: O::default(),
+ some_offset: O::default(),
+ naively_switch_state: false,
+ }
+ }
+}
+
pub enum ControlToken {
Eof,
Continue,
@@ -18,7 +67,7 @@ pub enum ControlToken {
}
#[inline]
-pub fn consume<O, R, E>(slf: &mut Tokenizer<R, O, E>) -> Result<ControlToken, R::Error>
+pub(super) fn consume<O, R, E>(slf: &mut Machine<R, O, E>) -> Result<ControlToken, R::Error>
where
O: Offset,
R: Reader + Position<O>,
@@ -1964,15 +2013,8 @@ where
}
}
-impl<R: Reader + Position<O>, O: Offset, E: Emitter<O>> Tokenizer<R, O, E> {
- #[inline]
- fn init_doctype(&mut self) {
- self.emitter.init_doctype(self.some_offset);
- }
-}
-
#[inline]
-pub fn handle_cdata_open<O, R, E>(slf: &mut Tokenizer<R, O, E>, action: CdataAction)
+pub(super) fn handle_cdata_open<O, R, E>(slf: &mut Machine<R, O, E>, action: CdataAction)
where
O: Offset,
R: Reader + Position<O>,
@@ -1989,3 +2031,29 @@ where
}
}
}
+
+// this is a stack that can hold 0 to 2 Ts
+#[derive(Debug, Default, Clone, Copy)]
+struct Stack2<T: Copy>(Option<(T, Option<T>)>);
+
+impl<T: Copy> Stack2<T> {
+ #[inline]
+ fn push(&mut self, c: T) {
+ self.0 = match self.0 {
+ None => Some((c, None)),
+ Some((c1, None)) => Some((c1, Some(c))),
+ Some((_c1, Some(_c2))) => panic!("stack full!"),
+ }
+ }
+
+ #[inline]
+ fn pop(&mut self) -> Option<T> {
+ let (new_self, rv) = match self.0 {
+ Some((c1, Some(c2))) => (Some((c1, None)), Some(c2)),
+ Some((c1, None)) => (None, Some(c1)),
+ None => (None, None),
+ };
+ self.0 = new_self;
+ rv
+ }
+}