aboutsummaryrefslogtreecommitdiff
path: root/src/tokenizer/machine.rs
diff options
context:
space:
mode:
Diffstat (limited to 'src/tokenizer/machine.rs')
-rw-r--r--src/tokenizer/machine.rs90
1 files changed, 79 insertions, 11 deletions
diff --git a/src/tokenizer/machine.rs b/src/tokenizer/machine.rs
index fc31a42..e9a3e68 100644
--- a/src/tokenizer/machine.rs
+++ b/src/tokenizer/machine.rs
@@ -1,16 +1,65 @@
-pub(super) mod utils;
+mod utils;
use crate::entities::try_read_character_reference;
use crate::offset::{Offset, Position};
use crate::token::AttrValueSyntax;
use crate::tokenizer::CdataAction;
-use crate::{reader::Reader, Emitter, Error, Tokenizer};
+use crate::{reader::Reader, Emitter, Error};
use utils::{
ascii_digit_pat, control_pat, ctostr, noncharacter_pat, surrogate_pat, whitespace_pat,
};
pub use utils::State;
+pub(super) struct Machine<R, O, E> {
+ pub(super) state: State,
+ pub(super) emitter: E,
+ temporary_buffer: String,
+ reader: R,
+ to_reconsume: Stack2<Option<char>>,
+ character_reference_code: u32,
+ return_state: Option<State>,
+ current_tag_name: String,
+ pub(super) last_start_tag_name: String,
+ is_start_tag: bool,
+ /// The reader position before the match block in [`consume`].
+ position_before_match: O,
+ /// * Set to the offset of `<` in [`State::Data`].
+ /// * Set to the offset of `-` in [`State::Comment`].
+ /// * Set to the offset of `&` in [`State::CharacterReference`].
+ some_offset: O,
+ /// This boolean flag exists so that the [`NaiveParser`](crate::NaiveParser) can work with any [`Emitter`]
+ /// (it cannot call [`Tokenizer::set_state`] using the emitted start tags since they can be of an arbitrary type).
+ ///
+ /// [`Tokenizer::set_state`]: super::Tokenizer::set_state
+ pub(crate) naively_switch_state: bool,
+}
+
+impl<R, O, E> Machine<R, O, E>
+where
+ R: Reader + Position<O>,
+ O: Offset,
+ E: Emitter<O>,
+{
+ pub fn new(reader: R, emitter: E) -> Self {
+ Self {
+ reader,
+ emitter,
+ state: State::Data,
+ to_reconsume: Stack2::default(),
+ return_state: None,
+ temporary_buffer: String::new(),
+ character_reference_code: 0,
+ current_tag_name: String::new(),
+ last_start_tag_name: String::new(),
+ is_start_tag: false,
+ position_before_match: O::default(),
+ some_offset: O::default(),
+ naively_switch_state: false,
+ }
+ }
+}
+
pub enum ControlToken {
Eof,
Continue,
@@ -18,7 +67,7 @@ pub enum ControlToken {
}
#[inline]
-pub fn consume<O, R, E>(slf: &mut Tokenizer<R, O, E>) -> Result<ControlToken, R::Error>
+pub(super) fn consume<O, R, E>(slf: &mut Machine<R, O, E>) -> Result<ControlToken, R::Error>
where
O: Offset,
R: Reader + Position<O>,
@@ -1964,15 +2013,8 @@ where
}
}
-impl<R: Reader + Position<O>, O: Offset, E: Emitter<O>> Tokenizer<R, O, E> {
- #[inline]
- fn init_doctype(&mut self) {
- self.emitter.init_doctype(self.some_offset);
- }
-}
-
#[inline]
-pub fn handle_cdata_open<O, R, E>(slf: &mut Tokenizer<R, O, E>, action: CdataAction)
+pub(super) fn handle_cdata_open<O, R, E>(slf: &mut Machine<R, O, E>, action: CdataAction)
where
O: Offset,
R: Reader + Position<O>,
@@ -1989,3 +2031,29 @@ where
}
}
}
+
+// this is a stack that can hold 0 to 2 Ts
+#[derive(Debug, Default, Clone, Copy)]
+struct Stack2<T: Copy>(Option<(T, Option<T>)>);
+
+impl<T: Copy> Stack2<T> {
+ #[inline]
+ fn push(&mut self, c: T) {
+ self.0 = match self.0 {
+ None => Some((c, None)),
+ Some((c1, None)) => Some((c1, Some(c))),
+ Some((_c1, Some(_c2))) => panic!("stack full!"),
+ }
+ }
+
+ #[inline]
+ fn pop(&mut self) -> Option<T> {
+ let (new_self, rv) = match self.0 {
+ Some((c1, Some(c2))) => (Some((c1, None)), Some(c2)),
+ Some((c1, None)) => (None, Some(c1)),
+ None => (None, None),
+ };
+ self.0 = new_self;
+ rv
+ }
+}