diff options
Diffstat (limited to 'src/tokenizer/machine.rs')
-rw-r--r-- | src/tokenizer/machine.rs | 90 |
1 files changed, 79 insertions, 11 deletions
diff --git a/src/tokenizer/machine.rs b/src/tokenizer/machine.rs index fc31a42..e9a3e68 100644 --- a/src/tokenizer/machine.rs +++ b/src/tokenizer/machine.rs @@ -1,16 +1,65 @@ -pub(super) mod utils; +mod utils; use crate::entities::try_read_character_reference; use crate::offset::{Offset, Position}; use crate::token::AttrValueSyntax; use crate::tokenizer::CdataAction; -use crate::{reader::Reader, Emitter, Error, Tokenizer}; +use crate::{reader::Reader, Emitter, Error}; use utils::{ ascii_digit_pat, control_pat, ctostr, noncharacter_pat, surrogate_pat, whitespace_pat, }; pub use utils::State; +pub(super) struct Machine<R, O, E> { + pub(super) state: State, + pub(super) emitter: E, + temporary_buffer: String, + reader: R, + to_reconsume: Stack2<Option<char>>, + character_reference_code: u32, + return_state: Option<State>, + current_tag_name: String, + pub(super) last_start_tag_name: String, + is_start_tag: bool, + /// The reader position before the match block in [`consume`]. + position_before_match: O, + /// * Set to the offset of `<` in [`State::Data`]. + /// * Set to the offset of `-` in [`State::Comment`]. + /// * Set to the offset of `&` in [`State::CharacterReference`]. + some_offset: O, + /// This boolean flag exists so that the [`NaiveParser`](crate::NaiveParser) can work with any [`Emitter`] + /// (it cannot call [`Tokenizer::set_state`] using the emitted start tags since they can be of an arbitrary type). + /// + /// [`Tokenizer::set_state`]: super::Tokenizer::set_state + pub(crate) naively_switch_state: bool, +} + +impl<R, O, E> Machine<R, O, E> +where + R: Reader + Position<O>, + O: Offset, + E: Emitter<O>, +{ + pub fn new(reader: R, emitter: E) -> Self { + Self { + reader, + emitter, + state: State::Data, + to_reconsume: Stack2::default(), + return_state: None, + temporary_buffer: String::new(), + character_reference_code: 0, + current_tag_name: String::new(), + last_start_tag_name: String::new(), + is_start_tag: false, + position_before_match: O::default(), + some_offset: O::default(), + naively_switch_state: false, + } + } +} + pub enum ControlToken { Eof, Continue, @@ -18,7 +67,7 @@ pub enum ControlToken { } #[inline] -pub fn consume<O, R, E>(slf: &mut Tokenizer<R, O, E>) -> Result<ControlToken, R::Error> +pub(super) fn consume<O, R, E>(slf: &mut Machine<R, O, E>) -> Result<ControlToken, R::Error> where O: Offset, R: Reader + Position<O>, @@ -1964,15 +2013,8 @@ where } } -impl<R: Reader + Position<O>, O: Offset, E: Emitter<O>> Tokenizer<R, O, E> { - #[inline] - fn init_doctype(&mut self) { - self.emitter.init_doctype(self.some_offset); - } -} - #[inline] -pub fn handle_cdata_open<O, R, E>(slf: &mut Tokenizer<R, O, E>, action: CdataAction) +pub(super) fn handle_cdata_open<O, R, E>(slf: &mut Machine<R, O, E>, action: CdataAction) where O: Offset, R: Reader + Position<O>, @@ -1989,3 +2031,29 @@ where } } } + +// this is a stack that can hold 0 to 2 Ts +#[derive(Debug, Default, Clone, Copy)] +struct Stack2<T: Copy>(Option<(T, Option<T>)>); + +impl<T: Copy> Stack2<T> { + #[inline] + fn push(&mut self, c: T) { + self.0 = match self.0 { + None => Some((c, None)), + Some((c1, None)) => Some((c1, Some(c))), + Some((_c1, Some(_c2))) => panic!("stack full!"), + } + } + + #[inline] + fn pop(&mut self) -> Option<T> { + let (new_self, rv) = match self.0 { + Some((c1, Some(c2))) => (Some((c1, None)), Some(c2)), + Some((c1, None)) => (None, Some(c1)), + None => (None, None), + }; + self.0 = new_self; + rv + } +} |