diff options
| -rw-r--r-- | src/tokenizer.rs | 39 | ||||
| -rw-r--r-- | src/tokenizer/machine.rs | 4 | 
2 files changed, 24 insertions, 19 deletions
| diff --git a/src/tokenizer.rs b/src/tokenizer.rs index b46bf45..7e05477 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -3,10 +3,13 @@ mod machine;  use crate::naive_parser::naive_next_state;  use crate::offset::{Offset, Position};  use crate::reader::{IntoReader, Reader}; -use crate::utils::{control_pat, noncharacter_pat, surrogate_pat, State as InternalState}; +use crate::utils::{control_pat, noncharacter_pat, surrogate_pat};  use crate::{Emitter, Error};  use machine::ControlToken; +#[cfg(feature = "integration-tests")] +use crate::utils::State as InternalState; +  // this is a stack that can hold 0 to 2 Ts  #[derive(Debug, Default, Clone, Copy)]  struct Stack2<T: Copy>(Option<(T, Option<T>)>); @@ -55,21 +58,21 @@ impl<T: Copy> Stack2<T> {  /// [tree construction]: https://html.spec.whatwg.org/multipage/parsing.html#tree-construction  pub struct Tokenizer<R: Reader, O, E: Emitter<O>> {      eof: bool, -    pub(crate) state: InternalState, +    pub(crate) state: machine::State,      pub(crate) emitter: E,      pub(crate) temporary_buffer: String,      pub(crate) reader: R,      to_reconsume: Stack2<Option<char>>,      pub(crate) character_reference_code: u32, -    pub(crate) return_state: Option<InternalState>, +    pub(crate) return_state: Option<machine::State>,      current_tag_name: String,      last_start_tag_name: String,      is_start_tag: bool,      /// The reader position before the match block in [`machine::consume`].      pub(crate) position_before_match: O, -    /// * Set to the offset of `<` in [`InternalState::Data`]. -    /// * Set to the offset of `-` in [`InternalState::Comment`]. -    /// * Set to the offset of `&` in [`InternalState::CharacterReference`]. +    /// * Set to the offset of `<` in [`machine::State::Data`]. +    /// * Set to the offset of `-` in [`machine::State::Comment`]. +    /// * Set to the offset of `&` in [`machine::State::CharacterReference`].      pub(crate) some_offset: O,      /// This boolean flag exists so that the [`NaiveParser`](crate::NaiveParser) can work with any [`Emitter`]      /// (it cannot call [`Tokenizer::set_state`] using the emitted start tags since they can be of an arbitrary type). @@ -87,7 +90,7 @@ impl<R: Reader + Position<O>, O: Offset, E: Emitter<O>> Tokenizer<R, O, E> {          Tokenizer {              reader: reader.into_reader(),              emitter, -            state: InternalState::Data, +            state: machine::State::Data,              to_reconsume: Stack2::default(),              return_state: None,              temporary_buffer: String::new(), @@ -174,16 +177,16 @@ pub enum State {      ScriptDataDoubleEscaped,  } -impl From<State> for InternalState { +impl From<State> for machine::State {      fn from(state: State) -> Self {          match state { -            State::Data => InternalState::Data, -            State::PlainText => InternalState::PlainText, -            State::RcData => InternalState::RcData, -            State::RawText => InternalState::RawText, -            State::ScriptData => InternalState::ScriptData, -            State::ScriptDataEscaped => InternalState::ScriptDataEscaped, -            State::ScriptDataDoubleEscaped => InternalState::ScriptDataDoubleEscaped, +            State::Data => machine::State::Data, +            State::PlainText => machine::State::PlainText, +            State::RcData => machine::State::RcData, +            State::RawText => machine::State::RawText, +            State::ScriptData => machine::State::ScriptData, +            State::ScriptDataEscaped => machine::State::ScriptDataEscaped, +            State::ScriptDataDoubleEscaped => machine::State::ScriptDataDoubleEscaped,          }      }  } @@ -354,9 +357,9 @@ impl<R: Reader + Position<O>, O: Offset, E: Emitter<O>> Tokenizer<R, O, E> {          matches!(              self.return_state,              Some( -                InternalState::AttributeValueDoubleQuoted -                    | InternalState::AttributeValueSingleQuoted -                    | InternalState::AttributeValueUnquoted +                machine::State::AttributeValueDoubleQuoted +                    | machine::State::AttributeValueSingleQuoted +                    | machine::State::AttributeValueUnquoted              )          )      } diff --git a/src/tokenizer/machine.rs b/src/tokenizer/machine.rs index fd4b36b..07d4c05 100644 --- a/src/tokenizer/machine.rs +++ b/src/tokenizer/machine.rs @@ -3,10 +3,12 @@ use crate::offset::{Offset, Position};  use crate::token::AttrValueSyntax;  use crate::tokenizer::CdataAction;  use crate::utils::{ -    ascii_digit_pat, control_pat, ctostr, noncharacter_pat, surrogate_pat, whitespace_pat, State, +    ascii_digit_pat, control_pat, ctostr, noncharacter_pat, surrogate_pat, whitespace_pat,  };  use crate::{reader::Reader, Emitter, Error, Tokenizer}; +pub use crate::utils::State; +  pub enum ControlToken {      Eof,      Continue, | 
