diff options
| -rw-r--r-- | src/naive_parser.rs | 6 | ||||
| -rw-r--r-- | src/tokenizer.rs | 263 | ||||
| -rw-r--r-- | src/tokenizer/machine.rs | 90 | ||||
| -rw-r--r-- | src/tokenizer/machine/utils.rs | 193 | 
4 files changed, 293 insertions, 259 deletions
| diff --git a/src/naive_parser.rs b/src/naive_parser.rs index 5bf002b..c5e9568 100644 --- a/src/naive_parser.rs +++ b/src/naive_parser.rs @@ -35,7 +35,7 @@ impl<R: Reader + Position<O>, O: Offset> NaiveParser<R, O, DefaultEmitter<O>> {      // TODO: add example for NaiveParser::new      pub fn new<'a>(reader: impl IntoReader<'a, Reader = R>) -> Self {          let mut tokenizer = Tokenizer::new(reader, DefaultEmitter::default()); -        tokenizer.naively_switch_state = true; +        tokenizer.enable_naive_state_switching();          NaiveParser { tokenizer }      }  } @@ -45,7 +45,7 @@ impl<R: Reader + Position<usize>> NaiveParser<R, usize, DefaultEmitter<usize>> {      // TODO: add example for NaiveParser::new_with_spans      pub fn new_with_spans<'a>(reader: impl IntoReader<'a, Reader = R>) -> Self {          let mut tokenizer = Tokenizer::new(reader, DefaultEmitter::default()); -        tokenizer.naively_switch_state = true; +        tokenizer.enable_naive_state_switching();          NaiveParser { tokenizer }      }  } @@ -55,7 +55,7 @@ impl<R: Reader + Position<O>, O: Offset, E: Emitter<O>> NaiveParser<R, O, E> {      // TODO: add example for NaiveParser::new_with_emitter      pub fn new_with_emitter<'a>(reader: impl IntoReader<'a, Reader = R>, emitter: E) -> Self {          let mut tokenizer = Tokenizer::new(reader, emitter); -        tokenizer.naively_switch_state = true; +        tokenizer.enable_naive_state_switching();          NaiveParser { tokenizer }      } diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 6f698f6..7c38e49 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -1,41 +1,13 @@ -mod machine; +pub(crate) mod machine; -use crate::naive_parser::naive_next_state;  use crate::offset::{Offset, Position};  use crate::reader::{IntoReader, Reader}; -use crate::{Emitter, Error}; -use machine::utils::{control_pat, noncharacter_pat, surrogate_pat}; +use crate::Emitter;  use machine::ControlToken;  #[cfg(feature = "integration-tests")]  pub use machine::State as InternalState; -// this is a stack that can hold 0 to 2 Ts -#[derive(Debug, Default, Clone, Copy)] -struct Stack2<T: Copy>(Option<(T, Option<T>)>); - -impl<T: Copy> Stack2<T> { -    #[inline] -    fn push(&mut self, c: T) { -        self.0 = match self.0 { -            None => Some((c, None)), -            Some((c1, None)) => Some((c1, Some(c))), -            Some((_c1, Some(_c2))) => panic!("stack full!"), -        } -    } - -    #[inline] -    fn pop(&mut self) -> Option<T> { -        let (new_self, rv) = match self.0 { -            Some((c1, Some(c2))) => (Some((c1, None)), Some(c2)), -            Some((c1, None)) => (None, Some(c1)), -            None => (None, None), -        }; -        self.0 = new_self; -        rv -    } -} -  /// An HTML tokenizer.  ///  /// # Warning @@ -56,27 +28,9 @@ impl<T: Copy> Stack2<T> {  ///  /// [`NaiveParser`]: crate::NaiveParser  /// [tree construction]: https://html.spec.whatwg.org/multipage/parsing.html#tree-construction -pub struct Tokenizer<R: Reader, O, E: Emitter<O>> { +pub struct Tokenizer<R, O, E> { +    machine: machine::Machine<R, O, E>,      eof: bool, -    pub(crate) state: machine::State, -    pub(crate) emitter: E, -    pub(crate) temporary_buffer: String, -    pub(crate) reader: R, -    to_reconsume: Stack2<Option<char>>, -    pub(crate) character_reference_code: u32, -    pub(crate) return_state: Option<machine::State>, -    current_tag_name: String, -    last_start_tag_name: String, -    is_start_tag: bool, -    /// The reader position before the match block in [`machine::consume`]. -    pub(crate) position_before_match: O, -    /// * Set to the offset of `<` in [`machine::State::Data`]. -    /// * Set to the offset of `-` in [`machine::State::Comment`]. -    /// * Set to the offset of `&` in [`machine::State::CharacterReference`]. -    pub(crate) some_offset: O, -    /// This boolean flag exists so that the [`NaiveParser`](crate::NaiveParser) can work with any [`Emitter`] -    /// (it cannot call [`Tokenizer::set_state`] using the emitted start tags since they can be of an arbitrary type). -    pub(crate) naively_switch_state: bool,  }  impl<R: Reader + Position<O>, O: Offset, E: Emitter<O>> Tokenizer<R, O, E> { @@ -88,20 +42,8 @@ impl<R: Reader + Position<O>, O: Offset, E: Emitter<O>> Tokenizer<R, O, E> {      /// [tree construction]: https://html.spec.whatwg.org/multipage/parsing.html#tree-construction      pub fn new<'a>(reader: impl IntoReader<'a, Reader = R>, emitter: E) -> Self {          Tokenizer { -            reader: reader.into_reader(), -            emitter, -            state: machine::State::Data, -            to_reconsume: Stack2::default(), -            return_state: None, -            temporary_buffer: String::new(), -            character_reference_code: 0, +            machine: machine::Machine::new(reader.into_reader(), emitter),              eof: false, -            current_tag_name: String::new(), -            last_start_tag_name: String::new(), -            is_start_tag: false, -            position_before_match: O::default(), -            some_offset: O::default(), -            naively_switch_state: false,          }      } @@ -114,12 +56,12 @@ impl<R: Reader + Position<O>, O: Offset, E: Emitter<O>> Tokenizer<R, O, E> {      ///      /// [Markup declaration open state]: https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state      pub fn handle_cdata_open(&mut self, action: CdataAction) { -        machine::handle_cdata_open(self, action); +        machine::handle_cdata_open(&mut self.machine, action);      }      /// Returns a mutable reference to the emitter.      pub fn emitter_mut(&mut self) -> &mut E { -        &mut self.emitter +        &mut self.machine.emitter      }  } @@ -197,185 +139,12 @@ impl<R: Reader + Position<O>, O: Offset, E: Emitter<O>> Tokenizer<R, O, E> {      /// Only available with the `integration-tests` feature which is not public API.      #[cfg(feature = "integration-tests")]      pub fn set_internal_state(&mut self, state: InternalState) { -        self.state = state; +        self.machine.state = state;      }      /// Set the statemachine to start/continue in the given state.      pub fn set_state(&mut self, state: State) { -        self.state = state.into(); -    } - -    /// Just a helper method for the machine. -    #[inline] -    pub(crate) fn emit_error(&mut self, error: Error) { -        let span = match error { -            Error::EofBeforeTagName -            | Error::EofInCdata -            | Error::EofInComment -            | Error::EofInDoctype -            | Error::EofInScriptHtmlCommentLikeText -            | Error::EofInTag -            | Error::MissingSemicolonAfterCharacterReference => { -                self.reader.position()..self.reader.position() -            } -            Error::AbsenceOfDigitsInNumericCharacterReference -            | Error::NullCharacterReference -            | Error::CharacterReferenceOutsideUnicodeRange -            | Error::SurrogateCharacterReference -            | Error::NoncharacterCharacterReference -            | Error::ControlCharacterReference -            | Error::UnknownNamedCharacterReference => self.some_offset..self.reader.position(), - -            _ => self.position_before_match..self.reader.position(), -        }; -        self.emitter.report_error(error, span); -    } - -    /// Assuming the _current token_ is an end tag, return true if all of these hold. Return false otherwise. -    /// -    /// * the _last start tag_ exists -    /// * the current end tag token's name equals to the last start tag's name. -    /// -    /// See also WHATWG's definition of [appropriate end tag token]. -    /// -    /// [appropriate end tag token]: https://html.spec.whatwg.org/multipage/parsing.html#appropriate-end-tag-token -    #[inline] -    pub(crate) fn current_end_tag_is_appropriate(&mut self) -> bool { -        self.current_tag_name == self.last_start_tag_name -    } - -    #[inline] -    pub(crate) fn init_start_tag(&mut self) { -        self.emitter -            .init_start_tag(self.some_offset, self.position_before_match); -        self.current_tag_name.clear(); -        self.is_start_tag = true; -    } - -    #[inline] -    pub(crate) fn init_end_tag(&mut self) { -        self.emitter -            .init_end_tag(self.some_offset, self.position_before_match); -        self.current_tag_name.clear(); -        self.is_start_tag = false; -    } - -    #[inline] -    pub(crate) fn push_tag_name(&mut self, s: &str) { -        self.emitter.push_tag_name(s); -        self.current_tag_name.push_str(s); -    } - -    #[inline] -    pub(crate) fn emit_current_tag(&mut self) { -        self.emitter.emit_current_tag(self.reader.position()); -        if self.is_start_tag { -            if self.naively_switch_state { -                self.state = naive_next_state(&self.current_tag_name).into(); -            } -            std::mem::swap(&mut self.last_start_tag_name, &mut self.current_tag_name); -        } -    } - -    #[inline] -    pub(crate) fn unread_char(&mut self, c: Option<char>) { -        self.to_reconsume.push(c); -    } - -    #[inline] -    fn validate_char(&mut self, c: char) { -        match c as u32 { -            surrogate_pat!() => { -                self.emit_error(Error::SurrogateInInputStream); -            } -            noncharacter_pat!() => { -                self.emit_error(Error::NoncharacterInInputStream); -            } -            // control without whitespace or nul -            x @ control_pat!() -                if !matches!(x, 0x0000 | 0x0009 | 0x000a | 0x000c | 0x000d | 0x0020) => -            { -                self.emit_error(Error::ControlCharacterInInputStream); -            } -            _ => (), -        } -    } - -    pub(crate) fn read_char(&mut self) -> Result<Option<char>, R::Error> { -        let (c_res, reconsumed) = match self.to_reconsume.pop() { -            Some(c) => (Ok(c), true), -            None => (self.reader.read_char(), false), -        }; - -        let mut c = match c_res { -            Ok(Some(c)) => c, -            res => return res, -        }; - -        if c == '\r' { -            c = '\n'; -            let c2 = self.reader.read_char()?; -            if c2 != Some('\n') { -                self.unread_char(c2); -            } -        } - -        if !reconsumed { -            self.validate_char(c); -        } - -        Ok(Some(c)) -    } - -    #[inline] -    pub(crate) fn try_read_string( -        &mut self, -        mut s: &str, -        case_sensitive: bool, -    ) -> Result<bool, R::Error> { -        debug_assert!(!s.is_empty()); - -        let to_reconsume_bak = self.to_reconsume; -        let mut chars = s.chars(); -        while let Some(c) = self.to_reconsume.pop() { -            if let (Some(x), Some(x2)) = (c, chars.next()) { -                if x == x2 || (!case_sensitive && x.to_ascii_lowercase() == x2.to_ascii_lowercase()) -                { -                    s = &s[x.len_utf8()..]; -                    continue; -                } -            } - -            self.to_reconsume = to_reconsume_bak; -            return Ok(false); -        } - -        self.reader.try_read_string(s, case_sensitive) -    } - -    pub(crate) fn is_consumed_as_part_of_an_attribute(&self) -> bool { -        matches!( -            self.return_state, -            Some( -                machine::State::AttributeValueDoubleQuoted -                    | machine::State::AttributeValueSingleQuoted -                    | machine::State::AttributeValueUnquoted -            ) -        ) -    } - -    pub(crate) fn flush_code_points_consumed_as_character_reference(&mut self) { -        if self.is_consumed_as_part_of_an_attribute() { -            self.emitter.push_attribute_value(&self.temporary_buffer); -            self.temporary_buffer.clear(); -        } else { -            self.flush_buffer_characters(); -        } -    } - -    pub(crate) fn flush_buffer_characters(&mut self) { -        self.emitter.emit_string(&self.temporary_buffer); -        self.temporary_buffer.clear(); +        self.machine.state = state.into();      }  } @@ -389,7 +158,7 @@ where      fn next(&mut self) -> Option<Self::Item> {          loop { -            if let Some(token) = self.emitter.next() { +            if let Some(token) = self.machine.emitter.next() {                  return Some(Ok(Event::Token(token)));              } @@ -397,12 +166,12 @@ where                  return None;              } -            match machine::consume(self) { +            match machine::consume(&mut self.machine) {                  Err(e) => return Some(Err(e)),                  Ok(ControlToken::Continue) => (),                  Ok(ControlToken::Eof) => {                      self.eof = true; -                    self.emitter.emit_eof(); +                    self.machine.emitter.emit_eof();                  }                  Ok(ControlToken::CdataOpen) => return Some(Ok(Event::CdataOpen)),              } @@ -411,12 +180,16 @@ where  }  impl<R: Reader, O, E: Emitter<O>> Tokenizer<R, O, E> { +    pub(crate) fn enable_naive_state_switching(&mut self) { +        self.machine.naively_switch_state = true; +    } +      /// Test-internal function to override internal state.      ///      /// Only available with the `integration-tests` feature which is not public API.      #[cfg(feature = "integration-tests")]      pub fn set_last_start_tag(&mut self, last_start_tag: &str) { -        self.last_start_tag_name.clear(); -        self.last_start_tag_name.push_str(last_start_tag); +        self.machine.last_start_tag_name.clear(); +        self.machine.last_start_tag_name.push_str(last_start_tag);      }  } diff --git a/src/tokenizer/machine.rs b/src/tokenizer/machine.rs index fc31a42..e9a3e68 100644 --- a/src/tokenizer/machine.rs +++ b/src/tokenizer/machine.rs @@ -1,16 +1,65 @@ -pub(super) mod utils; +mod utils;  use crate::entities::try_read_character_reference;  use crate::offset::{Offset, Position};  use crate::token::AttrValueSyntax;  use crate::tokenizer::CdataAction; -use crate::{reader::Reader, Emitter, Error, Tokenizer}; +use crate::{reader::Reader, Emitter, Error};  use utils::{      ascii_digit_pat, control_pat, ctostr, noncharacter_pat, surrogate_pat, whitespace_pat,  };  pub use utils::State; +pub(super) struct Machine<R, O, E> { +    pub(super) state: State, +    pub(super) emitter: E, +    temporary_buffer: String, +    reader: R, +    to_reconsume: Stack2<Option<char>>, +    character_reference_code: u32, +    return_state: Option<State>, +    current_tag_name: String, +    pub(super) last_start_tag_name: String, +    is_start_tag: bool, +    /// The reader position before the match block in [`consume`]. +    position_before_match: O, +    /// * Set to the offset of `<` in [`State::Data`]. +    /// * Set to the offset of `-` in [`State::Comment`]. +    /// * Set to the offset of `&` in [`State::CharacterReference`]. +    some_offset: O, +    /// This boolean flag exists so that the [`NaiveParser`](crate::NaiveParser) can work with any [`Emitter`] +    /// (it cannot call [`Tokenizer::set_state`] using the emitted start tags since they can be of an arbitrary type). +    /// +    /// [`Tokenizer::set_state`]: super::Tokenizer::set_state +    pub(crate) naively_switch_state: bool, +} + +impl<R, O, E> Machine<R, O, E> +where +    R: Reader + Position<O>, +    O: Offset, +    E: Emitter<O>, +{ +    pub fn new(reader: R, emitter: E) -> Self { +        Self { +            reader, +            emitter, +            state: State::Data, +            to_reconsume: Stack2::default(), +            return_state: None, +            temporary_buffer: String::new(), +            character_reference_code: 0, +            current_tag_name: String::new(), +            last_start_tag_name: String::new(), +            is_start_tag: false, +            position_before_match: O::default(), +            some_offset: O::default(), +            naively_switch_state: false, +        } +    } +} +  pub enum ControlToken {      Eof,      Continue, @@ -18,7 +67,7 @@ pub enum ControlToken {  }  #[inline] -pub fn consume<O, R, E>(slf: &mut Tokenizer<R, O, E>) -> Result<ControlToken, R::Error> +pub(super) fn consume<O, R, E>(slf: &mut Machine<R, O, E>) -> Result<ControlToken, R::Error>  where      O: Offset,      R: Reader + Position<O>, @@ -1964,15 +2013,8 @@ where      }  } -impl<R: Reader + Position<O>, O: Offset, E: Emitter<O>> Tokenizer<R, O, E> { -    #[inline] -    fn init_doctype(&mut self) { -        self.emitter.init_doctype(self.some_offset); -    } -} -  #[inline] -pub fn handle_cdata_open<O, R, E>(slf: &mut Tokenizer<R, O, E>, action: CdataAction) +pub(super) fn handle_cdata_open<O, R, E>(slf: &mut Machine<R, O, E>, action: CdataAction)  where      O: Offset,      R: Reader + Position<O>, @@ -1989,3 +2031,29 @@ where          }      }  } + +// this is a stack that can hold 0 to 2 Ts +#[derive(Debug, Default, Clone, Copy)] +struct Stack2<T: Copy>(Option<(T, Option<T>)>); + +impl<T: Copy> Stack2<T> { +    #[inline] +    fn push(&mut self, c: T) { +        self.0 = match self.0 { +            None => Some((c, None)), +            Some((c1, None)) => Some((c1, Some(c))), +            Some((_c1, Some(_c2))) => panic!("stack full!"), +        } +    } + +    #[inline] +    fn pop(&mut self) -> Option<T> { +        let (new_self, rv) = match self.0 { +            Some((c1, Some(c2))) => (Some((c1, None)), Some(c2)), +            Some((c1, None)) => (None, Some(c1)), +            None => (None, None), +        }; +        self.0 = new_self; +        rv +    } +} diff --git a/src/tokenizer/machine/utils.rs b/src/tokenizer/machine/utils.rs index 7d220cf..6e45f4d 100644 --- a/src/tokenizer/machine/utils.rs +++ b/src/tokenizer/machine/utils.rs @@ -1,3 +1,196 @@ +use crate::{ +    naive_parser::naive_next_state, +    offset::{Offset, Position}, +    reader::Reader, +    Emitter, Error, +}; + +use super::Machine; + +impl<R, O, E> Machine<R, O, E> +where +    R: Reader + Position<O>, +    O: Offset, +    E: Emitter<O>, +{ +    #[inline] +    pub(crate) fn emit_error(&mut self, error: Error) { +        let span = match error { +            Error::EofBeforeTagName +            | Error::EofInCdata +            | Error::EofInComment +            | Error::EofInDoctype +            | Error::EofInScriptHtmlCommentLikeText +            | Error::EofInTag +            | Error::MissingSemicolonAfterCharacterReference => { +                self.reader.position()..self.reader.position() +            } +            Error::AbsenceOfDigitsInNumericCharacterReference +            | Error::NullCharacterReference +            | Error::CharacterReferenceOutsideUnicodeRange +            | Error::SurrogateCharacterReference +            | Error::NoncharacterCharacterReference +            | Error::ControlCharacterReference +            | Error::UnknownNamedCharacterReference => self.some_offset..self.reader.position(), + +            _ => self.position_before_match..self.reader.position(), +        }; +        self.emitter.report_error(error, span); +    } + +    /// Assuming the _current token_ is an end tag, return true if all of these hold. Return false otherwise. +    /// +    /// * the _last start tag_ exists +    /// * the current end tag token's name equals to the last start tag's name. +    /// +    /// See also WHATWG's definition of [appropriate end tag token]. +    /// +    /// [appropriate end tag token]: https://html.spec.whatwg.org/multipage/parsing.html#appropriate-end-tag-token +    #[inline] +    pub(super) fn current_end_tag_is_appropriate(&mut self) -> bool { +        self.current_tag_name == self.last_start_tag_name +    } + +    #[inline] +    pub(super) fn init_start_tag(&mut self) { +        self.emitter +            .init_start_tag(self.some_offset, self.position_before_match); +        self.current_tag_name.clear(); +        self.is_start_tag = true; +    } + +    #[inline] +    pub(super) fn init_end_tag(&mut self) { +        self.emitter +            .init_end_tag(self.some_offset, self.position_before_match); +        self.current_tag_name.clear(); +        self.is_start_tag = false; +    } + +    #[inline] +    pub(super) fn init_doctype(&mut self) { +        self.emitter.init_doctype(self.some_offset); +    } + +    #[inline] +    pub(super) fn push_tag_name(&mut self, s: &str) { +        self.emitter.push_tag_name(s); +        self.current_tag_name.push_str(s); +    } + +    #[inline] +    pub(super) fn emit_current_tag(&mut self) { +        self.emitter.emit_current_tag(self.reader.position()); +        if self.is_start_tag { +            if self.naively_switch_state { +                self.state = naive_next_state(&self.current_tag_name).into(); +            } +            std::mem::swap(&mut self.last_start_tag_name, &mut self.current_tag_name); +        } +    } + +    #[inline] +    pub(super) fn unread_char(&mut self, c: Option<char>) { +        self.to_reconsume.push(c); +    } + +    #[inline] +    fn validate_char(&mut self, c: char) { +        match c as u32 { +            surrogate_pat!() => { +                self.emit_error(Error::SurrogateInInputStream); +            } +            noncharacter_pat!() => { +                self.emit_error(Error::NoncharacterInInputStream); +            } +            // control without whitespace or nul +            x @ control_pat!() +                if !matches!(x, 0x0000 | 0x0009 | 0x000a | 0x000c | 0x000d | 0x0020) => +            { +                self.emit_error(Error::ControlCharacterInInputStream); +            } +            _ => (), +        } +    } + +    pub(super) fn read_char(&mut self) -> Result<Option<char>, R::Error> { +        let (c_res, reconsumed) = match self.to_reconsume.pop() { +            Some(c) => (Ok(c), true), +            None => (self.reader.read_char(), false), +        }; + +        let mut c = match c_res { +            Ok(Some(c)) => c, +            res => return res, +        }; + +        if c == '\r' { +            c = '\n'; +            let c2 = self.reader.read_char()?; +            if c2 != Some('\n') { +                self.unread_char(c2); +            } +        } + +        if !reconsumed { +            self.validate_char(c); +        } + +        Ok(Some(c)) +    } + +    #[inline] +    pub(super) fn try_read_string( +        &mut self, +        mut s: &str, +        case_sensitive: bool, +    ) -> Result<bool, R::Error> { +        debug_assert!(!s.is_empty()); + +        let to_reconsume_bak = self.to_reconsume; +        let mut chars = s.chars(); +        while let Some(c) = self.to_reconsume.pop() { +            if let (Some(x), Some(x2)) = (c, chars.next()) { +                if x == x2 || (!case_sensitive && x.to_ascii_lowercase() == x2.to_ascii_lowercase()) +                { +                    s = &s[x.len_utf8()..]; +                    continue; +                } +            } + +            self.to_reconsume = to_reconsume_bak; +            return Ok(false); +        } + +        self.reader.try_read_string(s, case_sensitive) +    } + +    pub(super) fn is_consumed_as_part_of_an_attribute(&self) -> bool { +        matches!( +            self.return_state, +            Some( +                State::AttributeValueDoubleQuoted +                    | State::AttributeValueSingleQuoted +                    | State::AttributeValueUnquoted +            ) +        ) +    } + +    pub(super) fn flush_code_points_consumed_as_character_reference(&mut self) { +        if self.is_consumed_as_part_of_an_attribute() { +            self.emitter.push_attribute_value(&self.temporary_buffer); +            self.temporary_buffer.clear(); +        } else { +            self.flush_buffer_characters(); +        } +    } + +    pub(super) fn flush_buffer_characters(&mut self) { +        self.emitter.emit_string(&self.temporary_buffer); +        self.temporary_buffer.clear(); +    } +} +  macro_rules! surrogate_pat {      () => {          0xd800..=0xdfff | 
