diff options
-rw-r--r-- | src/naive_parser.rs | 6 | ||||
-rw-r--r-- | src/tokenizer.rs | 263 | ||||
-rw-r--r-- | src/tokenizer/machine.rs | 90 | ||||
-rw-r--r-- | src/tokenizer/machine/utils.rs | 193 |
4 files changed, 293 insertions, 259 deletions
diff --git a/src/naive_parser.rs b/src/naive_parser.rs index 5bf002b..c5e9568 100644 --- a/src/naive_parser.rs +++ b/src/naive_parser.rs @@ -35,7 +35,7 @@ impl<R: Reader + Position<O>, O: Offset> NaiveParser<R, O, DefaultEmitter<O>> { // TODO: add example for NaiveParser::new pub fn new<'a>(reader: impl IntoReader<'a, Reader = R>) -> Self { let mut tokenizer = Tokenizer::new(reader, DefaultEmitter::default()); - tokenizer.naively_switch_state = true; + tokenizer.enable_naive_state_switching(); NaiveParser { tokenizer } } } @@ -45,7 +45,7 @@ impl<R: Reader + Position<usize>> NaiveParser<R, usize, DefaultEmitter<usize>> { // TODO: add example for NaiveParser::new_with_spans pub fn new_with_spans<'a>(reader: impl IntoReader<'a, Reader = R>) -> Self { let mut tokenizer = Tokenizer::new(reader, DefaultEmitter::default()); - tokenizer.naively_switch_state = true; + tokenizer.enable_naive_state_switching(); NaiveParser { tokenizer } } } @@ -55,7 +55,7 @@ impl<R: Reader + Position<O>, O: Offset, E: Emitter<O>> NaiveParser<R, O, E> { // TODO: add example for NaiveParser::new_with_emitter pub fn new_with_emitter<'a>(reader: impl IntoReader<'a, Reader = R>, emitter: E) -> Self { let mut tokenizer = Tokenizer::new(reader, emitter); - tokenizer.naively_switch_state = true; + tokenizer.enable_naive_state_switching(); NaiveParser { tokenizer } } diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 6f698f6..7c38e49 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -1,41 +1,13 @@ -mod machine; +pub(crate) mod machine; -use crate::naive_parser::naive_next_state; use crate::offset::{Offset, Position}; use crate::reader::{IntoReader, Reader}; -use crate::{Emitter, Error}; -use machine::utils::{control_pat, noncharacter_pat, surrogate_pat}; +use crate::Emitter; use machine::ControlToken; #[cfg(feature = "integration-tests")] pub use machine::State as InternalState; -// this is a stack that can hold 0 to 2 Ts -#[derive(Debug, Default, Clone, Copy)] -struct Stack2<T: Copy>(Option<(T, Option<T>)>); - -impl<T: Copy> Stack2<T> { - #[inline] - fn push(&mut self, c: T) { - self.0 = match self.0 { - None => Some((c, None)), - Some((c1, None)) => Some((c1, Some(c))), - Some((_c1, Some(_c2))) => panic!("stack full!"), - } - } - - #[inline] - fn pop(&mut self) -> Option<T> { - let (new_self, rv) = match self.0 { - Some((c1, Some(c2))) => (Some((c1, None)), Some(c2)), - Some((c1, None)) => (None, Some(c1)), - None => (None, None), - }; - self.0 = new_self; - rv - } -} - /// An HTML tokenizer. /// /// # Warning @@ -56,27 +28,9 @@ impl<T: Copy> Stack2<T> { /// /// [`NaiveParser`]: crate::NaiveParser /// [tree construction]: https://html.spec.whatwg.org/multipage/parsing.html#tree-construction -pub struct Tokenizer<R: Reader, O, E: Emitter<O>> { +pub struct Tokenizer<R, O, E> { + machine: machine::Machine<R, O, E>, eof: bool, - pub(crate) state: machine::State, - pub(crate) emitter: E, - pub(crate) temporary_buffer: String, - pub(crate) reader: R, - to_reconsume: Stack2<Option<char>>, - pub(crate) character_reference_code: u32, - pub(crate) return_state: Option<machine::State>, - current_tag_name: String, - last_start_tag_name: String, - is_start_tag: bool, - /// The reader position before the match block in [`machine::consume`]. - pub(crate) position_before_match: O, - /// * Set to the offset of `<` in [`machine::State::Data`]. - /// * Set to the offset of `-` in [`machine::State::Comment`]. - /// * Set to the offset of `&` in [`machine::State::CharacterReference`]. - pub(crate) some_offset: O, - /// This boolean flag exists so that the [`NaiveParser`](crate::NaiveParser) can work with any [`Emitter`] - /// (it cannot call [`Tokenizer::set_state`] using the emitted start tags since they can be of an arbitrary type). - pub(crate) naively_switch_state: bool, } impl<R: Reader + Position<O>, O: Offset, E: Emitter<O>> Tokenizer<R, O, E> { @@ -88,20 +42,8 @@ impl<R: Reader + Position<O>, O: Offset, E: Emitter<O>> Tokenizer<R, O, E> { /// [tree construction]: https://html.spec.whatwg.org/multipage/parsing.html#tree-construction pub fn new<'a>(reader: impl IntoReader<'a, Reader = R>, emitter: E) -> Self { Tokenizer { - reader: reader.into_reader(), - emitter, - state: machine::State::Data, - to_reconsume: Stack2::default(), - return_state: None, - temporary_buffer: String::new(), - character_reference_code: 0, + machine: machine::Machine::new(reader.into_reader(), emitter), eof: false, - current_tag_name: String::new(), - last_start_tag_name: String::new(), - is_start_tag: false, - position_before_match: O::default(), - some_offset: O::default(), - naively_switch_state: false, } } @@ -114,12 +56,12 @@ impl<R: Reader + Position<O>, O: Offset, E: Emitter<O>> Tokenizer<R, O, E> { /// /// [Markup declaration open state]: https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state pub fn handle_cdata_open(&mut self, action: CdataAction) { - machine::handle_cdata_open(self, action); + machine::handle_cdata_open(&mut self.machine, action); } /// Returns a mutable reference to the emitter. pub fn emitter_mut(&mut self) -> &mut E { - &mut self.emitter + &mut self.machine.emitter } } @@ -197,185 +139,12 @@ impl<R: Reader + Position<O>, O: Offset, E: Emitter<O>> Tokenizer<R, O, E> { /// Only available with the `integration-tests` feature which is not public API. #[cfg(feature = "integration-tests")] pub fn set_internal_state(&mut self, state: InternalState) { - self.state = state; + self.machine.state = state; } /// Set the statemachine to start/continue in the given state. pub fn set_state(&mut self, state: State) { - self.state = state.into(); - } - - /// Just a helper method for the machine. - #[inline] - pub(crate) fn emit_error(&mut self, error: Error) { - let span = match error { - Error::EofBeforeTagName - | Error::EofInCdata - | Error::EofInComment - | Error::EofInDoctype - | Error::EofInScriptHtmlCommentLikeText - | Error::EofInTag - | Error::MissingSemicolonAfterCharacterReference => { - self.reader.position()..self.reader.position() - } - Error::AbsenceOfDigitsInNumericCharacterReference - | Error::NullCharacterReference - | Error::CharacterReferenceOutsideUnicodeRange - | Error::SurrogateCharacterReference - | Error::NoncharacterCharacterReference - | Error::ControlCharacterReference - | Error::UnknownNamedCharacterReference => self.some_offset..self.reader.position(), - - _ => self.position_before_match..self.reader.position(), - }; - self.emitter.report_error(error, span); - } - - /// Assuming the _current token_ is an end tag, return true if all of these hold. Return false otherwise. - /// - /// * the _last start tag_ exists - /// * the current end tag token's name equals to the last start tag's name. - /// - /// See also WHATWG's definition of [appropriate end tag token]. - /// - /// [appropriate end tag token]: https://html.spec.whatwg.org/multipage/parsing.html#appropriate-end-tag-token - #[inline] - pub(crate) fn current_end_tag_is_appropriate(&mut self) -> bool { - self.current_tag_name == self.last_start_tag_name - } - - #[inline] - pub(crate) fn init_start_tag(&mut self) { - self.emitter - .init_start_tag(self.some_offset, self.position_before_match); - self.current_tag_name.clear(); - self.is_start_tag = true; - } - - #[inline] - pub(crate) fn init_end_tag(&mut self) { - self.emitter - .init_end_tag(self.some_offset, self.position_before_match); - self.current_tag_name.clear(); - self.is_start_tag = false; - } - - #[inline] - pub(crate) fn push_tag_name(&mut self, s: &str) { - self.emitter.push_tag_name(s); - self.current_tag_name.push_str(s); - } - - #[inline] - pub(crate) fn emit_current_tag(&mut self) { - self.emitter.emit_current_tag(self.reader.position()); - if self.is_start_tag { - if self.naively_switch_state { - self.state = naive_next_state(&self.current_tag_name).into(); - } - std::mem::swap(&mut self.last_start_tag_name, &mut self.current_tag_name); - } - } - - #[inline] - pub(crate) fn unread_char(&mut self, c: Option<char>) { - self.to_reconsume.push(c); - } - - #[inline] - fn validate_char(&mut self, c: char) { - match c as u32 { - surrogate_pat!() => { - self.emit_error(Error::SurrogateInInputStream); - } - noncharacter_pat!() => { - self.emit_error(Error::NoncharacterInInputStream); - } - // control without whitespace or nul - x @ control_pat!() - if !matches!(x, 0x0000 | 0x0009 | 0x000a | 0x000c | 0x000d | 0x0020) => - { - self.emit_error(Error::ControlCharacterInInputStream); - } - _ => (), - } - } - - pub(crate) fn read_char(&mut self) -> Result<Option<char>, R::Error> { - let (c_res, reconsumed) = match self.to_reconsume.pop() { - Some(c) => (Ok(c), true), - None => (self.reader.read_char(), false), - }; - - let mut c = match c_res { - Ok(Some(c)) => c, - res => return res, - }; - - if c == '\r' { - c = '\n'; - let c2 = self.reader.read_char()?; - if c2 != Some('\n') { - self.unread_char(c2); - } - } - - if !reconsumed { - self.validate_char(c); - } - - Ok(Some(c)) - } - - #[inline] - pub(crate) fn try_read_string( - &mut self, - mut s: &str, - case_sensitive: bool, - ) -> Result<bool, R::Error> { - debug_assert!(!s.is_empty()); - - let to_reconsume_bak = self.to_reconsume; - let mut chars = s.chars(); - while let Some(c) = self.to_reconsume.pop() { - if let (Some(x), Some(x2)) = (c, chars.next()) { - if x == x2 || (!case_sensitive && x.to_ascii_lowercase() == x2.to_ascii_lowercase()) - { - s = &s[x.len_utf8()..]; - continue; - } - } - - self.to_reconsume = to_reconsume_bak; - return Ok(false); - } - - self.reader.try_read_string(s, case_sensitive) - } - - pub(crate) fn is_consumed_as_part_of_an_attribute(&self) -> bool { - matches!( - self.return_state, - Some( - machine::State::AttributeValueDoubleQuoted - | machine::State::AttributeValueSingleQuoted - | machine::State::AttributeValueUnquoted - ) - ) - } - - pub(crate) fn flush_code_points_consumed_as_character_reference(&mut self) { - if self.is_consumed_as_part_of_an_attribute() { - self.emitter.push_attribute_value(&self.temporary_buffer); - self.temporary_buffer.clear(); - } else { - self.flush_buffer_characters(); - } - } - - pub(crate) fn flush_buffer_characters(&mut self) { - self.emitter.emit_string(&self.temporary_buffer); - self.temporary_buffer.clear(); + self.machine.state = state.into(); } } @@ -389,7 +158,7 @@ where fn next(&mut self) -> Option<Self::Item> { loop { - if let Some(token) = self.emitter.next() { + if let Some(token) = self.machine.emitter.next() { return Some(Ok(Event::Token(token))); } @@ -397,12 +166,12 @@ where return None; } - match machine::consume(self) { + match machine::consume(&mut self.machine) { Err(e) => return Some(Err(e)), Ok(ControlToken::Continue) => (), Ok(ControlToken::Eof) => { self.eof = true; - self.emitter.emit_eof(); + self.machine.emitter.emit_eof(); } Ok(ControlToken::CdataOpen) => return Some(Ok(Event::CdataOpen)), } @@ -411,12 +180,16 @@ where } impl<R: Reader, O, E: Emitter<O>> Tokenizer<R, O, E> { + pub(crate) fn enable_naive_state_switching(&mut self) { + self.machine.naively_switch_state = true; + } + /// Test-internal function to override internal state. /// /// Only available with the `integration-tests` feature which is not public API. #[cfg(feature = "integration-tests")] pub fn set_last_start_tag(&mut self, last_start_tag: &str) { - self.last_start_tag_name.clear(); - self.last_start_tag_name.push_str(last_start_tag); + self.machine.last_start_tag_name.clear(); + self.machine.last_start_tag_name.push_str(last_start_tag); } } diff --git a/src/tokenizer/machine.rs b/src/tokenizer/machine.rs index fc31a42..e9a3e68 100644 --- a/src/tokenizer/machine.rs +++ b/src/tokenizer/machine.rs @@ -1,16 +1,65 @@ -pub(super) mod utils; +mod utils; use crate::entities::try_read_character_reference; use crate::offset::{Offset, Position}; use crate::token::AttrValueSyntax; use crate::tokenizer::CdataAction; -use crate::{reader::Reader, Emitter, Error, Tokenizer}; +use crate::{reader::Reader, Emitter, Error}; use utils::{ ascii_digit_pat, control_pat, ctostr, noncharacter_pat, surrogate_pat, whitespace_pat, }; pub use utils::State; +pub(super) struct Machine<R, O, E> { + pub(super) state: State, + pub(super) emitter: E, + temporary_buffer: String, + reader: R, + to_reconsume: Stack2<Option<char>>, + character_reference_code: u32, + return_state: Option<State>, + current_tag_name: String, + pub(super) last_start_tag_name: String, + is_start_tag: bool, + /// The reader position before the match block in [`consume`]. + position_before_match: O, + /// * Set to the offset of `<` in [`State::Data`]. + /// * Set to the offset of `-` in [`State::Comment`]. + /// * Set to the offset of `&` in [`State::CharacterReference`]. + some_offset: O, + /// This boolean flag exists so that the [`NaiveParser`](crate::NaiveParser) can work with any [`Emitter`] + /// (it cannot call [`Tokenizer::set_state`] using the emitted start tags since they can be of an arbitrary type). + /// + /// [`Tokenizer::set_state`]: super::Tokenizer::set_state + pub(crate) naively_switch_state: bool, +} + +impl<R, O, E> Machine<R, O, E> +where + R: Reader + Position<O>, + O: Offset, + E: Emitter<O>, +{ + pub fn new(reader: R, emitter: E) -> Self { + Self { + reader, + emitter, + state: State::Data, + to_reconsume: Stack2::default(), + return_state: None, + temporary_buffer: String::new(), + character_reference_code: 0, + current_tag_name: String::new(), + last_start_tag_name: String::new(), + is_start_tag: false, + position_before_match: O::default(), + some_offset: O::default(), + naively_switch_state: false, + } + } +} + pub enum ControlToken { Eof, Continue, @@ -18,7 +67,7 @@ pub enum ControlToken { } #[inline] -pub fn consume<O, R, E>(slf: &mut Tokenizer<R, O, E>) -> Result<ControlToken, R::Error> +pub(super) fn consume<O, R, E>(slf: &mut Machine<R, O, E>) -> Result<ControlToken, R::Error> where O: Offset, R: Reader + Position<O>, @@ -1964,15 +2013,8 @@ where } } -impl<R: Reader + Position<O>, O: Offset, E: Emitter<O>> Tokenizer<R, O, E> { - #[inline] - fn init_doctype(&mut self) { - self.emitter.init_doctype(self.some_offset); - } -} - #[inline] -pub fn handle_cdata_open<O, R, E>(slf: &mut Tokenizer<R, O, E>, action: CdataAction) +pub(super) fn handle_cdata_open<O, R, E>(slf: &mut Machine<R, O, E>, action: CdataAction) where O: Offset, R: Reader + Position<O>, @@ -1989,3 +2031,29 @@ where } } } + +// this is a stack that can hold 0 to 2 Ts +#[derive(Debug, Default, Clone, Copy)] +struct Stack2<T: Copy>(Option<(T, Option<T>)>); + +impl<T: Copy> Stack2<T> { + #[inline] + fn push(&mut self, c: T) { + self.0 = match self.0 { + None => Some((c, None)), + Some((c1, None)) => Some((c1, Some(c))), + Some((_c1, Some(_c2))) => panic!("stack full!"), + } + } + + #[inline] + fn pop(&mut self) -> Option<T> { + let (new_self, rv) = match self.0 { + Some((c1, Some(c2))) => (Some((c1, None)), Some(c2)), + Some((c1, None)) => (None, Some(c1)), + None => (None, None), + }; + self.0 = new_self; + rv + } +} diff --git a/src/tokenizer/machine/utils.rs b/src/tokenizer/machine/utils.rs index 7d220cf..6e45f4d 100644 --- a/src/tokenizer/machine/utils.rs +++ b/src/tokenizer/machine/utils.rs @@ -1,3 +1,196 @@ +use crate::{ + naive_parser::naive_next_state, + offset::{Offset, Position}, + reader::Reader, + Emitter, Error, +}; + +use super::Machine; + +impl<R, O, E> Machine<R, O, E> +where + R: Reader + Position<O>, + O: Offset, + E: Emitter<O>, +{ + #[inline] + pub(crate) fn emit_error(&mut self, error: Error) { + let span = match error { + Error::EofBeforeTagName + | Error::EofInCdata + | Error::EofInComment + | Error::EofInDoctype + | Error::EofInScriptHtmlCommentLikeText + | Error::EofInTag + | Error::MissingSemicolonAfterCharacterReference => { + self.reader.position()..self.reader.position() + } + Error::AbsenceOfDigitsInNumericCharacterReference + | Error::NullCharacterReference + | Error::CharacterReferenceOutsideUnicodeRange + | Error::SurrogateCharacterReference + | Error::NoncharacterCharacterReference + | Error::ControlCharacterReference + | Error::UnknownNamedCharacterReference => self.some_offset..self.reader.position(), + + _ => self.position_before_match..self.reader.position(), + }; + self.emitter.report_error(error, span); + } + + /// Assuming the _current token_ is an end tag, return true if all of these hold. Return false otherwise. + /// + /// * the _last start tag_ exists + /// * the current end tag token's name equals to the last start tag's name. + /// + /// See also WHATWG's definition of [appropriate end tag token]. + /// + /// [appropriate end tag token]: https://html.spec.whatwg.org/multipage/parsing.html#appropriate-end-tag-token + #[inline] + pub(super) fn current_end_tag_is_appropriate(&mut self) -> bool { + self.current_tag_name == self.last_start_tag_name + } + + #[inline] + pub(super) fn init_start_tag(&mut self) { + self.emitter + .init_start_tag(self.some_offset, self.position_before_match); + self.current_tag_name.clear(); + self.is_start_tag = true; + } + + #[inline] + pub(super) fn init_end_tag(&mut self) { + self.emitter + .init_end_tag(self.some_offset, self.position_before_match); + self.current_tag_name.clear(); + self.is_start_tag = false; + } + + #[inline] + pub(super) fn init_doctype(&mut self) { + self.emitter.init_doctype(self.some_offset); + } + + #[inline] + pub(super) fn push_tag_name(&mut self, s: &str) { + self.emitter.push_tag_name(s); + self.current_tag_name.push_str(s); + } + + #[inline] + pub(super) fn emit_current_tag(&mut self) { + self.emitter.emit_current_tag(self.reader.position()); + if self.is_start_tag { + if self.naively_switch_state { + self.state = naive_next_state(&self.current_tag_name).into(); + } + std::mem::swap(&mut self.last_start_tag_name, &mut self.current_tag_name); + } + } + + #[inline] + pub(super) fn unread_char(&mut self, c: Option<char>) { + self.to_reconsume.push(c); + } + + #[inline] + fn validate_char(&mut self, c: char) { + match c as u32 { + surrogate_pat!() => { + self.emit_error(Error::SurrogateInInputStream); + } + noncharacter_pat!() => { + self.emit_error(Error::NoncharacterInInputStream); + } + // control without whitespace or nul + x @ control_pat!() + if !matches!(x, 0x0000 | 0x0009 | 0x000a | 0x000c | 0x000d | 0x0020) => + { + self.emit_error(Error::ControlCharacterInInputStream); + } + _ => (), + } + } + + pub(super) fn read_char(&mut self) -> Result<Option<char>, R::Error> { + let (c_res, reconsumed) = match self.to_reconsume.pop() { + Some(c) => (Ok(c), true), + None => (self.reader.read_char(), false), + }; + + let mut c = match c_res { + Ok(Some(c)) => c, + res => return res, + }; + + if c == '\r' { + c = '\n'; + let c2 = self.reader.read_char()?; + if c2 != Some('\n') { + self.unread_char(c2); + } + } + + if !reconsumed { + self.validate_char(c); + } + + Ok(Some(c)) + } + + #[inline] + pub(super) fn try_read_string( + &mut self, + mut s: &str, + case_sensitive: bool, + ) -> Result<bool, R::Error> { + debug_assert!(!s.is_empty()); + + let to_reconsume_bak = self.to_reconsume; + let mut chars = s.chars(); + while let Some(c) = self.to_reconsume.pop() { + if let (Some(x), Some(x2)) = (c, chars.next()) { + if x == x2 || (!case_sensitive && x.to_ascii_lowercase() == x2.to_ascii_lowercase()) + { + s = &s[x.len_utf8()..]; + continue; + } + } + + self.to_reconsume = to_reconsume_bak; + return Ok(false); + } + + self.reader.try_read_string(s, case_sensitive) + } + + pub(super) fn is_consumed_as_part_of_an_attribute(&self) -> bool { + matches!( + self.return_state, + Some( + State::AttributeValueDoubleQuoted + | State::AttributeValueSingleQuoted + | State::AttributeValueUnquoted + ) + ) + } + + pub(super) fn flush_code_points_consumed_as_character_reference(&mut self) { + if self.is_consumed_as_part_of_an_attribute() { + self.emitter.push_attribute_value(&self.temporary_buffer); + self.temporary_buffer.clear(); + } else { + self.flush_buffer_characters(); + } + } + + pub(super) fn flush_buffer_characters(&mut self) { + self.emitter.emit_string(&self.temporary_buffer); + self.temporary_buffer.clear(); + } +} + macro_rules! surrogate_pat { () => { 0xd800..=0xdfff |