diff options
-rw-r--r-- | CHANGELOG.md | 7 | ||||
-rw-r--r-- | integration_tests/tests/test_html5lib.rs | 13 | ||||
-rw-r--r-- | src/emitter.rs | 35 | ||||
-rw-r--r-- | src/lib.rs | 2 | ||||
-rw-r--r-- | src/machine.rs | 37 | ||||
-rw-r--r-- | src/naive_parser.rs | 21 | ||||
-rw-r--r-- | src/tokenizer.rs | 37 |
7 files changed, 97 insertions, 55 deletions
diff --git a/CHANGELOG.md b/CHANGELOG.md index d266c7f..a401bfe 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,13 @@ #### Breaking changes +* Iterating over `Tokenizer` now yields values of a new `Event` enum. + `Event::CdataOpen` signals that `Tokenizer::handle_cdata_open` has to be called. + +* `Emitter` trait + + * Removed `adjusted_current_node_present_and_not_in_html_namespace`. + * Added missing `R: Position<O>` bounds for `Tokenizer`/`NaiveParser` constructors. (If you are able to construct a Tokenizer/NaiveParser, you should be able to iterate over it.) diff --git a/integration_tests/tests/test_html5lib.rs b/integration_tests/tests/test_html5lib.rs index fd69524..f351f85 100644 --- a/integration_tests/tests/test_html5lib.rs +++ b/integration_tests/tests/test_html5lib.rs @@ -4,7 +4,8 @@ use html5lib_tests::{ parse_tests, Error as TestError, InitialState, Output, Test, Token as TestToken, }; use html5tokenizer::{ - offset::NoopOffset, reader::Reader, DefaultEmitter, InternalState, Token, Tokenizer, + offset::NoopOffset, reader::Reader, CdataAction, DefaultEmitter, Event, InternalState, Token, + Tokenizer, }; use similar_asserts::assert_eq; @@ -119,8 +120,14 @@ fn run_test_inner<R: Reader>( tokens: Vec::new(), }; - for token in tokenizer { - let token = token.unwrap(); + while let Some(event) = tokenizer.next() { + let token = match event.unwrap() { + Event::CdataOpen => { + tokenizer.handle_cdata_open(CdataAction::BogusComment); + continue; + } + Event::Token(token) => token, + }; match token { Token::Error { error, .. } => actual.errors.push(TestError { diff --git a/src/emitter.rs b/src/emitter.rs index a5ecd55..dee0aa0 100644 --- a/src/emitter.rs +++ b/src/emitter.rs @@ -156,13 +156,6 @@ pub trait Emitter<O> { /// /// If the current token is not a doctype, this method may panic. fn push_doctype_system_id(&mut self, s: &str); - - /// Returns true if there is an _adjusted current node_ and it is not an element in the HTML namespace. - /// - /// See the third list item under [Markup declaration open state]. - /// - /// [Markup declaration open state]: https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state - fn adjusted_current_node_present_and_not_in_html_namespace(&mut self) -> bool; } /// The DefaultEmitter is not exposed in the public API because: @@ -172,26 +165,12 @@ pub trait Emitter<O> { /// #[cfg_attr(not(feature = "integration-tests"), doc = "```ignore")] #[cfg_attr(feature = "integration-tests", doc = "```")] -/// # use html5tokenizer::{DefaultEmitter, Tokenizer, Token}; +/// # use html5tokenizer::{DefaultEmitter, Event, Tokenizer, Token}; /// let emitter = DefaultEmitter::default(); /// let html = "<script><b>"; /// let mut tokens = Tokenizer::new(html, emitter).flatten(); -/// assert!(matches!(tokens.next().unwrap(), Token::StartTag(tag) if tag.name == "script")); -/// assert!(matches!(tokens.next().unwrap(), Token::StartTag(tag) if tag.name == "b")); -/// ``` -/// -/// * The DefaultEmitter implements [`Emitter::adjusted_current_node_present_and_not_in_html_namespace`] -/// by returning false, which results in all CDATA sections being tokenized as bogus comments. -/// -#[cfg_attr(not(feature = "integration-tests"), doc = "```ignore")] -#[cfg_attr(feature = "integration-tests", doc = "```")] -/// # use html5tokenizer::{DefaultEmitter, Tokenizer, Token}; -/// let emitter = DefaultEmitter::default(); -/// let html = "<svg><![CDATA[I love SVG]]>"; -/// let mut tokens = Tokenizer::new(html, emitter).flatten(); -/// assert!(matches!(tokens.next().unwrap(), Token::StartTag(tag) if tag.name == "svg")); -/// assert!(matches!(tokens.next().unwrap(), Token::Error {..})); -/// assert!(matches!(tokens.next().unwrap(), Token::Comment(_bogus_comment))); +/// assert!(matches!(tokens.next().unwrap(), Event::Token(Token::StartTag(tag)) if tag.name == "script")); +/// assert!(matches!(tokens.next().unwrap(), Event::Token(Token::StartTag(tag)) if tag.name == "b")); /// ``` /// /// [`Tokenizer::set_state`]: crate::Tokenizer::set_state @@ -467,10 +446,6 @@ impl<O: Offset> Emitter<O> for DefaultEmitter<O> { debug_assert!(false); } } - - fn adjusted_current_node_present_and_not_in_html_namespace(&mut self) -> bool { - false - } } /// An HTML start tag, such as `<p>` or `<a>`. @@ -613,7 +588,7 @@ pub enum Token<O> { #[cfg(test)] mod tests { use super::{DefaultEmitter, Token}; - use crate::{attr::AttrValueSyntax, Tokenizer}; + use crate::{attr::AttrValueSyntax, Event, Tokenizer}; #[test] fn test_attribute_value_syntax() { @@ -622,7 +597,7 @@ mod tests { DefaultEmitter::default(), ) .flatten(); - let Token::StartTag(start_tag) = tokenizer.next().unwrap() else { + let Event::Token(Token::StartTag(start_tag)) = tokenizer.next().unwrap() else { panic!("expected start tag"); }; assert_eq!( @@ -20,7 +20,7 @@ mod utils; pub use emitter::{Comment, Doctype, Emitter, EndTag, StartTag, Token}; pub use error::Error; pub use naive_parser::NaiveParser; -pub use tokenizer::{State, Tokenizer}; +pub use tokenizer::{CdataAction, Event, State, Tokenizer}; #[cfg(feature = "integration-tests")] pub use utils::State as InternalState; diff --git a/src/machine.rs b/src/machine.rs index c5bf021..509dae5 100644 --- a/src/machine.rs +++ b/src/machine.rs @@ -1,6 +1,7 @@ use crate::attr::AttrValueSyntax; use crate::entities::try_read_character_reference; use crate::offset::{Offset, Position}; +use crate::tokenizer::CdataAction; use crate::utils::{ ascii_digit_pat, control_pat, ctostr, noncharacter_pat, surrogate_pat, whitespace_pat, State, }; @@ -9,6 +10,7 @@ use crate::{reader::Reader, Emitter, Error, Tokenizer}; pub enum ControlToken { Eof, Continue, + CdataOpen, } // Note: This is not implemented as a method on Tokenizer because there's fields on Tokenizer that @@ -942,21 +944,7 @@ where slf.doctype_offset = slf.reader.position() - b"<!doctype".len(); Ok(ControlToken::Continue) } - Some('[') if slf.try_read_string("CDATA[", true)? => { - if slf - .emitter - .adjusted_current_node_present_and_not_in_html_namespace() - { - slf.state = State::CdataSection; - } else { - slf.emit_error(Error::CdataInHtmlContent); - - slf.emitter.init_comment(slf.reader.position()); - slf.emitter.push_comment("[CDATA["); - slf.state = State::BogusComment; - } - Ok(ControlToken::Continue) - } + Some('[') if slf.try_read_string("CDATA[", true)? => Ok(ControlToken::CdataOpen), c => { slf.emit_error(Error::IncorrectlyOpenedComment); slf.emitter.init_comment(slf.reader.position() - 1); @@ -1897,3 +1885,22 @@ where } } } + +#[inline] +pub fn handle_cdata_open<O, R, E>(slf: &mut Tokenizer<R, O, E>, action: CdataAction) +where + O: Offset, + R: Reader + Position<O>, + E: Emitter<O>, +{ + match action { + CdataAction::Cdata => slf.state = State::CdataSection, + CdataAction::BogusComment => { + slf.emit_error(Error::CdataInHtmlContent); + + slf.emitter.init_comment(slf.reader.position()); + slf.emitter.push_comment("[CDATA["); + slf.state = State::BogusComment; + } + } +} diff --git a/src/naive_parser.rs b/src/naive_parser.rs index 2626209..f126dfd 100644 --- a/src/naive_parser.rs +++ b/src/naive_parser.rs @@ -1,7 +1,8 @@ use crate::emitter::DefaultEmitter; use crate::offset::{Offset, Position}; use crate::reader::{IntoReader, Reader}; -use crate::{Emitter, State, Tokenizer}; +use crate::tokenizer::CdataAction; +use crate::{Emitter, Event, State, Tokenizer}; /// A naive HTML parser (**not** spec-compliant since it doesn't do tree construction). /// @@ -53,9 +54,21 @@ impl<R: Reader + Position<O>, O: Offset, E: Emitter<O>> Iterator for NaiveParser type Item = Result<E::Token, R::Error>; fn next(&mut self) -> Option<Self::Item> { - // A proper parser would follow the steps described under section '13.2.6 Tree construction' - // of the spec. Since this parser is naive, we directly return the token instead. - self.tokenizer.next() + loop { + let event = self.tokenizer.next()?; + match event { + Err(e) => return Some(Err(e)), + Ok(Event::Token(t)) => { + // A proper parser would follow the steps described under section '13.2.6 Tree construction' + // of the spec. Since this parser is naive, we directly return the token instead. + return Some(Ok(t)); + } + Ok(Event::CdataOpen) => { + // Naively parse any CDATA sections as bogus comments. + self.tokenizer.handle_cdata_open(CdataAction::BogusComment) + } + } + } } } diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 96d1c34..5b11db0 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -79,6 +79,38 @@ impl<R: Reader + Position<O>, O: Offset, E: Emitter<O>> Tokenizer<R, O, E> { naively_switch_state: false, } } + + /// To be called when the tokenizer iterator implementation yields [`Event::CdataOpen`]. + /// + /// For spec-compliant parsing *action* must be [`CdataAction::Cdata`], + /// if there is an _adjusted current node_ and it is not an element in + /// the HTML namespace, or [`CdataAction::BogusComment`] otherwise + /// (as per the third condition under [Markup declaration open state]). + /// + /// [Markup declaration open state]: https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state + pub fn handle_cdata_open(&mut self, action: CdataAction) { + machine::handle_cdata_open(self, action); + } +} + +/// Used by [`Tokenizer::handle_cdata_open`] to determine how to process `<![CDATA[` +/// +/// (Since as per the spec this depends on the _adjusted current node_). +pub enum CdataAction { + /// Process it as CDATA. + Cdata, + /// Process it as a bogus comment. + BogusComment, +} + +/// An event yielded by the [`Iterator`] implementation for the [`Tokenizer`]. +#[derive(Debug)] +pub enum Event<T> { + /// A token emitted by the [`Emitter`]. + Token(T), + /// The state machine encountered `<![CDATA[`. You must call [`Tokenizer::handle_cdata_open`], + /// before advancing the tokenizer iterator again. + CdataOpen, } /// The states you can set the tokenizer to. @@ -307,12 +339,12 @@ where R: Reader + Position<O>, E: Emitter<O>, { - type Item = Result<E::Token, R::Error>; + type Item = Result<Event<E::Token>, R::Error>; fn next(&mut self) -> Option<Self::Item> { loop { if let Some(token) = self.emitter.pop_token() { - return Some(Ok(token)); + return Some(Ok(Event::Token(token))); } if self.eof { @@ -326,6 +358,7 @@ where self.eof = true; self.emitter.emit_eof(); } + Ok(ControlToken::CdataOpen) => return Some(Ok(Event::CdataOpen)), } } } |