diff options
| -rw-r--r-- | CHANGELOG.md | 7 | ||||
| -rw-r--r-- | integration_tests/tests/test_html5lib.rs | 13 | ||||
| -rw-r--r-- | src/emitter.rs | 35 | ||||
| -rw-r--r-- | src/lib.rs | 2 | ||||
| -rw-r--r-- | src/machine.rs | 37 | ||||
| -rw-r--r-- | src/naive_parser.rs | 21 | ||||
| -rw-r--r-- | src/tokenizer.rs | 37 | 
7 files changed, 97 insertions, 55 deletions
| diff --git a/CHANGELOG.md b/CHANGELOG.md index d266c7f..a401bfe 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,13 @@  #### Breaking changes +* Iterating over `Tokenizer` now yields values of a new `Event` enum. +  `Event::CdataOpen` signals that `Tokenizer::handle_cdata_open` has to be called. + +* `Emitter` trait + +  * Removed `adjusted_current_node_present_and_not_in_html_namespace`. +  * Added missing `R: Position<O>` bounds for `Tokenizer`/`NaiveParser` constructors.      (If you are able to construct a Tokenizer/NaiveParser,    you should be able to iterate over it.) diff --git a/integration_tests/tests/test_html5lib.rs b/integration_tests/tests/test_html5lib.rs index fd69524..f351f85 100644 --- a/integration_tests/tests/test_html5lib.rs +++ b/integration_tests/tests/test_html5lib.rs @@ -4,7 +4,8 @@ use html5lib_tests::{      parse_tests, Error as TestError, InitialState, Output, Test, Token as TestToken,  };  use html5tokenizer::{ -    offset::NoopOffset, reader::Reader, DefaultEmitter, InternalState, Token, Tokenizer, +    offset::NoopOffset, reader::Reader, CdataAction, DefaultEmitter, Event, InternalState, Token, +    Tokenizer,  };  use similar_asserts::assert_eq; @@ -119,8 +120,14 @@ fn run_test_inner<R: Reader>(          tokens: Vec::new(),      }; -    for token in tokenizer { -        let token = token.unwrap(); +    while let Some(event) = tokenizer.next() { +        let token = match event.unwrap() { +            Event::CdataOpen => { +                tokenizer.handle_cdata_open(CdataAction::BogusComment); +                continue; +            } +            Event::Token(token) => token, +        };          match token {              Token::Error { error, .. } => actual.errors.push(TestError { diff --git a/src/emitter.rs b/src/emitter.rs index a5ecd55..dee0aa0 100644 --- a/src/emitter.rs +++ b/src/emitter.rs @@ -156,13 +156,6 @@ pub trait Emitter<O> {      ///      /// If the current token is not a doctype, this method may panic.      fn push_doctype_system_id(&mut self, s: &str); - -    /// Returns true if there is an _adjusted current node_ and it is not an element in the HTML namespace. -    /// -    /// See the third list item under [Markup declaration open state]. -    /// -    /// [Markup declaration open state]: https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state -    fn adjusted_current_node_present_and_not_in_html_namespace(&mut self) -> bool;  }  /// The DefaultEmitter is not exposed in the public API because: @@ -172,26 +165,12 @@ pub trait Emitter<O> {  ///  #[cfg_attr(not(feature = "integration-tests"), doc = "```ignore")]  #[cfg_attr(feature = "integration-tests", doc = "```")] -///   # use html5tokenizer::{DefaultEmitter, Tokenizer, Token}; +///   # use html5tokenizer::{DefaultEmitter, Event, Tokenizer, Token};  ///   let emitter = DefaultEmitter::default();  ///   let html = "<script><b>";  ///   let mut tokens = Tokenizer::new(html, emitter).flatten(); -///   assert!(matches!(tokens.next().unwrap(), Token::StartTag(tag) if tag.name == "script")); -///   assert!(matches!(tokens.next().unwrap(), Token::StartTag(tag) if tag.name == "b")); -///   ``` -/// -/// * The DefaultEmitter implements [`Emitter::adjusted_current_node_present_and_not_in_html_namespace`] -///   by returning false, which results in all CDATA sections being tokenized as bogus comments. -/// -#[cfg_attr(not(feature = "integration-tests"), doc = "```ignore")] -#[cfg_attr(feature = "integration-tests", doc = "```")] -///   # use html5tokenizer::{DefaultEmitter, Tokenizer, Token}; -///   let emitter = DefaultEmitter::default(); -///   let html = "<svg><![CDATA[I love SVG]]>"; -///   let mut tokens = Tokenizer::new(html, emitter).flatten(); -///   assert!(matches!(tokens.next().unwrap(), Token::StartTag(tag) if tag.name == "svg")); -///   assert!(matches!(tokens.next().unwrap(), Token::Error {..})); -///   assert!(matches!(tokens.next().unwrap(), Token::Comment(_bogus_comment))); +///   assert!(matches!(tokens.next().unwrap(), Event::Token(Token::StartTag(tag)) if tag.name == "script")); +///   assert!(matches!(tokens.next().unwrap(), Event::Token(Token::StartTag(tag)) if tag.name == "b"));  ///   ```  ///  /// [`Tokenizer::set_state`]: crate::Tokenizer::set_state @@ -467,10 +446,6 @@ impl<O: Offset> Emitter<O> for DefaultEmitter<O> {              debug_assert!(false);          }      } - -    fn adjusted_current_node_present_and_not_in_html_namespace(&mut self) -> bool { -        false -    }  }  /// An HTML start tag, such as `<p>` or `<a>`. @@ -613,7 +588,7 @@ pub enum Token<O> {  #[cfg(test)]  mod tests {      use super::{DefaultEmitter, Token}; -    use crate::{attr::AttrValueSyntax, Tokenizer}; +    use crate::{attr::AttrValueSyntax, Event, Tokenizer};      #[test]      fn test_attribute_value_syntax() { @@ -622,7 +597,7 @@ mod tests {              DefaultEmitter::default(),          )          .flatten(); -        let Token::StartTag(start_tag) = tokenizer.next().unwrap() else { +        let Event::Token(Token::StartTag(start_tag)) = tokenizer.next().unwrap() else {              panic!("expected start tag");          };          assert_eq!( @@ -20,7 +20,7 @@ mod utils;  pub use emitter::{Comment, Doctype, Emitter, EndTag, StartTag, Token};  pub use error::Error;  pub use naive_parser::NaiveParser; -pub use tokenizer::{State, Tokenizer}; +pub use tokenizer::{CdataAction, Event, State, Tokenizer};  #[cfg(feature = "integration-tests")]  pub use utils::State as InternalState; diff --git a/src/machine.rs b/src/machine.rs index c5bf021..509dae5 100644 --- a/src/machine.rs +++ b/src/machine.rs @@ -1,6 +1,7 @@  use crate::attr::AttrValueSyntax;  use crate::entities::try_read_character_reference;  use crate::offset::{Offset, Position}; +use crate::tokenizer::CdataAction;  use crate::utils::{      ascii_digit_pat, control_pat, ctostr, noncharacter_pat, surrogate_pat, whitespace_pat, State,  }; @@ -9,6 +10,7 @@ use crate::{reader::Reader, Emitter, Error, Tokenizer};  pub enum ControlToken {      Eof,      Continue, +    CdataOpen,  }  // Note: This is not implemented as a method on Tokenizer because there's fields on Tokenizer that @@ -942,21 +944,7 @@ where                  slf.doctype_offset = slf.reader.position() - b"<!doctype".len();                  Ok(ControlToken::Continue)              } -            Some('[') if slf.try_read_string("CDATA[", true)? => { -                if slf -                    .emitter -                    .adjusted_current_node_present_and_not_in_html_namespace() -                { -                    slf.state = State::CdataSection; -                } else { -                    slf.emit_error(Error::CdataInHtmlContent); - -                    slf.emitter.init_comment(slf.reader.position()); -                    slf.emitter.push_comment("[CDATA["); -                    slf.state = State::BogusComment; -                } -                Ok(ControlToken::Continue) -            } +            Some('[') if slf.try_read_string("CDATA[", true)? => Ok(ControlToken::CdataOpen),              c => {                  slf.emit_error(Error::IncorrectlyOpenedComment);                  slf.emitter.init_comment(slf.reader.position() - 1); @@ -1897,3 +1885,22 @@ where          }      }  } + +#[inline] +pub fn handle_cdata_open<O, R, E>(slf: &mut Tokenizer<R, O, E>, action: CdataAction) +where +    O: Offset, +    R: Reader + Position<O>, +    E: Emitter<O>, +{ +    match action { +        CdataAction::Cdata => slf.state = State::CdataSection, +        CdataAction::BogusComment => { +            slf.emit_error(Error::CdataInHtmlContent); + +            slf.emitter.init_comment(slf.reader.position()); +            slf.emitter.push_comment("[CDATA["); +            slf.state = State::BogusComment; +        } +    } +} diff --git a/src/naive_parser.rs b/src/naive_parser.rs index 2626209..f126dfd 100644 --- a/src/naive_parser.rs +++ b/src/naive_parser.rs @@ -1,7 +1,8 @@  use crate::emitter::DefaultEmitter;  use crate::offset::{Offset, Position};  use crate::reader::{IntoReader, Reader}; -use crate::{Emitter, State, Tokenizer}; +use crate::tokenizer::CdataAction; +use crate::{Emitter, Event, State, Tokenizer};  /// A naive HTML parser (**not** spec-compliant since it doesn't do tree construction).  /// @@ -53,9 +54,21 @@ impl<R: Reader + Position<O>, O: Offset, E: Emitter<O>> Iterator for NaiveParser      type Item = Result<E::Token, R::Error>;      fn next(&mut self) -> Option<Self::Item> { -        // A proper parser would follow the steps described under section '13.2.6 Tree construction' -        // of the spec. Since this parser is naive, we directly return the token instead. -        self.tokenizer.next() +        loop { +            let event = self.tokenizer.next()?; +            match event { +                Err(e) => return Some(Err(e)), +                Ok(Event::Token(t)) => { +                    // A proper parser would follow the steps described under section '13.2.6 Tree construction' +                    // of the spec. Since this parser is naive, we directly return the token instead. +                    return Some(Ok(t)); +                } +                Ok(Event::CdataOpen) => { +                    // Naively parse any CDATA sections as bogus comments. +                    self.tokenizer.handle_cdata_open(CdataAction::BogusComment) +                } +            } +        }      }  } diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 96d1c34..5b11db0 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -79,6 +79,38 @@ impl<R: Reader + Position<O>, O: Offset, E: Emitter<O>> Tokenizer<R, O, E> {              naively_switch_state: false,          }      } + +    /// To be called when the tokenizer iterator implementation yields [`Event::CdataOpen`]. +    /// +    /// For spec-compliant parsing *action* must be [`CdataAction::Cdata`], +    /// if there is an _adjusted current node_ and it is not an element in +    /// the HTML namespace, or [`CdataAction::BogusComment`] otherwise +    /// (as per the third condition under [Markup declaration open state]). +    /// +    /// [Markup declaration open state]: https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state +    pub fn handle_cdata_open(&mut self, action: CdataAction) { +        machine::handle_cdata_open(self, action); +    } +} + +/// Used by [`Tokenizer::handle_cdata_open`] to determine how to process `<![CDATA[` +/// +/// (Since as per the spec this depends on the _adjusted current node_). +pub enum CdataAction { +    /// Process it as CDATA. +    Cdata, +    /// Process it as a bogus comment. +    BogusComment, +} + +/// An event yielded by the [`Iterator`] implementation for the [`Tokenizer`]. +#[derive(Debug)] +pub enum Event<T> { +    /// A token emitted by the [`Emitter`]. +    Token(T), +    /// The state machine encountered `<![CDATA[`. You must call [`Tokenizer::handle_cdata_open`], +    /// before advancing the tokenizer iterator again. +    CdataOpen,  }  /// The states you can set the tokenizer to. @@ -307,12 +339,12 @@ where      R: Reader + Position<O>,      E: Emitter<O>,  { -    type Item = Result<E::Token, R::Error>; +    type Item = Result<Event<E::Token>, R::Error>;      fn next(&mut self) -> Option<Self::Item> {          loop {              if let Some(token) = self.emitter.pop_token() { -                return Some(Ok(token)); +                return Some(Ok(Event::Token(token)));              }              if self.eof { @@ -326,6 +358,7 @@ where                      self.eof = true;                      self.emitter.emit_eof();                  } +                Ok(ControlToken::CdataOpen) => return Some(Ok(Event::CdataOpen)),              }          }      } | 
