diff options
| -rw-r--r-- | src/emitter.rs | 24 | ||||
| -rw-r--r-- | src/machine.rs | 21 | 
2 files changed, 35 insertions, 10 deletions
| diff --git a/src/emitter.rs b/src/emitter.rs index ed8d9e9..30e1d17 100644 --- a/src/emitter.rs +++ b/src/emitter.rs @@ -156,6 +156,13 @@ pub trait Emitter<O> {      ///      /// If the current token is not a doctype, this method may panic.      fn push_doctype_system_id(&mut self, s: &str); + +    /// Returns true if there is an _adjusted current node_ and it is not an element in the HTML namespace. +    /// +    /// See the third list item under [Markup declaration open state]. +    /// +    /// [Markup declaration open state]: https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state +    fn adjusted_current_node_present_and_not_in_html_namespace(&mut self) -> bool;  }  /// The default implementation of [`Emitter`], used to produce tokens. @@ -174,6 +181,19 @@ pub trait Emitter<O> {  ///   assert!(matches!(tokens.next().unwrap(), Token::StartTag(tag) if tag.name == "b"));  ///   ```  /// +/// * The DefaultEmitter implements [`Emitter::adjusted_current_node_present_and_not_in_html_namespace`] +///   by returning false, which results in all CDATA sections being tokenized as bogus comments. +/// +///   ``` +///   # use html5tokenizer::{DefaultEmitter, Tokenizer, Token}; +///   let emitter = DefaultEmitter::default(); +///   let html = "<svg><![CDATA[I love SVG]]>"; +///   let mut tokens = Tokenizer::new(html, emitter).flatten(); +///   assert!(matches!(tokens.next().unwrap(), Token::StartTag(tag) if tag.name == "svg")); +///   assert!(matches!(tokens.next().unwrap(), Token::Error {..})); +///   assert!(matches!(tokens.next().unwrap(), Token::Comment(_bogus_comment))); +///   ``` +///  /// [`Tokenizer::set_state`]: crate::Tokenizer::set_state  pub struct DefaultEmitter<O = NoopOffset> {      current_characters: String, @@ -447,6 +467,10 @@ impl<O: Offset> Emitter<O> for DefaultEmitter<O> {              debug_assert!(false);          }      } + +    fn adjusted_current_node_present_and_not_in_html_namespace(&mut self) -> bool { +        false +    }  }  /// An HTML start tag, such as `<p>` or `<a>`. diff --git a/src/machine.rs b/src/machine.rs index 0d99ab8..007f22f 100644 --- a/src/machine.rs +++ b/src/machine.rs @@ -939,17 +939,18 @@ where                  Ok(ControlToken::Continue)              }              Some('[') if slf.try_read_string("CDATA[", true)? => { -                // missing: check for adjusted current element: we don't have an element stack -                // at all -                // -                // missing: cdata transition -                // -                // let's hope that bogus comment can just sort of skip over cdata -                slf.emit_error(Error::CdataInHtmlContent); +                if slf +                    .emitter +                    .adjusted_current_node_present_and_not_in_html_namespace() +                { +                    slf.state = State::CdataSection; +                } else { +                    slf.emit_error(Error::CdataInHtmlContent); -                slf.emitter.init_comment(slf.reader.position()); -                slf.emitter.push_comment("[CDATA["); -                slf.state = State::BogusComment; +                    slf.emitter.init_comment(slf.reader.position()); +                    slf.emitter.push_comment("[CDATA["); +                    slf.state = State::BogusComment; +                }                  Ok(ControlToken::Continue)              }              c => { | 
