From 82898967320f90116bbc686ab7ffc2f61ff456c4 Mon Sep 17 00:00:00 2001
From: Martin Fischer <martin@push-f.com>
Date: Tue, 15 Aug 2023 11:26:08 +0200
Subject: fix!: add adjusted_current_node_present_and_not_in_html_namespace to
 Emitter

---
 src/emitter.rs | 24 ++++++++++++++++++++++++
 src/machine.rs | 21 +++++++++++----------
 2 files changed, 35 insertions(+), 10 deletions(-)

(limited to 'src')

diff --git a/src/emitter.rs b/src/emitter.rs
index ed8d9e9..30e1d17 100644
--- a/src/emitter.rs
+++ b/src/emitter.rs
@@ -156,6 +156,13 @@ pub trait Emitter<O> {
     ///
     /// If the current token is not a doctype, this method may panic.
     fn push_doctype_system_id(&mut self, s: &str);
+
+    /// Returns true if there is an _adjusted current node_ and it is not an element in the HTML namespace.
+    ///
+    /// See the third list item under [Markup declaration open state].
+    ///
+    /// [Markup declaration open state]: https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state
+    fn adjusted_current_node_present_and_not_in_html_namespace(&mut self) -> bool;
 }
 
 /// The default implementation of [`Emitter`], used to produce tokens.
@@ -174,6 +181,19 @@ pub trait Emitter<O> {
 ///   assert!(matches!(tokens.next().unwrap(), Token::StartTag(tag) if tag.name == "b"));
 ///   ```
 ///
+/// * The DefaultEmitter implements [`Emitter::adjusted_current_node_present_and_not_in_html_namespace`]
+///   by returning false, which results in all CDATA sections being tokenized as bogus comments.
+///
+///   ```
+///   # use html5tokenizer::{DefaultEmitter, Tokenizer, Token};
+///   let emitter = DefaultEmitter::default();
+///   let html = "<svg><![CDATA[I love SVG]]>";
+///   let mut tokens = Tokenizer::new(html, emitter).flatten();
+///   assert!(matches!(tokens.next().unwrap(), Token::StartTag(tag) if tag.name == "svg"));
+///   assert!(matches!(tokens.next().unwrap(), Token::Error {..}));
+///   assert!(matches!(tokens.next().unwrap(), Token::Comment(_bogus_comment)));
+///   ```
+///
 /// [`Tokenizer::set_state`]: crate::Tokenizer::set_state
 pub struct DefaultEmitter<O = NoopOffset> {
     current_characters: String,
@@ -447,6 +467,10 @@ impl<O: Offset> Emitter<O> for DefaultEmitter<O> {
             debug_assert!(false);
         }
     }
+
+    fn adjusted_current_node_present_and_not_in_html_namespace(&mut self) -> bool {
+        false
+    }
 }
 
 /// An HTML start tag, such as `<p>` or `<a>`.
diff --git a/src/machine.rs b/src/machine.rs
index 0d99ab8..007f22f 100644
--- a/src/machine.rs
+++ b/src/machine.rs
@@ -939,17 +939,18 @@ where
                 Ok(ControlToken::Continue)
             }
             Some('[') if slf.try_read_string("CDATA[", true)? => {
-                // missing: check for adjusted current element: we don't have an element stack
-                // at all
-                //
-                // missing: cdata transition
-                //
-                // let's hope that bogus comment can just sort of skip over cdata
-                slf.emit_error(Error::CdataInHtmlContent);
+                if slf
+                    .emitter
+                    .adjusted_current_node_present_and_not_in_html_namespace()
+                {
+                    slf.state = State::CdataSection;
+                } else {
+                    slf.emit_error(Error::CdataInHtmlContent);
 
-                slf.emitter.init_comment(slf.reader.position());
-                slf.emitter.push_comment("[CDATA[");
-                slf.state = State::BogusComment;
+                    slf.emitter.init_comment(slf.reader.position());
+                    slf.emitter.push_comment("[CDATA[");
+                    slf.state = State::BogusComment;
+                }
                 Ok(ControlToken::Continue)
             }
             c => {
-- 
cgit v1.2.3