5 files changed, 80 insertions, 52 deletions
diff --git a/src/emitter.rs b/src/emitter.rs
index a5ecd55..dee0aa0 100644
--- a/src/emitter.rs
+++ b/src/emitter.rs
@@ -156,13 +156,6 @@ pub trait Emitter<O> {
     ///
     /// If the current token is not a doctype, this method may panic.
     fn push_doctype_system_id(&mut self, s: &str);
-
-    /// Returns true if there is an _adjusted current node_ and it is not an element in the HTML namespace.
-    ///
-    /// See the third list item under [Markup declaration open state].
-    ///
-    /// [Markup declaration open state]: https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state
-    fn adjusted_current_node_present_and_not_in_html_namespace(&mut self) -> bool;
 }
 
 /// The DefaultEmitter is not exposed in the public API because:
@@ -172,26 +165,12 @@ pub trait Emitter<O> {
 ///
 #[cfg_attr(not(feature = "integration-tests"), doc = "```ignore")]
 #[cfg_attr(feature = "integration-tests", doc = "```")]
-///   # use html5tokenizer::{DefaultEmitter, Tokenizer, Token};
+///   # use html5tokenizer::{DefaultEmitter, Event, Tokenizer, Token};
 ///   let emitter = DefaultEmitter::default();
 ///   let html = "<script><b>";
 ///   let mut tokens = Tokenizer::new(html, emitter).flatten();
-///   assert!(matches!(tokens.next().unwrap(), Token::StartTag(tag) if tag.name == "script"));
-///   assert!(matches!(tokens.next().unwrap(), Token::StartTag(tag) if tag.name == "b"));
-///   ```
-///
-/// * The DefaultEmitter implements [`Emitter::adjusted_current_node_present_and_not_in_html_namespace`]
-///   by returning false, which results in all CDATA sections being tokenized as bogus comments.
-///
-#[cfg_attr(not(feature = "integration-tests"), doc = "```ignore")]
-#[cfg_attr(feature = "integration-tests", doc = "```")]
-///   # use html5tokenizer::{DefaultEmitter, Tokenizer, Token};
-///   let emitter = DefaultEmitter::default();
-///   let html = "<svg><![CDATA[I love SVG]]>";
-///   let mut tokens = Tokenizer::new(html, emitter).flatten();
-///   assert!(matches!(tokens.next().unwrap(), Token::StartTag(tag) if tag.name == "svg"));
-///   assert!(matches!(tokens.next().unwrap(), Token::Error {..}));
-///   assert!(matches!(tokens.next().unwrap(), Token::Comment(_bogus_comment)));
+///   assert!(matches!(tokens.next().unwrap(), Event::Token(Token::StartTag(tag)) if tag.name == "script"));
+///   assert!(matches!(tokens.next().unwrap(), Event::Token(Token::StartTag(tag)) if tag.name == "b"));
 ///   ```
 ///
 /// [`Tokenizer::set_state`]: crate::Tokenizer::set_state
@@ -467,10 +446,6 @@ impl<O: Offset> Emitter<O> for DefaultEmitter<O> {
             debug_assert!(false);
         }
     }
-
-    fn adjusted_current_node_present_and_not_in_html_namespace(&mut self) -> bool {
-        false
-    }
 }
 
 /// An HTML start tag, such as `<p>` or `<a>`.
@@ -613,7 +588,7 @@ pub enum Token<O> {
 #[cfg(test)]
 mod tests {
     use super::{DefaultEmitter, Token};
-    use crate::{attr::AttrValueSyntax, Tokenizer};
+    use crate::{attr::AttrValueSyntax, Event, Tokenizer};
 
     #[test]
     fn test_attribute_value_syntax() {
@@ -622,7 +597,7 @@ mod tests {
             DefaultEmitter::default(),
         )
         .flatten();
-        let Token::StartTag(start_tag) = tokenizer.next().unwrap() else {
+        let Event::Token(Token::StartTag(start_tag)) = tokenizer.next().unwrap() else {
             panic!("expected start tag");
         };
         assert_eq!(
diff --git a/src/lib.rs b/src/lib.rs
index 9dd878c..7bc17c3 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -20,7 +20,7 @@ mod utils;
 pub use emitter::{Comment, Doctype, Emitter, EndTag, StartTag, Token};
 pub use error::Error;
 pub use naive_parser::NaiveParser;
-pub use tokenizer::{State, Tokenizer};
+pub use tokenizer::{CdataAction, Event, State, Tokenizer};
 
 #[cfg(feature = "integration-tests")]
 pub use utils::State as InternalState;
diff --git a/src/machine.rs b/src/machine.rs
index c5bf021..509dae5 100644
--- a/src/machine.rs
+++ b/src/machine.rs
@@ -1,6 +1,7 @@
 use crate::attr::AttrValueSyntax;
 use crate::entities::try_read_character_reference;
 use crate::offset::{Offset, Position};
+use crate::tokenizer::CdataAction;
 use crate::utils::{
     ascii_digit_pat, control_pat, ctostr, noncharacter_pat, surrogate_pat, whitespace_pat, State,
 };
@@ -9,6 +10,7 @@ use crate::{reader::Reader, Emitter, Error, Tokenizer};
 pub enum ControlToken {
     Eof,
     Continue,
+    CdataOpen,
 }
 
 // Note: This is not implemented as a method on Tokenizer because there's fields on Tokenizer that
@@ -942,21 +944,7 @@ where
                 slf.doctype_offset = slf.reader.position() - b"<!doctype".len();
                 Ok(ControlToken::Continue)
             }
-            Some('[') if slf.try_read_string("CDATA[", true)? => {
-                if slf
-                    .emitter
-                    .adjusted_current_node_present_and_not_in_html_namespace()
-                {
-                    slf.state = State::CdataSection;
-                } else {
-                    slf.emit_error(Error::CdataInHtmlContent);
-
-                    slf.emitter.init_comment(slf.reader.position());
-                    slf.emitter.push_comment("[CDATA[");
-                    slf.state = State::BogusComment;
-                }
-                Ok(ControlToken::Continue)
-            }
+            Some('[') if slf.try_read_string("CDATA[", true)? => Ok(ControlToken::CdataOpen),
             c => {
                 slf.emit_error(Error::IncorrectlyOpenedComment);
                 slf.emitter.init_comment(slf.reader.position() - 1);
@@ -1897,3 +1885,22 @@ where
         }
     }
 }
+
+#[inline]
+pub fn handle_cdata_open<O, R, E>(slf: &mut Tokenizer<R, O, E>, action: CdataAction)
+where
+    O: Offset,
+    R: Reader + Position<O>,
+    E: Emitter<O>,
+{
+    match action {
+        CdataAction::Cdata => slf.state = State::CdataSection,
+        CdataAction::BogusComment => {
+            slf.emit_error(Error::CdataInHtmlContent);
+
+            slf.emitter.init_comment(slf.reader.position());
+            slf.emitter.push_comment("[CDATA[");
+            slf.state = State::BogusComment;
+        }
+    }
+}
diff --git a/src/naive_parser.rs b/src/naive_parser.rs
index 2626209..f126dfd 100644
--- a/src/naive_parser.rs
+++ b/src/naive_parser.rs
@@ -1,7 +1,8 @@
 use crate::emitter::DefaultEmitter;
 use crate::offset::{Offset, Position};
 use crate::reader::{IntoReader, Reader};
-use crate::{Emitter, State, Tokenizer};
+use crate::tokenizer::CdataAction;
+use crate::{Emitter, Event, State, Tokenizer};
 
 /// A naive HTML parser (**not** spec-compliant since it doesn't do tree construction).
 ///
@@ -53,9 +54,21 @@ impl<R: Reader + Position<O>, O: Offset, E: Emitter<O>> Iterator for NaiveParser
     type Item = Result<E::Token, R::Error>;
 
     fn next(&mut self) -> Option<Self::Item> {
-        // A proper parser would follow the steps described under section '13.2.6 Tree construction'
-        // of the spec. Since this parser is naive, we directly return the token instead.
-        self.tokenizer.next()
+        loop {
+            let event = self.tokenizer.next()?;
+            match event {
+                Err(e) => return Some(Err(e)),
+                Ok(Event::Token(t)) => {
+                    // A proper parser would follow the steps described under section '13.2.6 Tree construction'
+                    // of the spec. Since this parser is naive, we directly return the token instead.
+                    return Some(Ok(t));
+                }
+                Ok(Event::CdataOpen) => {
+                    // Naively parse any CDATA sections as bogus comments.
+                    self.tokenizer.handle_cdata_open(CdataAction::BogusComment)
+                }
+            }
+        }
     }
 }
 
diff --git a/src/tokenizer.rs b/src/tokenizer.rs
index 96d1c34..5b11db0 100644
--- a/src/tokenizer.rs
+++ b/src/tokenizer.rs
@@ -79,6 +79,38 @@ impl<R: Reader + Position<O>, O: Offset, E: Emitter<O>> Tokenizer<R, O, E> {
             naively_switch_state: false,
         }
     }
+
+    /// To be called when the tokenizer iterator implementation yields [`Event::CdataOpen`].
+    ///
+    /// For spec-compliant parsing *action* must be [`CdataAction::Cdata`],
+    /// if there is an _adjusted current node_ and it is not an element in
+    /// the HTML namespace, or [`CdataAction::BogusComment`] otherwise
+    /// (as per the third condition under [Markup declaration open state]).
+    ///
+    /// [Markup declaration open state]: https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state
+    pub fn handle_cdata_open(&mut self, action: CdataAction) {
+        machine::handle_cdata_open(self, action);
+    }
+}
+
+/// Used by [`Tokenizer::handle_cdata_open`] to determine how to process `<![CDATA[`
+///
+/// (Since as per the spec this depends on the _adjusted current node_).
+pub enum CdataAction {
+    /// Process it as CDATA.
+    Cdata,
+    /// Process it as a bogus comment.
+    BogusComment,
+}
+
+/// An event yielded by the [`Iterator`] implementation for the [`Tokenizer`].
+#[derive(Debug)]
+pub enum Event<T> {
+    /// A token emitted by the [`Emitter`].
+    Token(T),
+    /// The state machine encountered `<![CDATA[`. You must call [`Tokenizer::handle_cdata_open`],
+    /// before advancing the tokenizer iterator again.
+    CdataOpen,
 }
 
 /// The states you can set the tokenizer to.
@@ -307,12 +339,12 @@ where
     R: Reader + Position<O>,
     E: Emitter<O>,
 {
-    type Item = Result<E::Token, R::Error>;
+    type Item = Result<Event<E::Token>, R::Error>;
 
     fn next(&mut self) -> Option<Self::Item> {
         loop {
             if let Some(token) = self.emitter.pop_token() {
-                return Some(Ok(token));
+                return Some(Ok(Event::Token(token)));
             }
 
             if self.eof {
@@ -326,6 +358,7 @@ where
                     self.eof = true;
                     self.emitter.emit_eof();
                 }
+                Ok(ControlToken::CdataOpen) => return Some(Ok(Event::CdataOpen)),
             }
         }
     }