aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMartin Fischer <martin@push-f.com>2023-08-15 11:26:08 +0200
committerMartin Fischer <martin@push-f.com>2023-08-19 13:41:55 +0200
commit82898967320f90116bbc686ab7ffc2f61ff456c4 (patch)
tree36c37d200945e20b331d271576e3255cfcc48d16
parenta83e64e81de66ff40cc8a6293f0b5650d431689c (diff)
fix!: add adjusted_current_node_present_and_not_in_html_namespace to Emitter
-rw-r--r--src/emitter.rs24
-rw-r--r--src/machine.rs21
2 files changed, 35 insertions, 10 deletions
diff --git a/src/emitter.rs b/src/emitter.rs
index ed8d9e9..30e1d17 100644
--- a/src/emitter.rs
+++ b/src/emitter.rs
@@ -156,6 +156,13 @@ pub trait Emitter<O> {
///
/// If the current token is not a doctype, this method may panic.
fn push_doctype_system_id(&mut self, s: &str);
+
+ /// Returns true if there is an _adjusted current node_ and it is not an element in the HTML namespace.
+ ///
+ /// See the third list item under [Markup declaration open state].
+ ///
+ /// [Markup declaration open state]: https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state
+ fn adjusted_current_node_present_and_not_in_html_namespace(&mut self) -> bool;
}
/// The default implementation of [`Emitter`], used to produce tokens.
@@ -174,6 +181,19 @@ pub trait Emitter<O> {
/// assert!(matches!(tokens.next().unwrap(), Token::StartTag(tag) if tag.name == "b"));
/// ```
///
+/// * The DefaultEmitter implements [`Emitter::adjusted_current_node_present_and_not_in_html_namespace`]
+/// by returning false, which results in all CDATA sections being tokenized as bogus comments.
+///
+/// ```
+/// # use html5tokenizer::{DefaultEmitter, Tokenizer, Token};
+/// let emitter = DefaultEmitter::default();
+/// let html = "<svg><![CDATA[I love SVG]]>";
+/// let mut tokens = Tokenizer::new(html, emitter).flatten();
+/// assert!(matches!(tokens.next().unwrap(), Token::StartTag(tag) if tag.name == "svg"));
+/// assert!(matches!(tokens.next().unwrap(), Token::Error {..}));
+/// assert!(matches!(tokens.next().unwrap(), Token::Comment(_bogus_comment)));
+/// ```
+///
/// [`Tokenizer::set_state`]: crate::Tokenizer::set_state
pub struct DefaultEmitter<O = NoopOffset> {
current_characters: String,
@@ -447,6 +467,10 @@ impl<O: Offset> Emitter<O> for DefaultEmitter<O> {
debug_assert!(false);
}
}
+
+ fn adjusted_current_node_present_and_not_in_html_namespace(&mut self) -> bool {
+ false
+ }
}
/// An HTML start tag, such as `<p>` or `<a>`.
diff --git a/src/machine.rs b/src/machine.rs
index 0d99ab8..007f22f 100644
--- a/src/machine.rs
+++ b/src/machine.rs
@@ -939,17 +939,18 @@ where
Ok(ControlToken::Continue)
}
Some('[') if slf.try_read_string("CDATA[", true)? => {
- // missing: check for adjusted current element: we don't have an element stack
- // at all
- //
- // missing: cdata transition
- //
- // let's hope that bogus comment can just sort of skip over cdata
- slf.emit_error(Error::CdataInHtmlContent);
+ if slf
+ .emitter
+ .adjusted_current_node_present_and_not_in_html_namespace()
+ {
+ slf.state = State::CdataSection;
+ } else {
+ slf.emit_error(Error::CdataInHtmlContent);
- slf.emitter.init_comment(slf.reader.position());
- slf.emitter.push_comment("[CDATA[");
- slf.state = State::BogusComment;
+ slf.emitter.init_comment(slf.reader.position());
+ slf.emitter.push_comment("[CDATA[");
+ slf.state = State::BogusComment;
+ }
Ok(ControlToken::Continue)
}
c => {