diff options
author | Martin Fischer <martin@push-f.com> | 2023-08-15 11:26:08 +0200 |
---|---|---|
committer | Martin Fischer <martin@push-f.com> | 2023-08-19 13:41:55 +0200 |
commit | 82898967320f90116bbc686ab7ffc2f61ff456c4 (patch) | |
tree | 36c37d200945e20b331d271576e3255cfcc48d16 /src | |
parent | a83e64e81de66ff40cc8a6293f0b5650d431689c (diff) |
fix!: add adjusted_current_node_present_and_not_in_html_namespace to Emitter
Diffstat (limited to 'src')
-rw-r--r-- | src/emitter.rs | 24 | ||||
-rw-r--r-- | src/machine.rs | 21 |
2 files changed, 35 insertions, 10 deletions
diff --git a/src/emitter.rs b/src/emitter.rs index ed8d9e9..30e1d17 100644 --- a/src/emitter.rs +++ b/src/emitter.rs @@ -156,6 +156,13 @@ pub trait Emitter<O> { /// /// If the current token is not a doctype, this method may panic. fn push_doctype_system_id(&mut self, s: &str); + + /// Returns true if there is an _adjusted current node_ and it is not an element in the HTML namespace. + /// + /// See the third list item under [Markup declaration open state]. + /// + /// [Markup declaration open state]: https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state + fn adjusted_current_node_present_and_not_in_html_namespace(&mut self) -> bool; } /// The default implementation of [`Emitter`], used to produce tokens. @@ -174,6 +181,19 @@ pub trait Emitter<O> { /// assert!(matches!(tokens.next().unwrap(), Token::StartTag(tag) if tag.name == "b")); /// ``` /// +/// * The DefaultEmitter implements [`Emitter::adjusted_current_node_present_and_not_in_html_namespace`] +/// by returning false, which results in all CDATA sections being tokenized as bogus comments. +/// +/// ``` +/// # use html5tokenizer::{DefaultEmitter, Tokenizer, Token}; +/// let emitter = DefaultEmitter::default(); +/// let html = "<svg><![CDATA[I love SVG]]>"; +/// let mut tokens = Tokenizer::new(html, emitter).flatten(); +/// assert!(matches!(tokens.next().unwrap(), Token::StartTag(tag) if tag.name == "svg")); +/// assert!(matches!(tokens.next().unwrap(), Token::Error {..})); +/// assert!(matches!(tokens.next().unwrap(), Token::Comment(_bogus_comment))); +/// ``` +/// /// [`Tokenizer::set_state`]: crate::Tokenizer::set_state pub struct DefaultEmitter<O = NoopOffset> { current_characters: String, @@ -447,6 +467,10 @@ impl<O: Offset> Emitter<O> for DefaultEmitter<O> { debug_assert!(false); } } + + fn adjusted_current_node_present_and_not_in_html_namespace(&mut self) -> bool { + false + } } /// An HTML start tag, such as `<p>` or `<a>`. diff --git a/src/machine.rs b/src/machine.rs index 0d99ab8..007f22f 100644 --- a/src/machine.rs +++ b/src/machine.rs @@ -939,17 +939,18 @@ where Ok(ControlToken::Continue) } Some('[') if slf.try_read_string("CDATA[", true)? => { - // missing: check for adjusted current element: we don't have an element stack - // at all - // - // missing: cdata transition - // - // let's hope that bogus comment can just sort of skip over cdata - slf.emit_error(Error::CdataInHtmlContent); + if slf + .emitter + .adjusted_current_node_present_and_not_in_html_namespace() + { + slf.state = State::CdataSection; + } else { + slf.emit_error(Error::CdataInHtmlContent); - slf.emitter.init_comment(slf.reader.position()); - slf.emitter.push_comment("[CDATA["); - slf.state = State::BogusComment; + slf.emitter.init_comment(slf.reader.position()); + slf.emitter.push_comment("[CDATA["); + slf.state = State::BogusComment; + } Ok(ControlToken::Continue) } c => { |