summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/emitter.rs35
-rw-r--r--src/lib.rs2
-rw-r--r--src/machine.rs37
-rw-r--r--src/naive_parser.rs21
-rw-r--r--src/tokenizer.rs37
5 files changed, 80 insertions, 52 deletions
diff --git a/src/emitter.rs b/src/emitter.rs
index a5ecd55..dee0aa0 100644
--- a/src/emitter.rs
+++ b/src/emitter.rs
@@ -156,13 +156,6 @@ pub trait Emitter<O> {
///
/// If the current token is not a doctype, this method may panic.
fn push_doctype_system_id(&mut self, s: &str);
-
- /// Returns true if there is an _adjusted current node_ and it is not an element in the HTML namespace.
- ///
- /// See the third list item under [Markup declaration open state].
- ///
- /// [Markup declaration open state]: https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state
- fn adjusted_current_node_present_and_not_in_html_namespace(&mut self) -> bool;
}
/// The DefaultEmitter is not exposed in the public API because:
@@ -172,26 +165,12 @@ pub trait Emitter<O> {
///
#[cfg_attr(not(feature = "integration-tests"), doc = "```ignore")]
#[cfg_attr(feature = "integration-tests", doc = "```")]
-/// # use html5tokenizer::{DefaultEmitter, Tokenizer, Token};
+/// # use html5tokenizer::{DefaultEmitter, Event, Tokenizer, Token};
/// let emitter = DefaultEmitter::default();
/// let html = "<script><b>";
/// let mut tokens = Tokenizer::new(html, emitter).flatten();
-/// assert!(matches!(tokens.next().unwrap(), Token::StartTag(tag) if tag.name == "script"));
-/// assert!(matches!(tokens.next().unwrap(), Token::StartTag(tag) if tag.name == "b"));
-/// ```
-///
-/// * The DefaultEmitter implements [`Emitter::adjusted_current_node_present_and_not_in_html_namespace`]
-/// by returning false, which results in all CDATA sections being tokenized as bogus comments.
-///
-#[cfg_attr(not(feature = "integration-tests"), doc = "```ignore")]
-#[cfg_attr(feature = "integration-tests", doc = "```")]
-/// # use html5tokenizer::{DefaultEmitter, Tokenizer, Token};
-/// let emitter = DefaultEmitter::default();
-/// let html = "<svg><![CDATA[I love SVG]]>";
-/// let mut tokens = Tokenizer::new(html, emitter).flatten();
-/// assert!(matches!(tokens.next().unwrap(), Token::StartTag(tag) if tag.name == "svg"));
-/// assert!(matches!(tokens.next().unwrap(), Token::Error {..}));
-/// assert!(matches!(tokens.next().unwrap(), Token::Comment(_bogus_comment)));
+/// assert!(matches!(tokens.next().unwrap(), Event::Token(Token::StartTag(tag)) if tag.name == "script"));
+/// assert!(matches!(tokens.next().unwrap(), Event::Token(Token::StartTag(tag)) if tag.name == "b"));
/// ```
///
/// [`Tokenizer::set_state`]: crate::Tokenizer::set_state
@@ -467,10 +446,6 @@ impl<O: Offset> Emitter<O> for DefaultEmitter<O> {
debug_assert!(false);
}
}
-
- fn adjusted_current_node_present_and_not_in_html_namespace(&mut self) -> bool {
- false
- }
}
/// An HTML start tag, such as `<p>` or `<a>`.
@@ -613,7 +588,7 @@ pub enum Token<O> {
#[cfg(test)]
mod tests {
use super::{DefaultEmitter, Token};
- use crate::{attr::AttrValueSyntax, Tokenizer};
+ use crate::{attr::AttrValueSyntax, Event, Tokenizer};
#[test]
fn test_attribute_value_syntax() {
@@ -622,7 +597,7 @@ mod tests {
DefaultEmitter::default(),
)
.flatten();
- let Token::StartTag(start_tag) = tokenizer.next().unwrap() else {
+ let Event::Token(Token::StartTag(start_tag)) = tokenizer.next().unwrap() else {
panic!("expected start tag");
};
assert_eq!(
diff --git a/src/lib.rs b/src/lib.rs
index 9dd878c..7bc17c3 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -20,7 +20,7 @@ mod utils;
pub use emitter::{Comment, Doctype, Emitter, EndTag, StartTag, Token};
pub use error::Error;
pub use naive_parser::NaiveParser;
-pub use tokenizer::{State, Tokenizer};
+pub use tokenizer::{CdataAction, Event, State, Tokenizer};
#[cfg(feature = "integration-tests")]
pub use utils::State as InternalState;
diff --git a/src/machine.rs b/src/machine.rs
index c5bf021..509dae5 100644
--- a/src/machine.rs
+++ b/src/machine.rs
@@ -1,6 +1,7 @@
use crate::attr::AttrValueSyntax;
use crate::entities::try_read_character_reference;
use crate::offset::{Offset, Position};
+use crate::tokenizer::CdataAction;
use crate::utils::{
ascii_digit_pat, control_pat, ctostr, noncharacter_pat, surrogate_pat, whitespace_pat, State,
};
@@ -9,6 +10,7 @@ use crate::{reader::Reader, Emitter, Error, Tokenizer};
pub enum ControlToken {
Eof,
Continue,
+ CdataOpen,
}
// Note: This is not implemented as a method on Tokenizer because there's fields on Tokenizer that
@@ -942,21 +944,7 @@ where
slf.doctype_offset = slf.reader.position() - b"<!doctype".len();
Ok(ControlToken::Continue)
}
- Some('[') if slf.try_read_string("CDATA[", true)? => {
- if slf
- .emitter
- .adjusted_current_node_present_and_not_in_html_namespace()
- {
- slf.state = State::CdataSection;
- } else {
- slf.emit_error(Error::CdataInHtmlContent);
-
- slf.emitter.init_comment(slf.reader.position());
- slf.emitter.push_comment("[CDATA[");
- slf.state = State::BogusComment;
- }
- Ok(ControlToken::Continue)
- }
+ Some('[') if slf.try_read_string("CDATA[", true)? => Ok(ControlToken::CdataOpen),
c => {
slf.emit_error(Error::IncorrectlyOpenedComment);
slf.emitter.init_comment(slf.reader.position() - 1);
@@ -1897,3 +1885,22 @@ where
}
}
}
+
+#[inline]
+pub fn handle_cdata_open<O, R, E>(slf: &mut Tokenizer<R, O, E>, action: CdataAction)
+where
+ O: Offset,
+ R: Reader + Position<O>,
+ E: Emitter<O>,
+{
+ match action {
+ CdataAction::Cdata => slf.state = State::CdataSection,
+ CdataAction::BogusComment => {
+ slf.emit_error(Error::CdataInHtmlContent);
+
+ slf.emitter.init_comment(slf.reader.position());
+ slf.emitter.push_comment("[CDATA[");
+ slf.state = State::BogusComment;
+ }
+ }
+}
diff --git a/src/naive_parser.rs b/src/naive_parser.rs
index 2626209..f126dfd 100644
--- a/src/naive_parser.rs
+++ b/src/naive_parser.rs
@@ -1,7 +1,8 @@
use crate::emitter::DefaultEmitter;
use crate::offset::{Offset, Position};
use crate::reader::{IntoReader, Reader};
-use crate::{Emitter, State, Tokenizer};
+use crate::tokenizer::CdataAction;
+use crate::{Emitter, Event, State, Tokenizer};
/// A naive HTML parser (**not** spec-compliant since it doesn't do tree construction).
///
@@ -53,9 +54,21 @@ impl<R: Reader + Position<O>, O: Offset, E: Emitter<O>> Iterator for NaiveParser
type Item = Result<E::Token, R::Error>;
fn next(&mut self) -> Option<Self::Item> {
- // A proper parser would follow the steps described under section '13.2.6 Tree construction'
- // of the spec. Since this parser is naive, we directly return the token instead.
- self.tokenizer.next()
+ loop {
+ let event = self.tokenizer.next()?;
+ match event {
+ Err(e) => return Some(Err(e)),
+ Ok(Event::Token(t)) => {
+ // A proper parser would follow the steps described under section '13.2.6 Tree construction'
+ // of the spec. Since this parser is naive, we directly return the token instead.
+ return Some(Ok(t));
+ }
+ Ok(Event::CdataOpen) => {
+ // Naively parse any CDATA sections as bogus comments.
+ self.tokenizer.handle_cdata_open(CdataAction::BogusComment)
+ }
+ }
+ }
}
}
diff --git a/src/tokenizer.rs b/src/tokenizer.rs
index 96d1c34..5b11db0 100644
--- a/src/tokenizer.rs
+++ b/src/tokenizer.rs
@@ -79,6 +79,38 @@ impl<R: Reader + Position<O>, O: Offset, E: Emitter<O>> Tokenizer<R, O, E> {
naively_switch_state: false,
}
}
+
+ /// To be called when the tokenizer iterator implementation yields [`Event::CdataOpen`].
+ ///
+ /// For spec-compliant parsing *action* must be [`CdataAction::Cdata`],
+ /// if there is an _adjusted current node_ and it is not an element in
+ /// the HTML namespace, or [`CdataAction::BogusComment`] otherwise
+ /// (as per the third condition under [Markup declaration open state]).
+ ///
+ /// [Markup declaration open state]: https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state
+ pub fn handle_cdata_open(&mut self, action: CdataAction) {
+ machine::handle_cdata_open(self, action);
+ }
+}
+
+/// Used by [`Tokenizer::handle_cdata_open`] to determine how to process `<![CDATA[`
+///
+/// (Since as per the spec this depends on the _adjusted current node_).
+pub enum CdataAction {
+ /// Process it as CDATA.
+ Cdata,
+ /// Process it as a bogus comment.
+ BogusComment,
+}
+
+/// An event yielded by the [`Iterator`] implementation for the [`Tokenizer`].
+#[derive(Debug)]
+pub enum Event<T> {
+ /// A token emitted by the [`Emitter`].
+ Token(T),
+ /// The state machine encountered `<![CDATA[`. You must call [`Tokenizer::handle_cdata_open`],
+ /// before advancing the tokenizer iterator again.
+ CdataOpen,
}
/// The states you can set the tokenizer to.
@@ -307,12 +339,12 @@ where
R: Reader + Position<O>,
E: Emitter<O>,
{
- type Item = Result<E::Token, R::Error>;
+ type Item = Result<Event<E::Token>, R::Error>;
fn next(&mut self) -> Option<Self::Item> {
loop {
if let Some(token) = self.emitter.pop_token() {
- return Some(Ok(token));
+ return Some(Ok(Event::Token(token)));
}
if self.eof {
@@ -326,6 +358,7 @@ where
self.eof = true;
self.emitter.emit_eof();
}
+ Ok(ControlToken::CdataOpen) => return Some(Ok(Event::CdataOpen)),
}
}
}