summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--CHANGELOG.md6
-rw-r--r--examples/tokenize.rs11
-rw-r--r--integration_tests/tests/test_html5lib.rs12
-rw-r--r--src/default_emitter.rs11
-rw-r--r--src/naive_parser.rs8
-rw-r--r--src/token.rs11
-rw-r--r--src/tokenizer.rs5
-rw-r--r--tests/test_spans.rs46
8 files changed, 53 insertions, 57 deletions
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 961665c..c4acbb2 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -8,6 +8,12 @@
#### Breaking changes
+* `Token` enum
+
+ * Removed the `Error` variant.
+ (Errors now have to be queried separately with
+ `DefaultEmitter::drain_errors`.)
+
* `Emitter` trait
* Removed `pop_token` method and `Token` associated type.
diff --git a/examples/tokenize.rs b/examples/tokenize.rs
index da99dd3..f8859e4 100644
--- a/examples/tokenize.rs
+++ b/examples/tokenize.rs
@@ -5,12 +5,15 @@ use html5tokenizer::{DefaultEmitter, Tokenizer};
use std::io::BufReader;
fn main() {
- for token in Tokenizer::new(
+ let mut tokenizer = Tokenizer::new(
BufReader::new(std::io::stdin().lock()),
DefaultEmitter::default(),
- )
- .flatten()
- {
+ );
+ while let Some(token) = tokenizer.next() {
+ for (error, _) in tokenizer.emitter_mut().drain_errors() {
+ eprintln!("error: {:?}", error);
+ }
+ let token = token.unwrap();
println!("{:?}", token);
}
}
diff --git a/integration_tests/tests/test_html5lib.rs b/integration_tests/tests/test_html5lib.rs
index 0cf5868..2e404c5 100644
--- a/integration_tests/tests/test_html5lib.rs
+++ b/integration_tests/tests/test_html5lib.rs
@@ -115,7 +115,6 @@ fn run_test_inner<R: Reader>(
tokenizer.set_last_start_tag(last_start_tag);
}
- let mut actual_errors = Vec::new();
let mut actual_tokens = Vec::new();
while let Some(event) = tokenizer.next() {
@@ -128,9 +127,6 @@ fn run_test_inner<R: Reader>(
};
match token {
- Token::Error { error, .. } => actual_errors.push(TestError {
- code: error.code().to_string(),
- }),
Token::StartTag(tag) => actual_tokens.push(TestToken::StartTag {
name: tag.name,
attributes: tag
@@ -154,7 +150,13 @@ fn run_test_inner<R: Reader>(
assert_eq!(
Output {
- errors: actual_errors,
+ errors: tokenizer
+ .emitter_mut()
+ .drain_errors()
+ .map(|(e, _)| TestError {
+ code: e.code().to_string()
+ })
+ .collect(),
tokens: actual_tokens,
},
test.output,
diff --git a/src/default_emitter.rs b/src/default_emitter.rs
index a4c5a63..e89fa5e 100644
--- a/src/default_emitter.rs
+++ b/src/default_emitter.rs
@@ -17,6 +17,7 @@ pub struct DefaultEmitter<O = NoopOffset> {
current_attribute: Option<(String, crate::token::AttrInternal<O>)>,
seen_attributes: BTreeSet<String>,
emitted_tokens: VecDeque<Token<O>>,
+ errors: VecDeque<(Error, Range<O>)>,
attr_in_end_tag_span: Option<Range<O>>,
}
@@ -28,11 +29,19 @@ impl<O> Default for DefaultEmitter<O> {
current_attribute: None,
seen_attributes: BTreeSet::new(),
emitted_tokens: VecDeque::new(),
+ errors: VecDeque::new(),
attr_in_end_tag_span: None,
}
}
}
+impl<O> DefaultEmitter<O> {
+ /// Removes all encountered tokenizer errors and returns them as an iterator.
+ pub fn drain_errors(&mut self) -> impl Iterator<Item = (Error, Range<O>)> + '_ {
+ self.errors.drain(0..)
+ }
+}
+
impl<O> Iterator for DefaultEmitter<O> {
type Item = Token<O>;
@@ -43,7 +52,7 @@ impl<O> Iterator for DefaultEmitter<O> {
impl<O: Offset> Emitter<O> for DefaultEmitter<O> {
fn report_error(&mut self, error: Error, span: Range<O>) {
- self.emitted_tokens.push_front(Token::Error { error, span });
+ self.errors.push_back((error, span));
}
fn emit_eof(&mut self) {
diff --git a/src/naive_parser.rs b/src/naive_parser.rs
index 10eb98d..5bf002b 100644
--- a/src/naive_parser.rs
+++ b/src/naive_parser.rs
@@ -14,11 +14,10 @@ use crate::{Emitter, Event, State, Tokenizer};
/// * it naively emits any CDATA sections as bogus comments, for example:
///
/// ```
-/// # use html5tokenizer::{Error, NaiveParser, Tokenizer, Token};
+/// # use html5tokenizer::{NaiveParser, Token};
/// let html = "<svg><![CDATA[I love SVG]]>";
/// let mut tokens = NaiveParser::new(html).flatten();
/// assert!(matches!(tokens.next().unwrap(), Token::StartTag(tag) if tag.name == "svg"));
-/// assert!(matches!(tokens.next().unwrap(), Token::Error {error: Error::CdataInHtmlContent, ..}));
/// assert!(matches!(tokens.next().unwrap(), Token::Comment(_bogus_comment)));
/// ```
///
@@ -59,6 +58,11 @@ impl<R: Reader + Position<O>, O: Offset, E: Emitter<O>> NaiveParser<R, O, E> {
tokenizer.naively_switch_state = true;
NaiveParser { tokenizer }
}
+
+ /// Returns a mutable reference to the emitter.
+ pub fn emitter_mut(&mut self) -> &mut E {
+ self.tokenizer.emitter_mut()
+ }
}
impl<R, O, E> Iterator for NaiveParser<R, O, E>
diff --git a/src/token.rs b/src/token.rs
index 48c90f7..c599cd5 100644
--- a/src/token.rs
+++ b/src/token.rs
@@ -5,7 +5,6 @@ use std::iter::FromIterator;
use std::ops::{Index, Range};
use crate::offset::Offset;
-use crate::Error;
/// A type for the tokens emitted by a WHATWG-compliant HTML tokenizer.
#[derive(Debug, Eq, PartialEq)]
@@ -20,16 +19,6 @@ pub enum Token<O> {
Comment(Comment<O>),
/// An HTML doctype declaration.
Doctype(Doctype<O>),
- /// An HTML parsing error.
- ///
- /// Can be skipped over, the tokenizer is supposed to recover from the error and continues with
- /// more tokens afterward.
- Error {
- /// What kind of error occurred.
- error: Error,
- /// The source code span of the error.
- span: Range<O>,
- },
}
/// An HTML start tag, such as `<p>` or `<a>`.
diff --git a/src/tokenizer.rs b/src/tokenizer.rs
index 7e1e85f..270d3d0 100644
--- a/src/tokenizer.rs
+++ b/src/tokenizer.rs
@@ -111,6 +111,11 @@ impl<R: Reader + Position<O>, O: Offset, E: Emitter<O>> Tokenizer<R, O, E> {
pub fn handle_cdata_open(&mut self, action: CdataAction) {
machine::handle_cdata_open(self, action);
}
+
+ /// Returns a mutable reference to the emitter.
+ pub fn emitter_mut(&mut self) -> &mut E {
+ &mut self.emitter
+ }
}
/// Used by [`Tokenizer::handle_cdata_open`] to determine how to process `<![CDATA[`
diff --git a/tests/test_spans.rs b/tests/test_spans.rs
index f2cdc5f..64cc250 100644
--- a/tests/test_spans.rs
+++ b/tests/test_spans.rs
@@ -221,12 +221,7 @@ fn comment_data_span() {
let mut annotated = String::new();
for case in cases {
let labeler = |parser: Parser| {
- let Token::Comment(comment) = parser
- .flatten()
- .filter(|t| !matches!(t, Token::Error { .. }))
- .next()
- .unwrap()
- else {
+ let Token::Comment(comment) = parser.flatten().next().unwrap() else {
panic!("expected comment");
};
vec![(comment.data_span(), "")]
@@ -265,12 +260,7 @@ fn comment_data_span() {
"###);
for (idx, case) in cases.iter().enumerate() {
- let Token::Comment(comment) = parser(*case)
- .flatten()
- .filter(|t| !matches!(t, Token::Error { .. }))
- .next()
- .unwrap()
- else {
+ let Token::Comment(comment) = parser(*case).flatten().next().unwrap() else {
panic!("expected comment");
};
assert_eq!(case[comment.data_span()], comment.data, "case {idx}");
@@ -287,12 +277,7 @@ fn doctype_span() {
let mut annotated = String::new();
for case in cases {
let labeler = |parser: Parser| {
- let Token::Doctype(doctype) = parser
- .flatten()
- .filter(|t| !matches!(t, Token::Error { .. }))
- .next()
- .unwrap()
- else {
+ let Token::Doctype(doctype) = parser.flatten().next().unwrap() else {
panic!("expected doctype");
};
vec![(doctype.span, "")]
@@ -316,12 +301,7 @@ fn doctype_id_spans() {
let mut annotated = String::new();
for case in cases {
let labeler = |parser: Parser| {
- let Token::Doctype(doctype) = parser
- .flatten()
- .filter(|t| !matches!(t, Token::Error { .. }))
- .next()
- .unwrap()
- else {
+ let Token::Doctype(doctype) = parser.flatten().next().unwrap() else {
panic!("expected doctype");
};
@@ -351,10 +331,11 @@ fn doctype_id_spans() {
}
fn annotate_errors(html: &'static str) -> String {
- for token in parser(html).flatten() {
- let Token::Error { span, .. } = token else {
- continue;
- };
+ let mut parser = parser(html);
+ for _ in parser.by_ref() {}
+ let errors: Vec<_> = parser.emitter_mut().drain_errors().collect();
+
+ for (_, span) in errors {
if span.start == span.end {
if span.start != html.len() {
panic!("empty error spans are only allowed at the very end of the source (for eof errors)");
@@ -365,13 +346,10 @@ fn annotate_errors(html: &'static str) -> String {
}
}
- let labeler = |parser: Parser| {
+ let labeler = |mut parser: Parser| {
let mut labels = Vec::new();
- for token in parser.flatten() {
- let Token::Error { error, span } = token else {
- continue;
- };
-
+ for _ in parser.by_ref() {}
+ for (error, span) in parser.emitter_mut().drain_errors() {
labels.push((span, error.code()));
}
labels