summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMartin Fischer <martin@push-f.com>2023-08-29 13:09:44 +0200
committerMartin Fischer <martin@push-f.com>2023-09-28 10:36:01 +0200
commit826907487e2b593f1c54e98b59fe2f6eb8cb6937 (patch)
treede48a91090a240033a6f02eb8e984da133b71025
parent2b4c52758c503b08d3299ad2d1ee369ad5f597f1 (diff)
break!: remove Token::Error
An error isn't a token (in general and also according to the spec). You shouldn't have to filter out errors when you're just interested in tokens but most importantly having errors in the Token enum is annoying when implementing tree construction (since the spec conditions exhaustively cover all Token variants except Token::Error).
-rw-r--r--CHANGELOG.md6
-rw-r--r--examples/tokenize.rs11
-rw-r--r--integration_tests/tests/test_html5lib.rs12
-rw-r--r--src/default_emitter.rs11
-rw-r--r--src/naive_parser.rs8
-rw-r--r--src/token.rs11
-rw-r--r--src/tokenizer.rs5
-rw-r--r--tests/test_spans.rs46
8 files changed, 53 insertions, 57 deletions
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 961665c..c4acbb2 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -8,6 +8,12 @@
#### Breaking changes
+* `Token` enum
+
+ * Removed the `Error` variant.
+ (Errors now have to be queried separately with
+ `DefaultEmitter::drain_errors`.)
+
* `Emitter` trait
* Removed `pop_token` method and `Token` associated type.
diff --git a/examples/tokenize.rs b/examples/tokenize.rs
index da99dd3..f8859e4 100644
--- a/examples/tokenize.rs
+++ b/examples/tokenize.rs
@@ -5,12 +5,15 @@ use html5tokenizer::{DefaultEmitter, Tokenizer};
use std::io::BufReader;
fn main() {
- for token in Tokenizer::new(
+ let mut tokenizer = Tokenizer::new(
BufReader::new(std::io::stdin().lock()),
DefaultEmitter::default(),
- )
- .flatten()
- {
+ );
+ while let Some(token) = tokenizer.next() {
+ for (error, _) in tokenizer.emitter_mut().drain_errors() {
+ eprintln!("error: {:?}", error);
+ }
+ let token = token.unwrap();
println!("{:?}", token);
}
}
diff --git a/integration_tests/tests/test_html5lib.rs b/integration_tests/tests/test_html5lib.rs
index 0cf5868..2e404c5 100644
--- a/integration_tests/tests/test_html5lib.rs
+++ b/integration_tests/tests/test_html5lib.rs
@@ -115,7 +115,6 @@ fn run_test_inner<R: Reader>(
tokenizer.set_last_start_tag(last_start_tag);
}
- let mut actual_errors = Vec::new();
let mut actual_tokens = Vec::new();
while let Some(event) = tokenizer.next() {
@@ -128,9 +127,6 @@ fn run_test_inner<R: Reader>(
};
match token {
- Token::Error { error, .. } => actual_errors.push(TestError {
- code: error.code().to_string(),
- }),
Token::StartTag(tag) => actual_tokens.push(TestToken::StartTag {
name: tag.name,
attributes: tag
@@ -154,7 +150,13 @@ fn run_test_inner<R: Reader>(
assert_eq!(
Output {
- errors: actual_errors,
+ errors: tokenizer
+ .emitter_mut()
+ .drain_errors()
+ .map(|(e, _)| TestError {
+ code: e.code().to_string()
+ })
+ .collect(),
tokens: actual_tokens,
},
test.output,
diff --git a/src/default_emitter.rs b/src/default_emitter.rs
index a4c5a63..e89fa5e 100644
--- a/src/default_emitter.rs
+++ b/src/default_emitter.rs
@@ -17,6 +17,7 @@ pub struct DefaultEmitter<O = NoopOffset> {
current_attribute: Option<(String, crate::token::AttrInternal<O>)>,
seen_attributes: BTreeSet<String>,
emitted_tokens: VecDeque<Token<O>>,
+ errors: VecDeque<(Error, Range<O>)>,
attr_in_end_tag_span: Option<Range<O>>,
}
@@ -28,11 +29,19 @@ impl<O> Default for DefaultEmitter<O> {
current_attribute: None,
seen_attributes: BTreeSet::new(),
emitted_tokens: VecDeque::new(),
+ errors: VecDeque::new(),
attr_in_end_tag_span: None,
}
}
}
+impl<O> DefaultEmitter<O> {
+ /// Removes all encountered tokenizer errors and returns them as an iterator.
+ pub fn drain_errors(&mut self) -> impl Iterator<Item = (Error, Range<O>)> + '_ {
+ self.errors.drain(0..)
+ }
+}
+
impl<O> Iterator for DefaultEmitter<O> {
type Item = Token<O>;
@@ -43,7 +52,7 @@ impl<O> Iterator for DefaultEmitter<O> {
impl<O: Offset> Emitter<O> for DefaultEmitter<O> {
fn report_error(&mut self, error: Error, span: Range<O>) {
- self.emitted_tokens.push_front(Token::Error { error, span });
+ self.errors.push_back((error, span));
}
fn emit_eof(&mut self) {
diff --git a/src/naive_parser.rs b/src/naive_parser.rs
index 10eb98d..5bf002b 100644
--- a/src/naive_parser.rs
+++ b/src/naive_parser.rs
@@ -14,11 +14,10 @@ use crate::{Emitter, Event, State, Tokenizer};
/// * it naively emits any CDATA sections as bogus comments, for example:
///
/// ```
-/// # use html5tokenizer::{Error, NaiveParser, Tokenizer, Token};
+/// # use html5tokenizer::{NaiveParser, Token};
/// let html = "<svg><![CDATA[I love SVG]]>";
/// let mut tokens = NaiveParser::new(html).flatten();
/// assert!(matches!(tokens.next().unwrap(), Token::StartTag(tag) if tag.name == "svg"));
-/// assert!(matches!(tokens.next().unwrap(), Token::Error {error: Error::CdataInHtmlContent, ..}));
/// assert!(matches!(tokens.next().unwrap(), Token::Comment(_bogus_comment)));
/// ```
///
@@ -59,6 +58,11 @@ impl<R: Reader + Position<O>, O: Offset, E: Emitter<O>> NaiveParser<R, O, E> {
tokenizer.naively_switch_state = true;
NaiveParser { tokenizer }
}
+
+ /// Returns a mutable reference to the emitter.
+ pub fn emitter_mut(&mut self) -> &mut E {
+ self.tokenizer.emitter_mut()
+ }
}
impl<R, O, E> Iterator for NaiveParser<R, O, E>
diff --git a/src/token.rs b/src/token.rs
index 48c90f7..c599cd5 100644
--- a/src/token.rs
+++ b/src/token.rs
@@ -5,7 +5,6 @@ use std::iter::FromIterator;
use std::ops::{Index, Range};
use crate::offset::Offset;
-use crate::Error;
/// A type for the tokens emitted by a WHATWG-compliant HTML tokenizer.
#[derive(Debug, Eq, PartialEq)]
@@ -20,16 +19,6 @@ pub enum Token<O> {
Comment(Comment<O>),
/// An HTML doctype declaration.
Doctype(Doctype<O>),
- /// An HTML parsing error.
- ///
- /// Can be skipped over, the tokenizer is supposed to recover from the error and continues with
- /// more tokens afterward.
- Error {
- /// What kind of error occurred.
- error: Error,
- /// The source code span of the error.
- span: Range<O>,
- },
}
/// An HTML start tag, such as `<p>` or `<a>`.
diff --git a/src/tokenizer.rs b/src/tokenizer.rs
index 7e1e85f..270d3d0 100644
--- a/src/tokenizer.rs
+++ b/src/tokenizer.rs
@@ -111,6 +111,11 @@ impl<R: Reader + Position<O>, O: Offset, E: Emitter<O>> Tokenizer<R, O, E> {
pub fn handle_cdata_open(&mut self, action: CdataAction) {
machine::handle_cdata_open(self, action);
}
+
+ /// Returns a mutable reference to the emitter.
+ pub fn emitter_mut(&mut self) -> &mut E {
+ &mut self.emitter
+ }
}
/// Used by [`Tokenizer::handle_cdata_open`] to determine how to process `<![CDATA[`
diff --git a/tests/test_spans.rs b/tests/test_spans.rs
index f2cdc5f..64cc250 100644
--- a/tests/test_spans.rs
+++ b/tests/test_spans.rs
@@ -221,12 +221,7 @@ fn comment_data_span() {
let mut annotated = String::new();
for case in cases {
let labeler = |parser: Parser| {
- let Token::Comment(comment) = parser
- .flatten()
- .filter(|t| !matches!(t, Token::Error { .. }))
- .next()
- .unwrap()
- else {
+ let Token::Comment(comment) = parser.flatten().next().unwrap() else {
panic!("expected comment");
};
vec![(comment.data_span(), "")]
@@ -265,12 +260,7 @@ fn comment_data_span() {
"###);
for (idx, case) in cases.iter().enumerate() {
- let Token::Comment(comment) = parser(*case)
- .flatten()
- .filter(|t| !matches!(t, Token::Error { .. }))
- .next()
- .unwrap()
- else {
+ let Token::Comment(comment) = parser(*case).flatten().next().unwrap() else {
panic!("expected comment");
};
assert_eq!(case[comment.data_span()], comment.data, "case {idx}");
@@ -287,12 +277,7 @@ fn doctype_span() {
let mut annotated = String::new();
for case in cases {
let labeler = |parser: Parser| {
- let Token::Doctype(doctype) = parser
- .flatten()
- .filter(|t| !matches!(t, Token::Error { .. }))
- .next()
- .unwrap()
- else {
+ let Token::Doctype(doctype) = parser.flatten().next().unwrap() else {
panic!("expected doctype");
};
vec![(doctype.span, "")]
@@ -316,12 +301,7 @@ fn doctype_id_spans() {
let mut annotated = String::new();
for case in cases {
let labeler = |parser: Parser| {
- let Token::Doctype(doctype) = parser
- .flatten()
- .filter(|t| !matches!(t, Token::Error { .. }))
- .next()
- .unwrap()
- else {
+ let Token::Doctype(doctype) = parser.flatten().next().unwrap() else {
panic!("expected doctype");
};
@@ -351,10 +331,11 @@ fn doctype_id_spans() {
}
fn annotate_errors(html: &'static str) -> String {
- for token in parser(html).flatten() {
- let Token::Error { span, .. } = token else {
- continue;
- };
+ let mut parser = parser(html);
+ for _ in parser.by_ref() {}
+ let errors: Vec<_> = parser.emitter_mut().drain_errors().collect();
+
+ for (_, span) in errors {
if span.start == span.end {
if span.start != html.len() {
panic!("empty error spans are only allowed at the very end of the source (for eof errors)");
@@ -365,13 +346,10 @@ fn annotate_errors(html: &'static str) -> String {
}
}
- let labeler = |parser: Parser| {
+ let labeler = |mut parser: Parser| {
let mut labels = Vec::new();
- for token in parser.flatten() {
- let Token::Error { error, span } = token else {
- continue;
- };
-
+ for _ in parser.by_ref() {}
+ for (error, span) in parser.emitter_mut().drain_errors() {
labels.push((span, error.code()));
}
labels