diff options
-rw-r--r-- | CHANGELOG.md | 6 | ||||
-rw-r--r-- | examples/tokenize.rs | 11 | ||||
-rw-r--r-- | integration_tests/tests/test_html5lib.rs | 12 | ||||
-rw-r--r-- | src/default_emitter.rs | 11 | ||||
-rw-r--r-- | src/naive_parser.rs | 8 | ||||
-rw-r--r-- | src/token.rs | 11 | ||||
-rw-r--r-- | src/tokenizer.rs | 5 | ||||
-rw-r--r-- | tests/test_spans.rs | 46 |
8 files changed, 53 insertions, 57 deletions
diff --git a/CHANGELOG.md b/CHANGELOG.md index 961665c..c4acbb2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,12 @@ #### Breaking changes +* `Token` enum + + * Removed the `Error` variant. + (Errors now have to be queried separately with + `DefaultEmitter::drain_errors`.) + * `Emitter` trait * Removed `pop_token` method and `Token` associated type. diff --git a/examples/tokenize.rs b/examples/tokenize.rs index da99dd3..f8859e4 100644 --- a/examples/tokenize.rs +++ b/examples/tokenize.rs @@ -5,12 +5,15 @@ use html5tokenizer::{DefaultEmitter, Tokenizer}; use std::io::BufReader; fn main() { - for token in Tokenizer::new( + let mut tokenizer = Tokenizer::new( BufReader::new(std::io::stdin().lock()), DefaultEmitter::default(), - ) - .flatten() - { + ); + while let Some(token) = tokenizer.next() { + for (error, _) in tokenizer.emitter_mut().drain_errors() { + eprintln!("error: {:?}", error); + } + let token = token.unwrap(); println!("{:?}", token); } } diff --git a/integration_tests/tests/test_html5lib.rs b/integration_tests/tests/test_html5lib.rs index 0cf5868..2e404c5 100644 --- a/integration_tests/tests/test_html5lib.rs +++ b/integration_tests/tests/test_html5lib.rs @@ -115,7 +115,6 @@ fn run_test_inner<R: Reader>( tokenizer.set_last_start_tag(last_start_tag); } - let mut actual_errors = Vec::new(); let mut actual_tokens = Vec::new(); while let Some(event) = tokenizer.next() { @@ -128,9 +127,6 @@ fn run_test_inner<R: Reader>( }; match token { - Token::Error { error, .. } => actual_errors.push(TestError { - code: error.code().to_string(), - }), Token::StartTag(tag) => actual_tokens.push(TestToken::StartTag { name: tag.name, attributes: tag @@ -154,7 +150,13 @@ fn run_test_inner<R: Reader>( assert_eq!( Output { - errors: actual_errors, + errors: tokenizer + .emitter_mut() + .drain_errors() + .map(|(e, _)| TestError { + code: e.code().to_string() + }) + .collect(), tokens: actual_tokens, }, test.output, diff --git a/src/default_emitter.rs b/src/default_emitter.rs index a4c5a63..e89fa5e 100644 --- a/src/default_emitter.rs +++ b/src/default_emitter.rs @@ -17,6 +17,7 @@ pub struct DefaultEmitter<O = NoopOffset> { current_attribute: Option<(String, crate::token::AttrInternal<O>)>, seen_attributes: BTreeSet<String>, emitted_tokens: VecDeque<Token<O>>, + errors: VecDeque<(Error, Range<O>)>, attr_in_end_tag_span: Option<Range<O>>, } @@ -28,11 +29,19 @@ impl<O> Default for DefaultEmitter<O> { current_attribute: None, seen_attributes: BTreeSet::new(), emitted_tokens: VecDeque::new(), + errors: VecDeque::new(), attr_in_end_tag_span: None, } } } +impl<O> DefaultEmitter<O> { + /// Removes all encountered tokenizer errors and returns them as an iterator. + pub fn drain_errors(&mut self) -> impl Iterator<Item = (Error, Range<O>)> + '_ { + self.errors.drain(0..) + } +} + impl<O> Iterator for DefaultEmitter<O> { type Item = Token<O>; @@ -43,7 +52,7 @@ impl<O> Iterator for DefaultEmitter<O> { impl<O: Offset> Emitter<O> for DefaultEmitter<O> { fn report_error(&mut self, error: Error, span: Range<O>) { - self.emitted_tokens.push_front(Token::Error { error, span }); + self.errors.push_back((error, span)); } fn emit_eof(&mut self) { diff --git a/src/naive_parser.rs b/src/naive_parser.rs index 10eb98d..5bf002b 100644 --- a/src/naive_parser.rs +++ b/src/naive_parser.rs @@ -14,11 +14,10 @@ use crate::{Emitter, Event, State, Tokenizer}; /// * it naively emits any CDATA sections as bogus comments, for example: /// /// ``` -/// # use html5tokenizer::{Error, NaiveParser, Tokenizer, Token}; +/// # use html5tokenizer::{NaiveParser, Token}; /// let html = "<svg><![CDATA[I love SVG]]>"; /// let mut tokens = NaiveParser::new(html).flatten(); /// assert!(matches!(tokens.next().unwrap(), Token::StartTag(tag) if tag.name == "svg")); -/// assert!(matches!(tokens.next().unwrap(), Token::Error {error: Error::CdataInHtmlContent, ..})); /// assert!(matches!(tokens.next().unwrap(), Token::Comment(_bogus_comment))); /// ``` /// @@ -59,6 +58,11 @@ impl<R: Reader + Position<O>, O: Offset, E: Emitter<O>> NaiveParser<R, O, E> { tokenizer.naively_switch_state = true; NaiveParser { tokenizer } } + + /// Returns a mutable reference to the emitter. + pub fn emitter_mut(&mut self) -> &mut E { + self.tokenizer.emitter_mut() + } } impl<R, O, E> Iterator for NaiveParser<R, O, E> diff --git a/src/token.rs b/src/token.rs index 48c90f7..c599cd5 100644 --- a/src/token.rs +++ b/src/token.rs @@ -5,7 +5,6 @@ use std::iter::FromIterator; use std::ops::{Index, Range}; use crate::offset::Offset; -use crate::Error; /// A type for the tokens emitted by a WHATWG-compliant HTML tokenizer. #[derive(Debug, Eq, PartialEq)] @@ -20,16 +19,6 @@ pub enum Token<O> { Comment(Comment<O>), /// An HTML doctype declaration. Doctype(Doctype<O>), - /// An HTML parsing error. - /// - /// Can be skipped over, the tokenizer is supposed to recover from the error and continues with - /// more tokens afterward. - Error { - /// What kind of error occurred. - error: Error, - /// The source code span of the error. - span: Range<O>, - }, } /// An HTML start tag, such as `<p>` or `<a>`. diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 7e1e85f..270d3d0 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -111,6 +111,11 @@ impl<R: Reader + Position<O>, O: Offset, E: Emitter<O>> Tokenizer<R, O, E> { pub fn handle_cdata_open(&mut self, action: CdataAction) { machine::handle_cdata_open(self, action); } + + /// Returns a mutable reference to the emitter. + pub fn emitter_mut(&mut self) -> &mut E { + &mut self.emitter + } } /// Used by [`Tokenizer::handle_cdata_open`] to determine how to process `<![CDATA[` diff --git a/tests/test_spans.rs b/tests/test_spans.rs index f2cdc5f..64cc250 100644 --- a/tests/test_spans.rs +++ b/tests/test_spans.rs @@ -221,12 +221,7 @@ fn comment_data_span() { let mut annotated = String::new(); for case in cases { let labeler = |parser: Parser| { - let Token::Comment(comment) = parser - .flatten() - .filter(|t| !matches!(t, Token::Error { .. })) - .next() - .unwrap() - else { + let Token::Comment(comment) = parser.flatten().next().unwrap() else { panic!("expected comment"); }; vec![(comment.data_span(), "")] @@ -265,12 +260,7 @@ fn comment_data_span() { "###); for (idx, case) in cases.iter().enumerate() { - let Token::Comment(comment) = parser(*case) - .flatten() - .filter(|t| !matches!(t, Token::Error { .. })) - .next() - .unwrap() - else { + let Token::Comment(comment) = parser(*case).flatten().next().unwrap() else { panic!("expected comment"); }; assert_eq!(case[comment.data_span()], comment.data, "case {idx}"); @@ -287,12 +277,7 @@ fn doctype_span() { let mut annotated = String::new(); for case in cases { let labeler = |parser: Parser| { - let Token::Doctype(doctype) = parser - .flatten() - .filter(|t| !matches!(t, Token::Error { .. })) - .next() - .unwrap() - else { + let Token::Doctype(doctype) = parser.flatten().next().unwrap() else { panic!("expected doctype"); }; vec![(doctype.span, "")] @@ -316,12 +301,7 @@ fn doctype_id_spans() { let mut annotated = String::new(); for case in cases { let labeler = |parser: Parser| { - let Token::Doctype(doctype) = parser - .flatten() - .filter(|t| !matches!(t, Token::Error { .. })) - .next() - .unwrap() - else { + let Token::Doctype(doctype) = parser.flatten().next().unwrap() else { panic!("expected doctype"); }; @@ -351,10 +331,11 @@ fn doctype_id_spans() { } fn annotate_errors(html: &'static str) -> String { - for token in parser(html).flatten() { - let Token::Error { span, .. } = token else { - continue; - }; + let mut parser = parser(html); + for _ in parser.by_ref() {} + let errors: Vec<_> = parser.emitter_mut().drain_errors().collect(); + + for (_, span) in errors { if span.start == span.end { if span.start != html.len() { panic!("empty error spans are only allowed at the very end of the source (for eof errors)"); @@ -365,13 +346,10 @@ fn annotate_errors(html: &'static str) -> String { } } - let labeler = |parser: Parser| { + let labeler = |mut parser: Parser| { let mut labels = Vec::new(); - for token in parser.flatten() { - let Token::Error { error, span } = token else { - continue; - }; - + for _ in parser.by_ref() {} + for (error, span) in parser.emitter_mut().drain_errors() { labels.push((span, error.code())); } labels |