From 826907487e2b593f1c54e98b59fe2f6eb8cb6937 Mon Sep 17 00:00:00 2001 From: Martin Fischer Date: Tue, 29 Aug 2023 13:09:44 +0200 Subject: break!: remove Token::Error An error isn't a token (in general and also according to the spec). You shouldn't have to filter out errors when you're just interested in tokens but most importantly having errors in the Token enum is annoying when implementing tree construction (since the spec conditions exhaustively cover all Token variants except Token::Error). --- src/default_emitter.rs | 11 ++++++++++- src/naive_parser.rs | 8 ++++++-- src/token.rs | 11 ----------- src/tokenizer.rs | 5 +++++ 4 files changed, 21 insertions(+), 14 deletions(-) (limited to 'src') diff --git a/src/default_emitter.rs b/src/default_emitter.rs index a4c5a63..e89fa5e 100644 --- a/src/default_emitter.rs +++ b/src/default_emitter.rs @@ -17,6 +17,7 @@ pub struct DefaultEmitter { current_attribute: Option<(String, crate::token::AttrInternal)>, seen_attributes: BTreeSet, emitted_tokens: VecDeque>, + errors: VecDeque<(Error, Range)>, attr_in_end_tag_span: Option>, } @@ -28,11 +29,19 @@ impl Default for DefaultEmitter { current_attribute: None, seen_attributes: BTreeSet::new(), emitted_tokens: VecDeque::new(), + errors: VecDeque::new(), attr_in_end_tag_span: None, } } } +impl DefaultEmitter { + /// Removes all encountered tokenizer errors and returns them as an iterator. + pub fn drain_errors(&mut self) -> impl Iterator)> + '_ { + self.errors.drain(0..) + } +} + impl Iterator for DefaultEmitter { type Item = Token; @@ -43,7 +52,7 @@ impl Iterator for DefaultEmitter { impl Emitter for DefaultEmitter { fn report_error(&mut self, error: Error, span: Range) { - self.emitted_tokens.push_front(Token::Error { error, span }); + self.errors.push_back((error, span)); } fn emit_eof(&mut self) { diff --git a/src/naive_parser.rs b/src/naive_parser.rs index 10eb98d..5bf002b 100644 --- a/src/naive_parser.rs +++ b/src/naive_parser.rs @@ -14,11 +14,10 @@ use crate::{Emitter, Event, State, Tokenizer}; /// * it naively emits any CDATA sections as bogus comments, for example: /// /// ``` -/// # use html5tokenizer::{Error, NaiveParser, Tokenizer, Token}; +/// # use html5tokenizer::{NaiveParser, Token}; /// let html = "I love SVG"; /// let mut tokens = NaiveParser::new(html).flatten(); /// assert!(matches!(tokens.next().unwrap(), Token::StartTag(tag) if tag.name == "svg")); -/// assert!(matches!(tokens.next().unwrap(), Token::Error {error: Error::CdataInHtmlContent, ..})); /// assert!(matches!(tokens.next().unwrap(), Token::Comment(_bogus_comment))); /// ``` /// @@ -59,6 +58,11 @@ impl, O: Offset, E: Emitter> NaiveParser { tokenizer.naively_switch_state = true; NaiveParser { tokenizer } } + + /// Returns a mutable reference to the emitter. + pub fn emitter_mut(&mut self) -> &mut E { + self.tokenizer.emitter_mut() + } } impl Iterator for NaiveParser diff --git a/src/token.rs b/src/token.rs index 48c90f7..c599cd5 100644 --- a/src/token.rs +++ b/src/token.rs @@ -5,7 +5,6 @@ use std::iter::FromIterator; use std::ops::{Index, Range}; use crate::offset::Offset; -use crate::Error; /// A type for the tokens emitted by a WHATWG-compliant HTML tokenizer. #[derive(Debug, Eq, PartialEq)] @@ -20,16 +19,6 @@ pub enum Token { Comment(Comment), /// An HTML doctype declaration. Doctype(Doctype), - /// An HTML parsing error. - /// - /// Can be skipped over, the tokenizer is supposed to recover from the error and continues with - /// more tokens afterward. - Error { - /// What kind of error occurred. - error: Error, - /// The source code span of the error. - span: Range, - }, } /// An HTML start tag, such as `

` or ``. diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 7e1e85f..270d3d0 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -111,6 +111,11 @@ impl, O: Offset, E: Emitter> Tokenizer { pub fn handle_cdata_open(&mut self, action: CdataAction) { machine::handle_cdata_open(self, action); } + + /// Returns a mutable reference to the emitter. + pub fn emitter_mut(&mut self) -> &mut E { + &mut self.emitter + } } /// Used by [`Tokenizer::handle_cdata_open`] to determine how to process `