diff options
author | Martin Fischer <martin@push-f.com> | 2023-08-29 13:09:44 +0200 |
---|---|---|
committer | Martin Fischer <martin@push-f.com> | 2023-09-28 10:36:01 +0200 |
commit | 826907487e2b593f1c54e98b59fe2f6eb8cb6937 (patch) | |
tree | de48a91090a240033a6f02eb8e984da133b71025 /src | |
parent | 2b4c52758c503b08d3299ad2d1ee369ad5f597f1 (diff) |
break!: remove Token::Error
An error isn't a token (in general and also according to the spec).
You shouldn't have to filter out errors when you're just interested
in tokens but most importantly having errors in the Token enum is
annoying when implementing tree construction (since the spec conditions
exhaustively cover all Token variants except Token::Error).
Diffstat (limited to 'src')
-rw-r--r-- | src/default_emitter.rs | 11 | ||||
-rw-r--r-- | src/naive_parser.rs | 8 | ||||
-rw-r--r-- | src/token.rs | 11 | ||||
-rw-r--r-- | src/tokenizer.rs | 5 |
4 files changed, 21 insertions, 14 deletions
diff --git a/src/default_emitter.rs b/src/default_emitter.rs index a4c5a63..e89fa5e 100644 --- a/src/default_emitter.rs +++ b/src/default_emitter.rs @@ -17,6 +17,7 @@ pub struct DefaultEmitter<O = NoopOffset> { current_attribute: Option<(String, crate::token::AttrInternal<O>)>, seen_attributes: BTreeSet<String>, emitted_tokens: VecDeque<Token<O>>, + errors: VecDeque<(Error, Range<O>)>, attr_in_end_tag_span: Option<Range<O>>, } @@ -28,11 +29,19 @@ impl<O> Default for DefaultEmitter<O> { current_attribute: None, seen_attributes: BTreeSet::new(), emitted_tokens: VecDeque::new(), + errors: VecDeque::new(), attr_in_end_tag_span: None, } } } +impl<O> DefaultEmitter<O> { + /// Removes all encountered tokenizer errors and returns them as an iterator. + pub fn drain_errors(&mut self) -> impl Iterator<Item = (Error, Range<O>)> + '_ { + self.errors.drain(0..) + } +} + impl<O> Iterator for DefaultEmitter<O> { type Item = Token<O>; @@ -43,7 +52,7 @@ impl<O> Iterator for DefaultEmitter<O> { impl<O: Offset> Emitter<O> for DefaultEmitter<O> { fn report_error(&mut self, error: Error, span: Range<O>) { - self.emitted_tokens.push_front(Token::Error { error, span }); + self.errors.push_back((error, span)); } fn emit_eof(&mut self) { diff --git a/src/naive_parser.rs b/src/naive_parser.rs index 10eb98d..5bf002b 100644 --- a/src/naive_parser.rs +++ b/src/naive_parser.rs @@ -14,11 +14,10 @@ use crate::{Emitter, Event, State, Tokenizer}; /// * it naively emits any CDATA sections as bogus comments, for example: /// /// ``` -/// # use html5tokenizer::{Error, NaiveParser, Tokenizer, Token}; +/// # use html5tokenizer::{NaiveParser, Token}; /// let html = "<svg><![CDATA[I love SVG]]>"; /// let mut tokens = NaiveParser::new(html).flatten(); /// assert!(matches!(tokens.next().unwrap(), Token::StartTag(tag) if tag.name == "svg")); -/// assert!(matches!(tokens.next().unwrap(), Token::Error {error: Error::CdataInHtmlContent, ..})); /// assert!(matches!(tokens.next().unwrap(), Token::Comment(_bogus_comment))); /// ``` /// @@ -59,6 +58,11 @@ impl<R: Reader + Position<O>, O: Offset, E: Emitter<O>> NaiveParser<R, O, E> { tokenizer.naively_switch_state = true; NaiveParser { tokenizer } } + + /// Returns a mutable reference to the emitter. + pub fn emitter_mut(&mut self) -> &mut E { + self.tokenizer.emitter_mut() + } } impl<R, O, E> Iterator for NaiveParser<R, O, E> diff --git a/src/token.rs b/src/token.rs index 48c90f7..c599cd5 100644 --- a/src/token.rs +++ b/src/token.rs @@ -5,7 +5,6 @@ use std::iter::FromIterator; use std::ops::{Index, Range}; use crate::offset::Offset; -use crate::Error; /// A type for the tokens emitted by a WHATWG-compliant HTML tokenizer. #[derive(Debug, Eq, PartialEq)] @@ -20,16 +19,6 @@ pub enum Token<O> { Comment(Comment<O>), /// An HTML doctype declaration. Doctype(Doctype<O>), - /// An HTML parsing error. - /// - /// Can be skipped over, the tokenizer is supposed to recover from the error and continues with - /// more tokens afterward. - Error { - /// What kind of error occurred. - error: Error, - /// The source code span of the error. - span: Range<O>, - }, } /// An HTML start tag, such as `<p>` or `<a>`. diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 7e1e85f..270d3d0 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -111,6 +111,11 @@ impl<R: Reader + Position<O>, O: Offset, E: Emitter<O>> Tokenizer<R, O, E> { pub fn handle_cdata_open(&mut self, action: CdataAction) { machine::handle_cdata_open(self, action); } + + /// Returns a mutable reference to the emitter. + pub fn emitter_mut(&mut self) -> &mut E { + &mut self.emitter + } } /// Used by [`Tokenizer::handle_cdata_open`] to determine how to process `<![CDATA[` |