break!: remove Token::Error

An error isn't a token (in general and also according to the spec). You shouldn't have to filter out errors when you're just interested in tokens but most importantly having errors in the Token enum is annoying when implementing tree construction (since the spec conditions exhaustively cover all Token variants except Token::Error).
author: Martin Fischer <martin@push-f.com> 2023-08-29 13:09:44 +0200
committer: Martin Fischer <martin@push-f.com> 2023-09-28 10:36:01 +0200
commit: 826907487e2b593f1c54e98b59fe2f6eb8cb6937 (patch)
tree: de48a91090a240033a6f02eb8e984da133b71025 /src
parent: 2b4c52758c503b08d3299ad2d1ee369ad5f597f1 (diff)
4 files changed, 21 insertions, 14 deletions
diff --git a/src/default_emitter.rs b/src/default_emitter.rs
index a4c5a63..e89fa5e 100644
--- a/src/default_emitter.rs
+++ b/src/default_emitter.rs
@@ -17,6 +17,7 @@ pub struct DefaultEmitter<O = NoopOffset> {
     current_attribute: Option<(String, crate::token::AttrInternal<O>)>,
     seen_attributes: BTreeSet<String>,
     emitted_tokens: VecDeque<Token<O>>,
+    errors: VecDeque<(Error, Range<O>)>,
     attr_in_end_tag_span: Option<Range<O>>,
 }
 
@@ -28,11 +29,19 @@ impl<O> Default for DefaultEmitter<O> {
             current_attribute: None,
             seen_attributes: BTreeSet::new(),
             emitted_tokens: VecDeque::new(),
+            errors: VecDeque::new(),
             attr_in_end_tag_span: None,
         }
     }
 }
 
+impl<O> DefaultEmitter<O> {
+    /// Removes all encountered tokenizer errors and returns them as an iterator.
+    pub fn drain_errors(&mut self) -> impl Iterator<Item = (Error, Range<O>)> + '_ {
+        self.errors.drain(0..)
+    }
+}
+
 impl<O> Iterator for DefaultEmitter<O> {
     type Item = Token<O>;
 
@@ -43,7 +52,7 @@ impl<O> Iterator for DefaultEmitter<O> {
 
 impl<O: Offset> Emitter<O> for DefaultEmitter<O> {
     fn report_error(&mut self, error: Error, span: Range<O>) {
-        self.emitted_tokens.push_front(Token::Error { error, span });
+        self.errors.push_back((error, span));
     }
 
     fn emit_eof(&mut self) {
diff --git a/src/naive_parser.rs b/src/naive_parser.rs
index 10eb98d..5bf002b 100644
--- a/src/naive_parser.rs
+++ b/src/naive_parser.rs
@@ -14,11 +14,10 @@ use crate::{Emitter, Event, State, Tokenizer};
 /// * it naively emits any CDATA sections as bogus comments, for example:
 ///
 ///   ```
-///   # use html5tokenizer::{Error, NaiveParser, Tokenizer, Token};
+///   # use html5tokenizer::{NaiveParser, Token};
 ///   let html = "<svg><![CDATA[I love SVG]]>";
 ///   let mut tokens = NaiveParser::new(html).flatten();
 ///   assert!(matches!(tokens.next().unwrap(), Token::StartTag(tag) if tag.name == "svg"));
-///   assert!(matches!(tokens.next().unwrap(), Token::Error {error: Error::CdataInHtmlContent, ..}));
 ///   assert!(matches!(tokens.next().unwrap(), Token::Comment(_bogus_comment)));
 ///   ```
 ///
@@ -59,6 +58,11 @@ impl<R: Reader + Position<O>, O: Offset, E: Emitter<O>> NaiveParser<R, O, E> {
         tokenizer.naively_switch_state = true;
         NaiveParser { tokenizer }
     }
+
+    /// Returns a mutable reference to the emitter.
+    pub fn emitter_mut(&mut self) -> &mut E {
+        self.tokenizer.emitter_mut()
+    }
 }
 
 impl<R, O, E> Iterator for NaiveParser<R, O, E>
diff --git a/src/token.rs b/src/token.rs
index 48c90f7..c599cd5 100644
--- a/src/token.rs
+++ b/src/token.rs
@@ -5,7 +5,6 @@ use std::iter::FromIterator;
 use std::ops::{Index, Range};
 
 use crate::offset::Offset;
-use crate::Error;
 
 /// A type for the tokens emitted by a WHATWG-compliant HTML tokenizer.
 #[derive(Debug, Eq, PartialEq)]
@@ -20,16 +19,6 @@ pub enum Token<O> {
     Comment(Comment<O>),
     /// An HTML doctype declaration.
     Doctype(Doctype<O>),
-    /// An HTML parsing error.
-    ///
-    /// Can be skipped over, the tokenizer is supposed to recover from the error and continues with
-    /// more tokens afterward.
-    Error {
-        /// What kind of error occurred.
-        error: Error,
-        /// The source code span of the error.
-        span: Range<O>,
-    },
 }
 
 /// An HTML start tag, such as `<p>` or `<a>`.
diff --git a/src/tokenizer.rs b/src/tokenizer.rs
index 7e1e85f..270d3d0 100644
--- a/src/tokenizer.rs
+++ b/src/tokenizer.rs
@@ -111,6 +111,11 @@ impl<R: Reader + Position<O>, O: Offset, E: Emitter<O>> Tokenizer<R, O, E> {
     pub fn handle_cdata_open(&mut self, action: CdataAction) {
         machine::handle_cdata_open(self, action);
     }
+
+    /// Returns a mutable reference to the emitter.
+    pub fn emitter_mut(&mut self) -> &mut E {
+        &mut self.emitter
+    }
 }
 
 /// Used by [`Tokenizer::handle_cdata_open`] to determine how to process `<![CDATA[`
author	Martin Fischer <martin@push-f.com>	2023-08-29 13:09:44 +0200
committer	Martin Fischer <martin@push-f.com>	2023-09-28 10:36:01 +0200
commit	826907487e2b593f1c54e98b59fe2f6eb8cb6937 (patch)
tree	de48a91090a240033a6f02eb8e984da133b71025 /src
parent	2b4c52758c503b08d3299ad2d1ee369ad5f597f1 (diff)