From 14f1a85d994ad97dae3d9de735fc51adb25d390a Mon Sep 17 00:00:00 2001
From: Martin Fischer <martin@push-f.com>
Date: Tue, 30 Nov 2021 07:28:21 +0100
Subject: introduce Error enum

---
 README.md                     |  4 +--
 src/macros.rs                 | 20 -----------
 src/tokenizer/char_ref/mod.rs | 39 +++++++---------------
 src/tokenizer/error.rs        | 78 +++++++++++++++++++++++++++++++++++++++++++
 src/tokenizer/interface.rs    |  4 +--
 src/tokenizer/mod.rs          | 40 ++++++++++------------
 6 files changed, 112 insertions(+), 73 deletions(-)
 create mode 100644 src/tokenizer/error.rs

diff --git a/README.md b/README.md
index 3adff5a..95a0a05 100644
--- a/README.md
+++ b/README.md
@@ -19,8 +19,8 @@ changes:
   source code spans for tag names, attribute names and attribute values.
   The feature is disabled by default.
 
-* The API has been cleaned up a bit (e.g. the internal tokenizer state enums
-  are no longer public).
+* The API has been polished, e.g. the internal tokenizer state enums are no
+  longer public and errors are no longer stringly typed.
 
 If you want to parse HTML into a tree (DOM) you should by all means use
 html5ever, this crate is merely for those who only want an HTML5 tokenizer and
diff --git a/src/macros.rs b/src/macros.rs
index d87ea98..558a4a9 100644
--- a/src/macros.rs
+++ b/src/macros.rs
@@ -31,23 +31,3 @@ macro_rules! time {
         (result, dt)
     }};
 }
-
-/// Conditionally perform string formatting.
-///
-/// If `$enabled` is true, then do the formatting and return a `Cow::Owned`.
-///
-/// Otherwise, just return the borrowed (often `'static`) string
-/// `$borrowed`.
-///
-/// When `$enabled` is false, this avoids the overhead of allocating
-/// and writing to a buffer, as well as any overhead or side effects
-/// of the format arguments.
-macro_rules! format_if {
-    ($enabled:expr, $borrowed:expr, $fmt:expr, $($args:expr),*) => {
-        if $enabled {
-            ::std::borrow::Cow::Owned(format!($fmt, $($args),*)) as ::std::borrow::Cow<str>
-        } else {
-            ::std::borrow::Cow::Borrowed($borrowed)
-        }
-    }
-}
diff --git a/src/tokenizer/char_ref/mod.rs b/src/tokenizer/char_ref/mod.rs
index 9c01bdf..4f94c88 100644
--- a/src/tokenizer/char_ref/mod.rs
+++ b/src/tokenizer/char_ref/mod.rs
@@ -8,10 +8,10 @@
 // except according to those terms.
 
 use super::{TokenSink, Tokenizer};
+use crate::error::{CharRefError, Error};
 use crate::util::buffer_queue::BufferQueue;
 use crate::util::str::is_ascii_alnum;
 
-use std::borrow::Cow::Borrowed;
 use std::char::from_u32;
 
 use self::State::*;
@@ -227,9 +227,7 @@ impl CharRefTokenizer {
     ) -> Status {
         match unwrap_or_return!(tokenizer.peek(input), Stuck) {
             ';' => tokenizer.discard_char(input),
-            _ => tokenizer.emit_error(Borrowed(
-                "Semicolon missing after numeric character reference",
-            )),
+            _ => tokenizer.emit_error(Error::CharRef(CharRefError::MissingSemicolon)),
         };
         self.finish_numeric(tokenizer)
     }
@@ -246,7 +244,7 @@ impl CharRefTokenizer {
         }
 
         input.push_front(unconsume);
-        tokenizer.emit_error(Borrowed("Numeric character reference without digits"));
+        tokenizer.emit_error(Error::CharRef(CharRefError::NumericCharRefWithoutDigits));
         self.finish_none()
     }
 
@@ -272,13 +270,9 @@ impl CharRefTokenizer {
         };
 
         if error {
-            let msg = format_if!(
-                tokenizer.opts.exact_errors,
-                "Invalid numeric character reference",
-                "Invalid numeric character reference value 0x{:06X}",
-                self.num
-            );
-            tokenizer.emit_error(msg);
+            tokenizer.emit_error(Error::CharRef(CharRefError::NumericCharRefInvalid(
+                self.num,
+            )));
         }
 
         self.finish_one(c)
@@ -311,13 +305,7 @@ impl CharRefTokenizer {
 
     #[cfg(feature = "named-entities")]
     fn emit_name_error<Sink: TokenSink>(&mut self, tokenizer: &mut Tokenizer<Sink>) {
-        let msg = format_if!(
-            tokenizer.opts.exact_errors,
-            "Invalid character reference",
-            "Invalid character reference &{}",
-            self.name_buf()
-        );
-        tokenizer.emit_error(msg);
+        tokenizer.emit_error(Error::CharRef(CharRefError::InvalidNamedCharRef));
     }
 
     fn unconsume_name(&mut self, input: &mut BufferQueue) {
@@ -384,16 +372,13 @@ impl CharRefTokenizer {
                 let unconsume_all = match (self.addnl_allowed, last_matched, next_after) {
                     (_, ';', _) => false,
                     (Some(_), _, Some('=')) => {
-                        tokenizer.emit_error(Borrowed(
-                            "Equals sign after character reference in attribute",
-                        ));
+                        tokenizer
+                            .emit_error(Error::CharRef(CharRefError::EqualsSignAfterCharRefInAttr));
                         true
                     }
                     (Some(_), _, Some(c)) if is_ascii_alnum(c) => true,
                     _ => {
-                        tokenizer.emit_error(Borrowed(
-                            "Character reference does not end with semicolon",
-                        ));
+                        tokenizer.emit_error(Error::CharRef(CharRefError::MissingSemicolon));
                         false
                     }
                 };
@@ -444,7 +429,7 @@ impl CharRefTokenizer {
                 Numeric(_) if !self.seen_digit => drop(self.unconsume_numeric(tokenizer, input)),
 
                 Numeric(_) | NumericSemicolon => {
-                    tokenizer.emit_error(Borrowed("EOF in numeric character reference"));
+                    tokenizer.emit_error(Error::CharRef(CharRefError::EofInNumericCharRef));
                     self.finish_numeric(tokenizer);
                 }
 
@@ -458,7 +443,7 @@ impl CharRefTokenizer {
 
                 Octothorpe => {
                     input.push_front(String::from("#"));
-                    tokenizer.emit_error(Borrowed("EOF after '#' in character reference"));
+                    tokenizer.emit_error(Error::CharRef(CharRefError::EofAfterNumberSign));
                     self.finish_none();
                 }
             }
diff --git a/src/tokenizer/error.rs b/src/tokenizer/error.rs
new file mode 100644
index 0000000..89eed2a
--- /dev/null
+++ b/src/tokenizer/error.rs
@@ -0,0 +1,78 @@
+//! Types to represent the parser errors that can occur.
+use std::fmt::Display;
+
+#[derive(PartialEq, Eq, Debug)]
+#[non_exhaustive]
+pub enum Error {
+    AttributesOnEndTag,
+    SelfClosingEndTag,
+    DuplicateAttribute,
+    BadCharacter(char),
+    UnexpectedCharacter(char, InternalState),
+    UnexpectedEOF(InternalState),
+    CharRef(CharRefError),
+}
+
+/// Allows Error variants to include the internal tokenizer state without making it public.
+#[derive(PartialEq, Eq, Debug)]
+pub struct InternalState(pub(crate) crate::tokenizer::states::State);
+
+impl Display for Error {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            Error::AttributesOnEndTag => write!(f, "attributes on an end tag"),
+            Error::SelfClosingEndTag => write!(f, "self-closing end tag"),
+            Error::DuplicateAttribute => write!(f, "duplicate attribute"),
+            Error::BadCharacter(char) => write!(f, "bad character {:?}", char),
+            Error::UnexpectedCharacter(char, state) => {
+                write!(
+                    f,
+                    "unexpected character: saw {:?} in state {:?}",
+                    char, state.0
+                )
+            }
+            Error::UnexpectedEOF(state) => write!(f, "unexpected EOF in state {:?}", state.0),
+            Error::CharRef(error) => error.fmt(f),
+        }
+    }
+}
+
+#[derive(PartialEq, Eq, Debug)]
+#[non_exhaustive]
+pub enum CharRefError {
+    MissingSemicolon,
+    NumericCharRefWithoutDigits,
+    NumericCharRefInvalid(u32),
+    EofInNumericCharRef,
+    EofAfterNumberSign,
+    EqualsSignAfterCharRefInAttr,
+    InvalidNamedCharRef,
+}
+
+impl Display for CharRefError {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            CharRefError::NumericCharRefWithoutDigits => {
+                write!(f, "numeric character reference without digits")
+            }
+            CharRefError::MissingSemicolon => {
+                write!(f, "semicolon missing after character reference")
+            }
+            CharRefError::NumericCharRefInvalid(num) => {
+                write!(f, "invalid numeric character reference value 0x{:06X}", num)
+            }
+            CharRefError::EofInNumericCharRef => {
+                write!(f, "EOF in numeric character reference")
+            }
+            CharRefError::EofAfterNumberSign => {
+                write!(f, "EOF after '#' in character reference")
+            }
+            CharRefError::EqualsSignAfterCharRefInAttr => {
+                write!(f, "equals sign after character reference in attribute")
+            }
+            CharRefError::InvalidNamedCharRef => {
+                write!(f, "invalid named character reference")
+            }
+        }
+    }
+}
diff --git a/src/tokenizer/interface.rs b/src/tokenizer/interface.rs
index f12fb16..715f9bc 100644
--- a/src/tokenizer/interface.rs
+++ b/src/tokenizer/interface.rs
@@ -7,8 +7,8 @@
 // option. This file may not be copied, modified, or distributed
 // except according to those terms.
 
+use crate::error::Error;
 use crate::tokenizer::states;
-use std::borrow::Cow;
 #[cfg(feature = "spans")]
 use std::ops::Range;
 
@@ -112,7 +112,7 @@ pub enum Token {
     CharacterTokens(String),
     NullCharacterToken,
     EOFToken,
-    ParseError(Cow<'static, str>),
+    ParseError(Error),
 }
 
 #[derive(Debug, PartialEq)]
diff --git a/src/tokenizer/mod.rs b/src/tokenizer/mod.rs
index 4511cf8..78101f6 100644
--- a/src/tokenizer/mod.rs
+++ b/src/tokenizer/mod.rs
@@ -9,6 +9,7 @@
 
 //! The HTML5 tokenizer.
 
+use self::error::InternalState;
 pub use self::interface::{Attribute, Doctype, Tag, TagKind, Token};
 use self::interface::{CharacterTokens, EOFToken, NullCharacterToken, ParseError};
 use self::interface::{CommentToken, DoctypeToken, EndTag, StartTag, TagToken};
@@ -21,9 +22,9 @@ use self::states::{Rawtext, Rcdata, ScriptData, ScriptDataEscaped};
 
 use self::char_ref::{CharRef, CharRefTokenizer};
 
+use crate::error::Error;
 use crate::util::{smallcharset::SmallCharSet, str::lower_ascii_letter};
 
-use std::borrow::Cow::{self, Borrowed};
 use std::collections::BTreeMap;
 use std::default::Default;
 use std::mem::replace;
@@ -34,6 +35,7 @@ use crate::util::buffer_queue::{FromSet, NotFromSet, SetResult};
 pub use states::RawKind;
 
 mod char_ref;
+pub mod error;
 mod interface;
 mod states;
 
@@ -292,8 +294,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
                 _ => false,
             }
         {
-            let msg = format!("Bad character {}", c);
-            self.emit_error(Cow::Owned(msg));
+            self.emit_error(Error::BadCharacter(c));
         }
 
         #[cfg(feature = "spans")]
@@ -400,24 +401,14 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
     }
 
     fn bad_char_error(&mut self) {
-        let msg = format_if!(
-            self.opts.exact_errors,
-            "Bad character",
-            "Saw {} in state {:?}",
+        self.emit_error(Error::UnexpectedCharacter(
             self.current_char,
-            self.state
-        );
-        self.emit_error(msg);
+            InternalState(self.state),
+        ));
     }
 
     fn bad_eof_error(&mut self) {
-        let msg = format_if!(
-            self.opts.exact_errors,
-            "Unexpected EOF",
-            "Saw EOF in state {:?}",
-            self.state
-        );
-        self.emit_error(msg);
+        self.emit_error(Error::UnexpectedEOF(InternalState(self.state)));
     }
 
     fn emit_char(&mut self, c: char) {
@@ -444,10 +435,10 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
             }
             EndTag => {
                 if !self.current_tag_attrs.is_empty() {
-                    self.emit_error(Borrowed("Attributes on an end tag"));
+                    self.emit_error(Error::AttributesOnEndTag);
                 }
                 if self.current_tag_self_closing {
-                    self.emit_error(Borrowed("Self-closing end tag"));
+                    self.emit_error(Error::SelfClosingEndTag);
                 }
             }
         }
@@ -547,7 +538,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
         };
 
         if dup {
-            self.emit_error(Borrowed("Duplicate attribute"));
+            self.emit_error(Error::DuplicateAttribute);
             self.current_attr_name.clear();
             self.current_attr_value.clear();
         } else {
@@ -606,7 +597,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
         self.get_char(input);
     }
 
-    fn emit_error(&mut self, error: Cow<'static, str>) {
+    fn emit_error(&mut self, error: Error) {
         self.process_token_and_continue(ParseError(error));
     }
 }
@@ -2451,11 +2442,16 @@ mod test {
     #[test]
     #[cfg(feature = "named-entities")]
     fn named_entities() {
+        use crate::error::{CharRefError, Error};
+
         let opts = opts();
         let vector = vec![String::from("&amp;\r\n"), String::from("&aamp;\r\n")];
         let expected = vec![
             (3, CharacterTokens("&\n".into())),
-            (3, ParseError("Invalid character reference".into())),
+            (
+                3,
+                ParseError(Error::CharRef(CharRefError::InvalidNamedCharRef)),
+            ),
             (4, CharacterTokens("&aamp;\n".into())),
         ];
         let results = tokenize(vector, opts);
-- 
cgit v1.2.3