aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMartin Fischer <martin@push-f.com>2021-11-30 07:28:21 +0100
committerMartin Fischer <martin@push-f.com>2021-11-30 11:22:35 +0100
commit14f1a85d994ad97dae3d9de735fc51adb25d390a (patch)
tree0fa0d7c173a19dcb7117132325a801808302bcf8
parentbaf1477c587fe22d27e94408cf2505d588ba007e (diff)
introduce Error enum
-rw-r--r--README.md4
-rw-r--r--src/macros.rs20
-rw-r--r--src/tokenizer/char_ref/mod.rs39
-rw-r--r--src/tokenizer/error.rs78
-rw-r--r--src/tokenizer/interface.rs4
-rw-r--r--src/tokenizer/mod.rs40
6 files changed, 112 insertions, 73 deletions
diff --git a/README.md b/README.md
index 3adff5a..95a0a05 100644
--- a/README.md
+++ b/README.md
@@ -19,8 +19,8 @@ changes:
source code spans for tag names, attribute names and attribute values.
The feature is disabled by default.
-* The API has been cleaned up a bit (e.g. the internal tokenizer state enums
- are no longer public).
+* The API has been polished, e.g. the internal tokenizer state enums are no
+ longer public and errors are no longer stringly typed.
If you want to parse HTML into a tree (DOM) you should by all means use
html5ever, this crate is merely for those who only want an HTML5 tokenizer and
diff --git a/src/macros.rs b/src/macros.rs
index d87ea98..558a4a9 100644
--- a/src/macros.rs
+++ b/src/macros.rs
@@ -31,23 +31,3 @@ macro_rules! time {
(result, dt)
}};
}
-
-/// Conditionally perform string formatting.
-///
-/// If `$enabled` is true, then do the formatting and return a `Cow::Owned`.
-///
-/// Otherwise, just return the borrowed (often `'static`) string
-/// `$borrowed`.
-///
-/// When `$enabled` is false, this avoids the overhead of allocating
-/// and writing to a buffer, as well as any overhead or side effects
-/// of the format arguments.
-macro_rules! format_if {
- ($enabled:expr, $borrowed:expr, $fmt:expr, $($args:expr),*) => {
- if $enabled {
- ::std::borrow::Cow::Owned(format!($fmt, $($args),*)) as ::std::borrow::Cow<str>
- } else {
- ::std::borrow::Cow::Borrowed($borrowed)
- }
- }
-}
diff --git a/src/tokenizer/char_ref/mod.rs b/src/tokenizer/char_ref/mod.rs
index 9c01bdf..4f94c88 100644
--- a/src/tokenizer/char_ref/mod.rs
+++ b/src/tokenizer/char_ref/mod.rs
@@ -8,10 +8,10 @@
// except according to those terms.
use super::{TokenSink, Tokenizer};
+use crate::error::{CharRefError, Error};
use crate::util::buffer_queue::BufferQueue;
use crate::util::str::is_ascii_alnum;
-use std::borrow::Cow::Borrowed;
use std::char::from_u32;
use self::State::*;
@@ -227,9 +227,7 @@ impl CharRefTokenizer {
) -> Status {
match unwrap_or_return!(tokenizer.peek(input), Stuck) {
';' => tokenizer.discard_char(input),
- _ => tokenizer.emit_error(Borrowed(
- "Semicolon missing after numeric character reference",
- )),
+ _ => tokenizer.emit_error(Error::CharRef(CharRefError::MissingSemicolon)),
};
self.finish_numeric(tokenizer)
}
@@ -246,7 +244,7 @@ impl CharRefTokenizer {
}
input.push_front(unconsume);
- tokenizer.emit_error(Borrowed("Numeric character reference without digits"));
+ tokenizer.emit_error(Error::CharRef(CharRefError::NumericCharRefWithoutDigits));
self.finish_none()
}
@@ -272,13 +270,9 @@ impl CharRefTokenizer {
};
if error {
- let msg = format_if!(
- tokenizer.opts.exact_errors,
- "Invalid numeric character reference",
- "Invalid numeric character reference value 0x{:06X}",
- self.num
- );
- tokenizer.emit_error(msg);
+ tokenizer.emit_error(Error::CharRef(CharRefError::NumericCharRefInvalid(
+ self.num,
+ )));
}
self.finish_one(c)
@@ -311,13 +305,7 @@ impl CharRefTokenizer {
#[cfg(feature = "named-entities")]
fn emit_name_error<Sink: TokenSink>(&mut self, tokenizer: &mut Tokenizer<Sink>) {
- let msg = format_if!(
- tokenizer.opts.exact_errors,
- "Invalid character reference",
- "Invalid character reference &{}",
- self.name_buf()
- );
- tokenizer.emit_error(msg);
+ tokenizer.emit_error(Error::CharRef(CharRefError::InvalidNamedCharRef));
}
fn unconsume_name(&mut self, input: &mut BufferQueue) {
@@ -384,16 +372,13 @@ impl CharRefTokenizer {
let unconsume_all = match (self.addnl_allowed, last_matched, next_after) {
(_, ';', _) => false,
(Some(_), _, Some('=')) => {
- tokenizer.emit_error(Borrowed(
- "Equals sign after character reference in attribute",
- ));
+ tokenizer
+ .emit_error(Error::CharRef(CharRefError::EqualsSignAfterCharRefInAttr));
true
}
(Some(_), _, Some(c)) if is_ascii_alnum(c) => true,
_ => {
- tokenizer.emit_error(Borrowed(
- "Character reference does not end with semicolon",
- ));
+ tokenizer.emit_error(Error::CharRef(CharRefError::MissingSemicolon));
false
}
};
@@ -444,7 +429,7 @@ impl CharRefTokenizer {
Numeric(_) if !self.seen_digit => drop(self.unconsume_numeric(tokenizer, input)),
Numeric(_) | NumericSemicolon => {
- tokenizer.emit_error(Borrowed("EOF in numeric character reference"));
+ tokenizer.emit_error(Error::CharRef(CharRefError::EofInNumericCharRef));
self.finish_numeric(tokenizer);
}
@@ -458,7 +443,7 @@ impl CharRefTokenizer {
Octothorpe => {
input.push_front(String::from("#"));
- tokenizer.emit_error(Borrowed("EOF after '#' in character reference"));
+ tokenizer.emit_error(Error::CharRef(CharRefError::EofAfterNumberSign));
self.finish_none();
}
}
diff --git a/src/tokenizer/error.rs b/src/tokenizer/error.rs
new file mode 100644
index 0000000..89eed2a
--- /dev/null
+++ b/src/tokenizer/error.rs
@@ -0,0 +1,78 @@
+//! Types to represent the parser errors that can occur.
+use std::fmt::Display;
+
+#[derive(PartialEq, Eq, Debug)]
+#[non_exhaustive]
+pub enum Error {
+ AttributesOnEndTag,
+ SelfClosingEndTag,
+ DuplicateAttribute,
+ BadCharacter(char),
+ UnexpectedCharacter(char, InternalState),
+ UnexpectedEOF(InternalState),
+ CharRef(CharRefError),
+}
+
+/// Allows Error variants to include the internal tokenizer state without making it public.
+#[derive(PartialEq, Eq, Debug)]
+pub struct InternalState(pub(crate) crate::tokenizer::states::State);
+
+impl Display for Error {
+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+ match self {
+ Error::AttributesOnEndTag => write!(f, "attributes on an end tag"),
+ Error::SelfClosingEndTag => write!(f, "self-closing end tag"),
+ Error::DuplicateAttribute => write!(f, "duplicate attribute"),
+ Error::BadCharacter(char) => write!(f, "bad character {:?}", char),
+ Error::UnexpectedCharacter(char, state) => {
+ write!(
+ f,
+ "unexpected character: saw {:?} in state {:?}",
+ char, state.0
+ )
+ }
+ Error::UnexpectedEOF(state) => write!(f, "unexpected EOF in state {:?}", state.0),
+ Error::CharRef(error) => error.fmt(f),
+ }
+ }
+}
+
+#[derive(PartialEq, Eq, Debug)]
+#[non_exhaustive]
+pub enum CharRefError {
+ MissingSemicolon,
+ NumericCharRefWithoutDigits,
+ NumericCharRefInvalid(u32),
+ EofInNumericCharRef,
+ EofAfterNumberSign,
+ EqualsSignAfterCharRefInAttr,
+ InvalidNamedCharRef,
+}
+
+impl Display for CharRefError {
+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+ match self {
+ CharRefError::NumericCharRefWithoutDigits => {
+ write!(f, "numeric character reference without digits")
+ }
+ CharRefError::MissingSemicolon => {
+ write!(f, "semicolon missing after character reference")
+ }
+ CharRefError::NumericCharRefInvalid(num) => {
+ write!(f, "invalid numeric character reference value 0x{:06X}", num)
+ }
+ CharRefError::EofInNumericCharRef => {
+ write!(f, "EOF in numeric character reference")
+ }
+ CharRefError::EofAfterNumberSign => {
+ write!(f, "EOF after '#' in character reference")
+ }
+ CharRefError::EqualsSignAfterCharRefInAttr => {
+ write!(f, "equals sign after character reference in attribute")
+ }
+ CharRefError::InvalidNamedCharRef => {
+ write!(f, "invalid named character reference")
+ }
+ }
+ }
+}
diff --git a/src/tokenizer/interface.rs b/src/tokenizer/interface.rs
index f12fb16..715f9bc 100644
--- a/src/tokenizer/interface.rs
+++ b/src/tokenizer/interface.rs
@@ -7,8 +7,8 @@
// option. This file may not be copied, modified, or distributed
// except according to those terms.
+use crate::error::Error;
use crate::tokenizer::states;
-use std::borrow::Cow;
#[cfg(feature = "spans")]
use std::ops::Range;
@@ -112,7 +112,7 @@ pub enum Token {
CharacterTokens(String),
NullCharacterToken,
EOFToken,
- ParseError(Cow<'static, str>),
+ ParseError(Error),
}
#[derive(Debug, PartialEq)]
diff --git a/src/tokenizer/mod.rs b/src/tokenizer/mod.rs
index 4511cf8..78101f6 100644
--- a/src/tokenizer/mod.rs
+++ b/src/tokenizer/mod.rs
@@ -9,6 +9,7 @@
//! The HTML5 tokenizer.
+use self::error::InternalState;
pub use self::interface::{Attribute, Doctype, Tag, TagKind, Token};
use self::interface::{CharacterTokens, EOFToken, NullCharacterToken, ParseError};
use self::interface::{CommentToken, DoctypeToken, EndTag, StartTag, TagToken};
@@ -21,9 +22,9 @@ use self::states::{Rawtext, Rcdata, ScriptData, ScriptDataEscaped};
use self::char_ref::{CharRef, CharRefTokenizer};
+use crate::error::Error;
use crate::util::{smallcharset::SmallCharSet, str::lower_ascii_letter};
-use std::borrow::Cow::{self, Borrowed};
use std::collections::BTreeMap;
use std::default::Default;
use std::mem::replace;
@@ -34,6 +35,7 @@ use crate::util::buffer_queue::{FromSet, NotFromSet, SetResult};
pub use states::RawKind;
mod char_ref;
+pub mod error;
mod interface;
mod states;
@@ -292,8 +294,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
_ => false,
}
{
- let msg = format!("Bad character {}", c);
- self.emit_error(Cow::Owned(msg));
+ self.emit_error(Error::BadCharacter(c));
}
#[cfg(feature = "spans")]
@@ -400,24 +401,14 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
}
fn bad_char_error(&mut self) {
- let msg = format_if!(
- self.opts.exact_errors,
- "Bad character",
- "Saw {} in state {:?}",
+ self.emit_error(Error::UnexpectedCharacter(
self.current_char,
- self.state
- );
- self.emit_error(msg);
+ InternalState(self.state),
+ ));
}
fn bad_eof_error(&mut self) {
- let msg = format_if!(
- self.opts.exact_errors,
- "Unexpected EOF",
- "Saw EOF in state {:?}",
- self.state
- );
- self.emit_error(msg);
+ self.emit_error(Error::UnexpectedEOF(InternalState(self.state)));
}
fn emit_char(&mut self, c: char) {
@@ -444,10 +435,10 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
}
EndTag => {
if !self.current_tag_attrs.is_empty() {
- self.emit_error(Borrowed("Attributes on an end tag"));
+ self.emit_error(Error::AttributesOnEndTag);
}
if self.current_tag_self_closing {
- self.emit_error(Borrowed("Self-closing end tag"));
+ self.emit_error(Error::SelfClosingEndTag);
}
}
}
@@ -547,7 +538,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
};
if dup {
- self.emit_error(Borrowed("Duplicate attribute"));
+ self.emit_error(Error::DuplicateAttribute);
self.current_attr_name.clear();
self.current_attr_value.clear();
} else {
@@ -606,7 +597,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
self.get_char(input);
}
- fn emit_error(&mut self, error: Cow<'static, str>) {
+ fn emit_error(&mut self, error: Error) {
self.process_token_and_continue(ParseError(error));
}
}
@@ -2451,11 +2442,16 @@ mod test {
#[test]
#[cfg(feature = "named-entities")]
fn named_entities() {
+ use crate::error::{CharRefError, Error};
+
let opts = opts();
let vector = vec![String::from("&amp;\r\n"), String::from("&aamp;\r\n")];
let expected = vec![
(3, CharacterTokens("&\n".into())),
- (3, ParseError("Invalid character reference".into())),
+ (
+ 3,
+ ParseError(Error::CharRef(CharRefError::InvalidNamedCharRef)),
+ ),
(4, CharacterTokens("&aamp;\n".into())),
];
let results = tokenize(vector, opts);