diff options
Diffstat (limited to 'src/tokenizer')
-rw-r--r-- | src/tokenizer/char_ref/mod.rs | 39 | ||||
-rw-r--r-- | src/tokenizer/error.rs | 78 | ||||
-rw-r--r-- | src/tokenizer/interface.rs | 4 | ||||
-rw-r--r-- | src/tokenizer/mod.rs | 40 |
4 files changed, 110 insertions, 51 deletions
diff --git a/src/tokenizer/char_ref/mod.rs b/src/tokenizer/char_ref/mod.rs index 9c01bdf..4f94c88 100644 --- a/src/tokenizer/char_ref/mod.rs +++ b/src/tokenizer/char_ref/mod.rs @@ -8,10 +8,10 @@ // except according to those terms. use super::{TokenSink, Tokenizer}; +use crate::error::{CharRefError, Error}; use crate::util::buffer_queue::BufferQueue; use crate::util::str::is_ascii_alnum; -use std::borrow::Cow::Borrowed; use std::char::from_u32; use self::State::*; @@ -227,9 +227,7 @@ impl CharRefTokenizer { ) -> Status { match unwrap_or_return!(tokenizer.peek(input), Stuck) { ';' => tokenizer.discard_char(input), - _ => tokenizer.emit_error(Borrowed( - "Semicolon missing after numeric character reference", - )), + _ => tokenizer.emit_error(Error::CharRef(CharRefError::MissingSemicolon)), }; self.finish_numeric(tokenizer) } @@ -246,7 +244,7 @@ impl CharRefTokenizer { } input.push_front(unconsume); - tokenizer.emit_error(Borrowed("Numeric character reference without digits")); + tokenizer.emit_error(Error::CharRef(CharRefError::NumericCharRefWithoutDigits)); self.finish_none() } @@ -272,13 +270,9 @@ impl CharRefTokenizer { }; if error { - let msg = format_if!( - tokenizer.opts.exact_errors, - "Invalid numeric character reference", - "Invalid numeric character reference value 0x{:06X}", - self.num - ); - tokenizer.emit_error(msg); + tokenizer.emit_error(Error::CharRef(CharRefError::NumericCharRefInvalid( + self.num, + ))); } self.finish_one(c) @@ -311,13 +305,7 @@ impl CharRefTokenizer { #[cfg(feature = "named-entities")] fn emit_name_error<Sink: TokenSink>(&mut self, tokenizer: &mut Tokenizer<Sink>) { - let msg = format_if!( - tokenizer.opts.exact_errors, - "Invalid character reference", - "Invalid character reference &{}", - self.name_buf() - ); - tokenizer.emit_error(msg); + tokenizer.emit_error(Error::CharRef(CharRefError::InvalidNamedCharRef)); } fn unconsume_name(&mut self, input: &mut BufferQueue) { @@ -384,16 +372,13 @@ impl CharRefTokenizer { let unconsume_all = match (self.addnl_allowed, last_matched, next_after) { (_, ';', _) => false, (Some(_), _, Some('=')) => { - tokenizer.emit_error(Borrowed( - "Equals sign after character reference in attribute", - )); + tokenizer + .emit_error(Error::CharRef(CharRefError::EqualsSignAfterCharRefInAttr)); true } (Some(_), _, Some(c)) if is_ascii_alnum(c) => true, _ => { - tokenizer.emit_error(Borrowed( - "Character reference does not end with semicolon", - )); + tokenizer.emit_error(Error::CharRef(CharRefError::MissingSemicolon)); false } }; @@ -444,7 +429,7 @@ impl CharRefTokenizer { Numeric(_) if !self.seen_digit => drop(self.unconsume_numeric(tokenizer, input)), Numeric(_) | NumericSemicolon => { - tokenizer.emit_error(Borrowed("EOF in numeric character reference")); + tokenizer.emit_error(Error::CharRef(CharRefError::EofInNumericCharRef)); self.finish_numeric(tokenizer); } @@ -458,7 +443,7 @@ impl CharRefTokenizer { Octothorpe => { input.push_front(String::from("#")); - tokenizer.emit_error(Borrowed("EOF after '#' in character reference")); + tokenizer.emit_error(Error::CharRef(CharRefError::EofAfterNumberSign)); self.finish_none(); } } diff --git a/src/tokenizer/error.rs b/src/tokenizer/error.rs new file mode 100644 index 0000000..89eed2a --- /dev/null +++ b/src/tokenizer/error.rs @@ -0,0 +1,78 @@ +//! Types to represent the parser errors that can occur. +use std::fmt::Display; + +#[derive(PartialEq, Eq, Debug)] +#[non_exhaustive] +pub enum Error { + AttributesOnEndTag, + SelfClosingEndTag, + DuplicateAttribute, + BadCharacter(char), + UnexpectedCharacter(char, InternalState), + UnexpectedEOF(InternalState), + CharRef(CharRefError), +} + +/// Allows Error variants to include the internal tokenizer state without making it public. +#[derive(PartialEq, Eq, Debug)] +pub struct InternalState(pub(crate) crate::tokenizer::states::State); + +impl Display for Error { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Error::AttributesOnEndTag => write!(f, "attributes on an end tag"), + Error::SelfClosingEndTag => write!(f, "self-closing end tag"), + Error::DuplicateAttribute => write!(f, "duplicate attribute"), + Error::BadCharacter(char) => write!(f, "bad character {:?}", char), + Error::UnexpectedCharacter(char, state) => { + write!( + f, + "unexpected character: saw {:?} in state {:?}", + char, state.0 + ) + } + Error::UnexpectedEOF(state) => write!(f, "unexpected EOF in state {:?}", state.0), + Error::CharRef(error) => error.fmt(f), + } + } +} + +#[derive(PartialEq, Eq, Debug)] +#[non_exhaustive] +pub enum CharRefError { + MissingSemicolon, + NumericCharRefWithoutDigits, + NumericCharRefInvalid(u32), + EofInNumericCharRef, + EofAfterNumberSign, + EqualsSignAfterCharRefInAttr, + InvalidNamedCharRef, +} + +impl Display for CharRefError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + CharRefError::NumericCharRefWithoutDigits => { + write!(f, "numeric character reference without digits") + } + CharRefError::MissingSemicolon => { + write!(f, "semicolon missing after character reference") + } + CharRefError::NumericCharRefInvalid(num) => { + write!(f, "invalid numeric character reference value 0x{:06X}", num) + } + CharRefError::EofInNumericCharRef => { + write!(f, "EOF in numeric character reference") + } + CharRefError::EofAfterNumberSign => { + write!(f, "EOF after '#' in character reference") + } + CharRefError::EqualsSignAfterCharRefInAttr => { + write!(f, "equals sign after character reference in attribute") + } + CharRefError::InvalidNamedCharRef => { + write!(f, "invalid named character reference") + } + } + } +} diff --git a/src/tokenizer/interface.rs b/src/tokenizer/interface.rs index f12fb16..715f9bc 100644 --- a/src/tokenizer/interface.rs +++ b/src/tokenizer/interface.rs @@ -7,8 +7,8 @@ // option. This file may not be copied, modified, or distributed // except according to those terms. +use crate::error::Error; use crate::tokenizer::states; -use std::borrow::Cow; #[cfg(feature = "spans")] use std::ops::Range; @@ -112,7 +112,7 @@ pub enum Token { CharacterTokens(String), NullCharacterToken, EOFToken, - ParseError(Cow<'static, str>), + ParseError(Error), } #[derive(Debug, PartialEq)] diff --git a/src/tokenizer/mod.rs b/src/tokenizer/mod.rs index 4511cf8..78101f6 100644 --- a/src/tokenizer/mod.rs +++ b/src/tokenizer/mod.rs @@ -9,6 +9,7 @@ //! The HTML5 tokenizer. +use self::error::InternalState; pub use self::interface::{Attribute, Doctype, Tag, TagKind, Token}; use self::interface::{CharacterTokens, EOFToken, NullCharacterToken, ParseError}; use self::interface::{CommentToken, DoctypeToken, EndTag, StartTag, TagToken}; @@ -21,9 +22,9 @@ use self::states::{Rawtext, Rcdata, ScriptData, ScriptDataEscaped}; use self::char_ref::{CharRef, CharRefTokenizer}; +use crate::error::Error; use crate::util::{smallcharset::SmallCharSet, str::lower_ascii_letter}; -use std::borrow::Cow::{self, Borrowed}; use std::collections::BTreeMap; use std::default::Default; use std::mem::replace; @@ -34,6 +35,7 @@ use crate::util::buffer_queue::{FromSet, NotFromSet, SetResult}; pub use states::RawKind; mod char_ref; +pub mod error; mod interface; mod states; @@ -292,8 +294,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> { _ => false, } { - let msg = format!("Bad character {}", c); - self.emit_error(Cow::Owned(msg)); + self.emit_error(Error::BadCharacter(c)); } #[cfg(feature = "spans")] @@ -400,24 +401,14 @@ impl<Sink: TokenSink> Tokenizer<Sink> { } fn bad_char_error(&mut self) { - let msg = format_if!( - self.opts.exact_errors, - "Bad character", - "Saw {} in state {:?}", + self.emit_error(Error::UnexpectedCharacter( self.current_char, - self.state - ); - self.emit_error(msg); + InternalState(self.state), + )); } fn bad_eof_error(&mut self) { - let msg = format_if!( - self.opts.exact_errors, - "Unexpected EOF", - "Saw EOF in state {:?}", - self.state - ); - self.emit_error(msg); + self.emit_error(Error::UnexpectedEOF(InternalState(self.state))); } fn emit_char(&mut self, c: char) { @@ -444,10 +435,10 @@ impl<Sink: TokenSink> Tokenizer<Sink> { } EndTag => { if !self.current_tag_attrs.is_empty() { - self.emit_error(Borrowed("Attributes on an end tag")); + self.emit_error(Error::AttributesOnEndTag); } if self.current_tag_self_closing { - self.emit_error(Borrowed("Self-closing end tag")); + self.emit_error(Error::SelfClosingEndTag); } } } @@ -547,7 +538,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> { }; if dup { - self.emit_error(Borrowed("Duplicate attribute")); + self.emit_error(Error::DuplicateAttribute); self.current_attr_name.clear(); self.current_attr_value.clear(); } else { @@ -606,7 +597,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> { self.get_char(input); } - fn emit_error(&mut self, error: Cow<'static, str>) { + fn emit_error(&mut self, error: Error) { self.process_token_and_continue(ParseError(error)); } } @@ -2451,11 +2442,16 @@ mod test { #[test] #[cfg(feature = "named-entities")] fn named_entities() { + use crate::error::{CharRefError, Error}; + let opts = opts(); let vector = vec![String::from("&\r\n"), String::from("&aamp;\r\n")]; let expected = vec![ (3, CharacterTokens("&\n".into())), - (3, ParseError("Invalid character reference".into())), + ( + 3, + ParseError(Error::CharRef(CharRefError::InvalidNamedCharRef)), + ), (4, CharacterTokens("&aamp;\n".into())), ]; let results = tokenize(vector, opts); |