diff options
| author | Martin Fischer <martin@push-f.com> | 2021-11-30 07:28:21 +0100 | 
|---|---|---|
| committer | Martin Fischer <martin@push-f.com> | 2021-11-30 11:22:35 +0100 | 
| commit | 14f1a85d994ad97dae3d9de735fc51adb25d390a (patch) | |
| tree | 0fa0d7c173a19dcb7117132325a801808302bcf8 | |
| parent | baf1477c587fe22d27e94408cf2505d588ba007e (diff) | |
introduce Error enum
| -rw-r--r-- | README.md | 4 | ||||
| -rw-r--r-- | src/macros.rs | 20 | ||||
| -rw-r--r-- | src/tokenizer/char_ref/mod.rs | 39 | ||||
| -rw-r--r-- | src/tokenizer/error.rs | 78 | ||||
| -rw-r--r-- | src/tokenizer/interface.rs | 4 | ||||
| -rw-r--r-- | src/tokenizer/mod.rs | 40 | 
6 files changed, 112 insertions, 73 deletions
| @@ -19,8 +19,8 @@ changes:    source code spans for tag names, attribute names and attribute values.    The feature is disabled by default. -* The API has been cleaned up a bit (e.g. the internal tokenizer state enums -  are no longer public). +* The API has been polished, e.g. the internal tokenizer state enums are no +  longer public and errors are no longer stringly typed.  If you want to parse HTML into a tree (DOM) you should by all means use  html5ever, this crate is merely for those who only want an HTML5 tokenizer and diff --git a/src/macros.rs b/src/macros.rs index d87ea98..558a4a9 100644 --- a/src/macros.rs +++ b/src/macros.rs @@ -31,23 +31,3 @@ macro_rules! time {          (result, dt)      }};  } - -/// Conditionally perform string formatting. -/// -/// If `$enabled` is true, then do the formatting and return a `Cow::Owned`. -/// -/// Otherwise, just return the borrowed (often `'static`) string -/// `$borrowed`. -/// -/// When `$enabled` is false, this avoids the overhead of allocating -/// and writing to a buffer, as well as any overhead or side effects -/// of the format arguments. -macro_rules! format_if { -    ($enabled:expr, $borrowed:expr, $fmt:expr, $($args:expr),*) => { -        if $enabled { -            ::std::borrow::Cow::Owned(format!($fmt, $($args),*)) as ::std::borrow::Cow<str> -        } else { -            ::std::borrow::Cow::Borrowed($borrowed) -        } -    } -} diff --git a/src/tokenizer/char_ref/mod.rs b/src/tokenizer/char_ref/mod.rs index 9c01bdf..4f94c88 100644 --- a/src/tokenizer/char_ref/mod.rs +++ b/src/tokenizer/char_ref/mod.rs @@ -8,10 +8,10 @@  // except according to those terms.  use super::{TokenSink, Tokenizer}; +use crate::error::{CharRefError, Error};  use crate::util::buffer_queue::BufferQueue;  use crate::util::str::is_ascii_alnum; -use std::borrow::Cow::Borrowed;  use std::char::from_u32;  use self::State::*; @@ -227,9 +227,7 @@ impl CharRefTokenizer {      ) -> Status {          match unwrap_or_return!(tokenizer.peek(input), Stuck) {              ';' => tokenizer.discard_char(input), -            _ => tokenizer.emit_error(Borrowed( -                "Semicolon missing after numeric character reference", -            )), +            _ => tokenizer.emit_error(Error::CharRef(CharRefError::MissingSemicolon)),          };          self.finish_numeric(tokenizer)      } @@ -246,7 +244,7 @@ impl CharRefTokenizer {          }          input.push_front(unconsume); -        tokenizer.emit_error(Borrowed("Numeric character reference without digits")); +        tokenizer.emit_error(Error::CharRef(CharRefError::NumericCharRefWithoutDigits));          self.finish_none()      } @@ -272,13 +270,9 @@ impl CharRefTokenizer {          };          if error { -            let msg = format_if!( -                tokenizer.opts.exact_errors, -                "Invalid numeric character reference", -                "Invalid numeric character reference value 0x{:06X}", -                self.num -            ); -            tokenizer.emit_error(msg); +            tokenizer.emit_error(Error::CharRef(CharRefError::NumericCharRefInvalid( +                self.num, +            )));          }          self.finish_one(c) @@ -311,13 +305,7 @@ impl CharRefTokenizer {      #[cfg(feature = "named-entities")]      fn emit_name_error<Sink: TokenSink>(&mut self, tokenizer: &mut Tokenizer<Sink>) { -        let msg = format_if!( -            tokenizer.opts.exact_errors, -            "Invalid character reference", -            "Invalid character reference &{}", -            self.name_buf() -        ); -        tokenizer.emit_error(msg); +        tokenizer.emit_error(Error::CharRef(CharRefError::InvalidNamedCharRef));      }      fn unconsume_name(&mut self, input: &mut BufferQueue) { @@ -384,16 +372,13 @@ impl CharRefTokenizer {                  let unconsume_all = match (self.addnl_allowed, last_matched, next_after) {                      (_, ';', _) => false,                      (Some(_), _, Some('=')) => { -                        tokenizer.emit_error(Borrowed( -                            "Equals sign after character reference in attribute", -                        )); +                        tokenizer +                            .emit_error(Error::CharRef(CharRefError::EqualsSignAfterCharRefInAttr));                          true                      }                      (Some(_), _, Some(c)) if is_ascii_alnum(c) => true,                      _ => { -                        tokenizer.emit_error(Borrowed( -                            "Character reference does not end with semicolon", -                        )); +                        tokenizer.emit_error(Error::CharRef(CharRefError::MissingSemicolon));                          false                      }                  }; @@ -444,7 +429,7 @@ impl CharRefTokenizer {                  Numeric(_) if !self.seen_digit => drop(self.unconsume_numeric(tokenizer, input)),                  Numeric(_) | NumericSemicolon => { -                    tokenizer.emit_error(Borrowed("EOF in numeric character reference")); +                    tokenizer.emit_error(Error::CharRef(CharRefError::EofInNumericCharRef));                      self.finish_numeric(tokenizer);                  } @@ -458,7 +443,7 @@ impl CharRefTokenizer {                  Octothorpe => {                      input.push_front(String::from("#")); -                    tokenizer.emit_error(Borrowed("EOF after '#' in character reference")); +                    tokenizer.emit_error(Error::CharRef(CharRefError::EofAfterNumberSign));                      self.finish_none();                  }              } diff --git a/src/tokenizer/error.rs b/src/tokenizer/error.rs new file mode 100644 index 0000000..89eed2a --- /dev/null +++ b/src/tokenizer/error.rs @@ -0,0 +1,78 @@ +//! Types to represent the parser errors that can occur. +use std::fmt::Display; + +#[derive(PartialEq, Eq, Debug)] +#[non_exhaustive] +pub enum Error { +    AttributesOnEndTag, +    SelfClosingEndTag, +    DuplicateAttribute, +    BadCharacter(char), +    UnexpectedCharacter(char, InternalState), +    UnexpectedEOF(InternalState), +    CharRef(CharRefError), +} + +/// Allows Error variants to include the internal tokenizer state without making it public. +#[derive(PartialEq, Eq, Debug)] +pub struct InternalState(pub(crate) crate::tokenizer::states::State); + +impl Display for Error { +    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { +        match self { +            Error::AttributesOnEndTag => write!(f, "attributes on an end tag"), +            Error::SelfClosingEndTag => write!(f, "self-closing end tag"), +            Error::DuplicateAttribute => write!(f, "duplicate attribute"), +            Error::BadCharacter(char) => write!(f, "bad character {:?}", char), +            Error::UnexpectedCharacter(char, state) => { +                write!( +                    f, +                    "unexpected character: saw {:?} in state {:?}", +                    char, state.0 +                ) +            } +            Error::UnexpectedEOF(state) => write!(f, "unexpected EOF in state {:?}", state.0), +            Error::CharRef(error) => error.fmt(f), +        } +    } +} + +#[derive(PartialEq, Eq, Debug)] +#[non_exhaustive] +pub enum CharRefError { +    MissingSemicolon, +    NumericCharRefWithoutDigits, +    NumericCharRefInvalid(u32), +    EofInNumericCharRef, +    EofAfterNumberSign, +    EqualsSignAfterCharRefInAttr, +    InvalidNamedCharRef, +} + +impl Display for CharRefError { +    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { +        match self { +            CharRefError::NumericCharRefWithoutDigits => { +                write!(f, "numeric character reference without digits") +            } +            CharRefError::MissingSemicolon => { +                write!(f, "semicolon missing after character reference") +            } +            CharRefError::NumericCharRefInvalid(num) => { +                write!(f, "invalid numeric character reference value 0x{:06X}", num) +            } +            CharRefError::EofInNumericCharRef => { +                write!(f, "EOF in numeric character reference") +            } +            CharRefError::EofAfterNumberSign => { +                write!(f, "EOF after '#' in character reference") +            } +            CharRefError::EqualsSignAfterCharRefInAttr => { +                write!(f, "equals sign after character reference in attribute") +            } +            CharRefError::InvalidNamedCharRef => { +                write!(f, "invalid named character reference") +            } +        } +    } +} diff --git a/src/tokenizer/interface.rs b/src/tokenizer/interface.rs index f12fb16..715f9bc 100644 --- a/src/tokenizer/interface.rs +++ b/src/tokenizer/interface.rs @@ -7,8 +7,8 @@  // option. This file may not be copied, modified, or distributed  // except according to those terms. +use crate::error::Error;  use crate::tokenizer::states; -use std::borrow::Cow;  #[cfg(feature = "spans")]  use std::ops::Range; @@ -112,7 +112,7 @@ pub enum Token {      CharacterTokens(String),      NullCharacterToken,      EOFToken, -    ParseError(Cow<'static, str>), +    ParseError(Error),  }  #[derive(Debug, PartialEq)] diff --git a/src/tokenizer/mod.rs b/src/tokenizer/mod.rs index 4511cf8..78101f6 100644 --- a/src/tokenizer/mod.rs +++ b/src/tokenizer/mod.rs @@ -9,6 +9,7 @@  //! The HTML5 tokenizer. +use self::error::InternalState;  pub use self::interface::{Attribute, Doctype, Tag, TagKind, Token};  use self::interface::{CharacterTokens, EOFToken, NullCharacterToken, ParseError};  use self::interface::{CommentToken, DoctypeToken, EndTag, StartTag, TagToken}; @@ -21,9 +22,9 @@ use self::states::{Rawtext, Rcdata, ScriptData, ScriptDataEscaped};  use self::char_ref::{CharRef, CharRefTokenizer}; +use crate::error::Error;  use crate::util::{smallcharset::SmallCharSet, str::lower_ascii_letter}; -use std::borrow::Cow::{self, Borrowed};  use std::collections::BTreeMap;  use std::default::Default;  use std::mem::replace; @@ -34,6 +35,7 @@ use crate::util::buffer_queue::{FromSet, NotFromSet, SetResult};  pub use states::RawKind;  mod char_ref; +pub mod error;  mod interface;  mod states; @@ -292,8 +294,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> {                  _ => false,              }          { -            let msg = format!("Bad character {}", c); -            self.emit_error(Cow::Owned(msg)); +            self.emit_error(Error::BadCharacter(c));          }          #[cfg(feature = "spans")] @@ -400,24 +401,14 @@ impl<Sink: TokenSink> Tokenizer<Sink> {      }      fn bad_char_error(&mut self) { -        let msg = format_if!( -            self.opts.exact_errors, -            "Bad character", -            "Saw {} in state {:?}", +        self.emit_error(Error::UnexpectedCharacter(              self.current_char, -            self.state -        ); -        self.emit_error(msg); +            InternalState(self.state), +        ));      }      fn bad_eof_error(&mut self) { -        let msg = format_if!( -            self.opts.exact_errors, -            "Unexpected EOF", -            "Saw EOF in state {:?}", -            self.state -        ); -        self.emit_error(msg); +        self.emit_error(Error::UnexpectedEOF(InternalState(self.state)));      }      fn emit_char(&mut self, c: char) { @@ -444,10 +435,10 @@ impl<Sink: TokenSink> Tokenizer<Sink> {              }              EndTag => {                  if !self.current_tag_attrs.is_empty() { -                    self.emit_error(Borrowed("Attributes on an end tag")); +                    self.emit_error(Error::AttributesOnEndTag);                  }                  if self.current_tag_self_closing { -                    self.emit_error(Borrowed("Self-closing end tag")); +                    self.emit_error(Error::SelfClosingEndTag);                  }              }          } @@ -547,7 +538,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> {          };          if dup { -            self.emit_error(Borrowed("Duplicate attribute")); +            self.emit_error(Error::DuplicateAttribute);              self.current_attr_name.clear();              self.current_attr_value.clear();          } else { @@ -606,7 +597,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> {          self.get_char(input);      } -    fn emit_error(&mut self, error: Cow<'static, str>) { +    fn emit_error(&mut self, error: Error) {          self.process_token_and_continue(ParseError(error));      }  } @@ -2451,11 +2442,16 @@ mod test {      #[test]      #[cfg(feature = "named-entities")]      fn named_entities() { +        use crate::error::{CharRefError, Error}; +          let opts = opts();          let vector = vec![String::from("&\r\n"), String::from("&aamp;\r\n")];          let expected = vec![              (3, CharacterTokens("&\n".into())), -            (3, ParseError("Invalid character reference".into())), +            ( +                3, +                ParseError(Error::CharRef(CharRefError::InvalidNamedCharRef)), +            ),              (4, CharacterTokens("&aamp;\n".into())),          ];          let results = tokenize(vector, opts); | 
