diff options
| -rw-r--r-- | CHANGELOG.md | 6 | ||||
| -rw-r--r-- | examples/tokenize.rs | 11 | ||||
| -rw-r--r-- | integration_tests/tests/test_html5lib.rs | 12 | ||||
| -rw-r--r-- | src/default_emitter.rs | 11 | ||||
| -rw-r--r-- | src/naive_parser.rs | 8 | ||||
| -rw-r--r-- | src/token.rs | 11 | ||||
| -rw-r--r-- | src/tokenizer.rs | 5 | ||||
| -rw-r--r-- | tests/test_spans.rs | 46 | 
8 files changed, 53 insertions, 57 deletions
| diff --git a/CHANGELOG.md b/CHANGELOG.md index 961665c..c4acbb2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,12 @@  #### Breaking changes +* `Token` enum + +  * Removed the `Error` variant.   +    (Errors now have to be queried separately with +    `DefaultEmitter::drain_errors`.) +  * `Emitter` trait    * Removed `pop_token` method and `Token` associated type. diff --git a/examples/tokenize.rs b/examples/tokenize.rs index da99dd3..f8859e4 100644 --- a/examples/tokenize.rs +++ b/examples/tokenize.rs @@ -5,12 +5,15 @@ use html5tokenizer::{DefaultEmitter, Tokenizer};  use std::io::BufReader;  fn main() { -    for token in Tokenizer::new( +    let mut tokenizer = Tokenizer::new(          BufReader::new(std::io::stdin().lock()),          DefaultEmitter::default(), -    ) -    .flatten() -    { +    ); +    while let Some(token) = tokenizer.next() { +        for (error, _) in tokenizer.emitter_mut().drain_errors() { +            eprintln!("error: {:?}", error); +        } +        let token = token.unwrap();          println!("{:?}", token);      }  } diff --git a/integration_tests/tests/test_html5lib.rs b/integration_tests/tests/test_html5lib.rs index 0cf5868..2e404c5 100644 --- a/integration_tests/tests/test_html5lib.rs +++ b/integration_tests/tests/test_html5lib.rs @@ -115,7 +115,6 @@ fn run_test_inner<R: Reader>(          tokenizer.set_last_start_tag(last_start_tag);      } -    let mut actual_errors = Vec::new();      let mut actual_tokens = Vec::new();      while let Some(event) = tokenizer.next() { @@ -128,9 +127,6 @@ fn run_test_inner<R: Reader>(          };          match token { -            Token::Error { error, .. } => actual_errors.push(TestError { -                code: error.code().to_string(), -            }),              Token::StartTag(tag) => actual_tokens.push(TestToken::StartTag {                  name: tag.name,                  attributes: tag @@ -154,7 +150,13 @@ fn run_test_inner<R: Reader>(      assert_eq!(          Output { -            errors: actual_errors, +            errors: tokenizer +                .emitter_mut() +                .drain_errors() +                .map(|(e, _)| TestError { +                    code: e.code().to_string() +                }) +                .collect(),              tokens: actual_tokens,          },          test.output, diff --git a/src/default_emitter.rs b/src/default_emitter.rs index a4c5a63..e89fa5e 100644 --- a/src/default_emitter.rs +++ b/src/default_emitter.rs @@ -17,6 +17,7 @@ pub struct DefaultEmitter<O = NoopOffset> {      current_attribute: Option<(String, crate::token::AttrInternal<O>)>,      seen_attributes: BTreeSet<String>,      emitted_tokens: VecDeque<Token<O>>, +    errors: VecDeque<(Error, Range<O>)>,      attr_in_end_tag_span: Option<Range<O>>,  } @@ -28,11 +29,19 @@ impl<O> Default for DefaultEmitter<O> {              current_attribute: None,              seen_attributes: BTreeSet::new(),              emitted_tokens: VecDeque::new(), +            errors: VecDeque::new(),              attr_in_end_tag_span: None,          }      }  } +impl<O> DefaultEmitter<O> { +    /// Removes all encountered tokenizer errors and returns them as an iterator. +    pub fn drain_errors(&mut self) -> impl Iterator<Item = (Error, Range<O>)> + '_ { +        self.errors.drain(0..) +    } +} +  impl<O> Iterator for DefaultEmitter<O> {      type Item = Token<O>; @@ -43,7 +52,7 @@ impl<O> Iterator for DefaultEmitter<O> {  impl<O: Offset> Emitter<O> for DefaultEmitter<O> {      fn report_error(&mut self, error: Error, span: Range<O>) { -        self.emitted_tokens.push_front(Token::Error { error, span }); +        self.errors.push_back((error, span));      }      fn emit_eof(&mut self) { diff --git a/src/naive_parser.rs b/src/naive_parser.rs index 10eb98d..5bf002b 100644 --- a/src/naive_parser.rs +++ b/src/naive_parser.rs @@ -14,11 +14,10 @@ use crate::{Emitter, Event, State, Tokenizer};  /// * it naively emits any CDATA sections as bogus comments, for example:  ///  ///   ``` -///   # use html5tokenizer::{Error, NaiveParser, Tokenizer, Token}; +///   # use html5tokenizer::{NaiveParser, Token};  ///   let html = "<svg><![CDATA[I love SVG]]>";  ///   let mut tokens = NaiveParser::new(html).flatten();  ///   assert!(matches!(tokens.next().unwrap(), Token::StartTag(tag) if tag.name == "svg")); -///   assert!(matches!(tokens.next().unwrap(), Token::Error {error: Error::CdataInHtmlContent, ..}));  ///   assert!(matches!(tokens.next().unwrap(), Token::Comment(_bogus_comment)));  ///   ```  /// @@ -59,6 +58,11 @@ impl<R: Reader + Position<O>, O: Offset, E: Emitter<O>> NaiveParser<R, O, E> {          tokenizer.naively_switch_state = true;          NaiveParser { tokenizer }      } + +    /// Returns a mutable reference to the emitter. +    pub fn emitter_mut(&mut self) -> &mut E { +        self.tokenizer.emitter_mut() +    }  }  impl<R, O, E> Iterator for NaiveParser<R, O, E> diff --git a/src/token.rs b/src/token.rs index 48c90f7..c599cd5 100644 --- a/src/token.rs +++ b/src/token.rs @@ -5,7 +5,6 @@ use std::iter::FromIterator;  use std::ops::{Index, Range};  use crate::offset::Offset; -use crate::Error;  /// A type for the tokens emitted by a WHATWG-compliant HTML tokenizer.  #[derive(Debug, Eq, PartialEq)] @@ -20,16 +19,6 @@ pub enum Token<O> {      Comment(Comment<O>),      /// An HTML doctype declaration.      Doctype(Doctype<O>), -    /// An HTML parsing error. -    /// -    /// Can be skipped over, the tokenizer is supposed to recover from the error and continues with -    /// more tokens afterward. -    Error { -        /// What kind of error occurred. -        error: Error, -        /// The source code span of the error. -        span: Range<O>, -    },  }  /// An HTML start tag, such as `<p>` or `<a>`. diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 7e1e85f..270d3d0 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -111,6 +111,11 @@ impl<R: Reader + Position<O>, O: Offset, E: Emitter<O>> Tokenizer<R, O, E> {      pub fn handle_cdata_open(&mut self, action: CdataAction) {          machine::handle_cdata_open(self, action);      } + +    /// Returns a mutable reference to the emitter. +    pub fn emitter_mut(&mut self) -> &mut E { +        &mut self.emitter +    }  }  /// Used by [`Tokenizer::handle_cdata_open`] to determine how to process `<![CDATA[` diff --git a/tests/test_spans.rs b/tests/test_spans.rs index f2cdc5f..64cc250 100644 --- a/tests/test_spans.rs +++ b/tests/test_spans.rs @@ -221,12 +221,7 @@ fn comment_data_span() {      let mut annotated = String::new();      for case in cases {          let labeler = |parser: Parser| { -            let Token::Comment(comment) = parser -                .flatten() -                .filter(|t| !matches!(t, Token::Error { .. })) -                .next() -                .unwrap() -            else { +            let Token::Comment(comment) = parser.flatten().next().unwrap() else {                  panic!("expected comment");              };              vec![(comment.data_span(), "")] @@ -265,12 +260,7 @@ fn comment_data_span() {      "###);      for (idx, case) in cases.iter().enumerate() { -        let Token::Comment(comment) = parser(*case) -            .flatten() -            .filter(|t| !matches!(t, Token::Error { .. })) -            .next() -            .unwrap() -        else { +        let Token::Comment(comment) = parser(*case).flatten().next().unwrap() else {              panic!("expected comment");          };          assert_eq!(case[comment.data_span()], comment.data, "case {idx}"); @@ -287,12 +277,7 @@ fn doctype_span() {      let mut annotated = String::new();      for case in cases {          let labeler = |parser: Parser| { -            let Token::Doctype(doctype) = parser -                .flatten() -                .filter(|t| !matches!(t, Token::Error { .. })) -                .next() -                .unwrap() -            else { +            let Token::Doctype(doctype) = parser.flatten().next().unwrap() else {                  panic!("expected doctype");              };              vec![(doctype.span, "")] @@ -316,12 +301,7 @@ fn doctype_id_spans() {      let mut annotated = String::new();      for case in cases {          let labeler = |parser: Parser| { -            let Token::Doctype(doctype) = parser -                .flatten() -                .filter(|t| !matches!(t, Token::Error { .. })) -                .next() -                .unwrap() -            else { +            let Token::Doctype(doctype) = parser.flatten().next().unwrap() else {                  panic!("expected doctype");              }; @@ -351,10 +331,11 @@ fn doctype_id_spans() {  }  fn annotate_errors(html: &'static str) -> String { -    for token in parser(html).flatten() { -        let Token::Error { span, .. } = token else { -            continue; -        }; +    let mut parser = parser(html); +    for _ in parser.by_ref() {} +    let errors: Vec<_> = parser.emitter_mut().drain_errors().collect(); + +    for (_, span) in errors {          if span.start == span.end {              if span.start != html.len() {                  panic!("empty error spans are only allowed at the very end of the source (for eof errors)"); @@ -365,13 +346,10 @@ fn annotate_errors(html: &'static str) -> String {          }      } -    let labeler = |parser: Parser| { +    let labeler = |mut parser: Parser| {          let mut labels = Vec::new(); -        for token in parser.flatten() { -            let Token::Error { error, span } = token else { -                continue; -            }; - +        for _ in parser.by_ref() {} +        for (error, span) in parser.emitter_mut().drain_errors() {              labels.push((span, error.code()));          }          labels | 
