diff options
-rw-r--r-- | README.md | 2 | ||||
-rw-r--r-- | examples/tokenize.rs | 4 | ||||
-rw-r--r-- | src/tokenizer/error.rs | 9 | ||||
-rw-r--r-- | src/tokenizer/interface.rs | 7 | ||||
-rw-r--r-- | src/tokenizer/mod.rs | 15 | ||||
-rw-r--r-- | tests/files/test.html | 4 | ||||
-rw-r--r-- | tests/files/test.out | 38 | ||||
-rw-r--r-- | tests/spans.rs | 10 |
8 files changed, 61 insertions, 28 deletions
@@ -16,7 +16,7 @@ changes: you had to do this yourself. * An optional `spans` feature has been added to make the tokenizer report the - source code spans for tag names, attribute names and attribute values. + source code spans for parser errors, tag names and attributes. The feature is disabled by default. * The API has been polished, e.g. the internal tokenizer state enums are no diff --git a/examples/tokenize.rs b/examples/tokenize.rs index 8728a18..1c9ea6a 100644 --- a/examples/tokenize.rs +++ b/examples/tokenize.rs @@ -66,9 +66,9 @@ impl TokenSink for TokenPrinter { } println!(">"); } - ParseError(err) => { + ParseError { error, .. } => { self.is_char(false); - println!("ERROR: {}", err); + println!("ERROR: {}", error); } _ => { self.is_char(false); diff --git a/src/tokenizer/error.rs b/src/tokenizer/error.rs index 0acc88f..dad3fd2 100644 --- a/src/tokenizer/error.rs +++ b/src/tokenizer/error.rs @@ -6,7 +6,12 @@ use std::fmt::Display; pub enum Error { AttributesOnEndTag, SelfClosingEndTag, - DuplicateAttribute, + DuplicateAttribute { + #[cfg(feature = "spans")] + #[cfg_attr(docsrs, doc(cfg(feature = "spans")))] + /// Span of the duplicate attribute name. + span: std::ops::Range<usize>, + }, BadCharacter(char), UnexpectedCharacter(char, InternalState), UnexpectedEOF(InternalState), @@ -22,7 +27,7 @@ impl Display for Error { match self { Error::AttributesOnEndTag => write!(f, "attributes on an end tag"), Error::SelfClosingEndTag => write!(f, "self-closing end tag"), - Error::DuplicateAttribute => write!(f, "duplicate attribute"), + Error::DuplicateAttribute { .. } => write!(f, "duplicate attribute"), Error::BadCharacter(char) => write!(f, "bad character {:?}", char), Error::UnexpectedCharacter(char, state) => { write!( diff --git a/src/tokenizer/interface.rs b/src/tokenizer/interface.rs index 715f9bc..128807e 100644 --- a/src/tokenizer/interface.rs +++ b/src/tokenizer/interface.rs @@ -112,7 +112,12 @@ pub enum Token { CharacterTokens(String), NullCharacterToken, EOFToken, - ParseError(Error), + ParseError { + error: Error, + #[cfg(feature = "spans")] + #[cfg_attr(docsrs, doc(cfg(feature = "spans")))] + span: std::ops::Range<usize>, + }, } #[derive(Debug, PartialEq)] diff --git a/src/tokenizer/mod.rs b/src/tokenizer/mod.rs index 6793eb2..1809275 100644 --- a/src/tokenizer/mod.rs +++ b/src/tokenizer/mod.rs @@ -538,7 +538,10 @@ impl<Sink: TokenSink> Tokenizer<Sink> { }; if dup { - self.emit_error(Error::DuplicateAttribute); + self.emit_error(Error::DuplicateAttribute { + #[cfg(feature = "spans")] + span: self.spans.current_attr_name.clone(), + }); self.current_attr_name.clear(); self.current_attr_value.clear(); } else { @@ -598,7 +601,11 @@ impl<Sink: TokenSink> Tokenizer<Sink> { } fn emit_error(&mut self, error: Error) { - self.process_token_and_continue(ParseError(error)); + self.process_token_and_continue(ParseError { + error, + #[cfg(feature = "spans")] + span: self.spans.current_pos - 1..self.spans.current_pos - 1, + }); } } //§ END @@ -2293,7 +2300,7 @@ mod test { self.current_str.push('\0'); } - token @ ParseError(_) => { + token @ ParseError { .. } => { self.push(token, line_number); } @@ -2453,7 +2460,7 @@ mod test { (3, CharacterTokens(c1)), ( 3, - ParseError(Error::CharRef(CharRefError::InvalidNamedCharRef)), + ParseError{error: Error::CharRef(CharRefError::InvalidNamedCharRef), ..}, ), (4, CharacterTokens(c2)), ] if c1 == "&\n" && c2 == "&aamp;\n" diff --git a/tests/files/test.html b/tests/files/test.html index 0dcbdbf..14493b7 100644 --- a/tests/files/test.html +++ b/tests/files/test.html @@ -5,3 +5,7 @@ Here is a tag: <strong >very cool</strong> Tags can have attributes: <div id = foo >...</div> Attribute values can be quoted: <input name = 'age' type = "number"> + +This is malformed < test + +Characters can be escaped but don't forget the semicolon: ¶ diff --git a/tests/files/test.out b/tests/files/test.out index 7127ebc..f5acb3e 100644 --- a/tests/files/test.out +++ b/tests/files/test.out @@ -1,17 +1,23 @@ note: - ┌─ test.html:3:17 - │ -3 │ Here is a tag: <strong >very cool</strong> - │ ^^^^^^ ^^^^^^ EndTag - │ │ - │ StartTag -4 │ -5 │ Tags can have attributes: <div id = foo >...</div> - │ ^^ ^^^ attribute value - │ │ - │ attribute name -6 │ -7 │ Attribute values can be quoted: <input name = 'age' type = "number"> - │ ^^^ ^^^^^^ in double quotes - │ │ - │ in single quotes + ┌─ test.html:3:17 + │ + 3 │ Here is a tag: <strong >very cool</strong> + │ ^^^^^^ ^^^^^^ EndTag + │ │ + │ StartTag + 4 │ + 5 │ Tags can have attributes: <div id = foo >...</div> + │ ^^ ^^^ attribute value + │ │ + │ attribute name + 6 │ + 7 │ Attribute values can be quoted: <input name = 'age' type = "number"> + │ ^^^ ^^^^^^ in double quotes + │ │ + │ in single quotes + 8 │ + 9 │ This is malformed < test + │ ^ unexpected character: saw ' ' in state TagOpen +10 │ +11 │ Characters can be escaped but don't forget the semicolon: ¶ + │ ^ semicolon missing after character reference diff --git a/tests/spans.rs b/tests/spans.rs index bfa42f6..5615853 100644 --- a/tests/spans.rs +++ b/tests/spans.rs @@ -1,5 +1,5 @@ #![cfg(feature = "spans")] -use std::include_str; +use std::{include_str, ops::Range}; use codespan_reporting::{ self, @@ -8,18 +8,21 @@ use codespan_reporting::{ term::{self, termcolor::Buffer}, }; use html5tokenizer::{ - BufferQueue, Tag, Token, TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts, + error::Error, BufferQueue, Tag, Token, TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts, }; #[derive(Default)] struct TagSink { tags: Vec<Tag>, + errors: Vec<(Error, Range<usize>)>, } impl TokenSink for TagSink { fn process_token(&mut self, token: Token, _line_number: u64) -> TokenSinkResult { if let Token::TagToken(tag) = token { self.tags.push(tag); + } else if let Token::ParseError { error, span } = token { + self.errors.push((error, span)); } TokenSinkResult::Continue } @@ -61,6 +64,9 @@ fn test() { Label::primary(file_id, tags[4].attrs[1].value_span.clone()) .with_message("in double quotes"), ); + for (error, span) in tok.sink.errors { + labels.push(Label::primary(file_id, span).with_message(format!("{}", error))); + } let diagnostic = Diagnostic::note().with_labels(labels); let mut writer = Buffer::no_color(); |