aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMartin Fischer <martin@push-f.com>2021-11-30 10:56:59 +0100
committerMartin Fischer <martin@push-f.com>2021-11-30 11:22:35 +0100
commit915530c02029f8bd4444930ed949e14f09afab03 (patch)
tree6f58b9728386dc5c1709137bc0a250640a7ce572
parent414e5838618123cb00216a7426b898aab88ee45a (diff)
report spans for errors
-rw-r--r--README.md2
-rw-r--r--examples/tokenize.rs4
-rw-r--r--src/tokenizer/error.rs9
-rw-r--r--src/tokenizer/interface.rs7
-rw-r--r--src/tokenizer/mod.rs15
-rw-r--r--tests/files/test.html4
-rw-r--r--tests/files/test.out38
-rw-r--r--tests/spans.rs10
8 files changed, 61 insertions, 28 deletions
diff --git a/README.md b/README.md
index 95a0a05..0846542 100644
--- a/README.md
+++ b/README.md
@@ -16,7 +16,7 @@ changes:
you had to do this yourself.
* An optional `spans` feature has been added to make the tokenizer report the
- source code spans for tag names, attribute names and attribute values.
+ source code spans for parser errors, tag names and attributes.
The feature is disabled by default.
* The API has been polished, e.g. the internal tokenizer state enums are no
diff --git a/examples/tokenize.rs b/examples/tokenize.rs
index 8728a18..1c9ea6a 100644
--- a/examples/tokenize.rs
+++ b/examples/tokenize.rs
@@ -66,9 +66,9 @@ impl TokenSink for TokenPrinter {
}
println!(">");
}
- ParseError(err) => {
+ ParseError { error, .. } => {
self.is_char(false);
- println!("ERROR: {}", err);
+ println!("ERROR: {}", error);
}
_ => {
self.is_char(false);
diff --git a/src/tokenizer/error.rs b/src/tokenizer/error.rs
index 0acc88f..dad3fd2 100644
--- a/src/tokenizer/error.rs
+++ b/src/tokenizer/error.rs
@@ -6,7 +6,12 @@ use std::fmt::Display;
pub enum Error {
AttributesOnEndTag,
SelfClosingEndTag,
- DuplicateAttribute,
+ DuplicateAttribute {
+ #[cfg(feature = "spans")]
+ #[cfg_attr(docsrs, doc(cfg(feature = "spans")))]
+ /// Span of the duplicate attribute name.
+ span: std::ops::Range<usize>,
+ },
BadCharacter(char),
UnexpectedCharacter(char, InternalState),
UnexpectedEOF(InternalState),
@@ -22,7 +27,7 @@ impl Display for Error {
match self {
Error::AttributesOnEndTag => write!(f, "attributes on an end tag"),
Error::SelfClosingEndTag => write!(f, "self-closing end tag"),
- Error::DuplicateAttribute => write!(f, "duplicate attribute"),
+ Error::DuplicateAttribute { .. } => write!(f, "duplicate attribute"),
Error::BadCharacter(char) => write!(f, "bad character {:?}", char),
Error::UnexpectedCharacter(char, state) => {
write!(
diff --git a/src/tokenizer/interface.rs b/src/tokenizer/interface.rs
index 715f9bc..128807e 100644
--- a/src/tokenizer/interface.rs
+++ b/src/tokenizer/interface.rs
@@ -112,7 +112,12 @@ pub enum Token {
CharacterTokens(String),
NullCharacterToken,
EOFToken,
- ParseError(Error),
+ ParseError {
+ error: Error,
+ #[cfg(feature = "spans")]
+ #[cfg_attr(docsrs, doc(cfg(feature = "spans")))]
+ span: std::ops::Range<usize>,
+ },
}
#[derive(Debug, PartialEq)]
diff --git a/src/tokenizer/mod.rs b/src/tokenizer/mod.rs
index 6793eb2..1809275 100644
--- a/src/tokenizer/mod.rs
+++ b/src/tokenizer/mod.rs
@@ -538,7 +538,10 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
};
if dup {
- self.emit_error(Error::DuplicateAttribute);
+ self.emit_error(Error::DuplicateAttribute {
+ #[cfg(feature = "spans")]
+ span: self.spans.current_attr_name.clone(),
+ });
self.current_attr_name.clear();
self.current_attr_value.clear();
} else {
@@ -598,7 +601,11 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
}
fn emit_error(&mut self, error: Error) {
- self.process_token_and_continue(ParseError(error));
+ self.process_token_and_continue(ParseError {
+ error,
+ #[cfg(feature = "spans")]
+ span: self.spans.current_pos - 1..self.spans.current_pos - 1,
+ });
}
}
//§ END
@@ -2293,7 +2300,7 @@ mod test {
self.current_str.push('\0');
}
- token @ ParseError(_) => {
+ token @ ParseError { .. } => {
self.push(token, line_number);
}
@@ -2453,7 +2460,7 @@ mod test {
(3, CharacterTokens(c1)),
(
3,
- ParseError(Error::CharRef(CharRefError::InvalidNamedCharRef)),
+ ParseError{error: Error::CharRef(CharRefError::InvalidNamedCharRef), ..},
),
(4, CharacterTokens(c2)),
] if c1 == "&\n" && c2 == "&aamp;\n"
diff --git a/tests/files/test.html b/tests/files/test.html
index 0dcbdbf..14493b7 100644
--- a/tests/files/test.html
+++ b/tests/files/test.html
@@ -5,3 +5,7 @@ Here is a tag: <strong >very cool</strong>
Tags can have attributes: <div id = foo >...</div>
Attribute values can be quoted: <input name = 'age' type = "number">
+
+This is malformed < test
+
+Characters can be escaped but don't forget the semicolon: &#182
diff --git a/tests/files/test.out b/tests/files/test.out
index 7127ebc..f5acb3e 100644
--- a/tests/files/test.out
+++ b/tests/files/test.out
@@ -1,17 +1,23 @@
note:
- ┌─ test.html:3:17
- │
-3 │ Here is a tag: <strong >very cool</strong>
- │ ^^^^^^ ^^^^^^ EndTag
- │ │
- │ StartTag
-4 │
-5 │ Tags can have attributes: <div id = foo >...</div>
- │ ^^ ^^^ attribute value
- │ │
- │ attribute name
-6 │
-7 │ Attribute values can be quoted: <input name = 'age' type = "number">
- │ ^^^ ^^^^^^ in double quotes
- │ │
- │ in single quotes
+ ┌─ test.html:3:17
+ │
+ 3 │ Here is a tag: <strong >very cool</strong>
+ │ ^^^^^^ ^^^^^^ EndTag
+ │ │
+ │ StartTag
+ 4 │
+ 5 │ Tags can have attributes: <div id = foo >...</div>
+ │ ^^ ^^^ attribute value
+ │ │
+ │ attribute name
+ 6 │
+ 7 │ Attribute values can be quoted: <input name = 'age' type = "number">
+ │ ^^^ ^^^^^^ in double quotes
+ │ │
+ │ in single quotes
+ 8 │
+ 9 │ This is malformed < test
+ │ ^ unexpected character: saw ' ' in state TagOpen
+10 │
+11 │ Characters can be escaped but don't forget the semicolon: &#182
+ │ ^ semicolon missing after character reference
diff --git a/tests/spans.rs b/tests/spans.rs
index bfa42f6..5615853 100644
--- a/tests/spans.rs
+++ b/tests/spans.rs
@@ -1,5 +1,5 @@
#![cfg(feature = "spans")]
-use std::include_str;
+use std::{include_str, ops::Range};
use codespan_reporting::{
self,
@@ -8,18 +8,21 @@ use codespan_reporting::{
term::{self, termcolor::Buffer},
};
use html5tokenizer::{
- BufferQueue, Tag, Token, TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts,
+ error::Error, BufferQueue, Tag, Token, TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts,
};
#[derive(Default)]
struct TagSink {
tags: Vec<Tag>,
+ errors: Vec<(Error, Range<usize>)>,
}
impl TokenSink for TagSink {
fn process_token(&mut self, token: Token, _line_number: u64) -> TokenSinkResult {
if let Token::TagToken(tag) = token {
self.tags.push(tag);
+ } else if let Token::ParseError { error, span } = token {
+ self.errors.push((error, span));
}
TokenSinkResult::Continue
}
@@ -61,6 +64,9 @@ fn test() {
Label::primary(file_id, tags[4].attrs[1].value_span.clone())
.with_message("in double quotes"),
);
+ for (error, span) in tok.sink.errors {
+ labels.push(Label::primary(file_id, span).with_message(format!("{}", error)));
+ }
let diagnostic = Diagnostic::note().with_labels(labels);
let mut writer = Buffer::no_color();