aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Cargo.toml6
-rw-r--r--README.md4
-rw-r--r--src/lib.rs1
-rw-r--r--src/tokenizer/interface.rs27
-rw-r--r--src/tokenizer/mod.rs267
-rw-r--r--tests/files/test.html7
-rw-r--r--tests/files/test.out17
-rw-r--r--tests/spans.rs89
8 files changed, 334 insertions, 84 deletions
diff --git a/Cargo.toml b/Cargo.toml
index 7b270b6..8aa0df0 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -12,6 +12,7 @@ edition = "2018"
[features]
default = ["named-entities"]
+spans = []
# resolve named entities like &
named-entities = ["phf", "phf_codegen"]
@@ -23,8 +24,13 @@ phf = { version = "0.9", optional = true }
phf_codegen = { version = "0.9", optional = true }
[dev-dependencies]
+codespan-reporting = "0.11.1"
criterion = "0.3"
[[bench]]
name = "html5ever"
harness = false
+
+[package.metadata.docs.rs]
+all-features = true
+rustdoc-args = ["--cfg", "docsrs"]
diff --git a/README.md b/README.md
index 91265b0..3adff5a 100644
--- a/README.md
+++ b/README.md
@@ -15,6 +15,10 @@ changes:
tag names (e.g. for `script` and `styles`) ... with the html5ever tokenizer
you had to do this yourself.
+* An optional `spans` feature has been added to make the tokenizer report the
+ source code spans for tag names, attribute names and attribute values.
+ The feature is disabled by default.
+
* The API has been cleaned up a bit (e.g. the internal tokenizer state enums
are no longer public).
diff --git a/src/lib.rs b/src/lib.rs
index 57b7b05..fe0e713 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -10,6 +10,7 @@
#![doc = include_str!("../README.md")]
#![crate_type = "dylib"]
#![cfg_attr(test, deny(warnings))]
+#![cfg_attr(docsrs, feature(doc_cfg))]
#![allow(unused_parens)]
#[macro_use]
diff --git a/src/tokenizer/interface.rs b/src/tokenizer/interface.rs
index 2c6cc38..f12fb16 100644
--- a/src/tokenizer/interface.rs
+++ b/src/tokenizer/interface.rs
@@ -9,6 +9,8 @@
use crate::tokenizer::states;
use std::borrow::Cow;
+#[cfg(feature = "spans")]
+use std::ops::Range;
pub use self::TagKind::{EndTag, StartTag};
pub use self::Token::{CharacterTokens, CommentToken, DoctypeToken, TagToken};
@@ -47,12 +49,30 @@ pub enum TagKind {
/// The tokenizer creates all attributes this way, but the tree
/// builder will adjust certain attribute names inside foreign
/// content (MathML, SVG).
-#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug)]
+#[derive(PartialEq, Eq, Clone, Debug)]
pub struct Attribute {
/// The name of the attribute (e.g. the `class` in `<div class="test">`)
pub name: String,
/// The value of the attribute (e.g. the `"test"` in `<div class="test">`)
pub value: String,
+ #[cfg(feature = "spans")]
+ #[cfg_attr(docsrs, doc(cfg(feature = "spans")))]
+ pub name_span: Range<usize>,
+ #[cfg(feature = "spans")]
+ #[cfg_attr(docsrs, doc(cfg(feature = "spans")))]
+ pub value_span: Range<usize>,
+}
+
+impl Ord for Attribute {
+ fn cmp(&self, other: &Self) -> std::cmp::Ordering {
+ (&self.name, &self.value).cmp(&(&other.name, &other.value))
+ }
+}
+
+impl PartialOrd for Attribute {
+ fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
+ (&self.name, &self.value).partial_cmp(&(&other.name, &other.value))
+ }
}
/// A tag token.
@@ -62,11 +82,14 @@ pub struct Tag {
pub name: String,
pub self_closing: bool,
pub attrs: Vec<Attribute>,
+ #[cfg(feature = "spans")]
+ #[cfg_attr(docsrs, doc(cfg(feature = "spans")))]
+ pub name_span: Range<usize>,
}
impl Tag {
/// Are the tags equivalent when we don't care about attribute order?
- /// Also ignores the self-closing flag.
+ /// Also ignores the self-closing flag and spans.
pub fn equiv_modulo_attr_order(&self, other: &Tag) -> bool {
if (self.kind != other.kind) || (self.name != other.name) {
return false;
diff --git a/src/tokenizer/mod.rs b/src/tokenizer/mod.rs
index 2d5e1ed..4511cf8 100644
--- a/src/tokenizer/mod.rs
+++ b/src/tokenizer/mod.rs
@@ -59,7 +59,8 @@ fn option_push(opt_str: &mut Option<String>, c: char) {
#[derive(Clone)]
pub struct TokenizerOpts {
/// Report all parse errors described in the spec, at some
- /// performance penalty? Default: false
+ /// performance penalty? Defaults to false, except when the
+ /// `spans` feature is enabled in which case it defaults to true.
pub exact_errors: bool,
/// Discard a `U+FEFF BYTE ORDER MARK` if we see one at the beginning
@@ -82,7 +83,7 @@ pub struct TokenizerOpts {
impl Default for TokenizerOpts {
fn default() -> TokenizerOpts {
TokenizerOpts {
- exact_errors: false,
+ exact_errors: cfg!(feature = "spans"),
discard_bom: true,
profile: false,
initial_state: None,
@@ -162,6 +163,36 @@ pub struct Tokenizer<Sink> {
/// Track current line
current_line: u64,
+
+ #[cfg(feature = "spans")]
+ spans: Spans,
+}
+
+#[cfg(feature = "spans")]
+#[derive(Default)]
+struct Spans {
+ /// Track current byte position
+ current_pos: usize,
+
+ /// Current tag name span.
+ current_tag_name: core::ops::Range<usize>,
+
+ /// Current attribute name span.
+ current_attr_name: core::ops::Range<usize>,
+
+ /// Current attribute value span.
+ current_attr_value: core::ops::Range<usize>,
+}
+
+#[cfg(feature = "spans")]
+impl Spans {
+ fn end_tag_name(&mut self) {
+ self.current_tag_name.end = self.current_pos - 1;
+ }
+
+ fn end_attr_name(&mut self) {
+ self.current_attr_name.end = self.current_pos - 1;
+ }
}
impl<Sink: TokenSink> Tokenizer<Sink> {
@@ -193,6 +224,8 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
state_profile: BTreeMap::new(),
time_in_sink: 0,
current_line: 1,
+ #[cfg(feature = "spans")]
+ spans: Spans::default(),
}
}
@@ -263,6 +296,11 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
self.emit_error(Cow::Owned(msg));
}
+ #[cfg(feature = "spans")]
+ {
+ self.spans.current_pos += c.len_utf8();
+ }
+
self.current_char = c;
Some(c)
}
@@ -430,6 +468,8 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
name,
self_closing: self.current_tag_self_closing,
attrs: replace(&mut self.current_tag_attrs, vec![]),
+ #[cfg(feature = "spans")]
+ name_span: self.spans.current_tag_name.clone(),
});
match self.process_token(token) {
@@ -516,6 +556,10 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
self.current_tag_attrs.push(Attribute {
name: name,
value: replace(&mut self.current_attr_value, String::new()),
+ #[cfg(feature = "spans")]
+ name_span: self.spans.current_attr_name.clone(),
+ #[cfg(feature = "spans")]
+ value_span: self.spans.current_attr_value.clone(),
});
}
}
@@ -830,6 +874,10 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
}
c => match lower_ascii_letter(c) {
Some(cl) => {
+ #[cfg(feature = "spans")]
+ {
+ self.spans.current_tag_name.start = self.spans.current_pos - 1;
+ }
go!(self: create_tag StartTag cl);
return go!(self: to TagName);
}
@@ -858,6 +906,10 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
}
c => match lower_ascii_letter(c) {
Some(cl) => {
+ #[cfg(feature = "spans")]
+ {
+ self.spans.current_tag_name.start = self.spans.current_pos - 1;
+ }
go!(self: create_tag EndTag cl);
return go!(self: to TagName);
}
@@ -875,12 +927,18 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
states::TagName => loop {
match get_char!(self, input)? {
'\t' | '\n' | '\x0C' | ' ' => {
+ #[cfg(feature = "spans")]
+ self.spans.end_tag_name();
return go!(self: to BeforeAttributeName);
}
'/' => {
+ #[cfg(feature = "spans")]
+ self.spans.end_tag_name();
return go!(self: to SelfClosingStartTag);
}
'>' => {
+ #[cfg(feature = "spans")]
+ self.spans.end_tag_name();
return go!(self: emit_tag Data);
}
'\0' => {
@@ -1168,6 +1226,10 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
c => match lower_ascii_letter(c) {
Some(cl) => {
go!(self: create_attr cl);
+ #[cfg(feature = "spans")]
+ {
+ self.spans.current_attr_name.start = self.spans.current_pos - 1;
+ }
return go!(self: to AttributeName);
}
None => {
@@ -1186,15 +1248,23 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
states::AttributeName => loop {
match get_char!(self, input)? {
'\t' | '\n' | '\x0C' | ' ' => {
+ #[cfg(feature = "spans")]
+ self.spans.end_attr_name();
return go!(self: to AfterAttributeName);
}
'/' => {
+ #[cfg(feature = "spans")]
+ self.spans.end_attr_name();
return go!(self: to SelfClosingStartTag);
}
'=' => {
+ #[cfg(feature = "spans")]
+ self.spans.end_attr_name();
return go!(self: to BeforeAttributeValue);
}
'>' => {
+ #[cfg(feature = "spans")]
+ self.spans.end_attr_name();
return go!(self: emit_tag Data);
}
'\0' => {
@@ -1285,101 +1355,134 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
},
//§ attribute-value-(double-quoted)-state
- states::AttributeValue(DoubleQuoted) => loop {
- match pop_except_from!(self, input, small_char_set!('\r' '"' '&' '\0' '\n'))? {
- FromSet('"') => {
- return go!(self: to AfterAttributeValueQuoted);
- }
- FromSet('&') => {
- return go!(self: consume_char_ref '"');
- }
- FromSet('\0') => {
- go!(self: error);
- go!(self: push_value '\u{fffd}');
- }
- FromSet(c) => {
- go!(self: push_value c);
- }
- NotFromSet(ref b) => {
- go!(self: append_value b);
+ states::AttributeValue(DoubleQuoted) => {
+ #[cfg(feature = "spans")]
+ {
+ self.spans.current_attr_value.start = self.spans.current_pos;
+ }
+ loop {
+ match pop_except_from!(self, input, small_char_set!('\r' '"' '&' '\0' '\n'))? {
+ FromSet('"') => {
+ return go!(self: to AfterAttributeValueQuoted);
+ }
+ FromSet('&') => {
+ return go!(self: consume_char_ref '"');
+ }
+ FromSet('\0') => {
+ go!(self: error);
+ go!(self: push_value '\u{fffd}');
+ }
+ FromSet(c) => {
+ go!(self: push_value c);
+ }
+ NotFromSet(ref b) => {
+ go!(self: append_value b);
+ }
}
}
- },
+ }
//§ attribute-value-(single-quoted)-state
- states::AttributeValue(SingleQuoted) => loop {
- match pop_except_from!(self, input, small_char_set!('\r' '\'' '&' '\0' '\n'))? {
- FromSet('\'') => {
- return go!(self: to AfterAttributeValueQuoted);
- }
- FromSet('&') => {
- return go!(self: consume_char_ref '\'');
- }
- FromSet('\0') => {
- go!(self: error);
- go!(self: push_value '\u{fffd}');
- }
- FromSet(c) => {
- go!(self: push_value c);
- }
- NotFromSet(ref b) => {
- go!(self: append_value b);
+ states::AttributeValue(SingleQuoted) => {
+ #[cfg(feature = "spans")]
+ {
+ self.spans.current_attr_value.start = self.spans.current_pos;
+ }
+ loop {
+ match pop_except_from!(self, input, small_char_set!('\r' '\'' '&' '\0' '\n'))? {
+ FromSet('\'') => {
+ return go!(self: to AfterAttributeValueQuoted);
+ }
+ FromSet('&') => {
+ return go!(self: consume_char_ref '\'');
+ }
+ FromSet('\0') => {
+ go!(self: error);
+ go!(self: push_value '\u{fffd}');
+ }
+ FromSet(c) => {
+ go!(self: push_value c);
+ }
+ NotFromSet(ref b) => {
+ go!(self: append_value b);
+ }
}
}
- },
+ }
//§ attribute-value-(unquoted)-state
- states::AttributeValue(Unquoted) => loop {
- match pop_except_from!(
- self,
- input,
- small_char_set!('\r' '\t' '\n' '\x0C' ' ' '&' '>' '\0')
- )? {
- FromSet('\t') | FromSet('\n') | FromSet('\x0C') | FromSet(' ') => {
- return go!(self: to BeforeAttributeName);
- }
- FromSet('&') => {
- return go!(self: consume_char_ref '>');
- }
- FromSet('>') => {
- return go!(self: emit_tag Data);
- }
- FromSet('\0') => {
- go!(self: error);
- go!(self: push_value '\u{fffd}');
- }
- FromSet(c) => {
- go_match!(self: c,
+ states::AttributeValue(Unquoted) => {
+ #[cfg(feature = "spans")]
+ {
+ self.spans.current_attr_value.start = self.spans.current_pos;
+ }
+ loop {
+ match pop_except_from!(
+ self,
+ input,
+ small_char_set!('\r' '\t' '\n' '\x0C' ' ' '&' '>' '\0')
+ )? {
+ FromSet('\t') | FromSet('\n') | FromSet('\x0C') | FromSet(' ') => {
+ #[cfg(feature = "spans")]
+ {
+ self.spans.current_attr_value.end = self.spans.current_pos - 1;
+ }
+ return go!(self: to BeforeAttributeName);
+ }
+ FromSet('&') => {
+ return go!(self: consume_char_ref '>');
+ }
+ FromSet('>') => {
+ #[cfg(feature = "spans")]
+ {
+ self.spans.current_attr_value.end = self.spans.current_pos - 1;
+ }
+ return go!(self: emit_tag Data);
+ }
+ FromSet('\0') => {
+ go!(self: error);
+ go!(self: push_value '\u{fffd}');
+ }
+ FromSet(c) => {
+ go_match!(self: c,
'"' , '\'' , '<' , '=' , '`' => error);
- {
- go!(self: push_value c);
- };
- }
- NotFromSet(ref b) => {
- go!(self: append_value b);
+ {
+ go!(self: push_value c);
+ };
+ }
+ NotFromSet(ref b) => {
+ go!(self: append_value b);
+ }
}
}
- },
+ }
//§ after-attribute-value-(quoted)-state
- states::AfterAttributeValueQuoted => loop {
- match get_char!(self, input)? {
- '\t' | '\n' | '\x0C' | ' ' => {
- return go!(self: to BeforeAttributeName);
- }
- '/' => {
- return go!(self: to SelfClosingStartTag);
- }
- '>' => {
- return go!(self: emit_tag Data);
- }
- _ => {
- go!(self: error);
- self.reconsume = true;
- return go!(self: to BeforeAttributeName);
+ states::AfterAttributeValueQuoted => {
+ #[cfg(feature = "spans")]
+ {
+ self.spans.current_attr_value.end = self.spans.current_pos - 1;
+ }
+
+ loop {
+ match get_char!(self, input)? {
+ '\t' | '\n' | '\x0C' | ' ' => {
+ return go!(self: to BeforeAttributeName);
+ }
+ '/' => {
+ return go!(self: to SelfClosingStartTag);
+ }
+ '>' => {
+ return go!(self: emit_tag Data);
+ }
+ _ => {
+ go!(self: error);
+ self.reconsume = true;
+ return go!(self: to BeforeAttributeName);
+ }
}
}
- },
+ }
//§ self-closing-start-tag-state
states::SelfClosingStartTag => loop {
diff --git a/tests/files/test.html b/tests/files/test.html
new file mode 100644
index 0000000..0dcbdbf
--- /dev/null
+++ b/tests/files/test.html
@@ -0,0 +1,7 @@
+This is a file.
+
+Here is a tag: <strong >very cool</strong>
+
+Tags can have attributes: <div id = foo >...</div>
+
+Attribute values can be quoted: <input name = 'age' type = "number">
diff --git a/tests/files/test.out b/tests/files/test.out
new file mode 100644
index 0000000..7127ebc
--- /dev/null
+++ b/tests/files/test.out
@@ -0,0 +1,17 @@
+note:
+ ┌─ test.html:3:17
+ │
+3 │ Here is a tag: <strong >very cool</strong>
+ │ ^^^^^^ ^^^^^^ EndTag
+ │ │
+ │ StartTag
+4 │
+5 │ Tags can have attributes: <div id = foo >...</div>
+ │ ^^ ^^^ attribute value
+ │ │
+ │ attribute name
+6 │
+7 │ Attribute values can be quoted: <input name = 'age' type = "number">
+ │ ^^^ ^^^^^^ in double quotes
+ │ │
+ │ in single quotes
diff --git a/tests/spans.rs b/tests/spans.rs
new file mode 100644
index 0000000..bfa42f6
--- /dev/null
+++ b/tests/spans.rs
@@ -0,0 +1,89 @@
+#![cfg(feature = "spans")]
+use std::include_str;
+
+use codespan_reporting::{
+ self,
+ diagnostic::{Diagnostic, Label},
+ files::SimpleFiles,
+ term::{self, termcolor::Buffer},
+};
+use html5tokenizer::{
+ BufferQueue, Tag, Token, TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts,
+};
+
+#[derive(Default)]
+struct TagSink {
+ tags: Vec<Tag>,
+}
+
+impl TokenSink for TagSink {
+ fn process_token(&mut self, token: Token, _line_number: u64) -> TokenSinkResult {
+ if let Token::TagToken(tag) = token {
+ self.tags.push(tag);
+ }
+ TokenSinkResult::Continue
+ }
+}
+
+#[test]
+fn test() {
+ let sink = TagSink::default();
+
+ let mut input = BufferQueue::new();
+ let text = include_str!("files/test.html");
+ input.push_back(text.to_string());
+
+ let mut tok = Tokenizer::new(sink, TokenizerOpts::default());
+ let _ = tok.feed(&mut input);
+
+ let mut files = SimpleFiles::new();
+ let file_id = files.add("test.html", text);
+ let mut labels = Vec::new();
+
+ let tags = tok.sink.tags;
+ for tag in &tags[..2] {
+ labels.push(
+ Label::primary(file_id, tag.name_span.clone()).with_message(format!("{:?}", tag.kind)),
+ );
+ }
+ labels.push(
+ Label::primary(file_id, tags[2].attrs[0].name_span.clone()).with_message("attribute name"),
+ );
+ labels.push(
+ Label::primary(file_id, tags[2].attrs[0].value_span.clone())
+ .with_message("attribute value"),
+ );
+ labels.push(
+ Label::primary(file_id, tags[4].attrs[0].value_span.clone())
+ .with_message("in single quotes"),
+ );
+ labels.push(
+ Label::primary(file_id, tags[4].attrs[1].value_span.clone())
+ .with_message("in double quotes"),
+ );
+ let diagnostic = Diagnostic::note().with_labels(labels);
+
+ let mut writer = Buffer::no_color();
+ let config = codespan_reporting::term::Config::default();
+ term::emit(&mut writer, &config, &files, &diagnostic).unwrap();
+
+ let actual = remove_trailing_spaces(std::str::from_utf8(writer.as_slice()).unwrap());
+ let expected = include_str!("files/test.out");
+
+ if actual != expected {
+ println!(
+ "EXPECTED:\n{banner}\n{expected}{banner}\n\nACTUAL OUTPUT:\n{banner}\n{actual}{banner}",
+ banner = "-".repeat(30),
+ expected = expected,
+ actual = actual
+ );
+ panic!("failed");
+ }
+}
+
+fn remove_trailing_spaces(text: &str) -> String {
+ text.lines()
+ .map(|l| l.trim_end())
+ .collect::<Vec<_>>()
+ .join("\n")
+}