diff options
author | Martin Fischer <martin@push-f.com> | 2021-11-30 07:28:21 +0100 |
---|---|---|
committer | Martin Fischer <martin@push-f.com> | 2021-11-30 11:22:35 +0100 |
commit | baf1477c587fe22d27e94408cf2505d588ba007e (patch) | |
tree | f3e027e3c149cfeb7187a625756ea4b2de47c82a | |
parent | 25087cce997abc386f881648dfd39c83dfef7667 (diff) |
add spans feature
-rw-r--r-- | Cargo.toml | 6 | ||||
-rw-r--r-- | README.md | 4 | ||||
-rw-r--r-- | src/lib.rs | 1 | ||||
-rw-r--r-- | src/tokenizer/interface.rs | 27 | ||||
-rw-r--r-- | src/tokenizer/mod.rs | 267 | ||||
-rw-r--r-- | tests/files/test.html | 7 | ||||
-rw-r--r-- | tests/files/test.out | 17 | ||||
-rw-r--r-- | tests/spans.rs | 89 |
8 files changed, 334 insertions, 84 deletions
@@ -12,6 +12,7 @@ edition = "2018" [features] default = ["named-entities"] +spans = [] # resolve named entities like & named-entities = ["phf", "phf_codegen"] @@ -23,8 +24,13 @@ phf = { version = "0.9", optional = true } phf_codegen = { version = "0.9", optional = true } [dev-dependencies] +codespan-reporting = "0.11.1" criterion = "0.3" [[bench]] name = "html5ever" harness = false + +[package.metadata.docs.rs] +all-features = true +rustdoc-args = ["--cfg", "docsrs"] @@ -15,6 +15,10 @@ changes: tag names (e.g. for `script` and `styles`) ... with the html5ever tokenizer you had to do this yourself. +* An optional `spans` feature has been added to make the tokenizer report the + source code spans for tag names, attribute names and attribute values. + The feature is disabled by default. + * The API has been cleaned up a bit (e.g. the internal tokenizer state enums are no longer public). @@ -10,6 +10,7 @@ #![doc = include_str!("../README.md")] #![crate_type = "dylib"] #![cfg_attr(test, deny(warnings))] +#![cfg_attr(docsrs, feature(doc_cfg))] #![allow(unused_parens)] #[macro_use] diff --git a/src/tokenizer/interface.rs b/src/tokenizer/interface.rs index 2c6cc38..f12fb16 100644 --- a/src/tokenizer/interface.rs +++ b/src/tokenizer/interface.rs @@ -9,6 +9,8 @@ use crate::tokenizer::states; use std::borrow::Cow; +#[cfg(feature = "spans")] +use std::ops::Range; pub use self::TagKind::{EndTag, StartTag}; pub use self::Token::{CharacterTokens, CommentToken, DoctypeToken, TagToken}; @@ -47,12 +49,30 @@ pub enum TagKind { /// The tokenizer creates all attributes this way, but the tree /// builder will adjust certain attribute names inside foreign /// content (MathML, SVG). -#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug)] +#[derive(PartialEq, Eq, Clone, Debug)] pub struct Attribute { /// The name of the attribute (e.g. the `class` in `<div class="test">`) pub name: String, /// The value of the attribute (e.g. the `"test"` in `<div class="test">`) pub value: String, + #[cfg(feature = "spans")] + #[cfg_attr(docsrs, doc(cfg(feature = "spans")))] + pub name_span: Range<usize>, + #[cfg(feature = "spans")] + #[cfg_attr(docsrs, doc(cfg(feature = "spans")))] + pub value_span: Range<usize>, +} + +impl Ord for Attribute { + fn cmp(&self, other: &Self) -> std::cmp::Ordering { + (&self.name, &self.value).cmp(&(&other.name, &other.value)) + } +} + +impl PartialOrd for Attribute { + fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> { + (&self.name, &self.value).partial_cmp(&(&other.name, &other.value)) + } } /// A tag token. @@ -62,11 +82,14 @@ pub struct Tag { pub name: String, pub self_closing: bool, pub attrs: Vec<Attribute>, + #[cfg(feature = "spans")] + #[cfg_attr(docsrs, doc(cfg(feature = "spans")))] + pub name_span: Range<usize>, } impl Tag { /// Are the tags equivalent when we don't care about attribute order? - /// Also ignores the self-closing flag. + /// Also ignores the self-closing flag and spans. pub fn equiv_modulo_attr_order(&self, other: &Tag) -> bool { if (self.kind != other.kind) || (self.name != other.name) { return false; diff --git a/src/tokenizer/mod.rs b/src/tokenizer/mod.rs index 2d5e1ed..4511cf8 100644 --- a/src/tokenizer/mod.rs +++ b/src/tokenizer/mod.rs @@ -59,7 +59,8 @@ fn option_push(opt_str: &mut Option<String>, c: char) { #[derive(Clone)] pub struct TokenizerOpts { /// Report all parse errors described in the spec, at some - /// performance penalty? Default: false + /// performance penalty? Defaults to false, except when the + /// `spans` feature is enabled in which case it defaults to true. pub exact_errors: bool, /// Discard a `U+FEFF BYTE ORDER MARK` if we see one at the beginning @@ -82,7 +83,7 @@ pub struct TokenizerOpts { impl Default for TokenizerOpts { fn default() -> TokenizerOpts { TokenizerOpts { - exact_errors: false, + exact_errors: cfg!(feature = "spans"), discard_bom: true, profile: false, initial_state: None, @@ -162,6 +163,36 @@ pub struct Tokenizer<Sink> { /// Track current line current_line: u64, + + #[cfg(feature = "spans")] + spans: Spans, +} + +#[cfg(feature = "spans")] +#[derive(Default)] +struct Spans { + /// Track current byte position + current_pos: usize, + + /// Current tag name span. + current_tag_name: core::ops::Range<usize>, + + /// Current attribute name span. + current_attr_name: core::ops::Range<usize>, + + /// Current attribute value span. + current_attr_value: core::ops::Range<usize>, +} + +#[cfg(feature = "spans")] +impl Spans { + fn end_tag_name(&mut self) { + self.current_tag_name.end = self.current_pos - 1; + } + + fn end_attr_name(&mut self) { + self.current_attr_name.end = self.current_pos - 1; + } } impl<Sink: TokenSink> Tokenizer<Sink> { @@ -193,6 +224,8 @@ impl<Sink: TokenSink> Tokenizer<Sink> { state_profile: BTreeMap::new(), time_in_sink: 0, current_line: 1, + #[cfg(feature = "spans")] + spans: Spans::default(), } } @@ -263,6 +296,11 @@ impl<Sink: TokenSink> Tokenizer<Sink> { self.emit_error(Cow::Owned(msg)); } + #[cfg(feature = "spans")] + { + self.spans.current_pos += c.len_utf8(); + } + self.current_char = c; Some(c) } @@ -430,6 +468,8 @@ impl<Sink: TokenSink> Tokenizer<Sink> { name, self_closing: self.current_tag_self_closing, attrs: replace(&mut self.current_tag_attrs, vec![]), + #[cfg(feature = "spans")] + name_span: self.spans.current_tag_name.clone(), }); match self.process_token(token) { @@ -516,6 +556,10 @@ impl<Sink: TokenSink> Tokenizer<Sink> { self.current_tag_attrs.push(Attribute { name: name, value: replace(&mut self.current_attr_value, String::new()), + #[cfg(feature = "spans")] + name_span: self.spans.current_attr_name.clone(), + #[cfg(feature = "spans")] + value_span: self.spans.current_attr_value.clone(), }); } } @@ -830,6 +874,10 @@ impl<Sink: TokenSink> Tokenizer<Sink> { } c => match lower_ascii_letter(c) { Some(cl) => { + #[cfg(feature = "spans")] + { + self.spans.current_tag_name.start = self.spans.current_pos - 1; + } go!(self: create_tag StartTag cl); return go!(self: to TagName); } @@ -858,6 +906,10 @@ impl<Sink: TokenSink> Tokenizer<Sink> { } c => match lower_ascii_letter(c) { Some(cl) => { + #[cfg(feature = "spans")] + { + self.spans.current_tag_name.start = self.spans.current_pos - 1; + } go!(self: create_tag EndTag cl); return go!(self: to TagName); } @@ -875,12 +927,18 @@ impl<Sink: TokenSink> Tokenizer<Sink> { states::TagName => loop { match get_char!(self, input)? { '\t' | '\n' | '\x0C' | ' ' => { + #[cfg(feature = "spans")] + self.spans.end_tag_name(); return go!(self: to BeforeAttributeName); } '/' => { + #[cfg(feature = "spans")] + self.spans.end_tag_name(); return go!(self: to SelfClosingStartTag); } '>' => { + #[cfg(feature = "spans")] + self.spans.end_tag_name(); return go!(self: emit_tag Data); } '\0' => { @@ -1168,6 +1226,10 @@ impl<Sink: TokenSink> Tokenizer<Sink> { c => match lower_ascii_letter(c) { Some(cl) => { go!(self: create_attr cl); + #[cfg(feature = "spans")] + { + self.spans.current_attr_name.start = self.spans.current_pos - 1; + } return go!(self: to AttributeName); } None => { @@ -1186,15 +1248,23 @@ impl<Sink: TokenSink> Tokenizer<Sink> { states::AttributeName => loop { match get_char!(self, input)? { '\t' | '\n' | '\x0C' | ' ' => { + #[cfg(feature = "spans")] + self.spans.end_attr_name(); return go!(self: to AfterAttributeName); } '/' => { + #[cfg(feature = "spans")] + self.spans.end_attr_name(); return go!(self: to SelfClosingStartTag); } '=' => { + #[cfg(feature = "spans")] + self.spans.end_attr_name(); return go!(self: to BeforeAttributeValue); } '>' => { + #[cfg(feature = "spans")] + self.spans.end_attr_name(); return go!(self: emit_tag Data); } '\0' => { @@ -1285,101 +1355,134 @@ impl<Sink: TokenSink> Tokenizer<Sink> { }, //§ attribute-value-(double-quoted)-state - states::AttributeValue(DoubleQuoted) => loop { - match pop_except_from!(self, input, small_char_set!('\r' '"' '&' '\0' '\n'))? { - FromSet('"') => { - return go!(self: to AfterAttributeValueQuoted); - } - FromSet('&') => { - return go!(self: consume_char_ref '"'); - } - FromSet('\0') => { - go!(self: error); - go!(self: push_value '\u{fffd}'); - } - FromSet(c) => { - go!(self: push_value c); - } - NotFromSet(ref b) => { - go!(self: append_value b); + states::AttributeValue(DoubleQuoted) => { + #[cfg(feature = "spans")] + { + self.spans.current_attr_value.start = self.spans.current_pos; + } + loop { + match pop_except_from!(self, input, small_char_set!('\r' '"' '&' '\0' '\n'))? { + FromSet('"') => { + return go!(self: to AfterAttributeValueQuoted); + } + FromSet('&') => { + return go!(self: consume_char_ref '"'); + } + FromSet('\0') => { + go!(self: error); + go!(self: push_value '\u{fffd}'); + } + FromSet(c) => { + go!(self: push_value c); + } + NotFromSet(ref b) => { + go!(self: append_value b); + } } } - }, + } //§ attribute-value-(single-quoted)-state - states::AttributeValue(SingleQuoted) => loop { - match pop_except_from!(self, input, small_char_set!('\r' '\'' '&' '\0' '\n'))? { - FromSet('\'') => { - return go!(self: to AfterAttributeValueQuoted); - } - FromSet('&') => { - return go!(self: consume_char_ref '\''); - } - FromSet('\0') => { - go!(self: error); - go!(self: push_value '\u{fffd}'); - } - FromSet(c) => { - go!(self: push_value c); - } - NotFromSet(ref b) => { - go!(self: append_value b); + states::AttributeValue(SingleQuoted) => { + #[cfg(feature = "spans")] + { + self.spans.current_attr_value.start = self.spans.current_pos; + } + loop { + match pop_except_from!(self, input, small_char_set!('\r' '\'' '&' '\0' '\n'))? { + FromSet('\'') => { + return go!(self: to AfterAttributeValueQuoted); + } + FromSet('&') => { + return go!(self: consume_char_ref '\''); + } + FromSet('\0') => { + go!(self: error); + go!(self: push_value '\u{fffd}'); + } + FromSet(c) => { + go!(self: push_value c); + } + NotFromSet(ref b) => { + go!(self: append_value b); + } } } - }, + } //§ attribute-value-(unquoted)-state - states::AttributeValue(Unquoted) => loop { - match pop_except_from!( - self, - input, - small_char_set!('\r' '\t' '\n' '\x0C' ' ' '&' '>' '\0') - )? { - FromSet('\t') | FromSet('\n') | FromSet('\x0C') | FromSet(' ') => { - return go!(self: to BeforeAttributeName); - } - FromSet('&') => { - return go!(self: consume_char_ref '>'); - } - FromSet('>') => { - return go!(self: emit_tag Data); - } - FromSet('\0') => { - go!(self: error); - go!(self: push_value '\u{fffd}'); - } - FromSet(c) => { - go_match!(self: c, + states::AttributeValue(Unquoted) => { + #[cfg(feature = "spans")] + { + self.spans.current_attr_value.start = self.spans.current_pos; + } + loop { + match pop_except_from!( + self, + input, + small_char_set!('\r' '\t' '\n' '\x0C' ' ' '&' '>' '\0') + )? { + FromSet('\t') | FromSet('\n') | FromSet('\x0C') | FromSet(' ') => { + #[cfg(feature = "spans")] + { + self.spans.current_attr_value.end = self.spans.current_pos - 1; + } + return go!(self: to BeforeAttributeName); + } + FromSet('&') => { + return go!(self: consume_char_ref '>'); + } + FromSet('>') => { + #[cfg(feature = "spans")] + { + self.spans.current_attr_value.end = self.spans.current_pos - 1; + } + return go!(self: emit_tag Data); + } + FromSet('\0') => { + go!(self: error); + go!(self: push_value '\u{fffd}'); + } + FromSet(c) => { + go_match!(self: c, '"' , '\'' , '<' , '=' , '`' => error); - { - go!(self: push_value c); - }; - } - NotFromSet(ref b) => { - go!(self: append_value b); + { + go!(self: push_value c); + }; + } + NotFromSet(ref b) => { + go!(self: append_value b); + } } } - }, + } //§ after-attribute-value-(quoted)-state - states::AfterAttributeValueQuoted => loop { - match get_char!(self, input)? { - '\t' | '\n' | '\x0C' | ' ' => { - return go!(self: to BeforeAttributeName); - } - '/' => { - return go!(self: to SelfClosingStartTag); - } - '>' => { - return go!(self: emit_tag Data); - } - _ => { - go!(self: error); - self.reconsume = true; - return go!(self: to BeforeAttributeName); + states::AfterAttributeValueQuoted => { + #[cfg(feature = "spans")] + { + self.spans.current_attr_value.end = self.spans.current_pos - 1; + } + + loop { + match get_char!(self, input)? { + '\t' | '\n' | '\x0C' | ' ' => { + return go!(self: to BeforeAttributeName); + } + '/' => { + return go!(self: to SelfClosingStartTag); + } + '>' => { + return go!(self: emit_tag Data); + } + _ => { + go!(self: error); + self.reconsume = true; + return go!(self: to BeforeAttributeName); + } } } - }, + } //§ self-closing-start-tag-state states::SelfClosingStartTag => loop { diff --git a/tests/files/test.html b/tests/files/test.html new file mode 100644 index 0000000..0dcbdbf --- /dev/null +++ b/tests/files/test.html @@ -0,0 +1,7 @@ +This is a file. + +Here is a tag: <strong >very cool</strong> + +Tags can have attributes: <div id = foo >...</div> + +Attribute values can be quoted: <input name = 'age' type = "number"> diff --git a/tests/files/test.out b/tests/files/test.out new file mode 100644 index 0000000..7127ebc --- /dev/null +++ b/tests/files/test.out @@ -0,0 +1,17 @@ +note: + ┌─ test.html:3:17 + │ +3 │ Here is a tag: <strong >very cool</strong> + │ ^^^^^^ ^^^^^^ EndTag + │ │ + │ StartTag +4 │ +5 │ Tags can have attributes: <div id = foo >...</div> + │ ^^ ^^^ attribute value + │ │ + │ attribute name +6 │ +7 │ Attribute values can be quoted: <input name = 'age' type = "number"> + │ ^^^ ^^^^^^ in double quotes + │ │ + │ in single quotes diff --git a/tests/spans.rs b/tests/spans.rs new file mode 100644 index 0000000..bfa42f6 --- /dev/null +++ b/tests/spans.rs @@ -0,0 +1,89 @@ +#![cfg(feature = "spans")] +use std::include_str; + +use codespan_reporting::{ + self, + diagnostic::{Diagnostic, Label}, + files::SimpleFiles, + term::{self, termcolor::Buffer}, +}; +use html5tokenizer::{ + BufferQueue, Tag, Token, TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts, +}; + +#[derive(Default)] +struct TagSink { + tags: Vec<Tag>, +} + +impl TokenSink for TagSink { + fn process_token(&mut self, token: Token, _line_number: u64) -> TokenSinkResult { + if let Token::TagToken(tag) = token { + self.tags.push(tag); + } + TokenSinkResult::Continue + } +} + +#[test] +fn test() { + let sink = TagSink::default(); + + let mut input = BufferQueue::new(); + let text = include_str!("files/test.html"); + input.push_back(text.to_string()); + + let mut tok = Tokenizer::new(sink, TokenizerOpts::default()); + let _ = tok.feed(&mut input); + + let mut files = SimpleFiles::new(); + let file_id = files.add("test.html", text); + let mut labels = Vec::new(); + + let tags = tok.sink.tags; + for tag in &tags[..2] { + labels.push( + Label::primary(file_id, tag.name_span.clone()).with_message(format!("{:?}", tag.kind)), + ); + } + labels.push( + Label::primary(file_id, tags[2].attrs[0].name_span.clone()).with_message("attribute name"), + ); + labels.push( + Label::primary(file_id, tags[2].attrs[0].value_span.clone()) + .with_message("attribute value"), + ); + labels.push( + Label::primary(file_id, tags[4].attrs[0].value_span.clone()) + .with_message("in single quotes"), + ); + labels.push( + Label::primary(file_id, tags[4].attrs[1].value_span.clone()) + .with_message("in double quotes"), + ); + let diagnostic = Diagnostic::note().with_labels(labels); + + let mut writer = Buffer::no_color(); + let config = codespan_reporting::term::Config::default(); + term::emit(&mut writer, &config, &files, &diagnostic).unwrap(); + + let actual = remove_trailing_spaces(std::str::from_utf8(writer.as_slice()).unwrap()); + let expected = include_str!("files/test.out"); + + if actual != expected { + println!( + "EXPECTED:\n{banner}\n{expected}{banner}\n\nACTUAL OUTPUT:\n{banner}\n{actual}{banner}", + banner = "-".repeat(30), + expected = expected, + actual = actual + ); + panic!("failed"); + } +} + +fn remove_trailing_spaces(text: &str) -> String { + text.lines() + .map(|l| l.trim_end()) + .collect::<Vec<_>>() + .join("\n") +} |