add spans feature

author: Martin Fischer <martin@push-f.com> 2021-11-30 07:28:21 +0100
committer: Martin Fischer <martin@push-f.com> 2021-11-30 11:22:35 +0100
commit: baf1477c587fe22d27e94408cf2505d588ba007e (patch)
tree: f3e027e3c149cfeb7187a625756ea4b2de47c82a
parent: 25087cce997abc386f881648dfd39c83dfef7667 (diff)
8 files changed, 334 insertions, 84 deletions
diff --git a/Cargo.toml b/Cargo.toml
index 7b270b6..8aa0df0 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -12,6 +12,7 @@ edition = "2018"
 
 [features]
 default = ["named-entities"]
+spans = []
 
 # resolve named entities like &amp;
 named-entities = ["phf", "phf_codegen"]
@@ -23,8 +24,13 @@ phf = { version = "0.9", optional = true  }
 phf_codegen = { version = "0.9", optional = true }
 
 [dev-dependencies]
+codespan-reporting = "0.11.1"
 criterion = "0.3"
 
 [[bench]]
 name = "html5ever"
 harness = false
+
+[package.metadata.docs.rs]
+all-features = true
+rustdoc-args = ["--cfg", "docsrs"]
diff --git a/README.md b/README.md
index 91265b0..3adff5a 100644
--- a/README.md
+++ b/README.md
@@ -15,6 +15,10 @@ changes:
   tag names (e.g. for `script` and `styles`) ... with the html5ever tokenizer
   you had to do this yourself.
 
+* An optional `spans` feature has been added to make the tokenizer report the
+  source code spans for tag names, attribute names and attribute values.
+  The feature is disabled by default.
+
 * The API has been cleaned up a bit (e.g. the internal tokenizer state enums
   are no longer public).
 
diff --git a/src/lib.rs b/src/lib.rs
index 57b7b05..fe0e713 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -10,6 +10,7 @@
 #![doc = include_str!("../README.md")]
 #![crate_type = "dylib"]
 #![cfg_attr(test, deny(warnings))]
+#![cfg_attr(docsrs, feature(doc_cfg))]
 #![allow(unused_parens)]
 
 #[macro_use]
diff --git a/src/tokenizer/interface.rs b/src/tokenizer/interface.rs
index 2c6cc38..f12fb16 100644
--- a/src/tokenizer/interface.rs
+++ b/src/tokenizer/interface.rs
@@ -9,6 +9,8 @@
 
 use crate::tokenizer::states;
 use std::borrow::Cow;
+#[cfg(feature = "spans")]
+use std::ops::Range;
 
 pub use self::TagKind::{EndTag, StartTag};
 pub use self::Token::{CharacterTokens, CommentToken, DoctypeToken, TagToken};
@@ -47,12 +49,30 @@ pub enum TagKind {
 /// The tokenizer creates all attributes this way, but the tree
 /// builder will adjust certain attribute names inside foreign
 /// content (MathML, SVG).
-#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug)]
+#[derive(PartialEq, Eq, Clone, Debug)]
 pub struct Attribute {
     /// The name of the attribute (e.g. the `class` in `<div class="test">`)
     pub name: String,
     /// The value of the attribute (e.g. the `"test"` in `<div class="test">`)
     pub value: String,
+    #[cfg(feature = "spans")]
+    #[cfg_attr(docsrs, doc(cfg(feature = "spans")))]
+    pub name_span: Range<usize>,
+    #[cfg(feature = "spans")]
+    #[cfg_attr(docsrs, doc(cfg(feature = "spans")))]
+    pub value_span: Range<usize>,
+}
+
+impl Ord for Attribute {
+    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
+        (&self.name, &self.value).cmp(&(&other.name, &other.value))
+    }
+}
+
+impl PartialOrd for Attribute {
+    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
+        (&self.name, &self.value).partial_cmp(&(&other.name, &other.value))
+    }
 }
 
 /// A tag token.
@@ -62,11 +82,14 @@ pub struct Tag {
     pub name: String,
     pub self_closing: bool,
     pub attrs: Vec<Attribute>,
+    #[cfg(feature = "spans")]
+    #[cfg_attr(docsrs, doc(cfg(feature = "spans")))]
+    pub name_span: Range<usize>,
 }
 
 impl Tag {
     /// Are the tags equivalent when we don't care about attribute order?
-    /// Also ignores the self-closing flag.
+    /// Also ignores the self-closing flag and spans.
     pub fn equiv_modulo_attr_order(&self, other: &Tag) -> bool {
         if (self.kind != other.kind) || (self.name != other.name) {
             return false;
diff --git a/src/tokenizer/mod.rs b/src/tokenizer/mod.rs
index 2d5e1ed..4511cf8 100644
--- a/src/tokenizer/mod.rs
+++ b/src/tokenizer/mod.rs
@@ -59,7 +59,8 @@ fn option_push(opt_str: &mut Option<String>, c: char) {
 #[derive(Clone)]
 pub struct TokenizerOpts {
     /// Report all parse errors described in the spec, at some
-    /// performance penalty?  Default: false
+    /// performance penalty? Defaults to false, except when the
+    /// `spans` feature is enabled in which case it defaults to true.
     pub exact_errors: bool,
 
     /// Discard a `U+FEFF BYTE ORDER MARK` if we see one at the beginning
@@ -82,7 +83,7 @@ pub struct TokenizerOpts {
 impl Default for TokenizerOpts {
     fn default() -> TokenizerOpts {
         TokenizerOpts {
-            exact_errors: false,
+            exact_errors: cfg!(feature = "spans"),
             discard_bom: true,
             profile: false,
             initial_state: None,
@@ -162,6 +163,36 @@ pub struct Tokenizer<Sink> {
 
     /// Track current line
     current_line: u64,
+
+    #[cfg(feature = "spans")]
+    spans: Spans,
+}
+
+#[cfg(feature = "spans")]
+#[derive(Default)]
+struct Spans {
+    /// Track current byte position
+    current_pos: usize,
+
+    /// Current tag name span.
+    current_tag_name: core::ops::Range<usize>,
+
+    /// Current attribute name span.
+    current_attr_name: core::ops::Range<usize>,
+
+    /// Current attribute value span.
+    current_attr_value: core::ops::Range<usize>,
+}
+
+#[cfg(feature = "spans")]
+impl Spans {
+    fn end_tag_name(&mut self) {
+        self.current_tag_name.end = self.current_pos - 1;
+    }
+
+    fn end_attr_name(&mut self) {
+        self.current_attr_name.end = self.current_pos - 1;
+    }
 }
 
 impl<Sink: TokenSink> Tokenizer<Sink> {
@@ -193,6 +224,8 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
             state_profile: BTreeMap::new(),
             time_in_sink: 0,
             current_line: 1,
+            #[cfg(feature = "spans")]
+            spans: Spans::default(),
         }
     }
 
@@ -263,6 +296,11 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
             self.emit_error(Cow::Owned(msg));
         }
 
+        #[cfg(feature = "spans")]
+        {
+            self.spans.current_pos += c.len_utf8();
+        }
+
         self.current_char = c;
         Some(c)
     }
@@ -430,6 +468,8 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
             name,
             self_closing: self.current_tag_self_closing,
             attrs: replace(&mut self.current_tag_attrs, vec![]),
+            #[cfg(feature = "spans")]
+            name_span: self.spans.current_tag_name.clone(),
         });
 
         match self.process_token(token) {
@@ -516,6 +556,10 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
             self.current_tag_attrs.push(Attribute {
                 name: name,
                 value: replace(&mut self.current_attr_value, String::new()),
+                #[cfg(feature = "spans")]
+                name_span: self.spans.current_attr_name.clone(),
+                #[cfg(feature = "spans")]
+                value_span: self.spans.current_attr_value.clone(),
             });
         }
     }
@@ -830,6 +874,10 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
                     }
                     c => match lower_ascii_letter(c) {
                         Some(cl) => {
+                            #[cfg(feature = "spans")]
+                            {
+                                self.spans.current_tag_name.start = self.spans.current_pos - 1;
+                            }
                             go!(self: create_tag StartTag cl);
                             return go!(self: to TagName);
                         }
@@ -858,6 +906,10 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
                     }
                     c => match lower_ascii_letter(c) {
                         Some(cl) => {
+                            #[cfg(feature = "spans")]
+                            {
+                                self.spans.current_tag_name.start = self.spans.current_pos - 1;
+                            }
                             go!(self: create_tag EndTag cl);
                             return go!(self: to TagName);
                         }
@@ -875,12 +927,18 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
             states::TagName => loop {
                 match get_char!(self, input)? {
                     '\t' | '\n' | '\x0C' | ' ' => {
+                        #[cfg(feature = "spans")]
+                        self.spans.end_tag_name();
                         return go!(self: to BeforeAttributeName);
                     }
                     '/' => {
+                        #[cfg(feature = "spans")]
+                        self.spans.end_tag_name();
                         return go!(self: to SelfClosingStartTag);
                     }
                     '>' => {
+                        #[cfg(feature = "spans")]
+                        self.spans.end_tag_name();
                         return go!(self: emit_tag Data);
                     }
                     '\0' => {
@@ -1168,6 +1226,10 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
                     c => match lower_ascii_letter(c) {
                         Some(cl) => {
                             go!(self: create_attr cl);
+                            #[cfg(feature = "spans")]
+                            {
+                                self.spans.current_attr_name.start = self.spans.current_pos - 1;
+                            }
                             return go!(self: to AttributeName);
                         }
                         None => {
@@ -1186,15 +1248,23 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
             states::AttributeName => loop {
                 match get_char!(self, input)? {
                     '\t' | '\n' | '\x0C' | ' ' => {
+                        #[cfg(feature = "spans")]
+                        self.spans.end_attr_name();
                         return go!(self: to AfterAttributeName);
                     }
                     '/' => {
+                        #[cfg(feature = "spans")]
+                        self.spans.end_attr_name();
                         return go!(self: to SelfClosingStartTag);
                     }
                     '=' => {
+                        #[cfg(feature = "spans")]
+                        self.spans.end_attr_name();
                         return go!(self: to BeforeAttributeValue);
                     }
                     '>' => {
+                        #[cfg(feature = "spans")]
+                        self.spans.end_attr_name();
                         return go!(self: emit_tag Data);
                     }
                     '\0' => {
@@ -1285,101 +1355,134 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
             },
 
             //§ attribute-value-(double-quoted)-state
-            states::AttributeValue(DoubleQuoted) => loop {
-                match pop_except_from!(self, input, small_char_set!('\r' '"' '&' '\0' '\n'))? {
-                    FromSet('"') => {
-                        return go!(self: to AfterAttributeValueQuoted);
-                    }
-                    FromSet('&') => {
-                        return go!(self: consume_char_ref '"');
-                    }
-                    FromSet('\0') => {
-                        go!(self: error);
-                        go!(self: push_value '\u{fffd}');
-                    }
-                    FromSet(c) => {
-                        go!(self: push_value c);
-                    }
-                    NotFromSet(ref b) => {
-                        go!(self: append_value b);
+            states::AttributeValue(DoubleQuoted) => {
+                #[cfg(feature = "spans")]
+                {
+                    self.spans.current_attr_value.start = self.spans.current_pos;
+                }
+                loop {
+                    match pop_except_from!(self, input, small_char_set!('\r' '"' '&' '\0' '\n'))? {
+                        FromSet('"') => {
+                            return go!(self: to AfterAttributeValueQuoted);
+                        }
+                        FromSet('&') => {
+                            return go!(self: consume_char_ref '"');
+                        }
+                        FromSet('\0') => {
+                            go!(self: error);
+                            go!(self: push_value '\u{fffd}');
+                        }
+                        FromSet(c) => {
+                            go!(self: push_value c);
+                        }
+                        NotFromSet(ref b) => {
+                            go!(self: append_value b);
+                        }
                     }
                 }
-            },
+            }
 
             //§ attribute-value-(single-quoted)-state
-            states::AttributeValue(SingleQuoted) => loop {
-                match pop_except_from!(self, input, small_char_set!('\r' '\'' '&' '\0' '\n'))? {
-                    FromSet('\'') => {
-                        return go!(self: to AfterAttributeValueQuoted);
-                    }
-                    FromSet('&') => {
-                        return go!(self: consume_char_ref '\'');
-                    }
-                    FromSet('\0') => {
-                        go!(self: error);
-                        go!(self: push_value '\u{fffd}');
-                    }
-                    FromSet(c) => {
-                        go!(self: push_value c);
-                    }
-                    NotFromSet(ref b) => {
-                        go!(self: append_value b);
+            states::AttributeValue(SingleQuoted) => {
+                #[cfg(feature = "spans")]
+                {
+                    self.spans.current_attr_value.start = self.spans.current_pos;
+                }
+                loop {
+                    match pop_except_from!(self, input, small_char_set!('\r' '\'' '&' '\0' '\n'))? {
+                        FromSet('\'') => {
+                            return go!(self: to AfterAttributeValueQuoted);
+                        }
+                        FromSet('&') => {
+                            return go!(self: consume_char_ref '\'');
+                        }
+                        FromSet('\0') => {
+                            go!(self: error);
+                            go!(self: push_value '\u{fffd}');
+                        }
+                        FromSet(c) => {
+                            go!(self: push_value c);
+                        }
+                        NotFromSet(ref b) => {
+                            go!(self: append_value b);
+                        }
                     }
                 }
-            },
+            }
 
             //§ attribute-value-(unquoted)-state
-            states::AttributeValue(Unquoted) => loop {
-                match pop_except_from!(
-                    self,
-                    input,
-                    small_char_set!('\r' '\t' '\n' '\x0C' ' ' '&' '>' '\0')
-                )? {
-                    FromSet('\t') | FromSet('\n') | FromSet('\x0C') | FromSet(' ') => {
-                        return go!(self: to BeforeAttributeName);
-                    }
-                    FromSet('&') => {
-                        return go!(self: consume_char_ref '>');
-                    }
-                    FromSet('>') => {
-                        return go!(self: emit_tag Data);
-                    }
-                    FromSet('\0') => {
-                        go!(self: error);
-                        go!(self: push_value '\u{fffd}');
-                    }
-                    FromSet(c) => {
-                        go_match!(self: c,
+            states::AttributeValue(Unquoted) => {
+                #[cfg(feature = "spans")]
+                {
+                    self.spans.current_attr_value.start = self.spans.current_pos;
+                }
+                loop {
+                    match pop_except_from!(
+                        self,
+                        input,
+                        small_char_set!('\r' '\t' '\n' '\x0C' ' ' '&' '>' '\0')
+                    )? {
+                        FromSet('\t') | FromSet('\n') | FromSet('\x0C') | FromSet(' ') => {
+                            #[cfg(feature = "spans")]
+                            {
+                                self.spans.current_attr_value.end = self.spans.current_pos - 1;
+                            }
+                            return go!(self: to BeforeAttributeName);
+                        }
+                        FromSet('&') => {
+                            return go!(self: consume_char_ref '>');
+                        }
+                        FromSet('>') => {
+                            #[cfg(feature = "spans")]
+                            {
+                                self.spans.current_attr_value.end = self.spans.current_pos - 1;
+                            }
+                            return go!(self: emit_tag Data);
+                        }
+                        FromSet('\0') => {
+                            go!(self: error);
+                            go!(self: push_value '\u{fffd}');
+                        }
+                        FromSet(c) => {
+                            go_match!(self: c,
                             '"' , '\'' , '<' , '=' , '`' => error);
-                        {
-                            go!(self: push_value c);
-                        };
-                    }
-                    NotFromSet(ref b) => {
-                        go!(self: append_value b);
+                            {
+                                go!(self: push_value c);
+                            };
+                        }
+                        NotFromSet(ref b) => {
+                            go!(self: append_value b);
+                        }
                     }
                 }
-            },
+            }
 
             //§ after-attribute-value-(quoted)-state
-            states::AfterAttributeValueQuoted => loop {
-                match get_char!(self, input)? {
-                    '\t' | '\n' | '\x0C' | ' ' => {
-                        return go!(self: to BeforeAttributeName);
-                    }
-                    '/' => {
-                        return go!(self: to SelfClosingStartTag);
-                    }
-                    '>' => {
-                        return go!(self: emit_tag Data);
-                    }
-                    _ => {
-                        go!(self: error);
-                        self.reconsume = true;
-                        return go!(self: to BeforeAttributeName);
+            states::AfterAttributeValueQuoted => {
+                #[cfg(feature = "spans")]
+                {
+                    self.spans.current_attr_value.end = self.spans.current_pos - 1;
+                }
+
+                loop {
+                    match get_char!(self, input)? {
+                        '\t' | '\n' | '\x0C' | ' ' => {
+                            return go!(self: to BeforeAttributeName);
+                        }
+                        '/' => {
+                            return go!(self: to SelfClosingStartTag);
+                        }
+                        '>' => {
+                            return go!(self: emit_tag Data);
+                        }
+                        _ => {
+                            go!(self: error);
+                            self.reconsume = true;
+                            return go!(self: to BeforeAttributeName);
+                        }
                     }
                 }
-            },
+            }
 
             //§ self-closing-start-tag-state
             states::SelfClosingStartTag => loop {
diff --git a/tests/files/test.html b/tests/files/test.html
new file mode 100644
index 0000000..0dcbdbf
--- /dev/null
+++ b/tests/files/test.html
@@ -0,0 +1,7 @@
+This is a file.
+
+Here is a tag: <strong >very cool</strong>
+
+Tags can have attributes: <div id = foo >...</div>
+
+Attribute values can be quoted: <input name = 'age' type = "number">
diff --git a/tests/files/test.out b/tests/files/test.out
new file mode 100644
index 0000000..7127ebc
--- /dev/null
+++ b/tests/files/test.out
@@ -0,0 +1,17 @@
+note:
+  ┌─ test.html:3:17
+  │
+3 │ Here is a tag: <strong >very cool</strong>
+  │                 ^^^^^^             ^^^^^^ EndTag
+  │                 │
+  │                 StartTag
+4 │
+5 │ Tags can have attributes: <div id = foo >...</div>
+  │                                ^^   ^^^ attribute value
+  │                                │
+  │                                attribute name
+6 │
+7 │ Attribute values can be quoted: <input name = 'age' type = "number">
+  │                                                ^^^          ^^^^^^ in double quotes
+  │                                                │
+  │                                                in single quotes
diff --git a/tests/spans.rs b/tests/spans.rs
new file mode 100644
index 0000000..bfa42f6
--- /dev/null
+++ b/tests/spans.rs
@@ -0,0 +1,89 @@
+#![cfg(feature = "spans")]
+use std::include_str;
+
+use codespan_reporting::{
+    self,
+    diagnostic::{Diagnostic, Label},
+    files::SimpleFiles,
+    term::{self, termcolor::Buffer},
+};
+use html5tokenizer::{
+    BufferQueue, Tag, Token, TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts,
+};
+
+#[derive(Default)]
+struct TagSink {
+    tags: Vec<Tag>,
+}
+
+impl TokenSink for TagSink {
+    fn process_token(&mut self, token: Token, _line_number: u64) -> TokenSinkResult {
+        if let Token::TagToken(tag) = token {
+            self.tags.push(tag);
+        }
+        TokenSinkResult::Continue
+    }
+}
+
+#[test]
+fn test() {
+    let sink = TagSink::default();
+
+    let mut input = BufferQueue::new();
+    let text = include_str!("files/test.html");
+    input.push_back(text.to_string());
+
+    let mut tok = Tokenizer::new(sink, TokenizerOpts::default());
+    let _ = tok.feed(&mut input);
+
+    let mut files = SimpleFiles::new();
+    let file_id = files.add("test.html", text);
+    let mut labels = Vec::new();
+
+    let tags = tok.sink.tags;
+    for tag in &tags[..2] {
+        labels.push(
+            Label::primary(file_id, tag.name_span.clone()).with_message(format!("{:?}", tag.kind)),
+        );
+    }
+    labels.push(
+        Label::primary(file_id, tags[2].attrs[0].name_span.clone()).with_message("attribute name"),
+    );
+    labels.push(
+        Label::primary(file_id, tags[2].attrs[0].value_span.clone())
+            .with_message("attribute value"),
+    );
+    labels.push(
+        Label::primary(file_id, tags[4].attrs[0].value_span.clone())
+            .with_message("in single quotes"),
+    );
+    labels.push(
+        Label::primary(file_id, tags[4].attrs[1].value_span.clone())
+            .with_message("in double quotes"),
+    );
+    let diagnostic = Diagnostic::note().with_labels(labels);
+
+    let mut writer = Buffer::no_color();
+    let config = codespan_reporting::term::Config::default();
+    term::emit(&mut writer, &config, &files, &diagnostic).unwrap();
+
+    let actual = remove_trailing_spaces(std::str::from_utf8(writer.as_slice()).unwrap());
+    let expected = include_str!("files/test.out");
+
+    if actual != expected {
+        println!(
+            "EXPECTED:\n{banner}\n{expected}{banner}\n\nACTUAL OUTPUT:\n{banner}\n{actual}{banner}",
+            banner = "-".repeat(30),
+            expected = expected,
+            actual = actual
+        );
+        panic!("failed");
+    }
+}
+
+fn remove_trailing_spaces(text: &str) -> String {
+    text.lines()
+        .map(|l| l.trim_end())
+        .collect::<Vec<_>>()
+        .join("\n")
+}
author	Martin Fischer <martin@push-f.com>	2021-11-30 07:28:21 +0100
committer	Martin Fischer <martin@push-f.com>	2021-11-30 11:22:35 +0100
commit	baf1477c587fe22d27e94408cf2505d588ba007e (patch)
tree	f3e027e3c149cfeb7187a625756ea4b2de47c82a
parent	25087cce997abc386f881648dfd39c83dfef7667 (diff)