From baf1477c587fe22d27e94408cf2505d588ba007e Mon Sep 17 00:00:00 2001
From: Martin Fischer <martin@push-f.com>
Date: Tue, 30 Nov 2021 07:28:21 +0100
Subject: add spans feature

---
 src/tokenizer/interface.rs |  27 ++++-
 src/tokenizer/mod.rs       | 267 +++++++++++++++++++++++++++++++--------------
 2 files changed, 210 insertions(+), 84 deletions(-)

(limited to 'src/tokenizer')

diff --git a/src/tokenizer/interface.rs b/src/tokenizer/interface.rs
index 2c6cc38..f12fb16 100644
--- a/src/tokenizer/interface.rs
+++ b/src/tokenizer/interface.rs
@@ -9,6 +9,8 @@
 
 use crate::tokenizer::states;
 use std::borrow::Cow;
+#[cfg(feature = "spans")]
+use std::ops::Range;
 
 pub use self::TagKind::{EndTag, StartTag};
 pub use self::Token::{CharacterTokens, CommentToken, DoctypeToken, TagToken};
@@ -47,12 +49,30 @@ pub enum TagKind {
 /// The tokenizer creates all attributes this way, but the tree
 /// builder will adjust certain attribute names inside foreign
 /// content (MathML, SVG).
-#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug)]
+#[derive(PartialEq, Eq, Clone, Debug)]
 pub struct Attribute {
     /// The name of the attribute (e.g. the `class` in `<div class="test">`)
     pub name: String,
     /// The value of the attribute (e.g. the `"test"` in `<div class="test">`)
     pub value: String,
+    #[cfg(feature = "spans")]
+    #[cfg_attr(docsrs, doc(cfg(feature = "spans")))]
+    pub name_span: Range<usize>,
+    #[cfg(feature = "spans")]
+    #[cfg_attr(docsrs, doc(cfg(feature = "spans")))]
+    pub value_span: Range<usize>,
+}
+
+impl Ord for Attribute {
+    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
+        (&self.name, &self.value).cmp(&(&other.name, &other.value))
+    }
+}
+
+impl PartialOrd for Attribute {
+    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
+        (&self.name, &self.value).partial_cmp(&(&other.name, &other.value))
+    }
 }
 
 /// A tag token.
@@ -62,11 +82,14 @@ pub struct Tag {
     pub name: String,
     pub self_closing: bool,
     pub attrs: Vec<Attribute>,
+    #[cfg(feature = "spans")]
+    #[cfg_attr(docsrs, doc(cfg(feature = "spans")))]
+    pub name_span: Range<usize>,
 }
 
 impl Tag {
     /// Are the tags equivalent when we don't care about attribute order?
-    /// Also ignores the self-closing flag.
+    /// Also ignores the self-closing flag and spans.
     pub fn equiv_modulo_attr_order(&self, other: &Tag) -> bool {
         if (self.kind != other.kind) || (self.name != other.name) {
             return false;
diff --git a/src/tokenizer/mod.rs b/src/tokenizer/mod.rs
index 2d5e1ed..4511cf8 100644
--- a/src/tokenizer/mod.rs
+++ b/src/tokenizer/mod.rs
@@ -59,7 +59,8 @@ fn option_push(opt_str: &mut Option<String>, c: char) {
 #[derive(Clone)]
 pub struct TokenizerOpts {
     /// Report all parse errors described in the spec, at some
-    /// performance penalty?  Default: false
+    /// performance penalty? Defaults to false, except when the
+    /// `spans` feature is enabled in which case it defaults to true.
     pub exact_errors: bool,
 
     /// Discard a `U+FEFF BYTE ORDER MARK` if we see one at the beginning
@@ -82,7 +83,7 @@ pub struct TokenizerOpts {
 impl Default for TokenizerOpts {
     fn default() -> TokenizerOpts {
         TokenizerOpts {
-            exact_errors: false,
+            exact_errors: cfg!(feature = "spans"),
             discard_bom: true,
             profile: false,
             initial_state: None,
@@ -162,6 +163,36 @@ pub struct Tokenizer<Sink> {
 
     /// Track current line
     current_line: u64,
+
+    #[cfg(feature = "spans")]
+    spans: Spans,
+}
+
+#[cfg(feature = "spans")]
+#[derive(Default)]
+struct Spans {
+    /// Track current byte position
+    current_pos: usize,
+
+    /// Current tag name span.
+    current_tag_name: core::ops::Range<usize>,
+
+    /// Current attribute name span.
+    current_attr_name: core::ops::Range<usize>,
+
+    /// Current attribute value span.
+    current_attr_value: core::ops::Range<usize>,
+}
+
+#[cfg(feature = "spans")]
+impl Spans {
+    fn end_tag_name(&mut self) {
+        self.current_tag_name.end = self.current_pos - 1;
+    }
+
+    fn end_attr_name(&mut self) {
+        self.current_attr_name.end = self.current_pos - 1;
+    }
 }
 
 impl<Sink: TokenSink> Tokenizer<Sink> {
@@ -193,6 +224,8 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
             state_profile: BTreeMap::new(),
             time_in_sink: 0,
             current_line: 1,
+            #[cfg(feature = "spans")]
+            spans: Spans::default(),
         }
     }
 
@@ -263,6 +296,11 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
             self.emit_error(Cow::Owned(msg));
         }
 
+        #[cfg(feature = "spans")]
+        {
+            self.spans.current_pos += c.len_utf8();
+        }
+
         self.current_char = c;
         Some(c)
     }
@@ -430,6 +468,8 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
             name,
             self_closing: self.current_tag_self_closing,
             attrs: replace(&mut self.current_tag_attrs, vec![]),
+            #[cfg(feature = "spans")]
+            name_span: self.spans.current_tag_name.clone(),
         });
 
         match self.process_token(token) {
@@ -516,6 +556,10 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
             self.current_tag_attrs.push(Attribute {
                 name: name,
                 value: replace(&mut self.current_attr_value, String::new()),
+                #[cfg(feature = "spans")]
+                name_span: self.spans.current_attr_name.clone(),
+                #[cfg(feature = "spans")]
+                value_span: self.spans.current_attr_value.clone(),
             });
         }
     }
@@ -830,6 +874,10 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
                     }
                     c => match lower_ascii_letter(c) {
                         Some(cl) => {
+                            #[cfg(feature = "spans")]
+                            {
+                                self.spans.current_tag_name.start = self.spans.current_pos - 1;
+                            }
                             go!(self: create_tag StartTag cl);
                             return go!(self: to TagName);
                         }
@@ -858,6 +906,10 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
                     }
                     c => match lower_ascii_letter(c) {
                         Some(cl) => {
+                            #[cfg(feature = "spans")]
+                            {
+                                self.spans.current_tag_name.start = self.spans.current_pos - 1;
+                            }
                             go!(self: create_tag EndTag cl);
                             return go!(self: to TagName);
                         }
@@ -875,12 +927,18 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
             states::TagName => loop {
                 match get_char!(self, input)? {
                     '\t' | '\n' | '\x0C' | ' ' => {
+                        #[cfg(feature = "spans")]
+                        self.spans.end_tag_name();
                         return go!(self: to BeforeAttributeName);
                     }
                     '/' => {
+                        #[cfg(feature = "spans")]
+                        self.spans.end_tag_name();
                         return go!(self: to SelfClosingStartTag);
                     }
                     '>' => {
+                        #[cfg(feature = "spans")]
+                        self.spans.end_tag_name();
                         return go!(self: emit_tag Data);
                     }
                     '\0' => {
@@ -1168,6 +1226,10 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
                     c => match lower_ascii_letter(c) {
                         Some(cl) => {
                             go!(self: create_attr cl);
+                            #[cfg(feature = "spans")]
+                            {
+                                self.spans.current_attr_name.start = self.spans.current_pos - 1;
+                            }
                             return go!(self: to AttributeName);
                         }
                         None => {
@@ -1186,15 +1248,23 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
             states::AttributeName => loop {
                 match get_char!(self, input)? {
                     '\t' | '\n' | '\x0C' | ' ' => {
+                        #[cfg(feature = "spans")]
+                        self.spans.end_attr_name();
                         return go!(self: to AfterAttributeName);
                     }
                     '/' => {
+                        #[cfg(feature = "spans")]
+                        self.spans.end_attr_name();
                         return go!(self: to SelfClosingStartTag);
                     }
                     '=' => {
+                        #[cfg(feature = "spans")]
+                        self.spans.end_attr_name();
                         return go!(self: to BeforeAttributeValue);
                     }
                     '>' => {
+                        #[cfg(feature = "spans")]
+                        self.spans.end_attr_name();
                         return go!(self: emit_tag Data);
                     }
                     '\0' => {
@@ -1285,101 +1355,134 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
             },
 
             //§ attribute-value-(double-quoted)-state
-            states::AttributeValue(DoubleQuoted) => loop {
-                match pop_except_from!(self, input, small_char_set!('\r' '"' '&' '\0' '\n'))? {
-                    FromSet('"') => {
-                        return go!(self: to AfterAttributeValueQuoted);
-                    }
-                    FromSet('&') => {
-                        return go!(self: consume_char_ref '"');
-                    }
-                    FromSet('\0') => {
-                        go!(self: error);
-                        go!(self: push_value '\u{fffd}');
-                    }
-                    FromSet(c) => {
-                        go!(self: push_value c);
-                    }
-                    NotFromSet(ref b) => {
-                        go!(self: append_value b);
+            states::AttributeValue(DoubleQuoted) => {
+                #[cfg(feature = "spans")]
+                {
+                    self.spans.current_attr_value.start = self.spans.current_pos;
+                }
+                loop {
+                    match pop_except_from!(self, input, small_char_set!('\r' '"' '&' '\0' '\n'))? {
+                        FromSet('"') => {
+                            return go!(self: to AfterAttributeValueQuoted);
+                        }
+                        FromSet('&') => {
+                            return go!(self: consume_char_ref '"');
+                        }
+                        FromSet('\0') => {
+                            go!(self: error);
+                            go!(self: push_value '\u{fffd}');
+                        }
+                        FromSet(c) => {
+                            go!(self: push_value c);
+                        }
+                        NotFromSet(ref b) => {
+                            go!(self: append_value b);
+                        }
                     }
                 }
-            },
+            }
 
             //§ attribute-value-(single-quoted)-state
-            states::AttributeValue(SingleQuoted) => loop {
-                match pop_except_from!(self, input, small_char_set!('\r' '\'' '&' '\0' '\n'))? {
-                    FromSet('\'') => {
-                        return go!(self: to AfterAttributeValueQuoted);
-                    }
-                    FromSet('&') => {
-                        return go!(self: consume_char_ref '\'');
-                    }
-                    FromSet('\0') => {
-                        go!(self: error);
-                        go!(self: push_value '\u{fffd}');
-                    }
-                    FromSet(c) => {
-                        go!(self: push_value c);
-                    }
-                    NotFromSet(ref b) => {
-                        go!(self: append_value b);
+            states::AttributeValue(SingleQuoted) => {
+                #[cfg(feature = "spans")]
+                {
+                    self.spans.current_attr_value.start = self.spans.current_pos;
+                }
+                loop {
+                    match pop_except_from!(self, input, small_char_set!('\r' '\'' '&' '\0' '\n'))? {
+                        FromSet('\'') => {
+                            return go!(self: to AfterAttributeValueQuoted);
+                        }
+                        FromSet('&') => {
+                            return go!(self: consume_char_ref '\'');
+                        }
+                        FromSet('\0') => {
+                            go!(self: error);
+                            go!(self: push_value '\u{fffd}');
+                        }
+                        FromSet(c) => {
+                            go!(self: push_value c);
+                        }
+                        NotFromSet(ref b) => {
+                            go!(self: append_value b);
+                        }
                     }
                 }
-            },
+            }
 
             //§ attribute-value-(unquoted)-state
-            states::AttributeValue(Unquoted) => loop {
-                match pop_except_from!(
-                    self,
-                    input,
-                    small_char_set!('\r' '\t' '\n' '\x0C' ' ' '&' '>' '\0')
-                )? {
-                    FromSet('\t') | FromSet('\n') | FromSet('\x0C') | FromSet(' ') => {
-                        return go!(self: to BeforeAttributeName);
-                    }
-                    FromSet('&') => {
-                        return go!(self: consume_char_ref '>');
-                    }
-                    FromSet('>') => {
-                        return go!(self: emit_tag Data);
-                    }
-                    FromSet('\0') => {
-                        go!(self: error);
-                        go!(self: push_value '\u{fffd}');
-                    }
-                    FromSet(c) => {
-                        go_match!(self: c,
+            states::AttributeValue(Unquoted) => {
+                #[cfg(feature = "spans")]
+                {
+                    self.spans.current_attr_value.start = self.spans.current_pos;
+                }
+                loop {
+                    match pop_except_from!(
+                        self,
+                        input,
+                        small_char_set!('\r' '\t' '\n' '\x0C' ' ' '&' '>' '\0')
+                    )? {
+                        FromSet('\t') | FromSet('\n') | FromSet('\x0C') | FromSet(' ') => {
+                            #[cfg(feature = "spans")]
+                            {
+                                self.spans.current_attr_value.end = self.spans.current_pos - 1;
+                            }
+                            return go!(self: to BeforeAttributeName);
+                        }
+                        FromSet('&') => {
+                            return go!(self: consume_char_ref '>');
+                        }
+                        FromSet('>') => {
+                            #[cfg(feature = "spans")]
+                            {
+                                self.spans.current_attr_value.end = self.spans.current_pos - 1;
+                            }
+                            return go!(self: emit_tag Data);
+                        }
+                        FromSet('\0') => {
+                            go!(self: error);
+                            go!(self: push_value '\u{fffd}');
+                        }
+                        FromSet(c) => {
+                            go_match!(self: c,
                             '"' , '\'' , '<' , '=' , '`' => error);
-                        {
-                            go!(self: push_value c);
-                        };
-                    }
-                    NotFromSet(ref b) => {
-                        go!(self: append_value b);
+                            {
+                                go!(self: push_value c);
+                            };
+                        }
+                        NotFromSet(ref b) => {
+                            go!(self: append_value b);
+                        }
                     }
                 }
-            },
+            }
 
             //§ after-attribute-value-(quoted)-state
-            states::AfterAttributeValueQuoted => loop {
-                match get_char!(self, input)? {
-                    '\t' | '\n' | '\x0C' | ' ' => {
-                        return go!(self: to BeforeAttributeName);
-                    }
-                    '/' => {
-                        return go!(self: to SelfClosingStartTag);
-                    }
-                    '>' => {
-                        return go!(self: emit_tag Data);
-                    }
-                    _ => {
-                        go!(self: error);
-                        self.reconsume = true;
-                        return go!(self: to BeforeAttributeName);
+            states::AfterAttributeValueQuoted => {
+                #[cfg(feature = "spans")]
+                {
+                    self.spans.current_attr_value.end = self.spans.current_pos - 1;
+                }
+
+                loop {
+                    match get_char!(self, input)? {
+                        '\t' | '\n' | '\x0C' | ' ' => {
+                            return go!(self: to BeforeAttributeName);
+                        }
+                        '/' => {
+                            return go!(self: to SelfClosingStartTag);
+                        }
+                        '>' => {
+                            return go!(self: emit_tag Data);
+                        }
+                        _ => {
+                            go!(self: error);
+                            self.reconsume = true;
+                            return go!(self: to BeforeAttributeName);
+                        }
                     }
                 }
-            },
+            }
 
             //§ self-closing-start-tag-state
             states::SelfClosingStartTag => loop {
-- 
cgit v1.2.3