From d45e09a3157c71e3c95bf618a2b181f322c87a5d Mon Sep 17 00:00:00 2001
From: Martin Fischer <martin@push-f.com>
Date: Thu, 31 Aug 2023 05:18:49 +0200
Subject: fix!: wrong attribute value spans for char refs

---
 CHANGELOG.md        |  6 ++++++
 src/attr.rs         | 23 ++++++++++-------------
 src/emitter.rs      | 15 +++++++++++++--
 src/machine.rs      |  6 ++++++
 tests/test_spans.rs |  2 +-
 5 files changed, 36 insertions(+), 16 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index c7b0dc2..832f00d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -20,6 +20,10 @@
 
   * Removed `adjusted_current_node_present_and_not_in_html_namespace`.
 
+* token types
+
+  * `AttributeOwned`: The `value_offset` field has been replaced with `value_span`.
+
 * Added missing `R: Position<O>` bounds for `Tokenizer`/`NaiveParser` constructors.  
   (If you are able to construct a Tokenizer/NaiveParser,
   you should be able to iterate over it.)
@@ -28,6 +32,8 @@
 
 * Fixed `BufReadReader` skipping the whole line if it contained invalid UTF-8.
 
+* Fixed attribute value spans being wrong for values containing character references.
+
 ### 0.5.0 - 2023-08-19
 
 #### Features
diff --git a/src/attr.rs b/src/attr.rs
index f7161cf..72dfcd9 100644
--- a/src/attr.rs
+++ b/src/attr.rs
@@ -32,10 +32,10 @@ pub(crate) struct AttrInternal<O> {
     pub value: String,
     /// The start offset of the attribute name.
     pub name_offset: O,
-    /// The start offset of the attribute value.
-    /// For the empty attribute syntax this is just `O::default()`.
-    /// We intentionally don't use `Option<O>` here to spare us a byte (and padding) per attribute.
-    pub value_offset: O,
+    /// The span of the attribute value.
+    /// For the empty attribute syntax this is just `O::default()..O::default()`.
+    /// We intentionally don't use `Option<Range<O>>` here to spare us a byte (and padding) per attribute.
+    pub value_span: Range<O>,
     pub value_syntax: Option<AttrValueSyntax>,
 }
 
@@ -67,9 +67,9 @@ pub struct AttributeOwned<O> {
     pub value: String,
     /// The start offset of the attribute name.
     pub name_offset: O,
-    /// The start offset of the attribute value.
+    /// The span of the attribute value.
     /// `None` in case of the empty attribute syntax (e.g. `disabled` in `<input disabled>`).
-    pub value_offset: Option<O>,
+    pub value_span: Option<Range<O>>,
     /// The syntax of the attribute value.
     /// `None` indicates the empty attribute syntax (e.g. `disabled` in `<input disabled>`).
     pub value_syntax: Option<AttrValueSyntax>,
@@ -104,14 +104,14 @@ impl<'a, O: Offset> Attribute<'a, O> {
         self.map_val.name_offset..self.map_val.name_offset + self.name.len()
     }
 
-    /// For explicitly defined values calculates the span of the attribute value and returns it.
+    /// For explicitly defined values returns the span of the attribute value.
     ///
     /// Returns `None` for attributes using the empty attribute syntax (e.g. `disabled` in `<input disabled>`).
     pub fn value_span(&self) -> Option<Range<O>> {
         if self.map_val.value_syntax.is_none() {
             return None;
         }
-        Some(self.map_val.value_offset..self.map_val.value_offset + self.map_val.value.len())
+        Some(self.map_val.value_span.clone())
     }
 
     /// Returns the attribute value syntax in case the value is explicitly defined.
@@ -158,10 +158,7 @@ impl<O> Iterator for AttrIntoIter<O> {
             name,
             value: map_val.value,
             name_offset: map_val.name_offset,
-            value_offset: map_val
-                .value_syntax
-                .is_some()
-                .then_some(map_val.value_offset),
+            value_span: map_val.value_syntax.is_some().then_some(map_val.value_span),
             value_syntax: map_val.value_syntax,
         })
     }
@@ -200,7 +197,7 @@ impl<O: Default> FromIterator<(String, String)> for AttributeMap<O> {
                         AttrInternal {
                             value,
                             name_offset: O::default(),
-                            value_offset: O::default(),
+                            value_span: O::default()..O::default(),
                             value_syntax: Some(AttrValueSyntax::DoubleQuoted),
                         },
                     )
diff --git a/src/emitter.rs b/src/emitter.rs
index 6d9b60e..e56ddd5 100644
--- a/src/emitter.rs
+++ b/src/emitter.rs
@@ -78,6 +78,11 @@ pub trait Emitter<O> {
     /// If the current token is not a doctype, this method may panic.
     fn emit_current_doctype(&mut self, offset: O);
 
+    /// Called after the last [`push_attribute_value`] call for an attribute value.
+    ///
+    /// [`push_attribute_value`]: Self::push_attribute_value
+    fn terminate_attribute_value(&mut self, offset: O) {}
+
     /// Assuming the _current token_ is a start tag, set the self-closing flag.
     ///
     /// If the current token is not a start or end tag, this method may panic.
@@ -391,14 +396,14 @@ impl<O: Offset> Emitter<O> for DefaultEmitter<O> {
             crate::attr::AttrInternal {
                 name_offset: offset,
                 value: String::new(),
-                value_offset: O::default(),
+                value_span: O::default()..O::default(),
                 value_syntax: None,
             },
         ));
     }
     fn init_attribute_value(&mut self, syntax: AttrValueSyntax, offset: O) {
         let (_, current_attribute) = self.current_attribute.as_mut().unwrap();
-        current_attribute.value_offset = offset;
+        current_attribute.value_span.start = offset;
         current_attribute.value_syntax = Some(syntax);
     }
 
@@ -410,6 +415,12 @@ impl<O: Offset> Emitter<O> for DefaultEmitter<O> {
         let current_attr = self.current_attribute.as_mut().unwrap();
         current_attr.1.value.push_str(s);
     }
+
+    fn terminate_attribute_value(&mut self, offset: O) {
+        let current_attr = self.current_attribute.as_mut().unwrap();
+        current_attr.1.value_span.end = offset;
+    }
+
     fn init_doctype_public_id(&mut self, offset: O) {
         let Some(Token::Doctype(doctype)) = &mut self.current_token else {
             debug_assert!(false);
diff --git a/src/machine.rs b/src/machine.rs
index 509dae5..159a8a0 100644
--- a/src/machine.rs
+++ b/src/machine.rs
@@ -790,6 +790,8 @@ where
         },
         State::AttributeValueDoubleQuoted => match slf.read_char()? {
             Some('"') => {
+                slf.emitter
+                    .terminate_attribute_value(slf.reader.position() - 1);
                 slf.state = State::AfterAttributeValueQuoted;
                 Ok(ControlToken::Continue)
             }
@@ -814,6 +816,8 @@ where
         },
         State::AttributeValueSingleQuoted => match slf.read_char()? {
             Some('\'') => {
+                slf.emitter
+                    .terminate_attribute_value(slf.reader.position() - 1);
                 slf.state = State::AfterAttributeValueQuoted;
                 Ok(ControlToken::Continue)
             }
@@ -838,6 +842,8 @@ where
         },
         State::AttributeValueUnquoted => match slf.read_char()? {
             Some(whitespace_pat!()) => {
+                slf.emitter
+                    .terminate_attribute_value(slf.reader.position() - 1);
                 slf.state = State::BeforeAttributeName;
                 Ok(ControlToken::Continue)
             }
diff --git a/tests/test_spans.rs b/tests/test_spans.rs
index bc6aeee..8f3dc8f 100644
--- a/tests/test_spans.rs
+++ b/tests/test_spans.rs
@@ -143,7 +143,7 @@ fn attribute_value_with_char_ref() {
     }
     assert_snapshot!(annotate(html, labels), @r###"
     <test x=&amp; y='&amp;' z="&amp;">
-            ^        ^         ^
+            ^^^^^    ^^^^^     ^^^^^
     "###);
 }
 
-- 
cgit v1.2.3