fix!: make comment data spans encoding-independent

author: Martin Fischer <martin@push-f.com> 2023-09-02 12:27:14 +0200
committer: Martin Fischer <martin@push-f.com> 2023-09-03 23:00:05 +0200
commit: e993f19c2b8ef00b32f17f9ed32306f3ceb21bc3 (patch)
tree: 4992456f36e6d012b4cc54a69811ec321cc4550c
parent: c8a8bcb95b725d91a7c4b7bc7623171f2a04fc67 (diff)
5 files changed, 36 insertions, 41 deletions
diff --git a/CHANGELOG.md b/CHANGELOG.md
index b7ae1d7..6e67319 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -25,7 +25,8 @@
   * Added a `name_offset` parameter to `init_start_tag` and `init_end_tag`.
 
   * Several provided offsets have been changed to be more sensible.  
-    Affected are: `init_start_tag`, `init_end_tag`, `emit_current_tag`
+    Affected are: `init_start_tag`, `init_end_tag`, `emit_current_tag`,
+    `emit_current_comment`
 
 * token types
 
diff --git a/src/emitter.rs b/src/emitter.rs
index 9fdf967..db3da78 100644
--- a/src/emitter.rs
+++ b/src/emitter.rs
@@ -55,7 +55,7 @@ pub trait Emitter<O> {
     fn init_end_tag(&mut self, tag_offset: O, name_offset: O);
 
     /// Set the _current token_ to a comment.
-    fn init_comment(&mut self, data_offset: O);
+    fn init_comment(&mut self, data_start_offset: O);
 
     /// Emit the _current token_, assuming it is a tag.
     ///
@@ -71,7 +71,7 @@ pub trait Emitter<O> {
     /// Emit the _current token_, assuming it is a comment.
     ///
     /// If the current token is not a comment, this method may panic.
-    fn emit_current_comment(&mut self, offset: O);
+    fn emit_current_comment(&mut self, data_end_offset: O);
 
     /// Emit the _current token_, assuming it is a doctype.
     ///
@@ -309,10 +309,10 @@ impl<O: Offset> Emitter<O> for DefaultEmitter<O> {
         self.seen_attributes.clear();
     }
 
-    fn init_comment(&mut self, data_offset: O) {
+    fn init_comment(&mut self, data_start_offset: O) {
         self.current_token = Some(Token::Comment(Comment {
             data: String::new(),
-            data_offset,
+            data_span: data_start_offset..O::default(),
         }));
     }
     fn emit_current_tag(&mut self, offset: O) {
@@ -334,10 +334,14 @@ impl<O: Offset> Emitter<O> for DefaultEmitter<O> {
         }
         self.emit_token(token);
     }
-    fn emit_current_comment(&mut self, _offset: O) {
-        let comment = self.current_token.take().unwrap();
-        debug_assert!(matches!(comment, Token::Comment(_)));
-        self.emit_token(comment);
+    fn emit_current_comment(&mut self, data_end_offset: O) {
+        let mut token = self.current_token.take().unwrap();
+        if let Token::Comment(comment) = &mut token {
+            comment.data_span.end = data_end_offset;
+        } else {
+            debug_assert!(false);
+        }
+        self.emit_token(token);
     }
 
     fn emit_current_doctype(&mut self, offset: O) {
@@ -572,13 +576,13 @@ pub struct Comment<O> {
     /// The text within the comment.
     pub data: String,
     /// The source offset of the comment data.
-    pub data_offset: O,
+    pub data_span: Range<O>,
 }
 
 impl<O: Offset> Comment<O> {
-    /// Calculates the span for the comment data and returns it.
+    /// Returns the span for the comment data.
     pub fn data_span(&self) -> Range<O> {
-        self.data_offset..self.data_offset + self.data.len()
+        self.data_span.clone()
     }
 }
 
diff --git a/src/machine.rs b/src/machine.rs
index f00af0a..26e1652 100644
--- a/src/machine.rs
+++ b/src/machine.rs
@@ -950,11 +950,11 @@ where
         State::BogusComment => match slf.read_char()? {
             Some('>') => {
                 slf.state = State::Data;
-                slf.emitter.emit_current_comment(slf.reader.position());
+                slf.emitter.emit_current_comment(slf.position_before_match);
                 Ok(ControlToken::Continue)
             }
             None => {
-                slf.emitter.emit_current_comment(slf.reader.position());
+                slf.emitter.emit_current_comment(slf.position_before_match);
                 Ok(ControlToken::Eof)
             }
             Some('\0') => {
@@ -994,7 +994,7 @@ where
             Some('>') => {
                 slf.emit_error(Error::AbruptClosingOfEmptyComment);
                 slf.state = State::Data;
-                slf.emitter.emit_current_comment(slf.reader.position());
+                slf.emitter.emit_current_comment(slf.position_before_match);
                 Ok(ControlToken::Continue)
             }
             c => {
@@ -1008,15 +1008,19 @@ where
                 slf.state = State::CommentEnd;
                 Ok(ControlToken::Continue)
             }
-            Some('>') => {
+            Some(c @ '>') => {
                 slf.emit_error(Error::AbruptClosingOfEmptyComment);
                 slf.state = State::Data;
-                slf.emitter.emit_current_comment(slf.reader.position());
+                slf.emitter.emit_current_comment(
+                    slf.position_before_match - slf.reader.len_of_char_in_current_encoding(c),
+                );
                 Ok(ControlToken::Continue)
             }
             None => {
                 slf.emit_error(Error::EofInComment);
-                slf.emitter.emit_current_comment(slf.reader.position());
+                slf.emitter.emit_current_comment(
+                    slf.position_before_match - slf.reader.len_of_char_in_current_encoding('-'),
+                );
                 Ok(ControlToken::Eof)
             }
             c @ Some(_) => {
@@ -1033,6 +1037,7 @@ where
                 Ok(ControlToken::Continue)
             }
             Some('-') => {
+                slf.some_offset = slf.position_before_match;
                 slf.state = State::CommentEndDash;
                 Ok(ControlToken::Continue)
             }
@@ -1109,7 +1114,7 @@ where
             }
             None => {
                 slf.emit_error(Error::EofInComment);
-                slf.emitter.emit_current_comment(slf.reader.position());
+                slf.emitter.emit_current_comment(slf.some_offset);
                 Ok(ControlToken::Eof)
             }
             c => {
@@ -1122,7 +1127,7 @@ where
         State::CommentEnd => match slf.read_char()? {
             Some('>') => {
                 slf.state = State::Data;
-                slf.emitter.emit_current_comment(slf.reader.position());
+                slf.emitter.emit_current_comment(slf.some_offset);
                 Ok(ControlToken::Continue)
             }
             Some('!') => {
@@ -1135,7 +1140,7 @@ where
             }
             None => {
                 slf.emit_error(Error::EofInComment);
-                slf.emitter.emit_current_comment(slf.reader.position());
+                slf.emitter.emit_current_comment(slf.some_offset);
                 Ok(ControlToken::Eof)
             }
             c @ Some(_) => {
@@ -1157,12 +1162,12 @@ where
             Some('>') => {
                 slf.emit_error(Error::IncorrectlyClosedComment);
                 slf.state = State::Data;
-                slf.emitter.emit_current_comment(slf.reader.position());
+                slf.emitter.emit_current_comment(slf.some_offset);
                 Ok(ControlToken::Continue)
             }
             None => {
                 slf.emit_error(Error::EofInComment);
-                slf.emitter.emit_current_comment(slf.reader.position());
+                slf.emitter.emit_current_comment(slf.some_offset);
                 Ok(ControlToken::Eof)
             }
             c @ Some(_) => {
diff --git a/src/tokenizer.rs b/src/tokenizer.rs
index e0402b9..cfd8eea 100644
--- a/src/tokenizer.rs
+++ b/src/tokenizer.rs
@@ -52,6 +52,7 @@ pub struct Tokenizer<R: Reader, O, E: Emitter<O>> {
     /// The reader position before the match block in [`machine::consume`].
     pub(crate) position_before_match: O,
     /// * Set to the offset of `<` in [`InternalState::Data`].
+    /// * Set to the offset of `-` in [`InternalState::Comment`].
     /// * Set to the offset of `&` in [`InternalState::CharacterReference`].
     pub(crate) some_offset: O,
     /// This boolean flag exists so that the [`NaiveParser`](crate::NaiveParser) can work with any [`Emitter`]
diff --git a/tests/test_spans.rs b/tests/test_spans.rs
index e856a19..ca23141 100644
--- a/tests/test_spans.rs
+++ b/tests/test_spans.rs
@@ -54,13 +54,6 @@ fn annotate(html: &str, labels: Vec<(Range<usize>, impl AsRef<str>)>) -> String
         .join("")
 }
 
-fn assert_panics_but_should_not(f: impl FnOnce() + std::panic::UnwindSafe) {
-    assert!(
-        std::panic::catch_unwind(f).is_err(),
-        "congrats! you made some span test support UTF-16, please stop calling assert_panics_but_should_not for this test"
-    );
-}
-
 #[test]
 fn start_tag_span() {
     let html = "<x> <xyz> <xyz  > <xyz/>";
@@ -219,7 +212,7 @@ fn comment_data_span() {
     ];
 
     let mut annotated = String::new();
-    for (idx, case) in cases.iter().enumerate() {
+    for case in cases {
         let labeler = |tokens: TokenIter| {
             let Token::Comment(comment) = tokens
                 .filter(|t| !matches!(t, Token::Error { .. }))
@@ -231,16 +224,7 @@ fn comment_data_span() {
             vec![(comment.data_span(), "")]
         };
 
-        println!("{idx}");
-        if [
-            0, 1, 2, 3, 8, 9, 10, 11, // FIXME
-        ]
-        .contains(&idx)
-        {
-            assert_panics_but_should_not(|| assert_char_encoding_independence(case, labeler));
-        } else {
-            assert_char_encoding_independence(case, labeler);
-        }
+        assert_char_encoding_independence(case, labeler);
         annotated.push_str(&test_and_annotate(case, labeler));
     }
author	Martin Fischer <martin@push-f.com>	2023-09-02 12:27:14 +0200
committer	Martin Fischer <martin@push-f.com>	2023-09-03 23:00:05 +0200
commit	e993f19c2b8ef00b32f17f9ed32306f3ceb21bc3 (patch)
tree	4992456f36e6d012b4cc54a69811ec321cc4550c
parent	c8a8bcb95b725d91a7c4b7bc7623171f2a04fc67 (diff)