From c040c4fc8091f1b63b63723334b0f1f821e8059f Mon Sep 17 00:00:00 2001
From: Martin Fischer <martin@push-f.com>
Date: Sat, 2 Sep 2023 09:50:43 +0200
Subject: test: verify that span logic incorrectly assumes UTF-8

---
 tests/test_spans.rs | 85 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 85 insertions(+)

(limited to 'tests')
diff --git a/tests/test_spans.rs b/tests/test_spans.rs
index db17328..c58616d 100644
--- a/tests/test_spans.rs
+++ b/tests/test_spans.rs
@@ -54,6 +54,13 @@ fn annotate(html: &str, labels: Vec<(Range<usize>, impl AsRef<str>)>) -> String
         .join("")
 }
 
+fn assert_panics_but_should_not(f: impl FnOnce() + std::panic::UnwindSafe) {
+    assert!(
+        std::panic::catch_unwind(f).is_err(),
+        "congrats! you made some span test support UTF-16, please stop calling assert_panics_but_should_not for this test"
+    );
+}
+
 #[test]
 fn start_tag_span() {
     let html = "<x> <xyz> <xyz  > <xyz/>";
@@ -66,6 +73,7 @@ fn start_tag_span() {
         }
         labels
     };
+    assert_panics_but_should_not(|| assert_char_encoding_independence(html, labeler)); // FIXME
     assert_snapshot!(test_and_annotate(html, labeler), @r###"
     <x> <xyz> <xyz  > <xyz/>
     ^^^ ^^^^^ ^^^^^^^ ^^^^^^
@@ -84,6 +92,7 @@ fn end_tag_span() {
         }
         labels
     };
+    assert_panics_but_should_not(|| assert_char_encoding_independence(html, labeler)); // FIXME
     assert_snapshot!(test_and_annotate(html, labeler), @r###"
     </x> </xyz> </xyz  > </xyz/>
     ^^^^ ^^^^^^ ^^^^^^^^ ^^^^^^^
@@ -102,6 +111,7 @@ fn start_tag_name_span() {
         }
         labels
     };
+    assert_panics_but_should_not(|| assert_char_encoding_independence(html, labeler)); // FIXME
     assert_snapshot!(test_and_annotate(html, labeler), @r###"
     <x> <xyz> <xyz  > <xyz/>
      ^   ^^^   ^^^     ^^^
@@ -120,6 +130,7 @@ fn end_tag_name_span() {
         }
         labels
     };
+    assert_panics_but_should_not(|| assert_char_encoding_independence(html, labeler)); // FIXME
     assert_snapshot!(test_and_annotate(html, labeler), @r###"
     </x> </xyz> </xyz  > </xyz/>
       ^    ^^^    ^^^      ^^^
@@ -139,6 +150,7 @@ fn attribute_name_span() {
         }
         labels
     };
+    assert_panics_but_should_not(|| assert_char_encoding_independence(html, labeler)); // FIXME
     assert_snapshot!(test_and_annotate(html, labeler), @r###"
     <test x xyz y=VAL xy=VAL z = VAL yzx = VAL>
           ^ ^^^ ^     ^^     ^       ^^^
@@ -158,6 +170,7 @@ fn attribute_value_span() {
         }
         labels
     };
+    assert_panics_but_should_not(|| assert_char_encoding_independence(html, labeler)); // FIXME
     assert_snapshot!(test_and_annotate(html, labeler), @r###"
     <test x=unquoted y = unquoted z='single-quoted' zz="double-quoted" empty=''>
             ^^^^^^^^     ^^^^^^^^    ^^^^^^^^^^^^^      ^^^^^^^^^^^^^         ^
@@ -177,6 +190,7 @@ fn attribute_value_with_char_ref() {
         }
         labels
     };
+    assert_panics_but_should_not(|| assert_char_encoding_independence(html, labeler)); // FIXME
     assert_snapshot!(test_and_annotate(html, labeler), @r###"
     <test x=&amp; y='&amp;' z="&amp;">
             ^^^^^    ^^^^^     ^^^^^
@@ -205,6 +219,7 @@ fn comment_data_span() {
             };
             vec![(comment.data_span(), "")]
         };
+        assert_panics_but_should_not(|| assert_char_encoding_independence(case, labeler)); // FIXME
         annotated.push_str(&test_and_annotate(case, labeler));
     }
 
@@ -246,6 +261,7 @@ fn doctype_span() {
             };
             vec![(doctype.span, "")]
         };
+        assert_panics_but_should_not(|| assert_char_encoding_independence(case, labeler)); // FIXME
         annotated.push_str(&test_and_annotate(case, labeler));
     }
 
@@ -283,6 +299,7 @@ fn doctype_id_spans() {
             labels
         };
 
+        assert_panics_but_should_not(|| assert_char_encoding_independence(case, labeler)); // FIXME
         annotated.push_str(&test_and_annotate(case, labeler));
     }
 
@@ -309,6 +326,8 @@ fn annotate_errors(html: &'static str) -> String {
         }
     }
 
+    let doesnt_support_utf16 = std::sync::Mutex::new(false);
+
     let labeler = |tokens| {
         let mut labels = Vec::new();
         for token in tokens {
@@ -317,10 +336,36 @@ fn annotate_errors(html: &'static str) -> String {
             };
 
             labels.push((span, error.code()));
+
+            use html5tokenizer::Error;
+
+            *doesnt_support_utf16.lock().unwrap() = matches!(
+                error,
+                | Error::AbsenceOfDigitsInNumericCharacterReference // FIXME
+                | Error::CharacterReferenceOutsideUnicodeRange // FIXME
+                | Error::ControlCharacterReference // FIXME
+                | Error::DuplicateAttribute // FIXME
+                | Error::EndTagWithAttributes // FIXME
+                | Error::EndTagWithTrailingSolidus // FIXME
+                | Error::InvalidFirstCharacterOfTagName // FIXME
+                | Error::NoncharacterCharacterReference // FIXME
+                | Error::NullCharacterReference // FIXME
+                | Error::SurrogateCharacterReference // FIXME
+                | Error::UnknownNamedCharacterReference // FIXME
+            );
         }
         labels
     };
 
+    // This will be removed once all tested errors support UTF-16.
+    let _ = labeler(Box::new(tokenizer(html)) as TokenIter);
+    if *doesnt_support_utf16.lock().unwrap() {
+        assert_panics_but_should_not(|| assert_char_encoding_independence(html, labeler));
+    } else {
+        // TODO: Move this assertion into test_and_annotate once all tests support it.
+        assert_char_encoding_independence(html, labeler);
+    }
+
     test_and_annotate(html, labeler)
 }
 
@@ -492,3 +537,43 @@ fn error_invalid_first_character_of_tag_name() {
                           ^ invalid-first-character-of-tag-name
     "###);
 }
+
+fn assert_char_encoding_independence<S: AsRef<str> + Clone>(
+    html: &'static str,
+    labeler: impl Fn(TokenIter) -> Vec<(Range<usize>, S)>,
+) {
+    let utf8_tokens = NaiveParser::new(PosTrackingReader::new(html)).flatten();
+    let string_reader = html5tokenizer::reader::IntoReader::into_reader(html);
+    let utf16_tokens =
+        NaiveParser::new(PosTrackingReader::new(Utf16Reader(string_reader))).flatten();
+    let utf8_labels = labeler(Box::new(utf8_tokens));
+
+    for (idx, (span, _)) in labeler(Box::new(utf16_tokens)).into_iter().enumerate() {
+        let expected_utf16_span = html[..utf8_labels[idx].0.start].encode_utf16().count() * 2
+            ..html[..utf8_labels[idx].0.end].encode_utf16().count() * 2;
+        assert_eq!(
+            span,
+            expected_utf16_span,
+            "UTF-16 span didn't match the UTF-8 span, which looks like:\n{}",
+            annotate(html, vec![utf8_labels[idx].clone()])
+        );
+    }
+}
+
+struct Utf16Reader<'a>(html5tokenizer::reader::StringReader<'a>);
+
+impl html5tokenizer::reader::Reader for Utf16Reader<'_> {
+    type Error = std::convert::Infallible;
+
+    fn read_char(&mut self) -> Result<Option<char>, Self::Error> {
+        self.0.read_char()
+    }
+
+    fn try_read_string(&mut self, s: &str, case_sensitive: bool) -> Result<bool, Self::Error> {
+        self.0.try_read_string(s, case_sensitive)
+    }
+
+    fn len_of_char_in_current_encoding(&self, c: char) -> usize {
+        c.len_utf16() * 2
+    }
+}
-- 
cgit v1.2.3