From c040c4fc8091f1b63b63723334b0f1f821e8059f Mon Sep 17 00:00:00 2001 From: Martin Fischer Date: Sat, 2 Sep 2023 09:50:43 +0200 Subject: test: verify that span logic incorrectly assumes UTF-8 --- tests/test_spans.rs | 85 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 85 insertions(+) (limited to 'tests') diff --git a/tests/test_spans.rs b/tests/test_spans.rs index db17328..c58616d 100644 --- a/tests/test_spans.rs +++ b/tests/test_spans.rs @@ -54,6 +54,13 @@ fn annotate(html: &str, labels: Vec<(Range, impl AsRef)>) -> String .join("") } +fn assert_panics_but_should_not(f: impl FnOnce() + std::panic::UnwindSafe) { + assert!( + std::panic::catch_unwind(f).is_err(), + "congrats! you made some span test support UTF-16, please stop calling assert_panics_but_should_not for this test" + ); +} + #[test] fn start_tag_span() { let html = " "; @@ -66,6 +73,7 @@ fn start_tag_span() { } labels }; + assert_panics_but_should_not(|| assert_char_encoding_independence(html, labeler)); // FIXME assert_snapshot!(test_and_annotate(html, labeler), @r###" ^^^ ^^^^^ ^^^^^^^ ^^^^^^ @@ -84,6 +92,7 @@ fn end_tag_span() { } labels }; + assert_panics_but_should_not(|| assert_char_encoding_independence(html, labeler)); // FIXME assert_snapshot!(test_and_annotate(html, labeler), @r###" ^^^^ ^^^^^^ ^^^^^^^^ ^^^^^^^ @@ -102,6 +111,7 @@ fn start_tag_name_span() { } labels }; + assert_panics_but_should_not(|| assert_char_encoding_independence(html, labeler)); // FIXME assert_snapshot!(test_and_annotate(html, labeler), @r###" ^ ^^^ ^^^ ^^^ @@ -120,6 +130,7 @@ fn end_tag_name_span() { } labels }; + assert_panics_but_should_not(|| assert_char_encoding_independence(html, labeler)); // FIXME assert_snapshot!(test_and_annotate(html, labeler), @r###" ^ ^^^ ^^^ ^^^ @@ -139,6 +150,7 @@ fn attribute_name_span() { } labels }; + assert_panics_but_should_not(|| assert_char_encoding_independence(html, labeler)); // FIXME assert_snapshot!(test_and_annotate(html, labeler), @r###" ^ ^^^ ^ ^^ ^ ^^^ @@ -158,6 +170,7 @@ fn attribute_value_span() { } labels }; + assert_panics_but_should_not(|| assert_char_encoding_independence(html, labeler)); // FIXME assert_snapshot!(test_and_annotate(html, labeler), @r###" ^^^^^^^^ ^^^^^^^^ ^^^^^^^^^^^^^ ^^^^^^^^^^^^^ ^ @@ -177,6 +190,7 @@ fn attribute_value_with_char_ref() { } labels }; + assert_panics_but_should_not(|| assert_char_encoding_independence(html, labeler)); // FIXME assert_snapshot!(test_and_annotate(html, labeler), @r###" ^^^^^ ^^^^^ ^^^^^ @@ -205,6 +219,7 @@ fn comment_data_span() { }; vec![(comment.data_span(), "")] }; + assert_panics_but_should_not(|| assert_char_encoding_independence(case, labeler)); // FIXME annotated.push_str(&test_and_annotate(case, labeler)); } @@ -246,6 +261,7 @@ fn doctype_span() { }; vec![(doctype.span, "")] }; + assert_panics_but_should_not(|| assert_char_encoding_independence(case, labeler)); // FIXME annotated.push_str(&test_and_annotate(case, labeler)); } @@ -283,6 +299,7 @@ fn doctype_id_spans() { labels }; + assert_panics_but_should_not(|| assert_char_encoding_independence(case, labeler)); // FIXME annotated.push_str(&test_and_annotate(case, labeler)); } @@ -309,6 +326,8 @@ fn annotate_errors(html: &'static str) -> String { } } + let doesnt_support_utf16 = std::sync::Mutex::new(false); + let labeler = |tokens| { let mut labels = Vec::new(); for token in tokens { @@ -317,10 +336,36 @@ fn annotate_errors(html: &'static str) -> String { }; labels.push((span, error.code())); + + use html5tokenizer::Error; + + *doesnt_support_utf16.lock().unwrap() = matches!( + error, + | Error::AbsenceOfDigitsInNumericCharacterReference // FIXME + | Error::CharacterReferenceOutsideUnicodeRange // FIXME + | Error::ControlCharacterReference // FIXME + | Error::DuplicateAttribute // FIXME + | Error::EndTagWithAttributes // FIXME + | Error::EndTagWithTrailingSolidus // FIXME + | Error::InvalidFirstCharacterOfTagName // FIXME + | Error::NoncharacterCharacterReference // FIXME + | Error::NullCharacterReference // FIXME + | Error::SurrogateCharacterReference // FIXME + | Error::UnknownNamedCharacterReference // FIXME + ); } labels }; + // This will be removed once all tested errors support UTF-16. + let _ = labeler(Box::new(tokenizer(html)) as TokenIter); + if *doesnt_support_utf16.lock().unwrap() { + assert_panics_but_should_not(|| assert_char_encoding_independence(html, labeler)); + } else { + // TODO: Move this assertion into test_and_annotate once all tests support it. + assert_char_encoding_independence(html, labeler); + } + test_and_annotate(html, labeler) } @@ -492,3 +537,43 @@ fn error_invalid_first_character_of_tag_name() { ^ invalid-first-character-of-tag-name "###); } + +fn assert_char_encoding_independence + Clone>( + html: &'static str, + labeler: impl Fn(TokenIter) -> Vec<(Range, S)>, +) { + let utf8_tokens = NaiveParser::new(PosTrackingReader::new(html)).flatten(); + let string_reader = html5tokenizer::reader::IntoReader::into_reader(html); + let utf16_tokens = + NaiveParser::new(PosTrackingReader::new(Utf16Reader(string_reader))).flatten(); + let utf8_labels = labeler(Box::new(utf8_tokens)); + + for (idx, (span, _)) in labeler(Box::new(utf16_tokens)).into_iter().enumerate() { + let expected_utf16_span = html[..utf8_labels[idx].0.start].encode_utf16().count() * 2 + ..html[..utf8_labels[idx].0.end].encode_utf16().count() * 2; + assert_eq!( + span, + expected_utf16_span, + "UTF-16 span didn't match the UTF-8 span, which looks like:\n{}", + annotate(html, vec![utf8_labels[idx].clone()]) + ); + } +} + +struct Utf16Reader<'a>(html5tokenizer::reader::StringReader<'a>); + +impl html5tokenizer::reader::Reader for Utf16Reader<'_> { + type Error = std::convert::Infallible; + + fn read_char(&mut self) -> Result, Self::Error> { + self.0.read_char() + } + + fn try_read_string(&mut self, s: &str, case_sensitive: bool) -> Result { + self.0.try_read_string(s, case_sensitive) + } + + fn len_of_char_in_current_encoding(&self, c: char) -> usize { + c.len_utf16() * 2 + } +} -- cgit v1.2.3