diff options
author | Martin Fischer <martin@push-f.com> | 2023-09-02 09:50:43 +0200 |
---|---|---|
committer | Martin Fischer <martin@push-f.com> | 2023-09-03 23:00:05 +0200 |
commit | c040c4fc8091f1b63b63723334b0f1f821e8059f (patch) | |
tree | 118368e18d618f8b1f3d899750802d12151e6cae /tests/test_spans.rs | |
parent | 3eaa8598b5749e5d7554a223ef2079ebdb778730 (diff) |
test: verify that span logic incorrectly assumes UTF-8
Diffstat (limited to 'tests/test_spans.rs')
-rw-r--r-- | tests/test_spans.rs | 85 |
1 files changed, 85 insertions, 0 deletions
diff --git a/tests/test_spans.rs b/tests/test_spans.rs index db17328..c58616d 100644 --- a/tests/test_spans.rs +++ b/tests/test_spans.rs @@ -54,6 +54,13 @@ fn annotate(html: &str, labels: Vec<(Range<usize>, impl AsRef<str>)>) -> String .join("") } +fn assert_panics_but_should_not(f: impl FnOnce() + std::panic::UnwindSafe) { + assert!( + std::panic::catch_unwind(f).is_err(), + "congrats! you made some span test support UTF-16, please stop calling assert_panics_but_should_not for this test" + ); +} + #[test] fn start_tag_span() { let html = "<x> <xyz> <xyz > <xyz/>"; @@ -66,6 +73,7 @@ fn start_tag_span() { } labels }; + assert_panics_but_should_not(|| assert_char_encoding_independence(html, labeler)); // FIXME assert_snapshot!(test_and_annotate(html, labeler), @r###" <x> <xyz> <xyz > <xyz/> ^^^ ^^^^^ ^^^^^^^ ^^^^^^ @@ -84,6 +92,7 @@ fn end_tag_span() { } labels }; + assert_panics_but_should_not(|| assert_char_encoding_independence(html, labeler)); // FIXME assert_snapshot!(test_and_annotate(html, labeler), @r###" </x> </xyz> </xyz > </xyz/> ^^^^ ^^^^^^ ^^^^^^^^ ^^^^^^^ @@ -102,6 +111,7 @@ fn start_tag_name_span() { } labels }; + assert_panics_but_should_not(|| assert_char_encoding_independence(html, labeler)); // FIXME assert_snapshot!(test_and_annotate(html, labeler), @r###" <x> <xyz> <xyz > <xyz/> ^ ^^^ ^^^ ^^^ @@ -120,6 +130,7 @@ fn end_tag_name_span() { } labels }; + assert_panics_but_should_not(|| assert_char_encoding_independence(html, labeler)); // FIXME assert_snapshot!(test_and_annotate(html, labeler), @r###" </x> </xyz> </xyz > </xyz/> ^ ^^^ ^^^ ^^^ @@ -139,6 +150,7 @@ fn attribute_name_span() { } labels }; + assert_panics_but_should_not(|| assert_char_encoding_independence(html, labeler)); // FIXME assert_snapshot!(test_and_annotate(html, labeler), @r###" <test x xyz y=VAL xy=VAL z = VAL yzx = VAL> ^ ^^^ ^ ^^ ^ ^^^ @@ -158,6 +170,7 @@ fn attribute_value_span() { } labels }; + assert_panics_but_should_not(|| assert_char_encoding_independence(html, labeler)); // FIXME assert_snapshot!(test_and_annotate(html, labeler), @r###" <test x=unquoted y = unquoted z='single-quoted' zz="double-quoted" empty=''> ^^^^^^^^ ^^^^^^^^ ^^^^^^^^^^^^^ ^^^^^^^^^^^^^ ^ @@ -177,6 +190,7 @@ fn attribute_value_with_char_ref() { } labels }; + assert_panics_but_should_not(|| assert_char_encoding_independence(html, labeler)); // FIXME assert_snapshot!(test_and_annotate(html, labeler), @r###" <test x=& y='&' z="&"> ^^^^^ ^^^^^ ^^^^^ @@ -205,6 +219,7 @@ fn comment_data_span() { }; vec![(comment.data_span(), "")] }; + assert_panics_but_should_not(|| assert_char_encoding_independence(case, labeler)); // FIXME annotated.push_str(&test_and_annotate(case, labeler)); } @@ -246,6 +261,7 @@ fn doctype_span() { }; vec![(doctype.span, "")] }; + assert_panics_but_should_not(|| assert_char_encoding_independence(case, labeler)); // FIXME annotated.push_str(&test_and_annotate(case, labeler)); } @@ -283,6 +299,7 @@ fn doctype_id_spans() { labels }; + assert_panics_but_should_not(|| assert_char_encoding_independence(case, labeler)); // FIXME annotated.push_str(&test_and_annotate(case, labeler)); } @@ -309,6 +326,8 @@ fn annotate_errors(html: &'static str) -> String { } } + let doesnt_support_utf16 = std::sync::Mutex::new(false); + let labeler = |tokens| { let mut labels = Vec::new(); for token in tokens { @@ -317,10 +336,36 @@ fn annotate_errors(html: &'static str) -> String { }; labels.push((span, error.code())); + + use html5tokenizer::Error; + + *doesnt_support_utf16.lock().unwrap() = matches!( + error, + | Error::AbsenceOfDigitsInNumericCharacterReference // FIXME + | Error::CharacterReferenceOutsideUnicodeRange // FIXME + | Error::ControlCharacterReference // FIXME + | Error::DuplicateAttribute // FIXME + | Error::EndTagWithAttributes // FIXME + | Error::EndTagWithTrailingSolidus // FIXME + | Error::InvalidFirstCharacterOfTagName // FIXME + | Error::NoncharacterCharacterReference // FIXME + | Error::NullCharacterReference // FIXME + | Error::SurrogateCharacterReference // FIXME + | Error::UnknownNamedCharacterReference // FIXME + ); } labels }; + // This will be removed once all tested errors support UTF-16. + let _ = labeler(Box::new(tokenizer(html)) as TokenIter); + if *doesnt_support_utf16.lock().unwrap() { + assert_panics_but_should_not(|| assert_char_encoding_independence(html, labeler)); + } else { + // TODO: Move this assertion into test_and_annotate once all tests support it. + assert_char_encoding_independence(html, labeler); + } + test_and_annotate(html, labeler) } @@ -492,3 +537,43 @@ fn error_invalid_first_character_of_tag_name() { ^ invalid-first-character-of-tag-name "###); } + +fn assert_char_encoding_independence<S: AsRef<str> + Clone>( + html: &'static str, + labeler: impl Fn(TokenIter) -> Vec<(Range<usize>, S)>, +) { + let utf8_tokens = NaiveParser::new(PosTrackingReader::new(html)).flatten(); + let string_reader = html5tokenizer::reader::IntoReader::into_reader(html); + let utf16_tokens = + NaiveParser::new(PosTrackingReader::new(Utf16Reader(string_reader))).flatten(); + let utf8_labels = labeler(Box::new(utf8_tokens)); + + for (idx, (span, _)) in labeler(Box::new(utf16_tokens)).into_iter().enumerate() { + let expected_utf16_span = html[..utf8_labels[idx].0.start].encode_utf16().count() * 2 + ..html[..utf8_labels[idx].0.end].encode_utf16().count() * 2; + assert_eq!( + span, + expected_utf16_span, + "UTF-16 span didn't match the UTF-8 span, which looks like:\n{}", + annotate(html, vec![utf8_labels[idx].clone()]) + ); + } +} + +struct Utf16Reader<'a>(html5tokenizer::reader::StringReader<'a>); + +impl html5tokenizer::reader::Reader for Utf16Reader<'_> { + type Error = std::convert::Infallible; + + fn read_char(&mut self) -> Result<Option<char>, Self::Error> { + self.0.read_char() + } + + fn try_read_string(&mut self, s: &str, case_sensitive: bool) -> Result<bool, Self::Error> { + self.0.try_read_string(s, case_sensitive) + } + + fn len_of_char_in_current_encoding(&self, c: char) -> usize { + c.len_utf16() * 2 + } +} |