aboutsummaryrefslogtreecommitdiff
path: root/tests/test_spans.rs
diff options
context:
space:
mode:
authorMartin Fischer <martin@push-f.com>2023-09-02 09:50:43 +0200
committerMartin Fischer <martin@push-f.com>2023-09-03 23:00:05 +0200
commitc040c4fc8091f1b63b63723334b0f1f821e8059f (patch)
tree118368e18d618f8b1f3d899750802d12151e6cae /tests/test_spans.rs
parent3eaa8598b5749e5d7554a223ef2079ebdb778730 (diff)
test: verify that span logic incorrectly assumes UTF-8
Diffstat (limited to 'tests/test_spans.rs')
-rw-r--r--tests/test_spans.rs85
1 files changed, 85 insertions, 0 deletions
diff --git a/tests/test_spans.rs b/tests/test_spans.rs
index db17328..c58616d 100644
--- a/tests/test_spans.rs
+++ b/tests/test_spans.rs
@@ -54,6 +54,13 @@ fn annotate(html: &str, labels: Vec<(Range<usize>, impl AsRef<str>)>) -> String
.join("")
}
+fn assert_panics_but_should_not(f: impl FnOnce() + std::panic::UnwindSafe) {
+ assert!(
+ std::panic::catch_unwind(f).is_err(),
+ "congrats! you made some span test support UTF-16, please stop calling assert_panics_but_should_not for this test"
+ );
+}
+
#[test]
fn start_tag_span() {
let html = "<x> <xyz> <xyz > <xyz/>";
@@ -66,6 +73,7 @@ fn start_tag_span() {
}
labels
};
+ assert_panics_but_should_not(|| assert_char_encoding_independence(html, labeler)); // FIXME
assert_snapshot!(test_and_annotate(html, labeler), @r###"
<x> <xyz> <xyz > <xyz/>
^^^ ^^^^^ ^^^^^^^ ^^^^^^
@@ -84,6 +92,7 @@ fn end_tag_span() {
}
labels
};
+ assert_panics_but_should_not(|| assert_char_encoding_independence(html, labeler)); // FIXME
assert_snapshot!(test_and_annotate(html, labeler), @r###"
</x> </xyz> </xyz > </xyz/>
^^^^ ^^^^^^ ^^^^^^^^ ^^^^^^^
@@ -102,6 +111,7 @@ fn start_tag_name_span() {
}
labels
};
+ assert_panics_but_should_not(|| assert_char_encoding_independence(html, labeler)); // FIXME
assert_snapshot!(test_and_annotate(html, labeler), @r###"
<x> <xyz> <xyz > <xyz/>
^ ^^^ ^^^ ^^^
@@ -120,6 +130,7 @@ fn end_tag_name_span() {
}
labels
};
+ assert_panics_but_should_not(|| assert_char_encoding_independence(html, labeler)); // FIXME
assert_snapshot!(test_and_annotate(html, labeler), @r###"
</x> </xyz> </xyz > </xyz/>
^ ^^^ ^^^ ^^^
@@ -139,6 +150,7 @@ fn attribute_name_span() {
}
labels
};
+ assert_panics_but_should_not(|| assert_char_encoding_independence(html, labeler)); // FIXME
assert_snapshot!(test_and_annotate(html, labeler), @r###"
<test x xyz y=VAL xy=VAL z = VAL yzx = VAL>
^ ^^^ ^ ^^ ^ ^^^
@@ -158,6 +170,7 @@ fn attribute_value_span() {
}
labels
};
+ assert_panics_but_should_not(|| assert_char_encoding_independence(html, labeler)); // FIXME
assert_snapshot!(test_and_annotate(html, labeler), @r###"
<test x=unquoted y = unquoted z='single-quoted' zz="double-quoted" empty=''>
^^^^^^^^ ^^^^^^^^ ^^^^^^^^^^^^^ ^^^^^^^^^^^^^ ^
@@ -177,6 +190,7 @@ fn attribute_value_with_char_ref() {
}
labels
};
+ assert_panics_but_should_not(|| assert_char_encoding_independence(html, labeler)); // FIXME
assert_snapshot!(test_and_annotate(html, labeler), @r###"
<test x=&amp; y='&amp;' z="&amp;">
^^^^^ ^^^^^ ^^^^^
@@ -205,6 +219,7 @@ fn comment_data_span() {
};
vec![(comment.data_span(), "")]
};
+ assert_panics_but_should_not(|| assert_char_encoding_independence(case, labeler)); // FIXME
annotated.push_str(&test_and_annotate(case, labeler));
}
@@ -246,6 +261,7 @@ fn doctype_span() {
};
vec![(doctype.span, "")]
};
+ assert_panics_but_should_not(|| assert_char_encoding_independence(case, labeler)); // FIXME
annotated.push_str(&test_and_annotate(case, labeler));
}
@@ -283,6 +299,7 @@ fn doctype_id_spans() {
labels
};
+ assert_panics_but_should_not(|| assert_char_encoding_independence(case, labeler)); // FIXME
annotated.push_str(&test_and_annotate(case, labeler));
}
@@ -309,6 +326,8 @@ fn annotate_errors(html: &'static str) -> String {
}
}
+ let doesnt_support_utf16 = std::sync::Mutex::new(false);
+
let labeler = |tokens| {
let mut labels = Vec::new();
for token in tokens {
@@ -317,10 +336,36 @@ fn annotate_errors(html: &'static str) -> String {
};
labels.push((span, error.code()));
+
+ use html5tokenizer::Error;
+
+ *doesnt_support_utf16.lock().unwrap() = matches!(
+ error,
+ | Error::AbsenceOfDigitsInNumericCharacterReference // FIXME
+ | Error::CharacterReferenceOutsideUnicodeRange // FIXME
+ | Error::ControlCharacterReference // FIXME
+ | Error::DuplicateAttribute // FIXME
+ | Error::EndTagWithAttributes // FIXME
+ | Error::EndTagWithTrailingSolidus // FIXME
+ | Error::InvalidFirstCharacterOfTagName // FIXME
+ | Error::NoncharacterCharacterReference // FIXME
+ | Error::NullCharacterReference // FIXME
+ | Error::SurrogateCharacterReference // FIXME
+ | Error::UnknownNamedCharacterReference // FIXME
+ );
}
labels
};
+ // This will be removed once all tested errors support UTF-16.
+ let _ = labeler(Box::new(tokenizer(html)) as TokenIter);
+ if *doesnt_support_utf16.lock().unwrap() {
+ assert_panics_but_should_not(|| assert_char_encoding_independence(html, labeler));
+ } else {
+ // TODO: Move this assertion into test_and_annotate once all tests support it.
+ assert_char_encoding_independence(html, labeler);
+ }
+
test_and_annotate(html, labeler)
}
@@ -492,3 +537,43 @@ fn error_invalid_first_character_of_tag_name() {
^ invalid-first-character-of-tag-name
"###);
}
+
+fn assert_char_encoding_independence<S: AsRef<str> + Clone>(
+ html: &'static str,
+ labeler: impl Fn(TokenIter) -> Vec<(Range<usize>, S)>,
+) {
+ let utf8_tokens = NaiveParser::new(PosTrackingReader::new(html)).flatten();
+ let string_reader = html5tokenizer::reader::IntoReader::into_reader(html);
+ let utf16_tokens =
+ NaiveParser::new(PosTrackingReader::new(Utf16Reader(string_reader))).flatten();
+ let utf8_labels = labeler(Box::new(utf8_tokens));
+
+ for (idx, (span, _)) in labeler(Box::new(utf16_tokens)).into_iter().enumerate() {
+ let expected_utf16_span = html[..utf8_labels[idx].0.start].encode_utf16().count() * 2
+ ..html[..utf8_labels[idx].0.end].encode_utf16().count() * 2;
+ assert_eq!(
+ span,
+ expected_utf16_span,
+ "UTF-16 span didn't match the UTF-8 span, which looks like:\n{}",
+ annotate(html, vec![utf8_labels[idx].clone()])
+ );
+ }
+}
+
+struct Utf16Reader<'a>(html5tokenizer::reader::StringReader<'a>);
+
+impl html5tokenizer::reader::Reader for Utf16Reader<'_> {
+ type Error = std::convert::Infallible;
+
+ fn read_char(&mut self) -> Result<Option<char>, Self::Error> {
+ self.0.read_char()
+ }
+
+ fn try_read_string(&mut self, s: &str, case_sensitive: bool) -> Result<bool, Self::Error> {
+ self.0.try_read_string(s, case_sensitive)
+ }
+
+ fn len_of_char_in_current_encoding(&self, c: char) -> usize {
+ c.len_utf16() * 2
+ }
+}