aboutsummaryrefslogtreecommitdiff
path: root/tests/spans.rs
diff options
context:
space:
mode:
authorMartin Fischer <martin@push-f.com>2023-09-27 09:25:12 +0200
committerMartin Fischer <martin@push-f.com>2023-09-28 11:00:01 +0200
commitb027ecdb397c2e378491f847660f8eeb740e8cf6 (patch)
tree1f910e8974c1f37706b3ab78d4214977b36fe74a /tests/spans.rs
parent635a571ee76bf7fdaaf01c204f30289489b80c1a (diff)
chore: rename integration tests
Diffstat (limited to 'tests/spans.rs')
-rw-r--r--tests/spans.rs626
1 files changed, 626 insertions, 0 deletions
diff --git a/tests/spans.rs b/tests/spans.rs
new file mode 100644
index 0000000..b10808c
--- /dev/null
+++ b/tests/spans.rs
@@ -0,0 +1,626 @@
+use std::convert::Infallible;
+use std::ops::Range;
+
+use codespan_reporting::{
+ self,
+ diagnostic::{Diagnostic, Label},
+ files::SimpleFiles,
+ term::{self, termcolor::Buffer},
+};
+use html5tokenizer::{
+ offset::PosTrackingReader,
+ reader::{IntoReader, Reader},
+ trace::Trace,
+ NaiveParser, Token,
+};
+use insta::assert_snapshot;
+use similar_asserts::assert_eq;
+
+/// Just a convenient type alias for labeler closures since Rust
+/// apparently cannot infer the type (requiring an annotation).
+type Parser = NaiveParser<
+ PosTrackingReader<Box<dyn Reader<Error = Infallible>>>,
+ usize,
+ html5tokenizer::TracingEmitter,
+>;
+
+fn parser<R>(reader: impl IntoReader<'static, Reader = R>) -> Parser
+where
+ R: Reader<Error = Infallible> + 'static,
+{
+ NaiveParser::new_with_emitter(
+ PosTrackingReader::new(
+ Box::new(reader.into_reader()) as Box<dyn Reader<Error = Infallible>>
+ ),
+ html5tokenizer::TracingEmitter::default(),
+ )
+}
+
+fn test_and_annotate<S: AsRef<str> + Clone>(
+ html: &'static str,
+ labeler: impl Fn(Parser) -> Vec<(Range<usize>, S)>,
+) -> String {
+ let labels = labeler(parser(html));
+
+ assert_char_encoding_independence(html, labeler);
+
+ annotate(html, labels)
+}
+
+fn annotate(html: &str, labels: Vec<(Range<usize>, impl AsRef<str>)>) -> String {
+ let mut files = SimpleFiles::new();
+ let file_id = files.add("test.html", html);
+
+ let diagnostic = Diagnostic::note().with_labels(
+ labels
+ .into_iter()
+ .map(|(span, text)| Label::primary(file_id, span).with_message(text.as_ref()))
+ .collect(),
+ );
+
+ let mut writer = Buffer::no_color();
+ let config = codespan_reporting::term::Config::default();
+ term::emit(&mut writer, &config, &files, &diagnostic).unwrap();
+ let msg = std::str::from_utf8(writer.as_slice()).unwrap();
+
+ // strip the filename and the line numbers since we don't need them
+ // (apparently they cannot be disabled in codespan_reporting)
+ msg.lines()
+ .skip(3)
+ .flat_map(|l| l.split_once("│ ").map(|s| format!("{}\n", s.1.trim_end())))
+ .collect::<Vec<_>>()
+ .join("")
+}
+
+#[test]
+fn char_span() {
+ let html = "X &amp; &doesntexist; &#1123; </";
+ let labeler = |parser: Parser| {
+ let mut labels = Vec::new();
+ for token_trace in parser.flatten() {
+ if let (Token::Char(c), Trace::Char(span)) = token_trace {
+ if c != ' ' {
+ labels.push((span, ""));
+ }
+ }
+ }
+ labels
+ };
+ assert_snapshot!(test_and_annotate(html, labeler), @r###"
+ X &amp; &doesntexist; &#1123; </
+ ^ ^^^^^ ^^^^^^^^^^^^^ ^^^^^^^ ^^
+ "###);
+}
+
+#[test]
+fn start_tag_span() {
+ let html = "<x> <xyz> <xyz > <xyz/>";
+ let labeler = |parser: Parser| {
+ let mut labels = Vec::new();
+ for (_, trace) in parser.flatten() {
+ if let Trace::StartTag(trace) = trace {
+ labels.push((trace.span, ""));
+ }
+ }
+ labels
+ };
+ assert_snapshot!(test_and_annotate(html, labeler), @r###"
+ <x> <xyz> <xyz > <xyz/>
+ ^^^ ^^^^^ ^^^^^^^ ^^^^^^
+ "###);
+}
+
+#[test]
+fn end_tag_span() {
+ let html = "</x> </xyz> </xyz > </xyz/>";
+ let labeler = |parser: Parser| {
+ let mut labels = Vec::new();
+ for (_, trace) in parser.flatten() {
+ if let Trace::EndTag(trace) = trace {
+ labels.push((trace.span, ""));
+ }
+ }
+ labels
+ };
+ assert_snapshot!(test_and_annotate(html, labeler), @r###"
+ </x> </xyz> </xyz > </xyz/>
+ ^^^^ ^^^^^^ ^^^^^^^^ ^^^^^^^
+ "###);
+}
+
+#[test]
+fn start_tag_name_span() {
+ let html = "<x> <xyz> <xyz > <xyz/>";
+ let labeler = |parser: Parser| {
+ let mut labels = Vec::new();
+ for (_, trace) in parser.flatten() {
+ if let Trace::StartTag(trace) = trace {
+ labels.push((trace.name_span, ""));
+ }
+ }
+ labels
+ };
+ assert_snapshot!(test_and_annotate(html, labeler), @r###"
+ <x> <xyz> <xyz > <xyz/>
+ ^ ^^^ ^^^ ^^^
+ "###);
+}
+
+#[test]
+fn end_tag_name_span() {
+ let html = "</x> </xyz> </xyz > </xyz/>";
+ let labeler = |parser: Parser| {
+ let mut labels = Vec::new();
+ for (_, trace) in parser.flatten() {
+ if let Trace::EndTag(trace) = trace {
+ labels.push((trace.name_span, ""));
+ }
+ }
+ labels
+ };
+ assert_snapshot!(test_and_annotate(html, labeler), @r###"
+ </x> </xyz> </xyz > </xyz/>
+ ^ ^^^ ^^^ ^^^
+ "###);
+}
+
+#[test]
+fn attribute_name_span() {
+ let html = "<test x xyz y=VAL xy=VAL z = VAL yzx = VAL>";
+ let labeler = |parser: Parser| {
+ let mut labels = Vec::new();
+ let (Token::StartTag(tag), Trace::StartTag(trace)) = parser.flatten().next().unwrap()
+ else {
+ panic!("expected start tag")
+ };
+ for attr in &tag.attributes {
+ labels.push((
+ trace.attribute_traces[attr.trace_idx().unwrap()].name_span(),
+ "",
+ ));
+ }
+ labels
+ };
+ assert_snapshot!(test_and_annotate(html, labeler), @r###"
+ <test x xyz y=VAL xy=VAL z = VAL yzx = VAL>
+ ^ ^^^ ^ ^^ ^ ^^^
+ "###);
+}
+
+#[test]
+fn attribute_value_span() {
+ let html = "<test x=unquoted y = unquoted z='single-quoted' zz=\"double-quoted\" empty=''>";
+ let labeler = |parser: Parser| {
+ let mut labels = Vec::new();
+ let (Token::StartTag(tag), Trace::StartTag(trace)) = parser.flatten().next().unwrap()
+ else {
+ panic!("expected start tag")
+ };
+ for attr in &tag.attributes {
+ labels.push((
+ trace.attribute_traces[attr.trace_idx().unwrap()]
+ .value_span()
+ .unwrap(),
+ "",
+ ));
+ }
+ labels
+ };
+ assert_snapshot!(test_and_annotate(html, labeler), @r###"
+ <test x=unquoted y = unquoted z='single-quoted' zz="double-quoted" empty=''>
+ ^^^^^^^^ ^^^^^^^^ ^^^^^^^^^^^^^ ^^^^^^^^^^^^^ ^
+ "###);
+}
+
+#[test]
+fn attribute_value_with_char_ref() {
+ let html = "<test x=&amp; y='&amp;' z=\"&amp;\">";
+ let labeler = |parser: Parser| {
+ let mut labels = Vec::new();
+ let (Token::StartTag(tag), Trace::StartTag(trace)) = parser.flatten().next().unwrap()
+ else {
+ panic!("expected start tag")
+ };
+ for attr in &tag.attributes {
+ labels.push((
+ trace.attribute_traces[attr.trace_idx().unwrap()]
+ .value_span()
+ .unwrap(),
+ "",
+ ));
+ }
+ labels
+ };
+ assert_snapshot!(test_and_annotate(html, labeler), @r###"
+ <test x=&amp; y='&amp;' z="&amp;">
+ ^^^^^ ^^^^^ ^^^^^
+ "###);
+}
+
+#[test]
+fn comment_data_span() {
+ #[rustfmt::skip]
+ let cases = [
+ "<!-- Why are you looking at the source code? -->",
+ "<!-- Why are you looking at the source code? --",
+ "<!-- Why are you looking at the source code? -",
+ "<!-- Why are you looking at the source code?",
+ "<!--",
+ "<!-->",
+ "<!---",
+ "<!--->",
+ "<!-- Why are you looking at the source code? ->",
+ "<!-- Why are you looking at the source code? --!>",
+ "<!-- Why are you looking at the source code? --!",
+
+ // bogus comments
+ "<! Why are you looking at the source code? -->",
+ "<!",
+ ];
+
+ let mut annotated = String::new();
+ for case in cases {
+ let labeler = |parser: Parser| {
+ let (_, Trace::Comment(comment)) = parser.flatten().next().unwrap() else {
+ panic!("expected comment");
+ };
+ vec![(comment.data_span, "")]
+ };
+
+ annotated.push_str(&test_and_annotate(case, labeler));
+ }
+
+ assert_snapshot!(annotated, @r###"
+ <!-- Why are you looking at the source code? -->
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+ <!-- Why are you looking at the source code? --
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+ <!-- Why are you looking at the source code? -
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+ <!-- Why are you looking at the source code?
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+ <!--
+ ^
+ <!-->
+ ^
+ <!---
+ ^
+ <!--->
+ ^
+ <!-- Why are you looking at the source code? ->
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+ <!-- Why are you looking at the source code? --!>
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+ <!-- Why are you looking at the source code? --!
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+ <! Why are you looking at the source code? -->
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+ <!
+ ^
+ "###);
+
+ for (idx, case) in cases.iter().enumerate() {
+ let (Token::Comment(data), Trace::Comment(trace)) = parser(*case).flatten().next().unwrap()
+ else {
+ panic!("expected comment");
+ };
+ assert_eq!(case[trace.data_span], data, "case {idx}");
+ }
+}
+
+#[test]
+fn doctype_span() {
+ #[rustfmt::skip]
+ let cases = [
+ r#"<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd" >"#,
+ ];
+
+ let mut annotated = String::new();
+ for case in cases {
+ let labeler = |parser: Parser| {
+ let (_, Trace::Doctype(trace)) = parser.flatten().next().unwrap() else {
+ panic!("expected doctype");
+ };
+ vec![(trace.span(), "")]
+ };
+ annotated.push_str(&test_and_annotate(case, labeler));
+ }
+
+ assert_snapshot!(annotated, @r###"
+ <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd" >
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+ "###);
+}
+
+#[test]
+fn doctype_id_spans() {
+ #[rustfmt::skip]
+ let cases = [
+ r#"<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">"#,
+ ];
+
+ let mut annotated = String::new();
+ for case in cases {
+ let labeler = |parser: Parser| {
+ let (_, Trace::Doctype(trace)) = parser.flatten().next().unwrap() else {
+ panic!("expected doctype");
+ };
+
+ let mut labels = Vec::new();
+ if let Some(name_span) = trace.name_span() {
+ labels.push((name_span, "name"));
+ }
+ if let Some(public_id_span) = trace.public_id_span() {
+ labels.push((public_id_span, "public id"));
+ }
+ if let Some(system_id_span) = trace.system_id_span() {
+ labels.push((system_id_span, "system id"));
+ }
+ labels
+ };
+
+ annotated.push_str(&test_and_annotate(case, labeler));
+ }
+
+ assert_snapshot!(annotated, @r###"
+ <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
+ ^^^^ ^^^^^^^^^^^^^^^^^^^^^^^^^ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ system id
+ │ │
+ │ public id
+ name
+ "###);
+}
+
+#[test]
+fn eof_offset() {
+ let html = "Where does it end?";
+ let labeler = |parser: Parser| {
+ let mut labels = Vec::new();
+ for (_, trace) in parser.flatten() {
+ if let Trace::EndOfFile(offset) = trace {
+ labels.push((offset..offset, "here"));
+ }
+ }
+ labels
+ };
+ assert_snapshot!(test_and_annotate(html, labeler), @r###"
+ Where does it end?
+ ^ here
+ "###);
+}
+
+fn annotate_errors(html: &'static str) -> String {
+ let mut parser = parser(html);
+ for _ in parser.by_ref() {}
+ let errors: Vec<_> = parser.emitter_mut().drain_errors().collect();
+
+ for (_, span) in errors {
+ if span.start == span.end {
+ if span.start != html.len() {
+ panic!("empty error spans are only allowed at the very end of the source (for eof errors)");
+ }
+ } else {
+ assert!(span.start < span.end);
+ assert!(span.end <= html.len());
+ }
+ }
+
+ let labeler = |mut parser: Parser| {
+ let mut labels = Vec::new();
+ for _ in parser.by_ref() {}
+ for (error, span) in parser.emitter_mut().drain_errors() {
+ labels.push((span, error.code()));
+ }
+ labels
+ };
+
+ test_and_annotate(html, labeler)
+}
+
+#[test]
+fn tests_for_errors_are_sorted() {
+ let source_of_this_file = std::fs::read_to_string(file!()).unwrap();
+ let mut error_tests: Vec<_> = source_of_this_file
+ .lines()
+ .filter(|l| l.starts_with("fn error_"))
+ .collect();
+ let error_tests_found_order = error_tests.join("\n");
+ error_tests.sort();
+ let error_tests_sorted = error_tests.join("\n");
+ assert_eq!(error_tests_found_order, error_tests_sorted);
+}
+
+#[test]
+fn error_char_ref_absence_of_digits() {
+ let html = "&#qux;";
+ assert_snapshot!(annotate_errors(html), @r###"
+ &#qux;
+ ^^^ absence-of-digits-in-numeric-character-reference
+ "###);
+}
+
+#[test]
+fn error_char_ref_control_char() {
+ let html = "&#127;";
+ assert_snapshot!(annotate_errors(html), @r###"
+ &#127;
+ ^^^^^^ control-character-reference
+ "###);
+}
+
+#[test]
+fn error_char_ref_missing_semicolon() {
+ let html = "&not";
+ assert_snapshot!(annotate_errors(html), @r###"
+ &not
+ ^ missing-semicolon-after-character-reference
+ "###);
+}
+
+#[test]
+fn error_char_ref_noncharacter() {
+ let html = "&#xFDD0;";
+ assert_snapshot!(annotate_errors(html), @r###"
+ &#xFDD0;
+ ^^^^^^^^ noncharacter-character-reference
+ "###);
+}
+
+#[test]
+fn error_char_ref_null_char() {
+ let html = "&#0;";
+ assert_snapshot!(annotate_errors(html), @r###"
+ &#0;
+ ^^^^ null-character-reference
+ "###);
+}
+
+#[test]
+fn error_char_ref_outside_unicode_range() {
+ let html = "&#9999999;";
+ assert_snapshot!(annotate_errors(html), @r###"
+ &#9999999;
+ ^^^^^^^^^^ character-reference-outside-unicode-range
+ "###);
+}
+
+#[test]
+fn error_char_ref_surrogate() {
+ let html = "&#xD800;";
+ assert_snapshot!(annotate_errors(html), @r###"
+ &#xD800;
+ ^^^^^^^^ surrogate-character-reference
+ "###);
+}
+
+#[test]
+fn error_char_ref_unknown_named() {
+ let html = "The pirate says &arrrrr;";
+ assert_snapshot!(annotate_errors(html), @r###"
+ The pirate says &arrrrr;
+ ^^^^^^^^ unknown-named-character-reference
+ "###);
+}
+
+#[test]
+fn error_duplicate_attribute() {
+ let html = "Does this open two pages? <a href=foo.html href=bar.html>";
+ assert_snapshot!(annotate_errors(html), @r###"
+ Does this open two pages? <a href=foo.html href=bar.html>
+ ^^^^ duplicate-attribute
+ "###);
+}
+
+#[test]
+fn error_end_tag_with_attributes() {
+ let html = "</end-tag first second=value>";
+ assert_snapshot!(annotate_errors(html), @r###"
+ </end-tag first second=value>
+ ^^^^^^ end-tag-with-attributes
+ "###);
+}
+
+#[test]
+fn error_end_tag_with_trailing_solidus() {
+ let html = "Do you start or do you end? </yes/>";
+ assert_snapshot!(annotate_errors(html), @r###"
+ Do you start or do you end? </yes/>
+ ^ end-tag-with-trailing-solidus
+ "###);
+}
+
+#[test]
+fn error_eof_before_tag_name() {
+ let html = "<";
+ assert_snapshot!(annotate_errors(html), @r###"
+ <
+ ^ eof-before-tag-name
+ "###);
+}
+
+// TODO: add error_eof_in_cdata test
+// blocked by lack of proper tree constructor (NaiveParser doesn't parse CDATA sections)
+
+#[test]
+fn error_eof_in_comment() {
+ let html = "<!--";
+ assert_snapshot!(annotate_errors(html), @r###"
+ <!--
+ ^ eof-in-comment
+ "###);
+}
+
+#[test]
+fn error_eof_in_doctype() {
+ let html = "<!doctype html";
+ assert_snapshot!(annotate_errors(html), @r###"
+ <!doctype html
+ ^ eof-in-doctype
+ "###);
+}
+
+#[test]
+fn error_eof_in_script_html_comment_like_text() {
+ let html = "<script><!--";
+ assert_snapshot!(annotate_errors(html), @r###"
+ <script><!--
+ ^ eof-in-script-html-comment-like-text
+ "###);
+}
+
+#[test]
+fn error_eof_in_tag() {
+ let html = "</sarcasm";
+ assert_snapshot!(annotate_errors(html), @r###"
+ </sarcasm
+ ^ eof-in-tag
+ "###);
+}
+
+#[test]
+fn error_invalid_first_character_of_tag_name() {
+ let html = "Please mind the gap: < test";
+ assert_snapshot!(annotate_errors(html), @r###"
+ Please mind the gap: < test
+ ^ invalid-first-character-of-tag-name
+ "###);
+}
+
+fn assert_char_encoding_independence<S: AsRef<str> + Clone>(
+ html: &'static str,
+ labeler: impl Fn(Parser) -> Vec<(Range<usize>, S)>,
+) {
+ let utf8_labels = labeler(parser(html));
+ let utf16_labels = labeler(parser(Utf16Reader(html.into_reader())));
+
+ for (idx, (span, _)) in utf16_labels.into_iter().enumerate() {
+ let expected_utf16_span = Range {
+ start: html[..utf8_labels[idx].0.start].encode_utf16().count() * 2,
+ end: html[..utf8_labels[idx].0.end].encode_utf16().count() * 2,
+ };
+ assert_eq!(
+ span,
+ expected_utf16_span,
+ "UTF-16 span didn't match the UTF-8 span, which looks like:\n{}",
+ annotate(html, vec![utf8_labels[idx].clone()])
+ );
+ }
+}
+
+struct Utf16Reader<'a>(html5tokenizer::reader::StringReader<'a>);
+
+impl html5tokenizer::reader::Reader for Utf16Reader<'_> {
+ type Error = std::convert::Infallible;
+
+ fn read_char(&mut self) -> Result<Option<char>, Self::Error> {
+ self.0.read_char()
+ }
+
+ fn try_read_string(&mut self, s: &str, case_sensitive: bool) -> Result<bool, Self::Error> {
+ self.0.try_read_string(s, case_sensitive)
+ }
+
+ fn len_of_char_in_current_encoding(&self, c: char) -> usize {
+ c.len_utf16() * 2
+ }
+}