diff options
author | Martin Fischer <martin@push-f.com> | 2023-09-27 09:25:12 +0200 |
---|---|---|
committer | Martin Fischer <martin@push-f.com> | 2023-09-28 11:00:01 +0200 |
commit | b027ecdb397c2e378491f847660f8eeb740e8cf6 (patch) | |
tree | 1f910e8974c1f37706b3ab78d4214977b36fe74a /tests/spans.rs | |
parent | 635a571ee76bf7fdaaf01c204f30289489b80c1a (diff) |
chore: rename integration tests
Diffstat (limited to 'tests/spans.rs')
-rw-r--r-- | tests/spans.rs | 626 |
1 files changed, 626 insertions, 0 deletions
diff --git a/tests/spans.rs b/tests/spans.rs new file mode 100644 index 0000000..b10808c --- /dev/null +++ b/tests/spans.rs @@ -0,0 +1,626 @@ +use std::convert::Infallible; +use std::ops::Range; + +use codespan_reporting::{ + self, + diagnostic::{Diagnostic, Label}, + files::SimpleFiles, + term::{self, termcolor::Buffer}, +}; +use html5tokenizer::{ + offset::PosTrackingReader, + reader::{IntoReader, Reader}, + trace::Trace, + NaiveParser, Token, +}; +use insta::assert_snapshot; +use similar_asserts::assert_eq; + +/// Just a convenient type alias for labeler closures since Rust +/// apparently cannot infer the type (requiring an annotation). +type Parser = NaiveParser< + PosTrackingReader<Box<dyn Reader<Error = Infallible>>>, + usize, + html5tokenizer::TracingEmitter, +>; + +fn parser<R>(reader: impl IntoReader<'static, Reader = R>) -> Parser +where + R: Reader<Error = Infallible> + 'static, +{ + NaiveParser::new_with_emitter( + PosTrackingReader::new( + Box::new(reader.into_reader()) as Box<dyn Reader<Error = Infallible>> + ), + html5tokenizer::TracingEmitter::default(), + ) +} + +fn test_and_annotate<S: AsRef<str> + Clone>( + html: &'static str, + labeler: impl Fn(Parser) -> Vec<(Range<usize>, S)>, +) -> String { + let labels = labeler(parser(html)); + + assert_char_encoding_independence(html, labeler); + + annotate(html, labels) +} + +fn annotate(html: &str, labels: Vec<(Range<usize>, impl AsRef<str>)>) -> String { + let mut files = SimpleFiles::new(); + let file_id = files.add("test.html", html); + + let diagnostic = Diagnostic::note().with_labels( + labels + .into_iter() + .map(|(span, text)| Label::primary(file_id, span).with_message(text.as_ref())) + .collect(), + ); + + let mut writer = Buffer::no_color(); + let config = codespan_reporting::term::Config::default(); + term::emit(&mut writer, &config, &files, &diagnostic).unwrap(); + let msg = std::str::from_utf8(writer.as_slice()).unwrap(); + + // strip the filename and the line numbers since we don't need them + // (apparently they cannot be disabled in codespan_reporting) + msg.lines() + .skip(3) + .flat_map(|l| l.split_once("│ ").map(|s| format!("{}\n", s.1.trim_end()))) + .collect::<Vec<_>>() + .join("") +} + +#[test] +fn char_span() { + let html = "X & &doesntexist; ѣ </"; + let labeler = |parser: Parser| { + let mut labels = Vec::new(); + for token_trace in parser.flatten() { + if let (Token::Char(c), Trace::Char(span)) = token_trace { + if c != ' ' { + labels.push((span, "")); + } + } + } + labels + }; + assert_snapshot!(test_and_annotate(html, labeler), @r###" + X & &doesntexist; ѣ </ + ^ ^^^^^ ^^^^^^^^^^^^^ ^^^^^^^ ^^ + "###); +} + +#[test] +fn start_tag_span() { + let html = "<x> <xyz> <xyz > <xyz/>"; + let labeler = |parser: Parser| { + let mut labels = Vec::new(); + for (_, trace) in parser.flatten() { + if let Trace::StartTag(trace) = trace { + labels.push((trace.span, "")); + } + } + labels + }; + assert_snapshot!(test_and_annotate(html, labeler), @r###" + <x> <xyz> <xyz > <xyz/> + ^^^ ^^^^^ ^^^^^^^ ^^^^^^ + "###); +} + +#[test] +fn end_tag_span() { + let html = "</x> </xyz> </xyz > </xyz/>"; + let labeler = |parser: Parser| { + let mut labels = Vec::new(); + for (_, trace) in parser.flatten() { + if let Trace::EndTag(trace) = trace { + labels.push((trace.span, "")); + } + } + labels + }; + assert_snapshot!(test_and_annotate(html, labeler), @r###" + </x> </xyz> </xyz > </xyz/> + ^^^^ ^^^^^^ ^^^^^^^^ ^^^^^^^ + "###); +} + +#[test] +fn start_tag_name_span() { + let html = "<x> <xyz> <xyz > <xyz/>"; + let labeler = |parser: Parser| { + let mut labels = Vec::new(); + for (_, trace) in parser.flatten() { + if let Trace::StartTag(trace) = trace { + labels.push((trace.name_span, "")); + } + } + labels + }; + assert_snapshot!(test_and_annotate(html, labeler), @r###" + <x> <xyz> <xyz > <xyz/> + ^ ^^^ ^^^ ^^^ + "###); +} + +#[test] +fn end_tag_name_span() { + let html = "</x> </xyz> </xyz > </xyz/>"; + let labeler = |parser: Parser| { + let mut labels = Vec::new(); + for (_, trace) in parser.flatten() { + if let Trace::EndTag(trace) = trace { + labels.push((trace.name_span, "")); + } + } + labels + }; + assert_snapshot!(test_and_annotate(html, labeler), @r###" + </x> </xyz> </xyz > </xyz/> + ^ ^^^ ^^^ ^^^ + "###); +} + +#[test] +fn attribute_name_span() { + let html = "<test x xyz y=VAL xy=VAL z = VAL yzx = VAL>"; + let labeler = |parser: Parser| { + let mut labels = Vec::new(); + let (Token::StartTag(tag), Trace::StartTag(trace)) = parser.flatten().next().unwrap() + else { + panic!("expected start tag") + }; + for attr in &tag.attributes { + labels.push(( + trace.attribute_traces[attr.trace_idx().unwrap()].name_span(), + "", + )); + } + labels + }; + assert_snapshot!(test_and_annotate(html, labeler), @r###" + <test x xyz y=VAL xy=VAL z = VAL yzx = VAL> + ^ ^^^ ^ ^^ ^ ^^^ + "###); +} + +#[test] +fn attribute_value_span() { + let html = "<test x=unquoted y = unquoted z='single-quoted' zz=\"double-quoted\" empty=''>"; + let labeler = |parser: Parser| { + let mut labels = Vec::new(); + let (Token::StartTag(tag), Trace::StartTag(trace)) = parser.flatten().next().unwrap() + else { + panic!("expected start tag") + }; + for attr in &tag.attributes { + labels.push(( + trace.attribute_traces[attr.trace_idx().unwrap()] + .value_span() + .unwrap(), + "", + )); + } + labels + }; + assert_snapshot!(test_and_annotate(html, labeler), @r###" + <test x=unquoted y = unquoted z='single-quoted' zz="double-quoted" empty=''> + ^^^^^^^^ ^^^^^^^^ ^^^^^^^^^^^^^ ^^^^^^^^^^^^^ ^ + "###); +} + +#[test] +fn attribute_value_with_char_ref() { + let html = "<test x=& y='&' z=\"&\">"; + let labeler = |parser: Parser| { + let mut labels = Vec::new(); + let (Token::StartTag(tag), Trace::StartTag(trace)) = parser.flatten().next().unwrap() + else { + panic!("expected start tag") + }; + for attr in &tag.attributes { + labels.push(( + trace.attribute_traces[attr.trace_idx().unwrap()] + .value_span() + .unwrap(), + "", + )); + } + labels + }; + assert_snapshot!(test_and_annotate(html, labeler), @r###" + <test x=& y='&' z="&"> + ^^^^^ ^^^^^ ^^^^^ + "###); +} + +#[test] +fn comment_data_span() { + #[rustfmt::skip] + let cases = [ + "<!-- Why are you looking at the source code? -->", + "<!-- Why are you looking at the source code? --", + "<!-- Why are you looking at the source code? -", + "<!-- Why are you looking at the source code?", + "<!--", + "<!-->", + "<!---", + "<!--->", + "<!-- Why are you looking at the source code? ->", + "<!-- Why are you looking at the source code? --!>", + "<!-- Why are you looking at the source code? --!", + + // bogus comments + "<! Why are you looking at the source code? -->", + "<!", + ]; + + let mut annotated = String::new(); + for case in cases { + let labeler = |parser: Parser| { + let (_, Trace::Comment(comment)) = parser.flatten().next().unwrap() else { + panic!("expected comment"); + }; + vec![(comment.data_span, "")] + }; + + annotated.push_str(&test_and_annotate(case, labeler)); + } + + assert_snapshot!(annotated, @r###" + <!-- Why are you looking at the source code? --> + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + <!-- Why are you looking at the source code? -- + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + <!-- Why are you looking at the source code? - + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + <!-- Why are you looking at the source code? + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + <!-- + ^ + <!--> + ^ + <!--- + ^ + <!---> + ^ + <!-- Why are you looking at the source code? -> + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + <!-- Why are you looking at the source code? --!> + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + <!-- Why are you looking at the source code? --! + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + <! Why are you looking at the source code? --> + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + <! + ^ + "###); + + for (idx, case) in cases.iter().enumerate() { + let (Token::Comment(data), Trace::Comment(trace)) = parser(*case).flatten().next().unwrap() + else { + panic!("expected comment"); + }; + assert_eq!(case[trace.data_span], data, "case {idx}"); + } +} + +#[test] +fn doctype_span() { + #[rustfmt::skip] + let cases = [ + r#"<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd" >"#, + ]; + + let mut annotated = String::new(); + for case in cases { + let labeler = |parser: Parser| { + let (_, Trace::Doctype(trace)) = parser.flatten().next().unwrap() else { + panic!("expected doctype"); + }; + vec![(trace.span(), "")] + }; + annotated.push_str(&test_and_annotate(case, labeler)); + } + + assert_snapshot!(annotated, @r###" + <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd" > + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + "###); +} + +#[test] +fn doctype_id_spans() { + #[rustfmt::skip] + let cases = [ + r#"<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">"#, + ]; + + let mut annotated = String::new(); + for case in cases { + let labeler = |parser: Parser| { + let (_, Trace::Doctype(trace)) = parser.flatten().next().unwrap() else { + panic!("expected doctype"); + }; + + let mut labels = Vec::new(); + if let Some(name_span) = trace.name_span() { + labels.push((name_span, "name")); + } + if let Some(public_id_span) = trace.public_id_span() { + labels.push((public_id_span, "public id")); + } + if let Some(system_id_span) = trace.system_id_span() { + labels.push((system_id_span, "system id")); + } + labels + }; + + annotated.push_str(&test_and_annotate(case, labeler)); + } + + assert_snapshot!(annotated, @r###" + <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd"> + ^^^^ ^^^^^^^^^^^^^^^^^^^^^^^^^ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ system id + │ │ + │ public id + name + "###); +} + +#[test] +fn eof_offset() { + let html = "Where does it end?"; + let labeler = |parser: Parser| { + let mut labels = Vec::new(); + for (_, trace) in parser.flatten() { + if let Trace::EndOfFile(offset) = trace { + labels.push((offset..offset, "here")); + } + } + labels + }; + assert_snapshot!(test_and_annotate(html, labeler), @r###" + Where does it end? + ^ here + "###); +} + +fn annotate_errors(html: &'static str) -> String { + let mut parser = parser(html); + for _ in parser.by_ref() {} + let errors: Vec<_> = parser.emitter_mut().drain_errors().collect(); + + for (_, span) in errors { + if span.start == span.end { + if span.start != html.len() { + panic!("empty error spans are only allowed at the very end of the source (for eof errors)"); + } + } else { + assert!(span.start < span.end); + assert!(span.end <= html.len()); + } + } + + let labeler = |mut parser: Parser| { + let mut labels = Vec::new(); + for _ in parser.by_ref() {} + for (error, span) in parser.emitter_mut().drain_errors() { + labels.push((span, error.code())); + } + labels + }; + + test_and_annotate(html, labeler) +} + +#[test] +fn tests_for_errors_are_sorted() { + let source_of_this_file = std::fs::read_to_string(file!()).unwrap(); + let mut error_tests: Vec<_> = source_of_this_file + .lines() + .filter(|l| l.starts_with("fn error_")) + .collect(); + let error_tests_found_order = error_tests.join("\n"); + error_tests.sort(); + let error_tests_sorted = error_tests.join("\n"); + assert_eq!(error_tests_found_order, error_tests_sorted); +} + +#[test] +fn error_char_ref_absence_of_digits() { + let html = "&#qux;"; + assert_snapshot!(annotate_errors(html), @r###" + &#qux; + ^^^ absence-of-digits-in-numeric-character-reference + "###); +} + +#[test] +fn error_char_ref_control_char() { + let html = ""; + assert_snapshot!(annotate_errors(html), @r###" +  + ^^^^^^ control-character-reference + "###); +} + +#[test] +fn error_char_ref_missing_semicolon() { + let html = "¬"; + assert_snapshot!(annotate_errors(html), @r###" + ¬ + ^ missing-semicolon-after-character-reference + "###); +} + +#[test] +fn error_char_ref_noncharacter() { + let html = ""; + assert_snapshot!(annotate_errors(html), @r###" +  + ^^^^^^^^ noncharacter-character-reference + "###); +} + +#[test] +fn error_char_ref_null_char() { + let html = "�"; + assert_snapshot!(annotate_errors(html), @r###" + � + ^^^^ null-character-reference + "###); +} + +#[test] +fn error_char_ref_outside_unicode_range() { + let html = "�"; + assert_snapshot!(annotate_errors(html), @r###" + � + ^^^^^^^^^^ character-reference-outside-unicode-range + "###); +} + +#[test] +fn error_char_ref_surrogate() { + let html = "�"; + assert_snapshot!(annotate_errors(html), @r###" + � + ^^^^^^^^ surrogate-character-reference + "###); +} + +#[test] +fn error_char_ref_unknown_named() { + let html = "The pirate says &arrrrr;"; + assert_snapshot!(annotate_errors(html), @r###" + The pirate says &arrrrr; + ^^^^^^^^ unknown-named-character-reference + "###); +} + +#[test] +fn error_duplicate_attribute() { + let html = "Does this open two pages? <a href=foo.html href=bar.html>"; + assert_snapshot!(annotate_errors(html), @r###" + Does this open two pages? <a href=foo.html href=bar.html> + ^^^^ duplicate-attribute + "###); +} + +#[test] +fn error_end_tag_with_attributes() { + let html = "</end-tag first second=value>"; + assert_snapshot!(annotate_errors(html), @r###" + </end-tag first second=value> + ^^^^^^ end-tag-with-attributes + "###); +} + +#[test] +fn error_end_tag_with_trailing_solidus() { + let html = "Do you start or do you end? </yes/>"; + assert_snapshot!(annotate_errors(html), @r###" + Do you start or do you end? </yes/> + ^ end-tag-with-trailing-solidus + "###); +} + +#[test] +fn error_eof_before_tag_name() { + let html = "<"; + assert_snapshot!(annotate_errors(html), @r###" + < + ^ eof-before-tag-name + "###); +} + +// TODO: add error_eof_in_cdata test +// blocked by lack of proper tree constructor (NaiveParser doesn't parse CDATA sections) + +#[test] +fn error_eof_in_comment() { + let html = "<!--"; + assert_snapshot!(annotate_errors(html), @r###" + <!-- + ^ eof-in-comment + "###); +} + +#[test] +fn error_eof_in_doctype() { + let html = "<!doctype html"; + assert_snapshot!(annotate_errors(html), @r###" + <!doctype html + ^ eof-in-doctype + "###); +} + +#[test] +fn error_eof_in_script_html_comment_like_text() { + let html = "<script><!--"; + assert_snapshot!(annotate_errors(html), @r###" + <script><!-- + ^ eof-in-script-html-comment-like-text + "###); +} + +#[test] +fn error_eof_in_tag() { + let html = "</sarcasm"; + assert_snapshot!(annotate_errors(html), @r###" + </sarcasm + ^ eof-in-tag + "###); +} + +#[test] +fn error_invalid_first_character_of_tag_name() { + let html = "Please mind the gap: < test"; + assert_snapshot!(annotate_errors(html), @r###" + Please mind the gap: < test + ^ invalid-first-character-of-tag-name + "###); +} + +fn assert_char_encoding_independence<S: AsRef<str> + Clone>( + html: &'static str, + labeler: impl Fn(Parser) -> Vec<(Range<usize>, S)>, +) { + let utf8_labels = labeler(parser(html)); + let utf16_labels = labeler(parser(Utf16Reader(html.into_reader()))); + + for (idx, (span, _)) in utf16_labels.into_iter().enumerate() { + let expected_utf16_span = Range { + start: html[..utf8_labels[idx].0.start].encode_utf16().count() * 2, + end: html[..utf8_labels[idx].0.end].encode_utf16().count() * 2, + }; + assert_eq!( + span, + expected_utf16_span, + "UTF-16 span didn't match the UTF-8 span, which looks like:\n{}", + annotate(html, vec![utf8_labels[idx].clone()]) + ); + } +} + +struct Utf16Reader<'a>(html5tokenizer::reader::StringReader<'a>); + +impl html5tokenizer::reader::Reader for Utf16Reader<'_> { + type Error = std::convert::Infallible; + + fn read_char(&mut self) -> Result<Option<char>, Self::Error> { + self.0.read_char() + } + + fn try_read_string(&mut self, s: &str, case_sensitive: bool) -> Result<bool, Self::Error> { + self.0.try_read_string(s, case_sensitive) + } + + fn len_of_char_in_current_encoding(&self, c: char) -> usize { + c.len_utf16() * 2 + } +} |