use std::ops::Range; use codespan_reporting::{ self, diagnostic::{Diagnostic, Label}, files::SimpleFiles, term::{self, termcolor::Buffer}, }; use html5tokenizer::{offset::PosTrackingReader, NaiveParser, Token}; use insta::assert_snapshot; use similar_asserts::assert_eq; fn tokenizer(html: &'static str) -> impl Iterator> { NaiveParser::new(PosTrackingReader::new(html)).flatten() } /// Just a convenient type alias for labeler closures calling `tokens.next()` /// since Rust apparently cannot infer the type (requiring an annotation). type TokenIter = Box>>; fn test_and_annotate + Clone>( html: &'static str, labeler: impl Fn(TokenIter) -> Vec<(Range, S)>, ) -> String { let labels = labeler(Box::new(tokenizer(html))); // TODO: assert character encoding independence here once all tests support it annotate(html, labels) } fn annotate(html: &str, labels: Vec<(Range, impl AsRef)>) -> String { let mut files = SimpleFiles::new(); let file_id = files.add("test.html", html); let diagnostic = Diagnostic::note().with_labels( labels .into_iter() .map(|(span, text)| Label::primary(file_id, span).with_message(text.as_ref())) .collect(), ); let mut writer = Buffer::no_color(); let config = codespan_reporting::term::Config::default(); term::emit(&mut writer, &config, &files, &diagnostic).unwrap(); let msg = std::str::from_utf8(writer.as_slice()).unwrap(); // strip the filename and the line numbers since we don't need them // (apparently they cannot be disabled in codespan_reporting) msg.lines() .skip(3) .flat_map(|l| l.split_once("│ ").map(|s| format!("{}\n", s.1.trim_end()))) .collect::>() .join("") } fn assert_panics_but_should_not(f: impl FnOnce() + std::panic::UnwindSafe) { assert!( std::panic::catch_unwind(f).is_err(), "congrats! you made some span test support UTF-16, please stop calling assert_panics_but_should_not for this test" ); } #[test] fn start_tag_span() { let html = " "; let labeler = |tokens| { let mut labels = Vec::new(); for token in tokens { if let Token::StartTag(tag) = token { labels.push((tag.span, "")); } } labels }; assert_panics_but_should_not(|| assert_char_encoding_independence(html, labeler)); // FIXME assert_snapshot!(test_and_annotate(html, labeler), @r###" ^^^ ^^^^^ ^^^^^^^ ^^^^^^ "###); } #[test] fn end_tag_span() { let html = " "; let labeler = |tokens| { let mut labels = Vec::new(); for token in tokens { if let Token::EndTag(tag) = token { labels.push((tag.span, "")); } } labels }; assert_panics_but_should_not(|| assert_char_encoding_independence(html, labeler)); // FIXME assert_snapshot!(test_and_annotate(html, labeler), @r###" ^^^^ ^^^^^^ ^^^^^^^^ ^^^^^^^ "###); } #[test] fn start_tag_name_span() { let html = " "; let labeler = |tokens| { let mut labels = Vec::new(); for token in tokens { if let Token::StartTag(tag) = token { labels.push((tag.name_span(), "")); } } labels }; assert_panics_but_should_not(|| assert_char_encoding_independence(html, labeler)); // FIXME assert_snapshot!(test_and_annotate(html, labeler), @r###" ^ ^^^ ^^^ ^^^ "###); } #[test] fn end_tag_name_span() { let html = " "; let labeler = |tokens| { let mut labels = Vec::new(); for token in tokens { if let Token::EndTag(tag) = token { labels.push((tag.name_span(), "")); } } labels }; assert_panics_but_should_not(|| assert_char_encoding_independence(html, labeler)); // FIXME assert_snapshot!(test_and_annotate(html, labeler), @r###" ^ ^^^ ^^^ ^^^ "###); } #[test] fn attribute_name_span() { let html = ""; let labeler = |mut tokens: TokenIter| { let mut labels = Vec::new(); let Token::StartTag(tag) = tokens.next().unwrap() else { panic!("expected start tag") }; for attr in &tag.attributes { labels.push((attr.name_span(), "")); } labels }; assert_panics_but_should_not(|| assert_char_encoding_independence(html, labeler)); // FIXME assert_snapshot!(test_and_annotate(html, labeler), @r###" ^ ^^^ ^ ^^ ^ ^^^ "###); } #[test] fn attribute_value_span() { let html = ""; let labeler = |mut tokens: TokenIter| { let mut labels = Vec::new(); let Token::StartTag(tag) = tokens.next().unwrap() else { panic!("expected start tag") }; for attr in &tag.attributes { labels.push((attr.value_span().unwrap(), "")); } labels }; assert_panics_but_should_not(|| assert_char_encoding_independence(html, labeler)); // FIXME assert_snapshot!(test_and_annotate(html, labeler), @r###" ^^^^^^^^ ^^^^^^^^ ^^^^^^^^^^^^^ ^^^^^^^^^^^^^ ^ "###); } #[test] fn attribute_value_with_char_ref() { let html = ""; let labeler = |mut tokens: TokenIter| { let mut labels = Vec::new(); let Token::StartTag(tag) = tokens.next().unwrap() else { panic!("expected start tag") }; for attr in &tag.attributes { labels.push((attr.value_span().unwrap(), "")); } labels }; assert_panics_but_should_not(|| assert_char_encoding_independence(html, labeler)); // FIXME assert_snapshot!(test_and_annotate(html, labeler), @r###" ^^^^^ ^^^^^ ^^^^^ "###); } #[test] fn comment_data_span() { #[rustfmt::skip] let cases = [ "", // bogus comments "", ]; let mut annotated = String::new(); for case in cases { let labeler = |tokens: TokenIter| { let Token::Comment(comment) = tokens .filter(|t| !matches!(t, Token::Error { .. })) .next() .unwrap() else { panic!("expected comment"); }; vec![(comment.data_span(), "")] }; assert_panics_but_should_not(|| assert_char_encoding_independence(case, labeler)); // FIXME annotated.push_str(&test_and_annotate(case, labeler)); } assert_snapshot!(annotated, @r###" ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ "###); for (idx, case) in cases.iter().enumerate() { let Token::Comment(comment) = tokenizer(case) .filter(|t| !matches!(t, Token::Error { .. })) .next() .unwrap() else { panic!("expected comment"); }; assert_eq!(case[comment.data_span()], comment.data, "case {idx}"); } } #[test] fn doctype_span() { #[rustfmt::skip] let cases = [ r#""#, ]; let mut annotated = String::new(); for case in cases { let labeler = |tokens: TokenIter| { let Token::Doctype(doctype) = tokens .filter(|t| !matches!(t, Token::Error { .. })) .next() .unwrap() else { panic!("expected doctype"); }; vec![(doctype.span, "")] }; assert_panics_but_should_not(|| assert_char_encoding_independence(case, labeler)); // FIXME annotated.push_str(&test_and_annotate(case, labeler)); } assert_snapshot!(annotated, @r###" ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ "###); } #[test] fn doctype_id_spans() { #[rustfmt::skip] let cases = [ r#""#, ]; let mut annotated = String::new(); for case in cases { let labeler = |tokens: TokenIter| { let Token::Doctype(doctype) = tokens .filter(|t| !matches!(t, Token::Error { .. })) .next() .unwrap() else { panic!("expected doctype"); }; let mut labels = Vec::new(); if let Some(public_id_span) = doctype.public_id_span() { labels.push((public_id_span, "public id")); } if let Some(system_id_span) = doctype.system_id_span() { labels.push((system_id_span, "system id")); } labels }; assert_panics_but_should_not(|| assert_char_encoding_independence(case, labeler)); // FIXME annotated.push_str(&test_and_annotate(case, labeler)); } assert_snapshot!(annotated, @r###" ^^^^^^^^^^^^^^^^^^^^^^^^^ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ system id │ public id "###); } fn annotate_errors(html: &'static str) -> String { for token in tokenizer(html) { let Token::Error { span, .. } = token else { continue; }; if span.start == span.end { if span.start != html.len() { panic!("empty error spans are only allowed at the very end of the source (for eof errors)"); } } else { assert!(span.start < span.end); assert!(span.end <= html.len()); } } let doesnt_support_utf16 = std::sync::Mutex::new(false); let labeler = |tokens| { let mut labels = Vec::new(); for token in tokens { let Token::Error { error, span } = token else { continue; }; labels.push((span, error.code())); use html5tokenizer::Error; *doesnt_support_utf16.lock().unwrap() = matches!( error, | Error::AbsenceOfDigitsInNumericCharacterReference // FIXME | Error::CharacterReferenceOutsideUnicodeRange // FIXME | Error::ControlCharacterReference // FIXME | Error::DuplicateAttribute // FIXME | Error::EndTagWithAttributes // FIXME | Error::EndTagWithTrailingSolidus // FIXME | Error::InvalidFirstCharacterOfTagName // FIXME | Error::NoncharacterCharacterReference // FIXME | Error::NullCharacterReference // FIXME | Error::SurrogateCharacterReference // FIXME | Error::UnknownNamedCharacterReference // FIXME ); } labels }; // This will be removed once all tested errors support UTF-16. let _ = labeler(Box::new(tokenizer(html)) as TokenIter); if *doesnt_support_utf16.lock().unwrap() { assert_panics_but_should_not(|| assert_char_encoding_independence(html, labeler)); } else { // TODO: Move this assertion into test_and_annotate once all tests support it. assert_char_encoding_independence(html, labeler); } test_and_annotate(html, labeler) } #[test] fn tests_for_errors_are_sorted() { let source_of_this_file = std::fs::read_to_string(file!()).unwrap(); let mut error_tests: Vec<_> = source_of_this_file .lines() .filter(|l| l.starts_with("fn error_")) .collect(); let error_tests_found_order = error_tests.join("\n"); error_tests.sort(); let error_tests_sorted = error_tests.join("\n"); assert_eq!(error_tests_found_order, error_tests_sorted); } #[test] fn error_char_ref_absence_of_digits() { let html = "&#qux;"; assert_snapshot!(annotate_errors(html), @r###" &#qux; ^^^ absence-of-digits-in-numeric-character-reference "###); } #[test] fn error_char_ref_control_char() { let html = ""; assert_snapshot!(annotate_errors(html), @r###"  ^^^^^^ control-character-reference "###); } #[test] fn error_char_ref_missing_semicolon() { let html = "¬"; assert_snapshot!(annotate_errors(html), @r###" ¬ ^ missing-semicolon-after-character-reference "###); } #[test] fn error_char_ref_noncharacter() { let html = "﷐"; assert_snapshot!(annotate_errors(html), @r###" ﷐ ^^^^^^^^ noncharacter-character-reference "###); } #[test] fn error_char_ref_null_char() { let html = "�"; assert_snapshot!(annotate_errors(html), @r###" � ^^^^ null-character-reference "###); } #[test] fn error_char_ref_outside_unicode_range() { let html = "�"; assert_snapshot!(annotate_errors(html), @r###" � ^^^^^^^^^^ character-reference-outside-unicode-range "###); } #[test] fn error_char_ref_surrogate() { let html = "�"; assert_snapshot!(annotate_errors(html), @r###" � ^^^^^^^^ surrogate-character-reference "###); } #[test] fn error_char_ref_unknown_named() { let html = "The pirate says &arrrrr;"; assert_snapshot!(annotate_errors(html), @r###" The pirate says &arrrrr; ^^^^^^^^ unknown-named-character-reference "###); } #[test] fn error_duplicate_attribute() { let html = "Does this open two pages? "; assert_snapshot!(annotate_errors(html), @r###" Does this open two pages? ^^^^ duplicate-attribute "###); } #[test] fn error_end_tag_with_attributes() { let html = ""; assert_snapshot!(annotate_errors(html), @r###" ^^^^^^ end-tag-with-attributes "###); } #[test] fn error_end_tag_with_trailing_solidus() { let html = "Do you start or do you end? "; assert_snapshot!(annotate_errors(html), @r###" Do you start or do you end? ^ end-tag-with-trailing-solidus "###); } #[test] fn error_eof_before_tag_name() { let html = "<"; assert_snapshot!(annotate_errors(html), @r###" < ^ eof-before-tag-name "###); } // TODO: add error_eof_in_cdata test // blocked by lack of proper tree constructor (NaiveParser doesn't parse CDATA sections) #[test] fn error_eof_in_comment() { let html = "