use std::ops::Range; use codespan_reporting::{ self, diagnostic::{Diagnostic, Label}, files::SimpleFiles, term::{self, termcolor::Buffer}, }; use html5tokenizer::{offset::PosTrackingReader, NaiveParser, Token}; use insta::assert_snapshot; use similar_asserts::assert_eq; fn tokenizer(html: &'static str) -> impl Iterator<Item = Token<usize>> { NaiveParser::new(PosTrackingReader::new(html)).flatten() } /// Just a convenient type alias for labeler closures calling `tokens.next()` /// since Rust apparently cannot infer the type (requiring an annotation). type TokenIter = Box<dyn Iterator<Item = Token<usize>>>; fn test_and_annotate<S: AsRef<str> + Clone>( html: &'static str, labeler: impl Fn(TokenIter) -> Vec<(Range<usize>, S)>, ) -> String { let labels = labeler(Box::new(tokenizer(html))); assert_char_encoding_independence(html, labeler); annotate(html, labels) } fn annotate(html: &str, labels: Vec<(Range<usize>, impl AsRef<str>)>) -> String { let mut files = SimpleFiles::new(); let file_id = files.add("test.html", html); let diagnostic = Diagnostic::note().with_labels( labels .into_iter() .map(|(span, text)| Label::primary(file_id, span).with_message(text.as_ref())) .collect(), ); let mut writer = Buffer::no_color(); let config = codespan_reporting::term::Config::default(); term::emit(&mut writer, &config, &files, &diagnostic).unwrap(); let msg = std::str::from_utf8(writer.as_slice()).unwrap(); // strip the filename and the line numbers since we don't need them // (apparently they cannot be disabled in codespan_reporting) msg.lines() .skip(3) .flat_map(|l| l.split_once("│ ").map(|s| format!("{}\n", s.1.trim_end()))) .collect::<Vec<_>>() .join("") } #[test] fn start_tag_span() { let html = "<x> <xyz> <xyz > <xyz/>"; let labeler = |tokens| { let mut labels = Vec::new(); for token in tokens { if let Token::StartTag(tag) = token { labels.push((tag.span, "")); } } labels }; assert_snapshot!(test_and_annotate(html, labeler), @r###" <x> <xyz> <xyz > <xyz/> ^^^ ^^^^^ ^^^^^^^ ^^^^^^ "###); } #[test] fn end_tag_span() { let html = "</x> </xyz> </xyz > </xyz/>"; let labeler = |tokens| { let mut labels = Vec::new(); for token in tokens { if let Token::EndTag(tag) = token { labels.push((tag.span, "")); } } labels }; assert_snapshot!(test_and_annotate(html, labeler), @r###" </x> </xyz> </xyz > </xyz/> ^^^^ ^^^^^^ ^^^^^^^^ ^^^^^^^ "###); } #[test] fn start_tag_name_span() { let html = "<x> <xyz> <xyz > <xyz/>"; let labeler = |tokens| { let mut labels = Vec::new(); for token in tokens { if let Token::StartTag(tag) = token { labels.push((tag.name_span, "")); } } labels }; assert_snapshot!(test_and_annotate(html, labeler), @r###" <x> <xyz> <xyz > <xyz/> ^ ^^^ ^^^ ^^^ "###); } #[test] fn end_tag_name_span() { let html = "</x> </xyz> </xyz > </xyz/>"; let labeler = |tokens| { let mut labels = Vec::new(); for token in tokens { if let Token::EndTag(tag) = token { labels.push((tag.name_span, "")); } } labels }; assert_snapshot!(test_and_annotate(html, labeler), @r###" </x> </xyz> </xyz > </xyz/> ^ ^^^ ^^^ ^^^ "###); } #[test] fn attribute_name_span() { let html = "<test x xyz y=VAL xy=VAL z = VAL yzx = VAL>"; let labeler = |mut tokens: TokenIter| { let mut labels = Vec::new(); let Token::StartTag(tag) = tokens.next().unwrap() else { panic!("expected start tag") }; for attr in &tag.attributes { labels.push((attr.name_span(), "")); } labels }; assert_snapshot!(test_and_annotate(html, labeler), @r###" <test x xyz y=VAL xy=VAL z = VAL yzx = VAL> ^ ^^^ ^ ^^ ^ ^^^ "###); } #[test] fn attribute_value_span() { let html = "<test x=unquoted y = unquoted z='single-quoted' zz=\"double-quoted\" empty=''>"; let labeler = |mut tokens: TokenIter| { let mut labels = Vec::new(); let Token::StartTag(tag) = tokens.next().unwrap() else { panic!("expected start tag") }; for attr in &tag.attributes { labels.push((attr.value_span().unwrap(), "")); } labels }; assert_snapshot!(test_and_annotate(html, labeler), @r###" <test x=unquoted y = unquoted z='single-quoted' zz="double-quoted" empty=''> ^^^^^^^^ ^^^^^^^^ ^^^^^^^^^^^^^ ^^^^^^^^^^^^^ ^ "###); } #[test] fn attribute_value_with_char_ref() { let html = "<test x=& y='&' z=\"&\">"; let labeler = |mut tokens: TokenIter| { let mut labels = Vec::new(); let Token::StartTag(tag) = tokens.next().unwrap() else { panic!("expected start tag") }; for attr in &tag.attributes { labels.push((attr.value_span().unwrap(), "")); } labels }; assert_snapshot!(test_and_annotate(html, labeler), @r###" <test x=& y='&' z="&"> ^^^^^ ^^^^^ ^^^^^ "###); } #[test] fn comment_data_span() { #[rustfmt::skip] let cases = [ "<!-- Why are you looking at the source code? -->", "<!-- Why are you looking at the source code? --", "<!-- Why are you looking at the source code? -", "<!-- Why are you looking at the source code?", "<!--", "<!-->", "<!---", "<!--->", "<!-- Why are you looking at the source code? ->", "<!-- Why are you looking at the source code? --!>", "<!-- Why are you looking at the source code? --!", // bogus comments "<! Why are you looking at the source code? -->", "<!", ]; let mut annotated = String::new(); for case in cases { let labeler = |tokens: TokenIter| { let Token::Comment(comment) = tokens .filter(|t| !matches!(t, Token::Error { .. })) .next() .unwrap() else { panic!("expected comment"); }; vec![(comment.data_span(), "")] }; annotated.push_str(&test_and_annotate(case, labeler)); } assert_snapshot!(annotated, @r###" <!-- Why are you looking at the source code? --> ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ <!-- Why are you looking at the source code? -- ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ <!-- Why are you looking at the source code? - ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ <!-- Why are you looking at the source code? ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ <!-- ^ <!--> ^ <!--- ^ <!---> ^ <!-- Why are you looking at the source code? -> ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ <!-- Why are you looking at the source code? --!> ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ <!-- Why are you looking at the source code? --! ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ <! Why are you looking at the source code? --> ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ <! ^ "###); for (idx, case) in cases.iter().enumerate() { let Token::Comment(comment) = tokenizer(case) .filter(|t| !matches!(t, Token::Error { .. })) .next() .unwrap() else { panic!("expected comment"); }; assert_eq!(case[comment.data_span()], comment.data, "case {idx}"); } } #[test] fn doctype_span() { #[rustfmt::skip] let cases = [ r#"<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd" >"#, ]; let mut annotated = String::new(); for case in cases { let labeler = |tokens: TokenIter| { let Token::Doctype(doctype) = tokens .filter(|t| !matches!(t, Token::Error { .. })) .next() .unwrap() else { panic!("expected doctype"); }; vec![(doctype.span, "")] }; annotated.push_str(&test_and_annotate(case, labeler)); } assert_snapshot!(annotated, @r###" <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd" > ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ "###); } #[test] fn doctype_id_spans() { #[rustfmt::skip] let cases = [ r#"<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">"#, ]; let mut annotated = String::new(); for case in cases { let labeler = |tokens: TokenIter| { let Token::Doctype(doctype) = tokens .filter(|t| !matches!(t, Token::Error { .. })) .next() .unwrap() else { panic!("expected doctype"); }; let mut labels = Vec::new(); if let Some(public_id_span) = doctype.public_id_span() { labels.push((public_id_span, "public id")); } if let Some(system_id_span) = doctype.system_id_span() { labels.push((system_id_span, "system id")); } labels }; annotated.push_str(&test_and_annotate(case, labeler)); } assert_snapshot!(annotated, @r###" <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd"> ^^^^^^^^^^^^^^^^^^^^^^^^^ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ system id │ public id "###); } fn annotate_errors(html: &'static str) -> String { for token in tokenizer(html) { let Token::Error { span, .. } = token else { continue; }; if span.start == span.end { if span.start != html.len() { panic!("empty error spans are only allowed at the very end of the source (for eof errors)"); } } else { assert!(span.start < span.end); assert!(span.end <= html.len()); } } let labeler = |tokens| { let mut labels = Vec::new(); for token in tokens { let Token::Error { error, span } = token else { continue; }; labels.push((span, error.code())); } labels }; test_and_annotate(html, labeler) } #[test] fn tests_for_errors_are_sorted() { let source_of_this_file = std::fs::read_to_string(file!()).unwrap(); let mut error_tests: Vec<_> = source_of_this_file .lines() .filter(|l| l.starts_with("fn error_")) .collect(); let error_tests_found_order = error_tests.join("\n"); error_tests.sort(); let error_tests_sorted = error_tests.join("\n"); assert_eq!(error_tests_found_order, error_tests_sorted); } #[test] fn error_char_ref_absence_of_digits() { let html = "&#qux;"; assert_snapshot!(annotate_errors(html), @r###" &#qux; ^^^ absence-of-digits-in-numeric-character-reference "###); } #[test] fn error_char_ref_control_char() { let html = ""; assert_snapshot!(annotate_errors(html), @r###"  ^^^^^^ control-character-reference "###); } #[test] fn error_char_ref_missing_semicolon() { let html = "¬"; assert_snapshot!(annotate_errors(html), @r###" ¬ ^ missing-semicolon-after-character-reference "###); } #[test] fn error_char_ref_noncharacter() { let html = ""; assert_snapshot!(annotate_errors(html), @r###"  ^^^^^^^^ noncharacter-character-reference "###); } #[test] fn error_char_ref_null_char() { let html = "�"; assert_snapshot!(annotate_errors(html), @r###" � ^^^^ null-character-reference "###); } #[test] fn error_char_ref_outside_unicode_range() { let html = "�"; assert_snapshot!(annotate_errors(html), @r###" � ^^^^^^^^^^ character-reference-outside-unicode-range "###); } #[test] fn error_char_ref_surrogate() { let html = "�"; assert_snapshot!(annotate_errors(html), @r###" � ^^^^^^^^ surrogate-character-reference "###); } #[test] fn error_char_ref_unknown_named() { let html = "The pirate says &arrrrr;"; assert_snapshot!(annotate_errors(html), @r###" The pirate says &arrrrr; ^^^^^^^^ unknown-named-character-reference "###); } #[test] fn error_duplicate_attribute() { let html = "Does this open two pages? <a href=foo.html href=bar.html>"; assert_snapshot!(annotate_errors(html), @r###" Does this open two pages? <a href=foo.html href=bar.html> ^^^^ duplicate-attribute "###); } #[test] fn error_end_tag_with_attributes() { let html = "</end-tag first second=value>"; assert_snapshot!(annotate_errors(html), @r###" </end-tag first second=value> ^^^^^^ end-tag-with-attributes "###); } #[test] fn error_end_tag_with_trailing_solidus() { let html = "Do you start or do you end? </yes/>"; assert_snapshot!(annotate_errors(html), @r###" Do you start or do you end? </yes/> ^ end-tag-with-trailing-solidus "###); } #[test] fn error_eof_before_tag_name() { let html = "<"; assert_snapshot!(annotate_errors(html), @r###" < ^ eof-before-tag-name "###); } // TODO: add error_eof_in_cdata test // blocked by lack of proper tree constructor (NaiveParser doesn't parse CDATA sections) #[test] fn error_eof_in_comment() { let html = "<!--"; assert_snapshot!(annotate_errors(html), @r###" <!-- ^ eof-in-comment "###); } #[test] fn error_eof_in_doctype() { let html = "<!doctype html"; assert_snapshot!(annotate_errors(html), @r###" <!doctype html ^ eof-in-doctype "###); } #[test] fn error_eof_in_script_html_comment_like_text() { let html = "<script><!--"; assert_snapshot!(annotate_errors(html), @r###" <script><!-- ^ eof-in-script-html-comment-like-text "###); } #[test] fn error_eof_in_tag() { let html = "</sarcasm"; assert_snapshot!(annotate_errors(html), @r###" </sarcasm ^ eof-in-tag "###); } #[test] fn error_invalid_first_character_of_tag_name() { let html = "Please mind the gap: < test"; assert_snapshot!(annotate_errors(html), @r###" Please mind the gap: < test ^ invalid-first-character-of-tag-name "###); } fn assert_char_encoding_independence<S: AsRef<str> + Clone>( html: &'static str, labeler: impl Fn(TokenIter) -> Vec<(Range<usize>, S)>, ) { let utf8_tokens = NaiveParser::new(PosTrackingReader::new(html)).flatten(); let string_reader = html5tokenizer::reader::IntoReader::into_reader(html); let utf16_tokens = NaiveParser::new(PosTrackingReader::new(Utf16Reader(string_reader))).flatten(); let utf8_labels = labeler(Box::new(utf8_tokens)); for (idx, (span, _)) in labeler(Box::new(utf16_tokens)).into_iter().enumerate() { let expected_utf16_span = html[..utf8_labels[idx].0.start].encode_utf16().count() * 2 ..html[..utf8_labels[idx].0.end].encode_utf16().count() * 2; assert_eq!( span, expected_utf16_span, "UTF-16 span didn't match the UTF-8 span, which looks like:\n{}", annotate(html, vec![utf8_labels[idx].clone()]) ); } } struct Utf16Reader<'a>(html5tokenizer::reader::StringReader<'a>); impl html5tokenizer::reader::Reader for Utf16Reader<'_> { type Error = std::convert::Infallible; fn read_char(&mut self) -> Result<Option<char>, Self::Error> { self.0.read_char() } fn try_read_string(&mut self, s: &str, case_sensitive: bool) -> Result<bool, Self::Error> { self.0.try_read_string(s, case_sensitive) } fn len_of_char_in_current_encoding(&self, c: char) -> usize { c.len_utf16() * 2 } }