1 files changed, 626 insertions, 0 deletions
diff --git a/tests/spans.rs b/tests/spans.rs
new file mode 100644
index 0000000..b10808c
--- /dev/null
+++ b/tests/spans.rs
@@ -0,0 +1,626 @@
+use std::convert::Infallible;
+use std::ops::Range;
+
+use codespan_reporting::{
+    self,
+    diagnostic::{Diagnostic, Label},
+    files::SimpleFiles,
+    term::{self, termcolor::Buffer},
+};
+use html5tokenizer::{
+    offset::PosTrackingReader,
+    reader::{IntoReader, Reader},
+    trace::Trace,
+    NaiveParser, Token,
+};
+use insta::assert_snapshot;
+use similar_asserts::assert_eq;
+
+/// Just a convenient type alias for labeler closures since Rust
+/// apparently cannot infer the type (requiring an annotation).
+type Parser = NaiveParser<
+    PosTrackingReader<Box<dyn Reader<Error = Infallible>>>,
+    usize,
+    html5tokenizer::TracingEmitter,
+>;
+
+fn parser<R>(reader: impl IntoReader<'static, Reader = R>) -> Parser
+where
+    R: Reader<Error = Infallible> + 'static,
+{
+    NaiveParser::new_with_emitter(
+        PosTrackingReader::new(
+            Box::new(reader.into_reader()) as Box<dyn Reader<Error = Infallible>>
+        ),
+        html5tokenizer::TracingEmitter::default(),
+    )
+}
+
+fn test_and_annotate<S: AsRef<str> + Clone>(
+    html: &'static str,
+    labeler: impl Fn(Parser) -> Vec<(Range<usize>, S)>,
+) -> String {
+    let labels = labeler(parser(html));
+
+    assert_char_encoding_independence(html, labeler);
+
+    annotate(html, labels)
+}
+
+fn annotate(html: &str, labels: Vec<(Range<usize>, impl AsRef<str>)>) -> String {
+    let mut files = SimpleFiles::new();
+    let file_id = files.add("test.html", html);
+
+    let diagnostic = Diagnostic::note().with_labels(
+        labels
+            .into_iter()
+            .map(|(span, text)| Label::primary(file_id, span).with_message(text.as_ref()))
+            .collect(),
+    );
+
+    let mut writer = Buffer::no_color();
+    let config = codespan_reporting::term::Config::default();
+    term::emit(&mut writer, &config, &files, &diagnostic).unwrap();
+    let msg = std::str::from_utf8(writer.as_slice()).unwrap();
+
+    // strip the filename and the line numbers since we don't need them
+    // (apparently they cannot be disabled in codespan_reporting)
+    msg.lines()
+        .skip(3)
+        .flat_map(|l| l.split_once("│ ").map(|s| format!("{}\n", s.1.trim_end())))
+        .collect::<Vec<_>>()
+        .join("")
+}
+
+#[test]
+fn char_span() {
+    let html = "X &amp; &doesntexist; &#1123; </";
+    let labeler = |parser: Parser| {
+        let mut labels = Vec::new();
+        for token_trace in parser.flatten() {
+            if let (Token::Char(c), Trace::Char(span)) = token_trace {
+                if c != ' ' {
+                    labels.push((span, ""));
+                }
+            }
+        }
+        labels
+    };
+    assert_snapshot!(test_and_annotate(html, labeler), @r###"
+    X &amp; &doesntexist; &#1123; </
+    ^ ^^^^^ ^^^^^^^^^^^^^ ^^^^^^^ ^^
+    "###);
+}
+
+#[test]
+fn start_tag_span() {
+    let html = "<x> <xyz> <xyz  > <xyz/>";
+    let labeler = |parser: Parser| {
+        let mut labels = Vec::new();
+        for (_, trace) in parser.flatten() {
+            if let Trace::StartTag(trace) = trace {
+                labels.push((trace.span, ""));
+            }
+        }
+        labels
+    };
+    assert_snapshot!(test_and_annotate(html, labeler), @r###"
+    <x> <xyz> <xyz  > <xyz/>
+    ^^^ ^^^^^ ^^^^^^^ ^^^^^^
+    "###);
+}
+
+#[test]
+fn end_tag_span() {
+    let html = "</x> </xyz> </xyz  > </xyz/>";
+    let labeler = |parser: Parser| {
+        let mut labels = Vec::new();
+        for (_, trace) in parser.flatten() {
+            if let Trace::EndTag(trace) = trace {
+                labels.push((trace.span, ""));
+            }
+        }
+        labels
+    };
+    assert_snapshot!(test_and_annotate(html, labeler), @r###"
+    </x> </xyz> </xyz  > </xyz/>
+    ^^^^ ^^^^^^ ^^^^^^^^ ^^^^^^^
+    "###);
+}
+
+#[test]
+fn start_tag_name_span() {
+    let html = "<x> <xyz> <xyz  > <xyz/>";
+    let labeler = |parser: Parser| {
+        let mut labels = Vec::new();
+        for (_, trace) in parser.flatten() {
+            if let Trace::StartTag(trace) = trace {
+                labels.push((trace.name_span, ""));
+            }
+        }
+        labels
+    };
+    assert_snapshot!(test_and_annotate(html, labeler), @r###"
+    <x> <xyz> <xyz  > <xyz/>
+     ^   ^^^   ^^^     ^^^
+    "###);
+}
+
+#[test]
+fn end_tag_name_span() {
+    let html = "</x> </xyz> </xyz  > </xyz/>";
+    let labeler = |parser: Parser| {
+        let mut labels = Vec::new();
+        for (_, trace) in parser.flatten() {
+            if let Trace::EndTag(trace) = trace {
+                labels.push((trace.name_span, ""));
+            }
+        }
+        labels
+    };
+    assert_snapshot!(test_and_annotate(html, labeler), @r###"
+    </x> </xyz> </xyz  > </xyz/>
+      ^    ^^^    ^^^      ^^^
+    "###);
+}
+
+#[test]
+fn attribute_name_span() {
+    let html = "<test x xyz y=VAL xy=VAL z = VAL yzx = VAL>";
+    let labeler = |parser: Parser| {
+        let mut labels = Vec::new();
+        let (Token::StartTag(tag), Trace::StartTag(trace)) = parser.flatten().next().unwrap()
+        else {
+            panic!("expected start tag")
+        };
+        for attr in &tag.attributes {
+            labels.push((
+                trace.attribute_traces[attr.trace_idx().unwrap()].name_span(),
+                "",
+            ));
+        }
+        labels
+    };
+    assert_snapshot!(test_and_annotate(html, labeler), @r###"
+    <test x xyz y=VAL xy=VAL z = VAL yzx = VAL>
+          ^ ^^^ ^     ^^     ^       ^^^
+    "###);
+}
+
+#[test]
+fn attribute_value_span() {
+    let html = "<test x=unquoted y = unquoted z='single-quoted' zz=\"double-quoted\" empty=''>";
+    let labeler = |parser: Parser| {
+        let mut labels = Vec::new();
+        let (Token::StartTag(tag), Trace::StartTag(trace)) = parser.flatten().next().unwrap()
+        else {
+            panic!("expected start tag")
+        };
+        for attr in &tag.attributes {
+            labels.push((
+                trace.attribute_traces[attr.trace_idx().unwrap()]
+                    .value_span()
+                    .unwrap(),
+                "",
+            ));
+        }
+        labels
+    };
+    assert_snapshot!(test_and_annotate(html, labeler), @r###"
+    <test x=unquoted y = unquoted z='single-quoted' zz="double-quoted" empty=''>
+            ^^^^^^^^     ^^^^^^^^    ^^^^^^^^^^^^^      ^^^^^^^^^^^^^         ^
+    "###);
+}
+
+#[test]
+fn attribute_value_with_char_ref() {
+    let html = "<test x=&amp; y='&amp;' z=\"&amp;\">";
+    let labeler = |parser: Parser| {
+        let mut labels = Vec::new();
+        let (Token::StartTag(tag), Trace::StartTag(trace)) = parser.flatten().next().unwrap()
+        else {
+            panic!("expected start tag")
+        };
+        for attr in &tag.attributes {
+            labels.push((
+                trace.attribute_traces[attr.trace_idx().unwrap()]
+                    .value_span()
+                    .unwrap(),
+                "",
+            ));
+        }
+        labels
+    };
+    assert_snapshot!(test_and_annotate(html, labeler), @r###"
+    <test x=&amp; y='&amp;' z="&amp;">
+            ^^^^^    ^^^^^     ^^^^^
+    "###);
+}
+
+#[test]
+fn comment_data_span() {
+    #[rustfmt::skip]
+    let cases = [
+        "<!-- Why are you looking at the source code? -->",
+        "<!-- Why are you looking at the source code? --",
+        "<!-- Why are you looking at the source code? -",
+        "<!-- Why are you looking at the source code?",
+        "<!--",
+        "<!-->",
+        "<!---",
+        "<!--->",
+        "<!-- Why are you looking at the source code? ->",
+        "<!-- Why are you looking at the source code? --!>",
+        "<!-- Why are you looking at the source code? --!",
+
+        // bogus comments
+        "<! Why are you looking at the source code? -->",
+        "<!",
+    ];
+
+    let mut annotated = String::new();
+    for case in cases {
+        let labeler = |parser: Parser| {
+            let (_, Trace::Comment(comment)) = parser.flatten().next().unwrap() else {
+                panic!("expected comment");
+            };
+            vec![(comment.data_span, "")]
+        };
+
+        annotated.push_str(&test_and_annotate(case, labeler));
+    }
+
+    assert_snapshot!(annotated, @r###"
+    <!-- Why are you looking at the source code? -->
+        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+    <!-- Why are you looking at the source code? --
+        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+    <!-- Why are you looking at the source code? -
+        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+    <!-- Why are you looking at the source code?
+        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+    <!--
+        ^
+    <!-->
+        ^
+    <!---
+        ^
+    <!--->
+        ^
+    <!-- Why are you looking at the source code? ->
+        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+    <!-- Why are you looking at the source code? --!>
+        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+    <!-- Why are you looking at the source code? --!
+        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+    <! Why are you looking at the source code? -->
+      ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+    <!
+      ^
+    "###);
+
+    for (idx, case) in cases.iter().enumerate() {
+        let (Token::Comment(data), Trace::Comment(trace)) = parser(*case).flatten().next().unwrap()
+        else {
+            panic!("expected comment");
+        };
+        assert_eq!(case[trace.data_span], data, "case {idx}");
+    }
+}
+
+#[test]
+fn doctype_span() {
+    #[rustfmt::skip]
+    let cases = [
+        r#"<!DOCTYPE       HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd"     >"#,
+    ];
+
+    let mut annotated = String::new();
+    for case in cases {
+        let labeler = |parser: Parser| {
+            let (_, Trace::Doctype(trace)) = parser.flatten().next().unwrap() else {
+                panic!("expected doctype");
+            };
+            vec![(trace.span(), "")]
+        };
+        annotated.push_str(&test_and_annotate(case, labeler));
+    }
+
+    assert_snapshot!(annotated, @r###"
+    <!DOCTYPE       HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd"     >
+    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+    "###);
+}
+
+#[test]
+fn doctype_id_spans() {
+    #[rustfmt::skip]
+    let cases = [
+        r#"<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">"#,
+    ];
+
+    let mut annotated = String::new();
+    for case in cases {
+        let labeler = |parser: Parser| {
+            let (_, Trace::Doctype(trace)) = parser.flatten().next().unwrap() else {
+                panic!("expected doctype");
+            };
+
+            let mut labels = Vec::new();
+            if let Some(name_span) = trace.name_span() {
+                labels.push((name_span, "name"));
+            }
+            if let Some(public_id_span) = trace.public_id_span() {
+                labels.push((public_id_span, "public id"));
+            }
+            if let Some(system_id_span) = trace.system_id_span() {
+                labels.push((system_id_span, "system id"));
+            }
+            labels
+        };
+
+        annotated.push_str(&test_and_annotate(case, labeler));
+    }
+
+    assert_snapshot!(annotated, @r###"
+    <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
+              ^^^^         ^^^^^^^^^^^^^^^^^^^^^^^^^   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ system id
+              │            │
+              │            public id
+              name
+    "###);
+}
+
+#[test]
+fn eof_offset() {
+    let html = "Where does it end?";
+    let labeler = |parser: Parser| {
+        let mut labels = Vec::new();
+        for (_, trace) in parser.flatten() {
+            if let Trace::EndOfFile(offset) = trace {
+                labels.push((offset..offset, "here"));
+            }
+        }
+        labels
+    };
+    assert_snapshot!(test_and_annotate(html, labeler), @r###"
+    Where does it end?
+                      ^ here
+    "###);
+}
+
+fn annotate_errors(html: &'static str) -> String {
+    let mut parser = parser(html);
+    for _ in parser.by_ref() {}
+    let errors: Vec<_> = parser.emitter_mut().drain_errors().collect();
+
+    for (_, span) in errors {
+        if span.start == span.end {
+            if span.start != html.len() {
+                panic!("empty error spans are only allowed at the very end of the source (for eof errors)");
+            }
+        } else {
+            assert!(span.start < span.end);
+            assert!(span.end <= html.len());
+        }
+    }
+
+    let labeler = |mut parser: Parser| {
+        let mut labels = Vec::new();
+        for _ in parser.by_ref() {}
+        for (error, span) in parser.emitter_mut().drain_errors() {
+            labels.push((span, error.code()));
+        }
+        labels
+    };
+
+    test_and_annotate(html, labeler)
+}
+
+#[test]
+fn tests_for_errors_are_sorted() {
+    let source_of_this_file = std::fs::read_to_string(file!()).unwrap();
+    let mut error_tests: Vec<_> = source_of_this_file
+        .lines()
+        .filter(|l| l.starts_with("fn error_"))
+        .collect();
+    let error_tests_found_order = error_tests.join("\n");
+    error_tests.sort();
+    let error_tests_sorted = error_tests.join("\n");
+    assert_eq!(error_tests_found_order, error_tests_sorted);
+}
+
+#[test]
+fn error_char_ref_absence_of_digits() {
+    let html = "&#qux;";
+    assert_snapshot!(annotate_errors(html), @r###"
+    &#qux;
+    ^^^ absence-of-digits-in-numeric-character-reference
+    "###);
+}
+
+#[test]
+fn error_char_ref_control_char() {
+    let html = "&#127;";
+    assert_snapshot!(annotate_errors(html), @r###"
+    &#127;
+    ^^^^^^ control-character-reference
+    "###);
+}
+
+#[test]
+fn error_char_ref_missing_semicolon() {
+    let html = "&not";
+    assert_snapshot!(annotate_errors(html), @r###"
+    &not
+        ^ missing-semicolon-after-character-reference
+    "###);
+}
+
+#[test]
+fn error_char_ref_noncharacter() {
+    let html = "&#xFDD0;";
+    assert_snapshot!(annotate_errors(html), @r###"
+    &#xFDD0;
+    ^^^^^^^^ noncharacter-character-reference
+    "###);
+}
+
+#[test]
+fn error_char_ref_null_char() {
+    let html = "&#0;";
+    assert_snapshot!(annotate_errors(html), @r###"
+    &#0;
+    ^^^^ null-character-reference
+    "###);
+}
+
+#[test]
+fn error_char_ref_outside_unicode_range() {
+    let html = "&#9999999;";
+    assert_snapshot!(annotate_errors(html), @r###"
+    &#9999999;
+    ^^^^^^^^^^ character-reference-outside-unicode-range
+    "###);
+}
+
+#[test]
+fn error_char_ref_surrogate() {
+    let html = "&#xD800;";
+    assert_snapshot!(annotate_errors(html), @r###"
+    &#xD800;
+    ^^^^^^^^ surrogate-character-reference
+    "###);
+}
+
+#[test]
+fn error_char_ref_unknown_named() {
+    let html = "The pirate says &arrrrr;";
+    assert_snapshot!(annotate_errors(html), @r###"
+    The pirate says &arrrrr;
+                    ^^^^^^^^ unknown-named-character-reference
+    "###);
+}
+
+#[test]
+fn error_duplicate_attribute() {
+    let html = "Does this open two pages? <a href=foo.html href=bar.html>";
+    assert_snapshot!(annotate_errors(html), @r###"
+    Does this open two pages? <a href=foo.html href=bar.html>
+                                               ^^^^ duplicate-attribute
+    "###);
+}
+
+#[test]
+fn error_end_tag_with_attributes() {
+    let html = "</end-tag first second=value>";
+    assert_snapshot!(annotate_errors(html), @r###"
+    </end-tag first second=value>
+                    ^^^^^^ end-tag-with-attributes
+    "###);
+}
+
+#[test]
+fn error_end_tag_with_trailing_solidus() {
+    let html = "Do you start or do you end? </yes/>";
+    assert_snapshot!(annotate_errors(html), @r###"
+    Do you start or do you end? </yes/>
+                                     ^ end-tag-with-trailing-solidus
+    "###);
+}
+
+#[test]
+fn error_eof_before_tag_name() {
+    let html = "<";
+    assert_snapshot!(annotate_errors(html), @r###"
+    <
+     ^ eof-before-tag-name
+    "###);
+}
+
+// TODO: add error_eof_in_cdata test
+// blocked by lack of proper tree constructor (NaiveParser doesn't parse CDATA sections)
+
+#[test]
+fn error_eof_in_comment() {
+    let html = "<!--";
+    assert_snapshot!(annotate_errors(html), @r###"
+    <!--
+        ^ eof-in-comment
+    "###);
+}
+
+#[test]
+fn error_eof_in_doctype() {
+    let html = "<!doctype html";
+    assert_snapshot!(annotate_errors(html), @r###"
+    <!doctype html
+                  ^ eof-in-doctype
+    "###);
+}
+
+#[test]
+fn error_eof_in_script_html_comment_like_text() {
+    let html = "<script><!--";
+    assert_snapshot!(annotate_errors(html), @r###"
+    <script><!--
+                ^ eof-in-script-html-comment-like-text
+    "###);
+}
+
+#[test]
+fn error_eof_in_tag() {
+    let html = "</sarcasm";
+    assert_snapshot!(annotate_errors(html), @r###"
+    </sarcasm
+             ^ eof-in-tag
+    "###);
+}
+
+#[test]
+fn error_invalid_first_character_of_tag_name() {
+    let html = "Please mind the gap: < test";
+    assert_snapshot!(annotate_errors(html), @r###"
+    Please mind the gap: < test
+                          ^ invalid-first-character-of-tag-name
+    "###);
+}
+
+fn assert_char_encoding_independence<S: AsRef<str> + Clone>(
+    html: &'static str,
+    labeler: impl Fn(Parser) -> Vec<(Range<usize>, S)>,
+) {
+    let utf8_labels = labeler(parser(html));
+    let utf16_labels = labeler(parser(Utf16Reader(html.into_reader())));
+
+    for (idx, (span, _)) in utf16_labels.into_iter().enumerate() {
+        let expected_utf16_span = Range {
+            start: html[..utf8_labels[idx].0.start].encode_utf16().count() * 2,
+            end: html[..utf8_labels[idx].0.end].encode_utf16().count() * 2,
+        };
+        assert_eq!(
+            span,
+            expected_utf16_span,
+            "UTF-16 span didn't match the UTF-8 span, which looks like:\n{}",
+            annotate(html, vec![utf8_labels[idx].clone()])
+        );
+    }
+}
+
+struct Utf16Reader<'a>(html5tokenizer::reader::StringReader<'a>);
+
+impl html5tokenizer::reader::Reader for Utf16Reader<'_> {
+    type Error = std::convert::Infallible;
+
+    fn read_char(&mut self) -> Result<Option<char>, Self::Error> {
+        self.0.read_char()
+    }
+
+    fn try_read_string(&mut self, s: &str, case_sensitive: bool) -> Result<bool, Self::Error> {
+        self.0.try_read_string(s, case_sensitive)
+    }
+
+    fn len_of_char_in_current_encoding(&self, c: char) -> usize {
+        c.len_utf16() * 2
+    }
+}