From b027ecdb397c2e378491f847660f8eeb740e8cf6 Mon Sep 17 00:00:00 2001
From: Martin Fischer <martin@push-f.com>
Date: Wed, 27 Sep 2023 09:25:12 +0200
Subject: chore: rename integration tests

---
 integration_tests/tests/test_html5lib.rs | 218 -----------
 integration_tests/tests/tokenizer.rs     | 218 +++++++++++
 tests/spans.rs                           | 626 +++++++++++++++++++++++++++++++
 tests/test_spans.rs                      | 626 -------------------------------
 4 files changed, 844 insertions(+), 844 deletions(-)
 delete mode 100644 integration_tests/tests/test_html5lib.rs
 create mode 100644 integration_tests/tests/tokenizer.rs
 create mode 100644 tests/spans.rs
 delete mode 100644 tests/test_spans.rs

diff --git a/integration_tests/tests/test_html5lib.rs b/integration_tests/tests/test_html5lib.rs
deleted file mode 100644
index 2d3e4cb..0000000
--- a/integration_tests/tests/test_html5lib.rs
+++ /dev/null
@@ -1,218 +0,0 @@
-use std::{fs::File, io::BufReader, ops::Range, path::Path};
-
-use html5lib_tests::{
-    parse_tests, Error as TestError, InitialState, Output, Test, Token as TestToken,
-};
-use html5tokenizer::{
-    offset::{Offset, PosTrackingReader, Position},
-    reader::Reader,
-    BasicEmitter, Emitter, Error, Event, InternalState, Token, Tokenizer, TracingEmitter,
-};
-use similar_asserts::assert_eq;
-
-/// Path to a local checkout of [html5lib-tests], relative to the
-/// directory containing the `Cargo.toml` file of the current crate.
-///
-/// [html5lib-tests]: https://github.com/html5lib/html5lib-tests
-const HTML5LIB_TESTS_PATH: &str = "html5lib-tests";
-
-// FUTURE: it would be nice to assert that HTML5LIB_TESTS_PATH matches the path defined in .gitmodules
-// but this is currently blocked by:
-// * Cargo not setting CARGO_WORKSPACE_DIR (see https://github.com/rust-lang/cargo/issues/3946)
-// * gix-config having more dependencies than I'd want to add for this
-
-#[test]
-fn tokenizer() {
-    // TODO: use a custom test harness with e.g. libtest-mimic
-    let test_dir = format!("{HTML5LIB_TESTS_PATH}/tokenizer");
-
-    let mut test_paths = glob::glob(&format!("{test_dir}/*.test"))
-        .unwrap()
-        .peekable();
-
-    if test_paths.peek().is_none() {
-        panic!(
-            "could not find any .test files in {}, maybe try `git submodule update --init`",
-            test_dir
-        );
-    }
-
-    for test_path in test_paths {
-        let test_path = test_path.unwrap();
-
-        test_tokenizer_file(&test_path);
-    }
-}
-
-fn test_tokenizer_file(path: &Path) {
-    let fname = path.file_name().unwrap().to_str().unwrap();
-
-    if matches!(
-        fname,
-        // We don't implement "Coercing an HTML DOM into an infoset" section
-        "xmlViolation.test" |
-        // Our parser does not operate on bytes, the input isn't valid Rust &str
-        "unicodeCharsProblematic.test"
-    ) {
-        return;
-    }
-
-    let f = File::open(path).unwrap();
-    let bf = BufReader::new(f);
-    let tests = parse_tests(bf).expect(&format!("failed to parse {path:?}"));
-
-    for (i, test) in tests.into_iter().enumerate() {
-        run_test(fname, i, test);
-    }
-}
-
-fn run_test(fname: &str, test_i: usize, test: Test) {
-    for state in &test.initial_states {
-        run_test_inner(
-            fname,
-            test_i,
-            &test,
-            state,
-            Tokenizer::new(&test.input, BasicEmitter::default()),
-            "BasicEmitter string",
-        );
-
-        run_test_inner(
-            fname,
-            test_i,
-            &test,
-            state,
-            Tokenizer::new(
-                BufReader::new(test.input.as_bytes()),
-                BasicEmitter::default(),
-            ),
-            "BasicEmitter bufread",
-        );
-
-        run_test_inner(
-            fname,
-            test_i,
-            &test,
-            state,
-            Tokenizer::new(
-                PosTrackingReader::new(&test.input),
-                TracingEmitter::default(),
-            ),
-            "TracingEmitter string",
-        );
-
-        run_test_inner(
-            fname,
-            test_i,
-            &test,
-            state,
-            Tokenizer::new(
-                PosTrackingReader::new(BufReader::new(test.input.as_bytes())),
-                TracingEmitter::default(),
-            ),
-            "TracingEmitter bufread",
-        );
-    }
-}
-
-fn run_test_inner<R, O, E, T>(
-    fname: &str,
-    test_i: usize,
-    test: &Test,
-    state: &InitialState,
-    mut tokenizer: Tokenizer<R, O, E>,
-    tokenizer_info: &str,
-) where
-    R: Reader + Position<O>,
-    O: Offset,
-    E: Emitter<O> + Iterator<Item = T> + DrainErrors<O>,
-    T: Into<Token>,
-{
-    println!(
-        "==== FILE {}, TEST {}, STATE {:?}, TOKENIZER {} ====",
-        fname, test_i, state, tokenizer_info,
-    );
-    println!("description: {}", test.description);
-    tokenizer.set_internal_state(match state {
-        InitialState::Data => InternalState::Data,
-        InitialState::Plaintext => InternalState::Plaintext,
-        InitialState::Rcdata => InternalState::Rcdata,
-        InitialState::Rawtext => InternalState::Rawtext,
-        InitialState::ScriptData => InternalState::ScriptData,
-        InitialState::CdataSection => InternalState::CdataSection,
-    });
-    if let Some(last_start_tag) = &test.last_start_tag {
-        tokenizer.set_last_start_tag(last_start_tag);
-    }
-
-    let mut actual_tokens = Vec::new();
-
-    while let Some(event) = tokenizer.next() {
-        let token = match event.unwrap() {
-            Event::CdataOpen => {
-                tokenizer.handle_cdata_open(false);
-                continue;
-            }
-            Event::Token(token) => token.into(),
-        };
-
-        match token {
-            Token::StartTag(tag) => actual_tokens.push(TestToken::StartTag {
-                name: tag.name,
-                attributes: tag
-                    .attributes
-                    .into_iter()
-                    .map(|attr| (attr.name, attr.value))
-                    .collect(),
-                self_closing: tag.self_closing,
-            }),
-            Token::EndTag(tag) => actual_tokens.push(TestToken::EndTag { name: tag.name }),
-            Token::Char(c) => {
-                // Coalesce all adjacent character tokens into a single string.
-                if let Some(TestToken::Character(s)) = actual_tokens.last_mut() {
-                    s.push(c);
-                } else {
-                    actual_tokens.push(TestToken::Character(c.into()));
-                }
-            }
-            Token::Comment(comment) => actual_tokens.push(TestToken::Comment(comment)),
-            Token::Doctype(doctype) => actual_tokens.push(TestToken::Doctype {
-                name: doctype.name,
-                public_id: doctype.public_id,
-                system_id: doctype.system_id,
-                force_quirks: doctype.force_quirks,
-            }),
-            Token::EndOfFile => {}
-        };
-    }
-
-    assert_eq!(
-        Output {
-            errors: tokenizer
-                .emitter_mut()
-                .drain_errors()
-                .map(|(e, _)| TestError {
-                    code: e.code().to_string()
-                })
-                .collect(),
-            tokens: actual_tokens,
-        },
-        test.output,
-    );
-}
-
-trait DrainErrors<O> {
-    fn drain_errors(&mut self) -> Box<dyn Iterator<Item = (Error, Range<O>)> + '_>;
-}
-
-impl<O> DrainErrors<O> for BasicEmitter<O> {
-    fn drain_errors(&mut self) -> Box<dyn Iterator<Item = (Error, Range<O>)> + '_> {
-        Box::new(self.drain_errors())
-    }
-}
-
-impl DrainErrors<usize> for TracingEmitter {
-    fn drain_errors(&mut self) -> Box<dyn Iterator<Item = (Error, Range<usize>)> + '_> {
-        Box::new(self.drain_errors())
-    }
-}
diff --git a/integration_tests/tests/tokenizer.rs b/integration_tests/tests/tokenizer.rs
new file mode 100644
index 0000000..2d3e4cb
--- /dev/null
+++ b/integration_tests/tests/tokenizer.rs
@@ -0,0 +1,218 @@
+use std::{fs::File, io::BufReader, ops::Range, path::Path};
+
+use html5lib_tests::{
+    parse_tests, Error as TestError, InitialState, Output, Test, Token as TestToken,
+};
+use html5tokenizer::{
+    offset::{Offset, PosTrackingReader, Position},
+    reader::Reader,
+    BasicEmitter, Emitter, Error, Event, InternalState, Token, Tokenizer, TracingEmitter,
+};
+use similar_asserts::assert_eq;
+
+/// Path to a local checkout of [html5lib-tests], relative to the
+/// directory containing the `Cargo.toml` file of the current crate.
+///
+/// [html5lib-tests]: https://github.com/html5lib/html5lib-tests
+const HTML5LIB_TESTS_PATH: &str = "html5lib-tests";
+
+// FUTURE: it would be nice to assert that HTML5LIB_TESTS_PATH matches the path defined in .gitmodules
+// but this is currently blocked by:
+// * Cargo not setting CARGO_WORKSPACE_DIR (see https://github.com/rust-lang/cargo/issues/3946)
+// * gix-config having more dependencies than I'd want to add for this
+
+#[test]
+fn tokenizer() {
+    // TODO: use a custom test harness with e.g. libtest-mimic
+    let test_dir = format!("{HTML5LIB_TESTS_PATH}/tokenizer");
+
+    let mut test_paths = glob::glob(&format!("{test_dir}/*.test"))
+        .unwrap()
+        .peekable();
+
+    if test_paths.peek().is_none() {
+        panic!(
+            "could not find any .test files in {}, maybe try `git submodule update --init`",
+            test_dir
+        );
+    }
+
+    for test_path in test_paths {
+        let test_path = test_path.unwrap();
+
+        test_tokenizer_file(&test_path);
+    }
+}
+
+fn test_tokenizer_file(path: &Path) {
+    let fname = path.file_name().unwrap().to_str().unwrap();
+
+    if matches!(
+        fname,
+        // We don't implement "Coercing an HTML DOM into an infoset" section
+        "xmlViolation.test" |
+        // Our parser does not operate on bytes, the input isn't valid Rust &str
+        "unicodeCharsProblematic.test"
+    ) {
+        return;
+    }
+
+    let f = File::open(path).unwrap();
+    let bf = BufReader::new(f);
+    let tests = parse_tests(bf).expect(&format!("failed to parse {path:?}"));
+
+    for (i, test) in tests.into_iter().enumerate() {
+        run_test(fname, i, test);
+    }
+}
+
+fn run_test(fname: &str, test_i: usize, test: Test) {
+    for state in &test.initial_states {
+        run_test_inner(
+            fname,
+            test_i,
+            &test,
+            state,
+            Tokenizer::new(&test.input, BasicEmitter::default()),
+            "BasicEmitter string",
+        );
+
+        run_test_inner(
+            fname,
+            test_i,
+            &test,
+            state,
+            Tokenizer::new(
+                BufReader::new(test.input.as_bytes()),
+                BasicEmitter::default(),
+            ),
+            "BasicEmitter bufread",
+        );
+
+        run_test_inner(
+            fname,
+            test_i,
+            &test,
+            state,
+            Tokenizer::new(
+                PosTrackingReader::new(&test.input),
+                TracingEmitter::default(),
+            ),
+            "TracingEmitter string",
+        );
+
+        run_test_inner(
+            fname,
+            test_i,
+            &test,
+            state,
+            Tokenizer::new(
+                PosTrackingReader::new(BufReader::new(test.input.as_bytes())),
+                TracingEmitter::default(),
+            ),
+            "TracingEmitter bufread",
+        );
+    }
+}
+
+fn run_test_inner<R, O, E, T>(
+    fname: &str,
+    test_i: usize,
+    test: &Test,
+    state: &InitialState,
+    mut tokenizer: Tokenizer<R, O, E>,
+    tokenizer_info: &str,
+) where
+    R: Reader + Position<O>,
+    O: Offset,
+    E: Emitter<O> + Iterator<Item = T> + DrainErrors<O>,
+    T: Into<Token>,
+{
+    println!(
+        "==== FILE {}, TEST {}, STATE {:?}, TOKENIZER {} ====",
+        fname, test_i, state, tokenizer_info,
+    );
+    println!("description: {}", test.description);
+    tokenizer.set_internal_state(match state {
+        InitialState::Data => InternalState::Data,
+        InitialState::Plaintext => InternalState::Plaintext,
+        InitialState::Rcdata => InternalState::Rcdata,
+        InitialState::Rawtext => InternalState::Rawtext,
+        InitialState::ScriptData => InternalState::ScriptData,
+        InitialState::CdataSection => InternalState::CdataSection,
+    });
+    if let Some(last_start_tag) = &test.last_start_tag {
+        tokenizer.set_last_start_tag(last_start_tag);
+    }
+
+    let mut actual_tokens = Vec::new();
+
+    while let Some(event) = tokenizer.next() {
+        let token = match event.unwrap() {
+            Event::CdataOpen => {
+                tokenizer.handle_cdata_open(false);
+                continue;
+            }
+            Event::Token(token) => token.into(),
+        };
+
+        match token {
+            Token::StartTag(tag) => actual_tokens.push(TestToken::StartTag {
+                name: tag.name,
+                attributes: tag
+                    .attributes
+                    .into_iter()
+                    .map(|attr| (attr.name, attr.value))
+                    .collect(),
+                self_closing: tag.self_closing,
+            }),
+            Token::EndTag(tag) => actual_tokens.push(TestToken::EndTag { name: tag.name }),
+            Token::Char(c) => {
+                // Coalesce all adjacent character tokens into a single string.
+                if let Some(TestToken::Character(s)) = actual_tokens.last_mut() {
+                    s.push(c);
+                } else {
+                    actual_tokens.push(TestToken::Character(c.into()));
+                }
+            }
+            Token::Comment(comment) => actual_tokens.push(TestToken::Comment(comment)),
+            Token::Doctype(doctype) => actual_tokens.push(TestToken::Doctype {
+                name: doctype.name,
+                public_id: doctype.public_id,
+                system_id: doctype.system_id,
+                force_quirks: doctype.force_quirks,
+            }),
+            Token::EndOfFile => {}
+        };
+    }
+
+    assert_eq!(
+        Output {
+            errors: tokenizer
+                .emitter_mut()
+                .drain_errors()
+                .map(|(e, _)| TestError {
+                    code: e.code().to_string()
+                })
+                .collect(),
+            tokens: actual_tokens,
+        },
+        test.output,
+    );
+}
+
+trait DrainErrors<O> {
+    fn drain_errors(&mut self) -> Box<dyn Iterator<Item = (Error, Range<O>)> + '_>;
+}
+
+impl<O> DrainErrors<O> for BasicEmitter<O> {
+    fn drain_errors(&mut self) -> Box<dyn Iterator<Item = (Error, Range<O>)> + '_> {
+        Box::new(self.drain_errors())
+    }
+}
+
+impl DrainErrors<usize> for TracingEmitter {
+    fn drain_errors(&mut self) -> Box<dyn Iterator<Item = (Error, Range<usize>)> + '_> {
+        Box::new(self.drain_errors())
+    }
+}
diff --git a/tests/spans.rs b/tests/spans.rs
new file mode 100644
index 0000000..b10808c
--- /dev/null
+++ b/tests/spans.rs
@@ -0,0 +1,626 @@
+use std::convert::Infallible;
+use std::ops::Range;
+
+use codespan_reporting::{
+    self,
+    diagnostic::{Diagnostic, Label},
+    files::SimpleFiles,
+    term::{self, termcolor::Buffer},
+};
+use html5tokenizer::{
+    offset::PosTrackingReader,
+    reader::{IntoReader, Reader},
+    trace::Trace,
+    NaiveParser, Token,
+};
+use insta::assert_snapshot;
+use similar_asserts::assert_eq;
+
+/// Just a convenient type alias for labeler closures since Rust
+/// apparently cannot infer the type (requiring an annotation).
+type Parser = NaiveParser<
+    PosTrackingReader<Box<dyn Reader<Error = Infallible>>>,
+    usize,
+    html5tokenizer::TracingEmitter,
+>;
+
+fn parser<R>(reader: impl IntoReader<'static, Reader = R>) -> Parser
+where
+    R: Reader<Error = Infallible> + 'static,
+{
+    NaiveParser::new_with_emitter(
+        PosTrackingReader::new(
+            Box::new(reader.into_reader()) as Box<dyn Reader<Error = Infallible>>
+        ),
+        html5tokenizer::TracingEmitter::default(),
+    )
+}
+
+fn test_and_annotate<S: AsRef<str> + Clone>(
+    html: &'static str,
+    labeler: impl Fn(Parser) -> Vec<(Range<usize>, S)>,
+) -> String {
+    let labels = labeler(parser(html));
+
+    assert_char_encoding_independence(html, labeler);
+
+    annotate(html, labels)
+}
+
+fn annotate(html: &str, labels: Vec<(Range<usize>, impl AsRef<str>)>) -> String {
+    let mut files = SimpleFiles::new();
+    let file_id = files.add("test.html", html);
+
+    let diagnostic = Diagnostic::note().with_labels(
+        labels
+            .into_iter()
+            .map(|(span, text)| Label::primary(file_id, span).with_message(text.as_ref()))
+            .collect(),
+    );
+
+    let mut writer = Buffer::no_color();
+    let config = codespan_reporting::term::Config::default();
+    term::emit(&mut writer, &config, &files, &diagnostic).unwrap();
+    let msg = std::str::from_utf8(writer.as_slice()).unwrap();
+
+    // strip the filename and the line numbers since we don't need them
+    // (apparently they cannot be disabled in codespan_reporting)
+    msg.lines()
+        .skip(3)
+        .flat_map(|l| l.split_once("│ ").map(|s| format!("{}\n", s.1.trim_end())))
+        .collect::<Vec<_>>()
+        .join("")
+}
+
+#[test]
+fn char_span() {
+    let html = "X &amp; &doesntexist; &#1123; </";
+    let labeler = |parser: Parser| {
+        let mut labels = Vec::new();
+        for token_trace in parser.flatten() {
+            if let (Token::Char(c), Trace::Char(span)) = token_trace {
+                if c != ' ' {
+                    labels.push((span, ""));
+                }
+            }
+        }
+        labels
+    };
+    assert_snapshot!(test_and_annotate(html, labeler), @r###"
+    X &amp; &doesntexist; &#1123; </
+    ^ ^^^^^ ^^^^^^^^^^^^^ ^^^^^^^ ^^
+    "###);
+}
+
+#[test]
+fn start_tag_span() {
+    let html = "<x> <xyz> <xyz  > <xyz/>";
+    let labeler = |parser: Parser| {
+        let mut labels = Vec::new();
+        for (_, trace) in parser.flatten() {
+            if let Trace::StartTag(trace) = trace {
+                labels.push((trace.span, ""));
+            }
+        }
+        labels
+    };
+    assert_snapshot!(test_and_annotate(html, labeler), @r###"
+    <x> <xyz> <xyz  > <xyz/>
+    ^^^ ^^^^^ ^^^^^^^ ^^^^^^
+    "###);
+}
+
+#[test]
+fn end_tag_span() {
+    let html = "</x> </xyz> </xyz  > </xyz/>";
+    let labeler = |parser: Parser| {
+        let mut labels = Vec::new();
+        for (_, trace) in parser.flatten() {
+            if let Trace::EndTag(trace) = trace {
+                labels.push((trace.span, ""));
+            }
+        }
+        labels
+    };
+    assert_snapshot!(test_and_annotate(html, labeler), @r###"
+    </x> </xyz> </xyz  > </xyz/>
+    ^^^^ ^^^^^^ ^^^^^^^^ ^^^^^^^
+    "###);
+}
+
+#[test]
+fn start_tag_name_span() {
+    let html = "<x> <xyz> <xyz  > <xyz/>";
+    let labeler = |parser: Parser| {
+        let mut labels = Vec::new();
+        for (_, trace) in parser.flatten() {
+            if let Trace::StartTag(trace) = trace {
+                labels.push((trace.name_span, ""));
+            }
+        }
+        labels
+    };
+    assert_snapshot!(test_and_annotate(html, labeler), @r###"
+    <x> <xyz> <xyz  > <xyz/>
+     ^   ^^^   ^^^     ^^^
+    "###);
+}
+
+#[test]
+fn end_tag_name_span() {
+    let html = "</x> </xyz> </xyz  > </xyz/>";
+    let labeler = |parser: Parser| {
+        let mut labels = Vec::new();
+        for (_, trace) in parser.flatten() {
+            if let Trace::EndTag(trace) = trace {
+                labels.push((trace.name_span, ""));
+            }
+        }
+        labels
+    };
+    assert_snapshot!(test_and_annotate(html, labeler), @r###"
+    </x> </xyz> </xyz  > </xyz/>
+      ^    ^^^    ^^^      ^^^
+    "###);
+}
+
+#[test]
+fn attribute_name_span() {
+    let html = "<test x xyz y=VAL xy=VAL z = VAL yzx = VAL>";
+    let labeler = |parser: Parser| {
+        let mut labels = Vec::new();
+        let (Token::StartTag(tag), Trace::StartTag(trace)) = parser.flatten().next().unwrap()
+        else {
+            panic!("expected start tag")
+        };
+        for attr in &tag.attributes {
+            labels.push((
+                trace.attribute_traces[attr.trace_idx().unwrap()].name_span(),
+                "",
+            ));
+        }
+        labels
+    };
+    assert_snapshot!(test_and_annotate(html, labeler), @r###"
+    <test x xyz y=VAL xy=VAL z = VAL yzx = VAL>
+          ^ ^^^ ^     ^^     ^       ^^^
+    "###);
+}
+
+#[test]
+fn attribute_value_span() {
+    let html = "<test x=unquoted y = unquoted z='single-quoted' zz=\"double-quoted\" empty=''>";
+    let labeler = |parser: Parser| {
+        let mut labels = Vec::new();
+        let (Token::StartTag(tag), Trace::StartTag(trace)) = parser.flatten().next().unwrap()
+        else {
+            panic!("expected start tag")
+        };
+        for attr in &tag.attributes {
+            labels.push((
+                trace.attribute_traces[attr.trace_idx().unwrap()]
+                    .value_span()
+                    .unwrap(),
+                "",
+            ));
+        }
+        labels
+    };
+    assert_snapshot!(test_and_annotate(html, labeler), @r###"
+    <test x=unquoted y = unquoted z='single-quoted' zz="double-quoted" empty=''>
+            ^^^^^^^^     ^^^^^^^^    ^^^^^^^^^^^^^      ^^^^^^^^^^^^^         ^
+    "###);
+}
+
+#[test]
+fn attribute_value_with_char_ref() {
+    let html = "<test x=&amp; y='&amp;' z=\"&amp;\">";
+    let labeler = |parser: Parser| {
+        let mut labels = Vec::new();
+        let (Token::StartTag(tag), Trace::StartTag(trace)) = parser.flatten().next().unwrap()
+        else {
+            panic!("expected start tag")
+        };
+        for attr in &tag.attributes {
+            labels.push((
+                trace.attribute_traces[attr.trace_idx().unwrap()]
+                    .value_span()
+                    .unwrap(),
+                "",
+            ));
+        }
+        labels
+    };
+    assert_snapshot!(test_and_annotate(html, labeler), @r###"
+    <test x=&amp; y='&amp;' z="&amp;">
+            ^^^^^    ^^^^^     ^^^^^
+    "###);
+}
+
+#[test]
+fn comment_data_span() {
+    #[rustfmt::skip]
+    let cases = [
+        "<!-- Why are you looking at the source code? -->",
+        "<!-- Why are you looking at the source code? --",
+        "<!-- Why are you looking at the source code? -",
+        "<!-- Why are you looking at the source code?",
+        "<!--",
+        "<!-->",
+        "<!---",
+        "<!--->",
+        "<!-- Why are you looking at the source code? ->",
+        "<!-- Why are you looking at the source code? --!>",
+        "<!-- Why are you looking at the source code? --!",
+
+        // bogus comments
+        "<! Why are you looking at the source code? -->",
+        "<!",
+    ];
+
+    let mut annotated = String::new();
+    for case in cases {
+        let labeler = |parser: Parser| {
+            let (_, Trace::Comment(comment)) = parser.flatten().next().unwrap() else {
+                panic!("expected comment");
+            };
+            vec![(comment.data_span, "")]
+        };
+
+        annotated.push_str(&test_and_annotate(case, labeler));
+    }
+
+    assert_snapshot!(annotated, @r###"
+    <!-- Why are you looking at the source code? -->
+        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+    <!-- Why are you looking at the source code? --
+        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+    <!-- Why are you looking at the source code? -
+        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+    <!-- Why are you looking at the source code?
+        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+    <!--
+        ^
+    <!-->
+        ^
+    <!---
+        ^
+    <!--->
+        ^
+    <!-- Why are you looking at the source code? ->
+        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+    <!-- Why are you looking at the source code? --!>
+        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+    <!-- Why are you looking at the source code? --!
+        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+    <! Why are you looking at the source code? -->
+      ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+    <!
+      ^
+    "###);
+
+    for (idx, case) in cases.iter().enumerate() {
+        let (Token::Comment(data), Trace::Comment(trace)) = parser(*case).flatten().next().unwrap()
+        else {
+            panic!("expected comment");
+        };
+        assert_eq!(case[trace.data_span], data, "case {idx}");
+    }
+}
+
+#[test]
+fn doctype_span() {
+    #[rustfmt::skip]
+    let cases = [
+        r#"<!DOCTYPE       HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd"     >"#,
+    ];
+
+    let mut annotated = String::new();
+    for case in cases {
+        let labeler = |parser: Parser| {
+            let (_, Trace::Doctype(trace)) = parser.flatten().next().unwrap() else {
+                panic!("expected doctype");
+            };
+            vec![(trace.span(), "")]
+        };
+        annotated.push_str(&test_and_annotate(case, labeler));
+    }
+
+    assert_snapshot!(annotated, @r###"
+    <!DOCTYPE       HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd"     >
+    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+    "###);
+}
+
+#[test]
+fn doctype_id_spans() {
+    #[rustfmt::skip]
+    let cases = [
+        r#"<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">"#,
+    ];
+
+    let mut annotated = String::new();
+    for case in cases {
+        let labeler = |parser: Parser| {
+            let (_, Trace::Doctype(trace)) = parser.flatten().next().unwrap() else {
+                panic!("expected doctype");
+            };
+
+            let mut labels = Vec::new();
+            if let Some(name_span) = trace.name_span() {
+                labels.push((name_span, "name"));
+            }
+            if let Some(public_id_span) = trace.public_id_span() {
+                labels.push((public_id_span, "public id"));
+            }
+            if let Some(system_id_span) = trace.system_id_span() {
+                labels.push((system_id_span, "system id"));
+            }
+            labels
+        };
+
+        annotated.push_str(&test_and_annotate(case, labeler));
+    }
+
+    assert_snapshot!(annotated, @r###"
+    <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
+              ^^^^         ^^^^^^^^^^^^^^^^^^^^^^^^^   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ system id
+              │            │
+              │            public id
+              name
+    "###);
+}
+
+#[test]
+fn eof_offset() {
+    let html = "Where does it end?";
+    let labeler = |parser: Parser| {
+        let mut labels = Vec::new();
+        for (_, trace) in parser.flatten() {
+            if let Trace::EndOfFile(offset) = trace {
+                labels.push((offset..offset, "here"));
+            }
+        }
+        labels
+    };
+    assert_snapshot!(test_and_annotate(html, labeler), @r###"
+    Where does it end?
+                      ^ here
+    "###);
+}
+
+fn annotate_errors(html: &'static str) -> String {
+    let mut parser = parser(html);
+    for _ in parser.by_ref() {}
+    let errors: Vec<_> = parser.emitter_mut().drain_errors().collect();
+
+    for (_, span) in errors {
+        if span.start == span.end {
+            if span.start != html.len() {
+                panic!("empty error spans are only allowed at the very end of the source (for eof errors)");
+            }
+        } else {
+            assert!(span.start < span.end);
+            assert!(span.end <= html.len());
+        }
+    }
+
+    let labeler = |mut parser: Parser| {
+        let mut labels = Vec::new();
+        for _ in parser.by_ref() {}
+        for (error, span) in parser.emitter_mut().drain_errors() {
+            labels.push((span, error.code()));
+        }
+        labels
+    };
+
+    test_and_annotate(html, labeler)
+}
+
+#[test]
+fn tests_for_errors_are_sorted() {
+    let source_of_this_file = std::fs::read_to_string(file!()).unwrap();
+    let mut error_tests: Vec<_> = source_of_this_file
+        .lines()
+        .filter(|l| l.starts_with("fn error_"))
+        .collect();
+    let error_tests_found_order = error_tests.join("\n");
+    error_tests.sort();
+    let error_tests_sorted = error_tests.join("\n");
+    assert_eq!(error_tests_found_order, error_tests_sorted);
+}
+
+#[test]
+fn error_char_ref_absence_of_digits() {
+    let html = "&#qux;";
+    assert_snapshot!(annotate_errors(html), @r###"
+    &#qux;
+    ^^^ absence-of-digits-in-numeric-character-reference
+    "###);
+}
+
+#[test]
+fn error_char_ref_control_char() {
+    let html = "&#127;";
+    assert_snapshot!(annotate_errors(html), @r###"
+    &#127;
+    ^^^^^^ control-character-reference
+    "###);
+}
+
+#[test]
+fn error_char_ref_missing_semicolon() {
+    let html = "&not";
+    assert_snapshot!(annotate_errors(html), @r###"
+    &not
+        ^ missing-semicolon-after-character-reference
+    "###);
+}
+
+#[test]
+fn error_char_ref_noncharacter() {
+    let html = "&#xFDD0;";
+    assert_snapshot!(annotate_errors(html), @r###"
+    &#xFDD0;
+    ^^^^^^^^ noncharacter-character-reference
+    "###);
+}
+
+#[test]
+fn error_char_ref_null_char() {
+    let html = "&#0;";
+    assert_snapshot!(annotate_errors(html), @r###"
+    &#0;
+    ^^^^ null-character-reference
+    "###);
+}
+
+#[test]
+fn error_char_ref_outside_unicode_range() {
+    let html = "&#9999999;";
+    assert_snapshot!(annotate_errors(html), @r###"
+    &#9999999;
+    ^^^^^^^^^^ character-reference-outside-unicode-range
+    "###);
+}
+
+#[test]
+fn error_char_ref_surrogate() {
+    let html = "&#xD800;";
+    assert_snapshot!(annotate_errors(html), @r###"
+    &#xD800;
+    ^^^^^^^^ surrogate-character-reference
+    "###);
+}
+
+#[test]
+fn error_char_ref_unknown_named() {
+    let html = "The pirate says &arrrrr;";
+    assert_snapshot!(annotate_errors(html), @r###"
+    The pirate says &arrrrr;
+                    ^^^^^^^^ unknown-named-character-reference
+    "###);
+}
+
+#[test]
+fn error_duplicate_attribute() {
+    let html = "Does this open two pages? <a href=foo.html href=bar.html>";
+    assert_snapshot!(annotate_errors(html), @r###"
+    Does this open two pages? <a href=foo.html href=bar.html>
+                                               ^^^^ duplicate-attribute
+    "###);
+}
+
+#[test]
+fn error_end_tag_with_attributes() {
+    let html = "</end-tag first second=value>";
+    assert_snapshot!(annotate_errors(html), @r###"
+    </end-tag first second=value>
+                    ^^^^^^ end-tag-with-attributes
+    "###);
+}
+
+#[test]
+fn error_end_tag_with_trailing_solidus() {
+    let html = "Do you start or do you end? </yes/>";
+    assert_snapshot!(annotate_errors(html), @r###"
+    Do you start or do you end? </yes/>
+                                     ^ end-tag-with-trailing-solidus
+    "###);
+}
+
+#[test]
+fn error_eof_before_tag_name() {
+    let html = "<";
+    assert_snapshot!(annotate_errors(html), @r###"
+    <
+     ^ eof-before-tag-name
+    "###);
+}
+
+// TODO: add error_eof_in_cdata test
+// blocked by lack of proper tree constructor (NaiveParser doesn't parse CDATA sections)
+
+#[test]
+fn error_eof_in_comment() {
+    let html = "<!--";
+    assert_snapshot!(annotate_errors(html), @r###"
+    <!--
+        ^ eof-in-comment
+    "###);
+}
+
+#[test]
+fn error_eof_in_doctype() {
+    let html = "<!doctype html";
+    assert_snapshot!(annotate_errors(html), @r###"
+    <!doctype html
+                  ^ eof-in-doctype
+    "###);
+}
+
+#[test]
+fn error_eof_in_script_html_comment_like_text() {
+    let html = "<script><!--";
+    assert_snapshot!(annotate_errors(html), @r###"
+    <script><!--
+                ^ eof-in-script-html-comment-like-text
+    "###);
+}
+
+#[test]
+fn error_eof_in_tag() {
+    let html = "</sarcasm";
+    assert_snapshot!(annotate_errors(html), @r###"
+    </sarcasm
+             ^ eof-in-tag
+    "###);
+}
+
+#[test]
+fn error_invalid_first_character_of_tag_name() {
+    let html = "Please mind the gap: < test";
+    assert_snapshot!(annotate_errors(html), @r###"
+    Please mind the gap: < test
+                          ^ invalid-first-character-of-tag-name
+    "###);
+}
+
+fn assert_char_encoding_independence<S: AsRef<str> + Clone>(
+    html: &'static str,
+    labeler: impl Fn(Parser) -> Vec<(Range<usize>, S)>,
+) {
+    let utf8_labels = labeler(parser(html));
+    let utf16_labels = labeler(parser(Utf16Reader(html.into_reader())));
+
+    for (idx, (span, _)) in utf16_labels.into_iter().enumerate() {
+        let expected_utf16_span = Range {
+            start: html[..utf8_labels[idx].0.start].encode_utf16().count() * 2,
+            end: html[..utf8_labels[idx].0.end].encode_utf16().count() * 2,
+        };
+        assert_eq!(
+            span,
+            expected_utf16_span,
+            "UTF-16 span didn't match the UTF-8 span, which looks like:\n{}",
+            annotate(html, vec![utf8_labels[idx].clone()])
+        );
+    }
+}
+
+struct Utf16Reader<'a>(html5tokenizer::reader::StringReader<'a>);
+
+impl html5tokenizer::reader::Reader for Utf16Reader<'_> {
+    type Error = std::convert::Infallible;
+
+    fn read_char(&mut self) -> Result<Option<char>, Self::Error> {
+        self.0.read_char()
+    }
+
+    fn try_read_string(&mut self, s: &str, case_sensitive: bool) -> Result<bool, Self::Error> {
+        self.0.try_read_string(s, case_sensitive)
+    }
+
+    fn len_of_char_in_current_encoding(&self, c: char) -> usize {
+        c.len_utf16() * 2
+    }
+}
diff --git a/tests/test_spans.rs b/tests/test_spans.rs
deleted file mode 100644
index b10808c..0000000
--- a/tests/test_spans.rs
+++ /dev/null
@@ -1,626 +0,0 @@
-use std::convert::Infallible;
-use std::ops::Range;
-
-use codespan_reporting::{
-    self,
-    diagnostic::{Diagnostic, Label},
-    files::SimpleFiles,
-    term::{self, termcolor::Buffer},
-};
-use html5tokenizer::{
-    offset::PosTrackingReader,
-    reader::{IntoReader, Reader},
-    trace::Trace,
-    NaiveParser, Token,
-};
-use insta::assert_snapshot;
-use similar_asserts::assert_eq;
-
-/// Just a convenient type alias for labeler closures since Rust
-/// apparently cannot infer the type (requiring an annotation).
-type Parser = NaiveParser<
-    PosTrackingReader<Box<dyn Reader<Error = Infallible>>>,
-    usize,
-    html5tokenizer::TracingEmitter,
->;
-
-fn parser<R>(reader: impl IntoReader<'static, Reader = R>) -> Parser
-where
-    R: Reader<Error = Infallible> + 'static,
-{
-    NaiveParser::new_with_emitter(
-        PosTrackingReader::new(
-            Box::new(reader.into_reader()) as Box<dyn Reader<Error = Infallible>>
-        ),
-        html5tokenizer::TracingEmitter::default(),
-    )
-}
-
-fn test_and_annotate<S: AsRef<str> + Clone>(
-    html: &'static str,
-    labeler: impl Fn(Parser) -> Vec<(Range<usize>, S)>,
-) -> String {
-    let labels = labeler(parser(html));
-
-    assert_char_encoding_independence(html, labeler);
-
-    annotate(html, labels)
-}
-
-fn annotate(html: &str, labels: Vec<(Range<usize>, impl AsRef<str>)>) -> String {
-    let mut files = SimpleFiles::new();
-    let file_id = files.add("test.html", html);
-
-    let diagnostic = Diagnostic::note().with_labels(
-        labels
-            .into_iter()
-            .map(|(span, text)| Label::primary(file_id, span).with_message(text.as_ref()))
-            .collect(),
-    );
-
-    let mut writer = Buffer::no_color();
-    let config = codespan_reporting::term::Config::default();
-    term::emit(&mut writer, &config, &files, &diagnostic).unwrap();
-    let msg = std::str::from_utf8(writer.as_slice()).unwrap();
-
-    // strip the filename and the line numbers since we don't need them
-    // (apparently they cannot be disabled in codespan_reporting)
-    msg.lines()
-        .skip(3)
-        .flat_map(|l| l.split_once("│ ").map(|s| format!("{}\n", s.1.trim_end())))
-        .collect::<Vec<_>>()
-        .join("")
-}
-
-#[test]
-fn char_span() {
-    let html = "X &amp; &doesntexist; &#1123; </";
-    let labeler = |parser: Parser| {
-        let mut labels = Vec::new();
-        for token_trace in parser.flatten() {
-            if let (Token::Char(c), Trace::Char(span)) = token_trace {
-                if c != ' ' {
-                    labels.push((span, ""));
-                }
-            }
-        }
-        labels
-    };
-    assert_snapshot!(test_and_annotate(html, labeler), @r###"
-    X &amp; &doesntexist; &#1123; </
-    ^ ^^^^^ ^^^^^^^^^^^^^ ^^^^^^^ ^^
-    "###);
-}
-
-#[test]
-fn start_tag_span() {
-    let html = "<x> <xyz> <xyz  > <xyz/>";
-    let labeler = |parser: Parser| {
-        let mut labels = Vec::new();
-        for (_, trace) in parser.flatten() {
-            if let Trace::StartTag(trace) = trace {
-                labels.push((trace.span, ""));
-            }
-        }
-        labels
-    };
-    assert_snapshot!(test_and_annotate(html, labeler), @r###"
-    <x> <xyz> <xyz  > <xyz/>
-    ^^^ ^^^^^ ^^^^^^^ ^^^^^^
-    "###);
-}
-
-#[test]
-fn end_tag_span() {
-    let html = "</x> </xyz> </xyz  > </xyz/>";
-    let labeler = |parser: Parser| {
-        let mut labels = Vec::new();
-        for (_, trace) in parser.flatten() {
-            if let Trace::EndTag(trace) = trace {
-                labels.push((trace.span, ""));
-            }
-        }
-        labels
-    };
-    assert_snapshot!(test_and_annotate(html, labeler), @r###"
-    </x> </xyz> </xyz  > </xyz/>
-    ^^^^ ^^^^^^ ^^^^^^^^ ^^^^^^^
-    "###);
-}
-
-#[test]
-fn start_tag_name_span() {
-    let html = "<x> <xyz> <xyz  > <xyz/>";
-    let labeler = |parser: Parser| {
-        let mut labels = Vec::new();
-        for (_, trace) in parser.flatten() {
-            if let Trace::StartTag(trace) = trace {
-                labels.push((trace.name_span, ""));
-            }
-        }
-        labels
-    };
-    assert_snapshot!(test_and_annotate(html, labeler), @r###"
-    <x> <xyz> <xyz  > <xyz/>
-     ^   ^^^   ^^^     ^^^
-    "###);
-}
-
-#[test]
-fn end_tag_name_span() {
-    let html = "</x> </xyz> </xyz  > </xyz/>";
-    let labeler = |parser: Parser| {
-        let mut labels = Vec::new();
-        for (_, trace) in parser.flatten() {
-            if let Trace::EndTag(trace) = trace {
-                labels.push((trace.name_span, ""));
-            }
-        }
-        labels
-    };
-    assert_snapshot!(test_and_annotate(html, labeler), @r###"
-    </x> </xyz> </xyz  > </xyz/>
-      ^    ^^^    ^^^      ^^^
-    "###);
-}
-
-#[test]
-fn attribute_name_span() {
-    let html = "<test x xyz y=VAL xy=VAL z = VAL yzx = VAL>";
-    let labeler = |parser: Parser| {
-        let mut labels = Vec::new();
-        let (Token::StartTag(tag), Trace::StartTag(trace)) = parser.flatten().next().unwrap()
-        else {
-            panic!("expected start tag")
-        };
-        for attr in &tag.attributes {
-            labels.push((
-                trace.attribute_traces[attr.trace_idx().unwrap()].name_span(),
-                "",
-            ));
-        }
-        labels
-    };
-    assert_snapshot!(test_and_annotate(html, labeler), @r###"
-    <test x xyz y=VAL xy=VAL z = VAL yzx = VAL>
-          ^ ^^^ ^     ^^     ^       ^^^
-    "###);
-}
-
-#[test]
-fn attribute_value_span() {
-    let html = "<test x=unquoted y = unquoted z='single-quoted' zz=\"double-quoted\" empty=''>";
-    let labeler = |parser: Parser| {
-        let mut labels = Vec::new();
-        let (Token::StartTag(tag), Trace::StartTag(trace)) = parser.flatten().next().unwrap()
-        else {
-            panic!("expected start tag")
-        };
-        for attr in &tag.attributes {
-            labels.push((
-                trace.attribute_traces[attr.trace_idx().unwrap()]
-                    .value_span()
-                    .unwrap(),
-                "",
-            ));
-        }
-        labels
-    };
-    assert_snapshot!(test_and_annotate(html, labeler), @r###"
-    <test x=unquoted y = unquoted z='single-quoted' zz="double-quoted" empty=''>
-            ^^^^^^^^     ^^^^^^^^    ^^^^^^^^^^^^^      ^^^^^^^^^^^^^         ^
-    "###);
-}
-
-#[test]
-fn attribute_value_with_char_ref() {
-    let html = "<test x=&amp; y='&amp;' z=\"&amp;\">";
-    let labeler = |parser: Parser| {
-        let mut labels = Vec::new();
-        let (Token::StartTag(tag), Trace::StartTag(trace)) = parser.flatten().next().unwrap()
-        else {
-            panic!("expected start tag")
-        };
-        for attr in &tag.attributes {
-            labels.push((
-                trace.attribute_traces[attr.trace_idx().unwrap()]
-                    .value_span()
-                    .unwrap(),
-                "",
-            ));
-        }
-        labels
-    };
-    assert_snapshot!(test_and_annotate(html, labeler), @r###"
-    <test x=&amp; y='&amp;' z="&amp;">
-            ^^^^^    ^^^^^     ^^^^^
-    "###);
-}
-
-#[test]
-fn comment_data_span() {
-    #[rustfmt::skip]
-    let cases = [
-        "<!-- Why are you looking at the source code? -->",
-        "<!-- Why are you looking at the source code? --",
-        "<!-- Why are you looking at the source code? -",
-        "<!-- Why are you looking at the source code?",
-        "<!--",
-        "<!-->",
-        "<!---",
-        "<!--->",
-        "<!-- Why are you looking at the source code? ->",
-        "<!-- Why are you looking at the source code? --!>",
-        "<!-- Why are you looking at the source code? --!",
-
-        // bogus comments
-        "<! Why are you looking at the source code? -->",
-        "<!",
-    ];
-
-    let mut annotated = String::new();
-    for case in cases {
-        let labeler = |parser: Parser| {
-            let (_, Trace::Comment(comment)) = parser.flatten().next().unwrap() else {
-                panic!("expected comment");
-            };
-            vec![(comment.data_span, "")]
-        };
-
-        annotated.push_str(&test_and_annotate(case, labeler));
-    }
-
-    assert_snapshot!(annotated, @r###"
-    <!-- Why are you looking at the source code? -->
-        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-    <!-- Why are you looking at the source code? --
-        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-    <!-- Why are you looking at the source code? -
-        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-    <!-- Why are you looking at the source code?
-        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-    <!--
-        ^
-    <!-->
-        ^
-    <!---
-        ^
-    <!--->
-        ^
-    <!-- Why are you looking at the source code? ->
-        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-    <!-- Why are you looking at the source code? --!>
-        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-    <!-- Why are you looking at the source code? --!
-        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-    <! Why are you looking at the source code? -->
-      ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-    <!
-      ^
-    "###);
-
-    for (idx, case) in cases.iter().enumerate() {
-        let (Token::Comment(data), Trace::Comment(trace)) = parser(*case).flatten().next().unwrap()
-        else {
-            panic!("expected comment");
-        };
-        assert_eq!(case[trace.data_span], data, "case {idx}");
-    }
-}
-
-#[test]
-fn doctype_span() {
-    #[rustfmt::skip]
-    let cases = [
-        r#"<!DOCTYPE       HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd"     >"#,
-    ];
-
-    let mut annotated = String::new();
-    for case in cases {
-        let labeler = |parser: Parser| {
-            let (_, Trace::Doctype(trace)) = parser.flatten().next().unwrap() else {
-                panic!("expected doctype");
-            };
-            vec![(trace.span(), "")]
-        };
-        annotated.push_str(&test_and_annotate(case, labeler));
-    }
-
-    assert_snapshot!(annotated, @r###"
-    <!DOCTYPE       HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd"     >
-    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-    "###);
-}
-
-#[test]
-fn doctype_id_spans() {
-    #[rustfmt::skip]
-    let cases = [
-        r#"<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">"#,
-    ];
-
-    let mut annotated = String::new();
-    for case in cases {
-        let labeler = |parser: Parser| {
-            let (_, Trace::Doctype(trace)) = parser.flatten().next().unwrap() else {
-                panic!("expected doctype");
-            };
-
-            let mut labels = Vec::new();
-            if let Some(name_span) = trace.name_span() {
-                labels.push((name_span, "name"));
-            }
-            if let Some(public_id_span) = trace.public_id_span() {
-                labels.push((public_id_span, "public id"));
-            }
-            if let Some(system_id_span) = trace.system_id_span() {
-                labels.push((system_id_span, "system id"));
-            }
-            labels
-        };
-
-        annotated.push_str(&test_and_annotate(case, labeler));
-    }
-
-    assert_snapshot!(annotated, @r###"
-    <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
-              ^^^^         ^^^^^^^^^^^^^^^^^^^^^^^^^   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ system id
-              │            │
-              │            public id
-              name
-    "###);
-}
-
-#[test]
-fn eof_offset() {
-    let html = "Where does it end?";
-    let labeler = |parser: Parser| {
-        let mut labels = Vec::new();
-        for (_, trace) in parser.flatten() {
-            if let Trace::EndOfFile(offset) = trace {
-                labels.push((offset..offset, "here"));
-            }
-        }
-        labels
-    };
-    assert_snapshot!(test_and_annotate(html, labeler), @r###"
-    Where does it end?
-                      ^ here
-    "###);
-}
-
-fn annotate_errors(html: &'static str) -> String {
-    let mut parser = parser(html);
-    for _ in parser.by_ref() {}
-    let errors: Vec<_> = parser.emitter_mut().drain_errors().collect();
-
-    for (_, span) in errors {
-        if span.start == span.end {
-            if span.start != html.len() {
-                panic!("empty error spans are only allowed at the very end of the source (for eof errors)");
-            }
-        } else {
-            assert!(span.start < span.end);
-            assert!(span.end <= html.len());
-        }
-    }
-
-    let labeler = |mut parser: Parser| {
-        let mut labels = Vec::new();
-        for _ in parser.by_ref() {}
-        for (error, span) in parser.emitter_mut().drain_errors() {
-            labels.push((span, error.code()));
-        }
-        labels
-    };
-
-    test_and_annotate(html, labeler)
-}
-
-#[test]
-fn tests_for_errors_are_sorted() {
-    let source_of_this_file = std::fs::read_to_string(file!()).unwrap();
-    let mut error_tests: Vec<_> = source_of_this_file
-        .lines()
-        .filter(|l| l.starts_with("fn error_"))
-        .collect();
-    let error_tests_found_order = error_tests.join("\n");
-    error_tests.sort();
-    let error_tests_sorted = error_tests.join("\n");
-    assert_eq!(error_tests_found_order, error_tests_sorted);
-}
-
-#[test]
-fn error_char_ref_absence_of_digits() {
-    let html = "&#qux;";
-    assert_snapshot!(annotate_errors(html), @r###"
-    &#qux;
-    ^^^ absence-of-digits-in-numeric-character-reference
-    "###);
-}
-
-#[test]
-fn error_char_ref_control_char() {
-    let html = "&#127;";
-    assert_snapshot!(annotate_errors(html), @r###"
-    &#127;
-    ^^^^^^ control-character-reference
-    "###);
-}
-
-#[test]
-fn error_char_ref_missing_semicolon() {
-    let html = "&not";
-    assert_snapshot!(annotate_errors(html), @r###"
-    &not
-        ^ missing-semicolon-after-character-reference
-    "###);
-}
-
-#[test]
-fn error_char_ref_noncharacter() {
-    let html = "&#xFDD0;";
-    assert_snapshot!(annotate_errors(html), @r###"
-    &#xFDD0;
-    ^^^^^^^^ noncharacter-character-reference
-    "###);
-}
-
-#[test]
-fn error_char_ref_null_char() {
-    let html = "&#0;";
-    assert_snapshot!(annotate_errors(html), @r###"
-    &#0;
-    ^^^^ null-character-reference
-    "###);
-}
-
-#[test]
-fn error_char_ref_outside_unicode_range() {
-    let html = "&#9999999;";
-    assert_snapshot!(annotate_errors(html), @r###"
-    &#9999999;
-    ^^^^^^^^^^ character-reference-outside-unicode-range
-    "###);
-}
-
-#[test]
-fn error_char_ref_surrogate() {
-    let html = "&#xD800;";
-    assert_snapshot!(annotate_errors(html), @r###"
-    &#xD800;
-    ^^^^^^^^ surrogate-character-reference
-    "###);
-}
-
-#[test]
-fn error_char_ref_unknown_named() {
-    let html = "The pirate says &arrrrr;";
-    assert_snapshot!(annotate_errors(html), @r###"
-    The pirate says &arrrrr;
-                    ^^^^^^^^ unknown-named-character-reference
-    "###);
-}
-
-#[test]
-fn error_duplicate_attribute() {
-    let html = "Does this open two pages? <a href=foo.html href=bar.html>";
-    assert_snapshot!(annotate_errors(html), @r###"
-    Does this open two pages? <a href=foo.html href=bar.html>
-                                               ^^^^ duplicate-attribute
-    "###);
-}
-
-#[test]
-fn error_end_tag_with_attributes() {
-    let html = "</end-tag first second=value>";
-    assert_snapshot!(annotate_errors(html), @r###"
-    </end-tag first second=value>
-                    ^^^^^^ end-tag-with-attributes
-    "###);
-}
-
-#[test]
-fn error_end_tag_with_trailing_solidus() {
-    let html = "Do you start or do you end? </yes/>";
-    assert_snapshot!(annotate_errors(html), @r###"
-    Do you start or do you end? </yes/>
-                                     ^ end-tag-with-trailing-solidus
-    "###);
-}
-
-#[test]
-fn error_eof_before_tag_name() {
-    let html = "<";
-    assert_snapshot!(annotate_errors(html), @r###"
-    <
-     ^ eof-before-tag-name
-    "###);
-}
-
-// TODO: add error_eof_in_cdata test
-// blocked by lack of proper tree constructor (NaiveParser doesn't parse CDATA sections)
-
-#[test]
-fn error_eof_in_comment() {
-    let html = "<!--";
-    assert_snapshot!(annotate_errors(html), @r###"
-    <!--
-        ^ eof-in-comment
-    "###);
-}
-
-#[test]
-fn error_eof_in_doctype() {
-    let html = "<!doctype html";
-    assert_snapshot!(annotate_errors(html), @r###"
-    <!doctype html
-                  ^ eof-in-doctype
-    "###);
-}
-
-#[test]
-fn error_eof_in_script_html_comment_like_text() {
-    let html = "<script><!--";
-    assert_snapshot!(annotate_errors(html), @r###"
-    <script><!--
-                ^ eof-in-script-html-comment-like-text
-    "###);
-}
-
-#[test]
-fn error_eof_in_tag() {
-    let html = "</sarcasm";
-    assert_snapshot!(annotate_errors(html), @r###"
-    </sarcasm
-             ^ eof-in-tag
-    "###);
-}
-
-#[test]
-fn error_invalid_first_character_of_tag_name() {
-    let html = "Please mind the gap: < test";
-    assert_snapshot!(annotate_errors(html), @r###"
-    Please mind the gap: < test
-                          ^ invalid-first-character-of-tag-name
-    "###);
-}
-
-fn assert_char_encoding_independence<S: AsRef<str> + Clone>(
-    html: &'static str,
-    labeler: impl Fn(Parser) -> Vec<(Range<usize>, S)>,
-) {
-    let utf8_labels = labeler(parser(html));
-    let utf16_labels = labeler(parser(Utf16Reader(html.into_reader())));
-
-    for (idx, (span, _)) in utf16_labels.into_iter().enumerate() {
-        let expected_utf16_span = Range {
-            start: html[..utf8_labels[idx].0.start].encode_utf16().count() * 2,
-            end: html[..utf8_labels[idx].0.end].encode_utf16().count() * 2,
-        };
-        assert_eq!(
-            span,
-            expected_utf16_span,
-            "UTF-16 span didn't match the UTF-8 span, which looks like:\n{}",
-            annotate(html, vec![utf8_labels[idx].clone()])
-        );
-    }
-}
-
-struct Utf16Reader<'a>(html5tokenizer::reader::StringReader<'a>);
-
-impl html5tokenizer::reader::Reader for Utf16Reader<'_> {
-    type Error = std::convert::Infallible;
-
-    fn read_char(&mut self) -> Result<Option<char>, Self::Error> {
-        self.0.read_char()
-    }
-
-    fn try_read_string(&mut self, s: &str, case_sensitive: bool) -> Result<bool, Self::Error> {
-        self.0.try_read_string(s, case_sensitive)
-    }
-
-    fn len_of_char_in_current_encoding(&self, c: char) -> usize {
-        c.len_utf16() * 2
-    }
-}
-- 
cgit v1.2.3