From b48e5c3b99fd537d223cb899e8675177d77e650c Mon Sep 17 00:00:00 2001
From: Martin Fischer <martin@push-f.com>
Date: Fri, 11 Aug 2023 19:37:09 +0200
Subject: refactor: move html5lib test to own crate to fix `cargo test`

Previously `cargo test` failed because it ran the test_html5lib
integration test, which depends on the integration-tests feature
(so you always had to run `cargo test` with
`--features integration-tests` or `--all-features`, which was annoying).

This commit moves the integration tests to another crate,
so that the dependency on the feature can be properly defined
in a way so that `cargo test` just works and runs the test.
---
 .gitmodules                              |   4 +-
 Cargo.toml                               |   8 +-
 integration_tests/Cargo.toml             |  22 ++
 integration_tests/html5lib-tests         |   1 +
 integration_tests/tests/test_html5lib.rs | 383 ++++++++++++++++++++++++++++++
 tests/html5lib-tests                     |   1 -
 tests/test_html5lib.rs                   | 388 -------------------------------
 7 files changed, 412 insertions(+), 395 deletions(-)
 create mode 100644 integration_tests/Cargo.toml
 create mode 160000 integration_tests/html5lib-tests
 create mode 100644 integration_tests/tests/test_html5lib.rs
 delete mode 160000 tests/html5lib-tests
 delete mode 100644 tests/test_html5lib.rs

diff --git a/.gitmodules b/.gitmodules
index 2411bd3..36e58a2 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,3 +1,3 @@
-[submodule "tests/html5lib-tests"]
-	path = tests/html5lib-tests
+[submodule "html5lib-tests"]
+	path = integration_tests/html5lib-tests
 	url = https://github.com/html5lib/html5lib-tests
diff --git a/Cargo.toml b/Cargo.toml
index 6918f79..64306d8 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,3 +1,7 @@
+[workspace]
+members = [".", "integration_tests"]
+default-members = [".", "integration_tests"]
+
 [package]
 name = "html5tokenizer"
 authors = ["Markus Unterwaditzer <markus-honeypot@unterwaditzer.net>", "Martin Fischer <martin@push-f.com>"]
@@ -12,10 +16,6 @@ include = ["src/**/*", "LICENSE", "README.md"]
 
 [dev-dependencies]
 codespan-reporting = "0.11.1"
-glob = "0.3.1"
-pretty_assertions = "1.0.0"
-serde = { version = "1.0.130", features = ["derive"] }
-serde_json = "1.0.71"
 
 [features]
 # Feature used by integration tests in tests/ to get access to library internals.
diff --git a/integration_tests/Cargo.toml b/integration_tests/Cargo.toml
new file mode 100644
index 0000000..1e68a0b
--- /dev/null
+++ b/integration_tests/Cargo.toml
@@ -0,0 +1,22 @@
+# The html5lib integration test lives in a separate crate because
+# we want `cargo test` to run these tests despite their dependency
+# on the `integration-tests` feature from the html5tokenizer crate
+# and cargo doesn't support features to be automatically enabled for
+# integration tests in a single crate. (required-features under [[test]]
+# just results in the test being skipped if the feature isn't enabled).
+# See https://github.com/rust-lang/cargo/issues/2911#issuecomment-524652568.
+
+[package]
+name = "integration_tests"
+publish = false
+version = "0.0.0"
+edition = "2021"
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dev-dependencies]
+glob = "0.3.1"
+html5tokenizer = { path = "..", features = ["integration-tests"] }
+pretty_assertions = "1.0.0"
+serde = { version = "1.0.130", features = ["derive"] }
+serde_json = "1.0.71"
diff --git a/integration_tests/html5lib-tests b/integration_tests/html5lib-tests
new file mode 160000
index 0000000..6030cb6
--- /dev/null
+++ b/integration_tests/html5lib-tests
@@ -0,0 +1 @@
+Subproject commit 6030cb6e40a0cf68ae38bf0001bb85b727b80a26
diff --git a/integration_tests/tests/test_html5lib.rs b/integration_tests/tests/test_html5lib.rs
new file mode 100644
index 0000000..cf95bb6
--- /dev/null
+++ b/integration_tests/tests/test_html5lib.rs
@@ -0,0 +1,383 @@
+use html5tokenizer::{
+    Attribute, Doctype, EndTag, Error, InternalState as State, Reader, StartTag, Token, Tokenizer,
+};
+use pretty_assertions::assert_eq;
+use serde::{de::Error as _, Deserialize};
+use std::{collections::BTreeMap, fs::File, io::BufReader, path::Path};
+
+struct ExpectedOutputTokens(Vec<Token<()>>);
+
+impl<'de> Deserialize<'de> for ExpectedOutputTokens {
+    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
+    where
+        D: serde::Deserializer<'de>,
+    {
+        // this macro is a horrible way to define a type that deserializes only from a particular
+        // string. Together with serde(untagged) this gives us really flexible enum tagging with really
+        // terrible error messages.
+        macro_rules! def_const {
+            ($str:expr, $ty:ident) => {
+                #[derive(Deserialize)]
+                enum $ty {
+                    #[serde(rename = $str)]
+                    $ty,
+                }
+            };
+        }
+
+        def_const!("DOCTYPE", DoctypeConst);
+        def_const!("StartTag", StartTagConst);
+        def_const!("EndTag", EndTagConst);
+        def_const!("Comment", CommentConst);
+        def_const!("Character", CharacterConst);
+
+        type Attributes = BTreeMap<String, String>;
+
+        #[derive(Deserialize)]
+        #[serde(untagged)]
+        enum OutputToken {
+            // "DOCTYPE", name, public_id, system_id, correctness
+            Doctype(
+                DoctypeConst,
+                Option<String>,
+                Option<String>,
+                Option<String>,
+                bool,
+            ),
+            // "StartTag", name, attributes, self_closing
+            StartTag(StartTagConst, String, Attributes),
+            StartTag2(StartTagConst, String, Attributes, bool),
+            // "EndTag", name
+            EndTag(EndTagConst, String),
+            // "Comment", data
+            Comment(CommentConst, String),
+            // "Character", data
+            Character(CharacterConst, String),
+        }
+
+        Ok(ExpectedOutputTokens(
+            Vec::deserialize(deserializer)?
+                .into_iter()
+                .map(|output_token| match output_token {
+                    OutputToken::Doctype(
+                        _,
+                        name,
+                        public_identifier,
+                        system_identifier,
+                        correctness,
+                    ) => Token::Doctype(Doctype {
+                        name: name.unwrap_or_default(),
+                        public_identifier,
+                        system_identifier,
+                        force_quirks: !correctness,
+                    }),
+                    OutputToken::StartTag(_, name, attributes) => Token::StartTag(StartTag {
+                        self_closing: false,
+                        name,
+                        attributes: attributes
+                            .into_iter()
+                            .map(|(k, v)| {
+                                (
+                                    k,
+                                    Attribute {
+                                        value: v,
+                                        ..Default::default()
+                                    },
+                                )
+                            })
+                            .collect(),
+                        name_span: (),
+                    }),
+                    OutputToken::StartTag2(_, name, attributes, self_closing) => {
+                        Token::StartTag(StartTag {
+                            self_closing,
+                            name,
+                            attributes: attributes
+                                .into_iter()
+                                .map(|(k, v)| {
+                                    (
+                                        k,
+                                        Attribute {
+                                            value: v,
+                                            ..Default::default()
+                                        },
+                                    )
+                                })
+                                .collect(),
+                            name_span: (),
+                        })
+                    }
+                    OutputToken::EndTag(_, name) => Token::EndTag(EndTag {
+                        name,
+                        name_span: (),
+                    }),
+                    OutputToken::Comment(_, data) => Token::Comment(data),
+                    OutputToken::Character(_, data) => Token::String(data),
+                })
+                .collect::<Vec<Token<()>>>(),
+        ))
+    }
+}
+
+struct InitialState(State);
+
+impl<'de> Deserialize<'de> for InitialState {
+    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
+    where
+        D: serde::Deserializer<'de>,
+    {
+        #[derive(Deserialize)]
+        enum RawInitialState {
+            #[serde(rename = "Data state")]
+            Data,
+            #[serde(rename = "PLAINTEXT state")]
+            PlainText,
+            #[serde(rename = "RCDATA state")]
+            RcData,
+            #[serde(rename = "RAWTEXT state")]
+            RawText,
+            #[serde(rename = "Script data state")]
+            ScriptData,
+            #[serde(rename = "CDATA section state")]
+            CdataSection,
+        }
+
+        Ok(Self(match RawInitialState::deserialize(deserializer)? {
+            RawInitialState::Data => State::Data,
+            RawInitialState::PlainText => State::PlainText,
+            RawInitialState::RcData => State::RcData,
+            RawInitialState::RawText => State::RawText,
+            RawInitialState::ScriptData => State::ScriptData,
+            RawInitialState::CdataSection => State::CdataSection,
+        }))
+    }
+}
+
+fn initial_states_default() -> Vec<InitialState> {
+    vec![InitialState(State::Data)]
+}
+
+#[derive(Deserialize)]
+#[serde(rename_all = "camelCase")]
+struct Test {
+    description: String,
+    input: String,
+    output: ExpectedOutputTokens,
+    #[serde(default = "initial_states_default")]
+    initial_states: Vec<InitialState>,
+    #[serde(default)]
+    double_escaped: bool,
+    #[serde(default)]
+    last_start_tag: Option<String>,
+    #[serde(default)]
+    errors: Vec<ParseError>,
+}
+
+#[derive(Debug, Eq, PartialEq)]
+struct ParseErrorInner(Error);
+
+impl<'de> Deserialize<'de> for ParseErrorInner {
+    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
+    where
+        D: serde::Deserializer<'de>,
+    {
+        let str_err = String::deserialize(deserializer)?;
+        let err: Error = str_err
+            .parse()
+            .map_err(|_| D::Error::custom(&format!("failed to deserialize error: {}", str_err)))?;
+        Ok(ParseErrorInner(err))
+    }
+}
+
+#[derive(Deserialize, Debug, Eq, PartialEq)]
+#[serde(rename_all = "camelCase")]
+struct ParseError {
+    code: ParseErrorInner,
+    // TODO: lineno and column?
+}
+
+#[derive(Deserialize)]
+struct Tests {
+    tests: Vec<Test>,
+}
+
+/// Path to a local checkout of [html5lib-tests], relative to the
+/// directory containing the `Cargo.toml` file of the current crate.
+///
+/// [html5lib-tests]: https://github.com/html5lib/html5lib-tests
+const HTML5LIB_TESTS_PATH: &str = "html5lib-tests";
+
+// FUTURE: it would be nice to assert that HTML5LIB_TESTS_PATH matches the path defined in .gitmodules
+// but this is currently blocked by:
+// * Cargo not setting CARGO_WORKSPACE_DIR (see https://github.com/rust-lang/cargo/issues/3946)
+// * gix-config having more dependencies than I'd want to add for this
+
+#[test]
+fn tokenizer() {
+    // TODO: use a custom test harness with e.g. libtest-mimic
+    let test_dir = format!("{HTML5LIB_TESTS_PATH}/tokenizer");
+
+    let mut test_paths = glob::glob(&format!("{test_dir}/*.test"))
+        .unwrap()
+        .peekable();
+
+    if test_paths.peek().is_none() {
+        panic!(
+            "could not find any .test files in {}, maybe try `git submodule update --init`",
+            test_dir
+        );
+    }
+
+    for test_path in test_paths {
+        let test_path = test_path.unwrap();
+
+        test_tokenizer_file(&test_path);
+    }
+}
+
+fn test_tokenizer_file(path: &Path) {
+    let fname = path.file_name().unwrap().to_str().unwrap();
+
+    if matches!(
+        fname,
+        // We don't implement "Coercing an HTML DOM into an infoset" section
+        "xmlViolation.test" |
+        // Our parser does not operate on bytes, the input isn't valid Rust &str
+        "unicodeCharsProblematic.test"
+    ) {
+        return;
+    }
+
+    let f = File::open(path).unwrap();
+    let bf = BufReader::new(f);
+    let tests: Tests = serde_json::from_reader(bf).unwrap();
+
+    for (i, test) in tests.tests.into_iter().enumerate() {
+        run_test(fname, i, test);
+    }
+}
+
+fn run_test(fname: &str, test_i: usize, mut test: Test) {
+    test.input = if test.double_escaped {
+        unescape(&test.input)
+    } else {
+        test.input
+    };
+
+    test.output = if test.double_escaped {
+        ExpectedOutputTokens(
+            test.output
+                .0
+                .into_iter()
+                .map(|token| match token {
+                    Token::String(x) => Token::String(unescape(&x)),
+                    Token::Comment(x) => Token::Comment(unescape(&x)),
+                    token => token,
+                })
+                .collect(),
+        )
+    } else {
+        ExpectedOutputTokens(test.output.0)
+    };
+
+    for state in &test.initial_states {
+        run_test_inner(
+            fname,
+            test_i,
+            &test,
+            state.0,
+            Tokenizer::new(&test.input),
+            "string",
+        );
+
+        run_test_inner(
+            fname,
+            test_i,
+            &test,
+            state.0,
+            Tokenizer::new(BufReader::new(test.input.as_bytes())),
+            "bufread",
+        );
+    }
+}
+
+fn run_test_inner<R: Reader>(
+    fname: &str,
+    test_i: usize,
+    test: &Test,
+    state: State,
+    mut tokenizer: Tokenizer<R>,
+    tokenizer_info: &str,
+) {
+    println!(
+        "==== FILE {}, TEST {}, STATE {:?}, TOKENIZER {} ====",
+        fname, test_i, state, tokenizer_info,
+    );
+    println!("description: {}", test.description);
+    tokenizer.set_internal_state(state);
+    tokenizer.set_last_start_tag(test.last_start_tag.as_ref().map(String::as_str));
+
+    let mut actual_tokens = Vec::new();
+    let mut actual_errors = Vec::new();
+
+    for token in tokenizer {
+        let token = token.unwrap();
+
+        if let Token::Error { error, .. } = token {
+            actual_errors.push(ParseError {
+                code: ParseErrorInner(error),
+            });
+        } else {
+            actual_tokens.push(token);
+        }
+    }
+
+    assert_eq!(test.output.0, actual_tokens);
+
+    if !matches!(
+        (fname, test_i),
+        // TODO: html5lib-tests bug?
+        ("test3.test", 79)
+    ) {
+        assert_eq!(test.errors, actual_errors);
+    }
+}
+
+/// Implements the escape sequences described in the tokenizer tests of html5lib-tests (and nothing
+/// more)
+fn unescape(data: &str) -> String {
+    let mut stream = data.chars();
+    let mut rv = String::new();
+
+    loop {
+        match stream.next() {
+            Some('\\') => (),
+            Some(x) => {
+                rv.push(x);
+                continue;
+            }
+            None => break,
+        }
+
+        match stream.next() {
+            Some('u') => (),
+            x => panic!("unexpected escape: {:?}", x),
+        }
+
+        let orig_len = rv.len();
+
+        for _ in 0..4 {
+            rv.push(match stream.next() {
+                Some(x) => x,
+                None => panic!("unexpected eof after \\u"),
+            });
+        }
+
+        let c = u32::from_str_radix(&rv[orig_len..], 16).expect("failed to parse as hex");
+        let c = char::from_u32(c).expect("bad character");
+        rv.truncate(orig_len);
+        rv.push(c);
+    }
+
+    rv
+}
diff --git a/tests/html5lib-tests b/tests/html5lib-tests
deleted file mode 160000
index 6030cb6..0000000
--- a/tests/html5lib-tests
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 6030cb6e40a0cf68ae38bf0001bb85b727b80a26
diff --git a/tests/test_html5lib.rs b/tests/test_html5lib.rs
deleted file mode 100644
index fc5e89c..0000000
--- a/tests/test_html5lib.rs
+++ /dev/null
@@ -1,388 +0,0 @@
-use html5tokenizer::{
-    Attribute, Doctype, EndTag, Error, InternalState as State, Reader, StartTag, Token, Tokenizer,
-};
-use pretty_assertions::assert_eq;
-use serde::{de::Error as _, Deserialize};
-use std::{collections::BTreeMap, fs::File, io::BufReader, path::Path};
-
-#[cfg(not(feature = "integration-tests"))]
-compile_error!(
-    "integration tests need the integration-tests feature enabled. Run cargo test --all-features"
-);
-
-struct ExpectedOutputTokens(Vec<Token<()>>);
-
-impl<'de> Deserialize<'de> for ExpectedOutputTokens {
-    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
-    where
-        D: serde::Deserializer<'de>,
-    {
-        // this macro is a horrible way to define a type that deserializes only from a particular
-        // string. Together with serde(untagged) this gives us really flexible enum tagging with really
-        // terrible error messages.
-        macro_rules! def_const {
-            ($str:expr, $ty:ident) => {
-                #[derive(Deserialize)]
-                enum $ty {
-                    #[serde(rename = $str)]
-                    $ty,
-                }
-            };
-        }
-
-        def_const!("DOCTYPE", DoctypeConst);
-        def_const!("StartTag", StartTagConst);
-        def_const!("EndTag", EndTagConst);
-        def_const!("Comment", CommentConst);
-        def_const!("Character", CharacterConst);
-
-        type Attributes = BTreeMap<String, String>;
-
-        #[derive(Deserialize)]
-        #[serde(untagged)]
-        enum OutputToken {
-            // "DOCTYPE", name, public_id, system_id, correctness
-            Doctype(
-                DoctypeConst,
-                Option<String>,
-                Option<String>,
-                Option<String>,
-                bool,
-            ),
-            // "StartTag", name, attributes, self_closing
-            StartTag(StartTagConst, String, Attributes),
-            StartTag2(StartTagConst, String, Attributes, bool),
-            // "EndTag", name
-            EndTag(EndTagConst, String),
-            // "Comment", data
-            Comment(CommentConst, String),
-            // "Character", data
-            Character(CharacterConst, String),
-        }
-
-        Ok(ExpectedOutputTokens(
-            Vec::deserialize(deserializer)?
-                .into_iter()
-                .map(|output_token| match output_token {
-                    OutputToken::Doctype(
-                        _,
-                        name,
-                        public_identifier,
-                        system_identifier,
-                        correctness,
-                    ) => Token::Doctype(Doctype {
-                        name: name.unwrap_or_default(),
-                        public_identifier,
-                        system_identifier,
-                        force_quirks: !correctness,
-                    }),
-                    OutputToken::StartTag(_, name, attributes) => Token::StartTag(StartTag {
-                        self_closing: false,
-                        name,
-                        attributes: attributes
-                            .into_iter()
-                            .map(|(k, v)| {
-                                (
-                                    k,
-                                    Attribute {
-                                        value: v,
-                                        ..Default::default()
-                                    },
-                                )
-                            })
-                            .collect(),
-                        name_span: (),
-                    }),
-                    OutputToken::StartTag2(_, name, attributes, self_closing) => {
-                        Token::StartTag(StartTag {
-                            self_closing,
-                            name,
-                            attributes: attributes
-                                .into_iter()
-                                .map(|(k, v)| {
-                                    (
-                                        k,
-                                        Attribute {
-                                            value: v,
-                                            ..Default::default()
-                                        },
-                                    )
-                                })
-                                .collect(),
-                            name_span: (),
-                        })
-                    }
-                    OutputToken::EndTag(_, name) => Token::EndTag(EndTag {
-                        name,
-                        name_span: (),
-                    }),
-                    OutputToken::Comment(_, data) => Token::Comment(data),
-                    OutputToken::Character(_, data) => Token::String(data),
-                })
-                .collect::<Vec<Token<()>>>(),
-        ))
-    }
-}
-
-struct InitialState(State);
-
-impl<'de> Deserialize<'de> for InitialState {
-    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
-    where
-        D: serde::Deserializer<'de>,
-    {
-        #[derive(Deserialize)]
-        enum RawInitialState {
-            #[serde(rename = "Data state")]
-            Data,
-            #[serde(rename = "PLAINTEXT state")]
-            PlainText,
-            #[serde(rename = "RCDATA state")]
-            RcData,
-            #[serde(rename = "RAWTEXT state")]
-            RawText,
-            #[serde(rename = "Script data state")]
-            ScriptData,
-            #[serde(rename = "CDATA section state")]
-            CdataSection,
-        }
-
-        Ok(Self(match RawInitialState::deserialize(deserializer)? {
-            RawInitialState::Data => State::Data,
-            RawInitialState::PlainText => State::PlainText,
-            RawInitialState::RcData => State::RcData,
-            RawInitialState::RawText => State::RawText,
-            RawInitialState::ScriptData => State::ScriptData,
-            RawInitialState::CdataSection => State::CdataSection,
-        }))
-    }
-}
-
-fn initial_states_default() -> Vec<InitialState> {
-    vec![InitialState(State::Data)]
-}
-
-#[derive(Deserialize)]
-#[serde(rename_all = "camelCase")]
-struct Test {
-    description: String,
-    input: String,
-    output: ExpectedOutputTokens,
-    #[serde(default = "initial_states_default")]
-    initial_states: Vec<InitialState>,
-    #[serde(default)]
-    double_escaped: bool,
-    #[serde(default)]
-    last_start_tag: Option<String>,
-    #[serde(default)]
-    errors: Vec<ParseError>,
-}
-
-#[derive(Debug, Eq, PartialEq)]
-struct ParseErrorInner(Error);
-
-impl<'de> Deserialize<'de> for ParseErrorInner {
-    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
-    where
-        D: serde::Deserializer<'de>,
-    {
-        let str_err = String::deserialize(deserializer)?;
-        let err: Error = str_err
-            .parse()
-            .map_err(|_| D::Error::custom(&format!("failed to deserialize error: {}", str_err)))?;
-        Ok(ParseErrorInner(err))
-    }
-}
-
-#[derive(Deserialize, Debug, Eq, PartialEq)]
-#[serde(rename_all = "camelCase")]
-struct ParseError {
-    code: ParseErrorInner,
-    // TODO: lineno and column?
-}
-
-#[derive(Deserialize)]
-struct Tests {
-    tests: Vec<Test>,
-}
-
-/// Path to a local checkout of [html5lib-tests], relative to the
-/// directory containing the `Cargo.toml` file of the current crate.
-///
-/// [html5lib-tests]: https://github.com/html5lib/html5lib-tests
-const HTML5LIB_TESTS_PATH: &str = "tests/html5lib-tests";
-
-// FUTURE: it would be nice to assert that HTML5LIB_TESTS_PATH matches the path defined in .gitmodules
-// but this is currently blocked by:
-// * Cargo not setting CARGO_WORKSPACE_DIR (see https://github.com/rust-lang/cargo/issues/3946)
-// * gix-config having more dependencies than I'd want to add for this
-
-#[test]
-fn tokenizer() {
-    // TODO: use a custom test harness with e.g. libtest-mimic
-    let test_dir = format!("{HTML5LIB_TESTS_PATH}/tokenizer");
-
-    let mut test_paths = glob::glob(&format!("{test_dir}/*.test"))
-        .unwrap()
-        .peekable();
-
-    if test_paths.peek().is_none() {
-        panic!(
-            "could not find any .test files in {}, maybe try `git submodule update --init`",
-            test_dir
-        );
-    }
-
-    for test_path in test_paths {
-        let test_path = test_path.unwrap();
-
-        test_tokenizer_file(&test_path);
-    }
-}
-
-fn test_tokenizer_file(path: &Path) {
-    let fname = path.file_name().unwrap().to_str().unwrap();
-
-    if matches!(
-        fname,
-        // We don't implement "Coercing an HTML DOM into an infoset" section
-        "xmlViolation.test" |
-        // Our parser does not operate on bytes, the input isn't valid Rust &str
-        "unicodeCharsProblematic.test"
-    ) {
-        return;
-    }
-
-    let f = File::open(path).unwrap();
-    let bf = BufReader::new(f);
-    let tests: Tests = serde_json::from_reader(bf).unwrap();
-
-    for (i, test) in tests.tests.into_iter().enumerate() {
-        run_test(fname, i, test);
-    }
-}
-
-fn run_test(fname: &str, test_i: usize, mut test: Test) {
-    test.input = if test.double_escaped {
-        unescape(&test.input)
-    } else {
-        test.input
-    };
-
-    test.output = if test.double_escaped {
-        ExpectedOutputTokens(
-            test.output
-                .0
-                .into_iter()
-                .map(|token| match token {
-                    Token::String(x) => Token::String(unescape(&x)),
-                    Token::Comment(x) => Token::Comment(unescape(&x)),
-                    token => token,
-                })
-                .collect(),
-        )
-    } else {
-        ExpectedOutputTokens(test.output.0)
-    };
-
-    for state in &test.initial_states {
-        run_test_inner(
-            fname,
-            test_i,
-            &test,
-            state.0,
-            Tokenizer::new(&test.input),
-            "string",
-        );
-
-        run_test_inner(
-            fname,
-            test_i,
-            &test,
-            state.0,
-            Tokenizer::new(BufReader::new(test.input.as_bytes())),
-            "bufread",
-        );
-    }
-}
-
-fn run_test_inner<R: Reader>(
-    fname: &str,
-    test_i: usize,
-    test: &Test,
-    state: State,
-    mut tokenizer: Tokenizer<R>,
-    tokenizer_info: &str,
-) {
-    println!(
-        "==== FILE {}, TEST {}, STATE {:?}, TOKENIZER {} ====",
-        fname, test_i, state, tokenizer_info,
-    );
-    println!("description: {}", test.description);
-    tokenizer.set_internal_state(state);
-    tokenizer.set_last_start_tag(test.last_start_tag.as_ref().map(String::as_str));
-
-    let mut actual_tokens = Vec::new();
-    let mut actual_errors = Vec::new();
-
-    for token in tokenizer {
-        let token = token.unwrap();
-
-        if let Token::Error { error, .. } = token {
-            actual_errors.push(ParseError {
-                code: ParseErrorInner(error),
-            });
-        } else {
-            actual_tokens.push(token);
-        }
-    }
-
-    assert_eq!(test.output.0, actual_tokens);
-
-    if !matches!(
-        (fname, test_i),
-        // TODO: html5lib-tests bug?
-        ("test3.test", 79)
-    ) {
-        assert_eq!(test.errors, actual_errors);
-    }
-}
-
-/// Implements the escape sequences described in the tokenizer tests of html5lib-tests (and nothing
-/// more)
-fn unescape(data: &str) -> String {
-    let mut stream = data.chars();
-    let mut rv = String::new();
-
-    loop {
-        match stream.next() {
-            Some('\\') => (),
-            Some(x) => {
-                rv.push(x);
-                continue;
-            }
-            None => break,
-        }
-
-        match stream.next() {
-            Some('u') => (),
-            x => panic!("unexpected escape: {:?}", x),
-        }
-
-        let orig_len = rv.len();
-
-        for _ in 0..4 {
-            rv.push(match stream.next() {
-                Some(x) => x,
-                None => panic!("unexpected eof after \\u"),
-            });
-        }
-
-        let c = u32::from_str_radix(&rv[orig_len..], 16).expect("failed to parse as hex");
-        let c = char::from_u32(c).expect("bad character");
-        rv.truncate(orig_len);
-        rv.push(c);
-    }
-
-    rv
-}
-- 
cgit v1.2.3