1 files changed, 218 insertions, 0 deletions
diff --git a/integration_tests/tests/tokenizer.rs b/integration_tests/tests/tokenizer.rs
new file mode 100644
index 0000000..2d3e4cb
--- /dev/null
+++ b/integration_tests/tests/tokenizer.rs
@@ -0,0 +1,218 @@
+use std::{fs::File, io::BufReader, ops::Range, path::Path};
+
+use html5lib_tests::{
+    parse_tests, Error as TestError, InitialState, Output, Test, Token as TestToken,
+};
+use html5tokenizer::{
+    offset::{Offset, PosTrackingReader, Position},
+    reader::Reader,
+    BasicEmitter, Emitter, Error, Event, InternalState, Token, Tokenizer, TracingEmitter,
+};
+use similar_asserts::assert_eq;
+
+/// Path to a local checkout of [html5lib-tests], relative to the
+/// directory containing the `Cargo.toml` file of the current crate.
+///
+/// [html5lib-tests]: https://github.com/html5lib/html5lib-tests
+const HTML5LIB_TESTS_PATH: &str = "html5lib-tests";
+
+// FUTURE: it would be nice to assert that HTML5LIB_TESTS_PATH matches the path defined in .gitmodules
+// but this is currently blocked by:
+// * Cargo not setting CARGO_WORKSPACE_DIR (see https://github.com/rust-lang/cargo/issues/3946)
+// * gix-config having more dependencies than I'd want to add for this
+
+#[test]
+fn tokenizer() {
+    // TODO: use a custom test harness with e.g. libtest-mimic
+    let test_dir = format!("{HTML5LIB_TESTS_PATH}/tokenizer");
+
+    let mut test_paths = glob::glob(&format!("{test_dir}/*.test"))
+        .unwrap()
+        .peekable();
+
+    if test_paths.peek().is_none() {
+        panic!(
+            "could not find any .test files in {}, maybe try `git submodule update --init`",
+            test_dir
+        );
+    }
+
+    for test_path in test_paths {
+        let test_path = test_path.unwrap();
+
+        test_tokenizer_file(&test_path);
+    }
+}
+
+fn test_tokenizer_file(path: &Path) {
+    let fname = path.file_name().unwrap().to_str().unwrap();
+
+    if matches!(
+        fname,
+        // We don't implement "Coercing an HTML DOM into an infoset" section
+        "xmlViolation.test" |
+        // Our parser does not operate on bytes, the input isn't valid Rust &str
+        "unicodeCharsProblematic.test"
+    ) {
+        return;
+    }
+
+    let f = File::open(path).unwrap();
+    let bf = BufReader::new(f);
+    let tests = parse_tests(bf).expect(&format!("failed to parse {path:?}"));
+
+    for (i, test) in tests.into_iter().enumerate() {
+        run_test(fname, i, test);
+    }
+}
+
+fn run_test(fname: &str, test_i: usize, test: Test) {
+    for state in &test.initial_states {
+        run_test_inner(
+            fname,
+            test_i,
+            &test,
+            state,
+            Tokenizer::new(&test.input, BasicEmitter::default()),
+            "BasicEmitter string",
+        );
+
+        run_test_inner(
+            fname,
+            test_i,
+            &test,
+            state,
+            Tokenizer::new(
+                BufReader::new(test.input.as_bytes()),
+                BasicEmitter::default(),
+            ),
+            "BasicEmitter bufread",
+        );
+
+        run_test_inner(
+            fname,
+            test_i,
+            &test,
+            state,
+            Tokenizer::new(
+                PosTrackingReader::new(&test.input),
+                TracingEmitter::default(),
+            ),
+            "TracingEmitter string",
+        );
+
+        run_test_inner(
+            fname,
+            test_i,
+            &test,
+            state,
+            Tokenizer::new(
+                PosTrackingReader::new(BufReader::new(test.input.as_bytes())),
+                TracingEmitter::default(),
+            ),
+            "TracingEmitter bufread",
+        );
+    }
+}
+
+fn run_test_inner<R, O, E, T>(
+    fname: &str,
+    test_i: usize,
+    test: &Test,
+    state: &InitialState,
+    mut tokenizer: Tokenizer<R, O, E>,
+    tokenizer_info: &str,
+) where
+    R: Reader + Position<O>,
+    O: Offset,
+    E: Emitter<O> + Iterator<Item = T> + DrainErrors<O>,
+    T: Into<Token>,
+{
+    println!(
+        "==== FILE {}, TEST {}, STATE {:?}, TOKENIZER {} ====",
+        fname, test_i, state, tokenizer_info,
+    );
+    println!("description: {}", test.description);
+    tokenizer.set_internal_state(match state {
+        InitialState::Data => InternalState::Data,
+        InitialState::Plaintext => InternalState::Plaintext,
+        InitialState::Rcdata => InternalState::Rcdata,
+        InitialState::Rawtext => InternalState::Rawtext,
+        InitialState::ScriptData => InternalState::ScriptData,
+        InitialState::CdataSection => InternalState::CdataSection,
+    });
+    if let Some(last_start_tag) = &test.last_start_tag {
+        tokenizer.set_last_start_tag(last_start_tag);
+    }
+
+    let mut actual_tokens = Vec::new();
+
+    while let Some(event) = tokenizer.next() {
+        let token = match event.unwrap() {
+            Event::CdataOpen => {
+                tokenizer.handle_cdata_open(false);
+                continue;
+            }
+            Event::Token(token) => token.into(),
+        };
+
+        match token {
+            Token::StartTag(tag) => actual_tokens.push(TestToken::StartTag {
+                name: tag.name,
+                attributes: tag
+                    .attributes
+                    .into_iter()
+                    .map(|attr| (attr.name, attr.value))
+                    .collect(),
+                self_closing: tag.self_closing,
+            }),
+            Token::EndTag(tag) => actual_tokens.push(TestToken::EndTag { name: tag.name }),
+            Token::Char(c) => {
+                // Coalesce all adjacent character tokens into a single string.
+                if let Some(TestToken::Character(s)) = actual_tokens.last_mut() {
+                    s.push(c);
+                } else {
+                    actual_tokens.push(TestToken::Character(c.into()));
+                }
+            }
+            Token::Comment(comment) => actual_tokens.push(TestToken::Comment(comment)),
+            Token::Doctype(doctype) => actual_tokens.push(TestToken::Doctype {
+                name: doctype.name,
+                public_id: doctype.public_id,
+                system_id: doctype.system_id,
+                force_quirks: doctype.force_quirks,
+            }),
+            Token::EndOfFile => {}
+        };
+    }
+
+    assert_eq!(
+        Output {
+            errors: tokenizer
+                .emitter_mut()
+                .drain_errors()
+                .map(|(e, _)| TestError {
+                    code: e.code().to_string()
+                })
+                .collect(),
+            tokens: actual_tokens,
+        },
+        test.output,
+    );
+}
+
+trait DrainErrors<O> {
+    fn drain_errors(&mut self) -> Box<dyn Iterator<Item = (Error, Range<O>)> + '_>;
+}
+
+impl<O> DrainErrors<O> for BasicEmitter<O> {
+    fn drain_errors(&mut self) -> Box<dyn Iterator<Item = (Error, Range<O>)> + '_> {
+        Box::new(self.drain_errors())
+    }
+}
+
+impl DrainErrors<usize> for TracingEmitter {
+    fn drain_errors(&mut self) -> Box<dyn Iterator<Item = (Error, Range<usize>)> + '_> {
+        Box::new(self.drain_errors())
+    }
+}