diff options
author | Martin Fischer <martin@push-f.com> | 2023-08-11 19:37:09 +0200 |
---|---|---|
committer | Martin Fischer <martin@push-f.com> | 2023-08-19 06:39:08 +0200 |
commit | b48e5c3b99fd537d223cb899e8675177d77e650c (patch) | |
tree | d614c5c0950f75d677fb5e9e351c19a30d140257 /integration_tests | |
parent | 900c12ee92ee9dfff7e2c52770ba17a0c51f837f (diff) |
refactor: move html5lib test to own crate to fix `cargo test`
Previously `cargo test` failed because it ran the test_html5lib
integration test, which depends on the integration-tests feature
(so you always had to run `cargo test` with
`--features integration-tests` or `--all-features`, which was annoying).
This commit moves the integration tests to another crate,
so that the dependency on the feature can be properly defined
in a way so that `cargo test` just works and runs the test.
Diffstat (limited to 'integration_tests')
-rw-r--r-- | integration_tests/Cargo.toml | 22 | ||||
m--------- | integration_tests/html5lib-tests | 0 | ||||
-rw-r--r-- | integration_tests/tests/test_html5lib.rs | 383 |
3 files changed, 405 insertions, 0 deletions
diff --git a/integration_tests/Cargo.toml b/integration_tests/Cargo.toml new file mode 100644 index 0000000..1e68a0b --- /dev/null +++ b/integration_tests/Cargo.toml @@ -0,0 +1,22 @@ +# The html5lib integration test lives in a separate crate because +# we want `cargo test` to run these tests despite their dependency +# on the `integration-tests` feature from the html5tokenizer crate +# and cargo doesn't support features to be automatically enabled for +# integration tests in a single crate. (required-features under [[test]] +# just results in the test being skipped if the feature isn't enabled). +# See https://github.com/rust-lang/cargo/issues/2911#issuecomment-524652568. + +[package] +name = "integration_tests" +publish = false +version = "0.0.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dev-dependencies] +glob = "0.3.1" +html5tokenizer = { path = "..", features = ["integration-tests"] } +pretty_assertions = "1.0.0" +serde = { version = "1.0.130", features = ["derive"] } +serde_json = "1.0.71" diff --git a/integration_tests/html5lib-tests b/integration_tests/html5lib-tests new file mode 160000 +Subproject 6030cb6e40a0cf68ae38bf0001bb85b727b80a2 diff --git a/integration_tests/tests/test_html5lib.rs b/integration_tests/tests/test_html5lib.rs new file mode 100644 index 0000000..cf95bb6 --- /dev/null +++ b/integration_tests/tests/test_html5lib.rs @@ -0,0 +1,383 @@ +use html5tokenizer::{ + Attribute, Doctype, EndTag, Error, InternalState as State, Reader, StartTag, Token, Tokenizer, +}; +use pretty_assertions::assert_eq; +use serde::{de::Error as _, Deserialize}; +use std::{collections::BTreeMap, fs::File, io::BufReader, path::Path}; + +struct ExpectedOutputTokens(Vec<Token<()>>); + +impl<'de> Deserialize<'de> for ExpectedOutputTokens { + fn deserialize<D>(deserializer: D) -> Result<Self, D::Error> + where + D: serde::Deserializer<'de>, + { + // this macro is a horrible way to define a type that deserializes only from a particular + // string. Together with serde(untagged) this gives us really flexible enum tagging with really + // terrible error messages. + macro_rules! def_const { + ($str:expr, $ty:ident) => { + #[derive(Deserialize)] + enum $ty { + #[serde(rename = $str)] + $ty, + } + }; + } + + def_const!("DOCTYPE", DoctypeConst); + def_const!("StartTag", StartTagConst); + def_const!("EndTag", EndTagConst); + def_const!("Comment", CommentConst); + def_const!("Character", CharacterConst); + + type Attributes = BTreeMap<String, String>; + + #[derive(Deserialize)] + #[serde(untagged)] + enum OutputToken { + // "DOCTYPE", name, public_id, system_id, correctness + Doctype( + DoctypeConst, + Option<String>, + Option<String>, + Option<String>, + bool, + ), + // "StartTag", name, attributes, self_closing + StartTag(StartTagConst, String, Attributes), + StartTag2(StartTagConst, String, Attributes, bool), + // "EndTag", name + EndTag(EndTagConst, String), + // "Comment", data + Comment(CommentConst, String), + // "Character", data + Character(CharacterConst, String), + } + + Ok(ExpectedOutputTokens( + Vec::deserialize(deserializer)? + .into_iter() + .map(|output_token| match output_token { + OutputToken::Doctype( + _, + name, + public_identifier, + system_identifier, + correctness, + ) => Token::Doctype(Doctype { + name: name.unwrap_or_default(), + public_identifier, + system_identifier, + force_quirks: !correctness, + }), + OutputToken::StartTag(_, name, attributes) => Token::StartTag(StartTag { + self_closing: false, + name, + attributes: attributes + .into_iter() + .map(|(k, v)| { + ( + k, + Attribute { + value: v, + ..Default::default() + }, + ) + }) + .collect(), + name_span: (), + }), + OutputToken::StartTag2(_, name, attributes, self_closing) => { + Token::StartTag(StartTag { + self_closing, + name, + attributes: attributes + .into_iter() + .map(|(k, v)| { + ( + k, + Attribute { + value: v, + ..Default::default() + }, + ) + }) + .collect(), + name_span: (), + }) + } + OutputToken::EndTag(_, name) => Token::EndTag(EndTag { + name, + name_span: (), + }), + OutputToken::Comment(_, data) => Token::Comment(data), + OutputToken::Character(_, data) => Token::String(data), + }) + .collect::<Vec<Token<()>>>(), + )) + } +} + +struct InitialState(State); + +impl<'de> Deserialize<'de> for InitialState { + fn deserialize<D>(deserializer: D) -> Result<Self, D::Error> + where + D: serde::Deserializer<'de>, + { + #[derive(Deserialize)] + enum RawInitialState { + #[serde(rename = "Data state")] + Data, + #[serde(rename = "PLAINTEXT state")] + PlainText, + #[serde(rename = "RCDATA state")] + RcData, + #[serde(rename = "RAWTEXT state")] + RawText, + #[serde(rename = "Script data state")] + ScriptData, + #[serde(rename = "CDATA section state")] + CdataSection, + } + + Ok(Self(match RawInitialState::deserialize(deserializer)? { + RawInitialState::Data => State::Data, + RawInitialState::PlainText => State::PlainText, + RawInitialState::RcData => State::RcData, + RawInitialState::RawText => State::RawText, + RawInitialState::ScriptData => State::ScriptData, + RawInitialState::CdataSection => State::CdataSection, + })) + } +} + +fn initial_states_default() -> Vec<InitialState> { + vec![InitialState(State::Data)] +} + +#[derive(Deserialize)] +#[serde(rename_all = "camelCase")] +struct Test { + description: String, + input: String, + output: ExpectedOutputTokens, + #[serde(default = "initial_states_default")] + initial_states: Vec<InitialState>, + #[serde(default)] + double_escaped: bool, + #[serde(default)] + last_start_tag: Option<String>, + #[serde(default)] + errors: Vec<ParseError>, +} + +#[derive(Debug, Eq, PartialEq)] +struct ParseErrorInner(Error); + +impl<'de> Deserialize<'de> for ParseErrorInner { + fn deserialize<D>(deserializer: D) -> Result<Self, D::Error> + where + D: serde::Deserializer<'de>, + { + let str_err = String::deserialize(deserializer)?; + let err: Error = str_err + .parse() + .map_err(|_| D::Error::custom(&format!("failed to deserialize error: {}", str_err)))?; + Ok(ParseErrorInner(err)) + } +} + +#[derive(Deserialize, Debug, Eq, PartialEq)] +#[serde(rename_all = "camelCase")] +struct ParseError { + code: ParseErrorInner, + // TODO: lineno and column? +} + +#[derive(Deserialize)] +struct Tests { + tests: Vec<Test>, +} + +/// Path to a local checkout of [html5lib-tests], relative to the +/// directory containing the `Cargo.toml` file of the current crate. +/// +/// [html5lib-tests]: https://github.com/html5lib/html5lib-tests +const HTML5LIB_TESTS_PATH: &str = "html5lib-tests"; + +// FUTURE: it would be nice to assert that HTML5LIB_TESTS_PATH matches the path defined in .gitmodules +// but this is currently blocked by: +// * Cargo not setting CARGO_WORKSPACE_DIR (see https://github.com/rust-lang/cargo/issues/3946) +// * gix-config having more dependencies than I'd want to add for this + +#[test] +fn tokenizer() { + // TODO: use a custom test harness with e.g. libtest-mimic + let test_dir = format!("{HTML5LIB_TESTS_PATH}/tokenizer"); + + let mut test_paths = glob::glob(&format!("{test_dir}/*.test")) + .unwrap() + .peekable(); + + if test_paths.peek().is_none() { + panic!( + "could not find any .test files in {}, maybe try `git submodule update --init`", + test_dir + ); + } + + for test_path in test_paths { + let test_path = test_path.unwrap(); + + test_tokenizer_file(&test_path); + } +} + +fn test_tokenizer_file(path: &Path) { + let fname = path.file_name().unwrap().to_str().unwrap(); + + if matches!( + fname, + // We don't implement "Coercing an HTML DOM into an infoset" section + "xmlViolation.test" | + // Our parser does not operate on bytes, the input isn't valid Rust &str + "unicodeCharsProblematic.test" + ) { + return; + } + + let f = File::open(path).unwrap(); + let bf = BufReader::new(f); + let tests: Tests = serde_json::from_reader(bf).unwrap(); + + for (i, test) in tests.tests.into_iter().enumerate() { + run_test(fname, i, test); + } +} + +fn run_test(fname: &str, test_i: usize, mut test: Test) { + test.input = if test.double_escaped { + unescape(&test.input) + } else { + test.input + }; + + test.output = if test.double_escaped { + ExpectedOutputTokens( + test.output + .0 + .into_iter() + .map(|token| match token { + Token::String(x) => Token::String(unescape(&x)), + Token::Comment(x) => Token::Comment(unescape(&x)), + token => token, + }) + .collect(), + ) + } else { + ExpectedOutputTokens(test.output.0) + }; + + for state in &test.initial_states { + run_test_inner( + fname, + test_i, + &test, + state.0, + Tokenizer::new(&test.input), + "string", + ); + + run_test_inner( + fname, + test_i, + &test, + state.0, + Tokenizer::new(BufReader::new(test.input.as_bytes())), + "bufread", + ); + } +} + +fn run_test_inner<R: Reader>( + fname: &str, + test_i: usize, + test: &Test, + state: State, + mut tokenizer: Tokenizer<R>, + tokenizer_info: &str, +) { + println!( + "==== FILE {}, TEST {}, STATE {:?}, TOKENIZER {} ====", + fname, test_i, state, tokenizer_info, + ); + println!("description: {}", test.description); + tokenizer.set_internal_state(state); + tokenizer.set_last_start_tag(test.last_start_tag.as_ref().map(String::as_str)); + + let mut actual_tokens = Vec::new(); + let mut actual_errors = Vec::new(); + + for token in tokenizer { + let token = token.unwrap(); + + if let Token::Error { error, .. } = token { + actual_errors.push(ParseError { + code: ParseErrorInner(error), + }); + } else { + actual_tokens.push(token); + } + } + + assert_eq!(test.output.0, actual_tokens); + + if !matches!( + (fname, test_i), + // TODO: html5lib-tests bug? + ("test3.test", 79) + ) { + assert_eq!(test.errors, actual_errors); + } +} + +/// Implements the escape sequences described in the tokenizer tests of html5lib-tests (and nothing +/// more) +fn unescape(data: &str) -> String { + let mut stream = data.chars(); + let mut rv = String::new(); + + loop { + match stream.next() { + Some('\\') => (), + Some(x) => { + rv.push(x); + continue; + } + None => break, + } + + match stream.next() { + Some('u') => (), + x => panic!("unexpected escape: {:?}", x), + } + + let orig_len = rv.len(); + + for _ in 0..4 { + rv.push(match stream.next() { + Some(x) => x, + None => panic!("unexpected eof after \\u"), + }); + } + + let c = u32::from_str_radix(&rv[orig_len..], 16).expect("failed to parse as hex"); + let c = char::from_u32(c).expect("bad character"); + rv.truncate(orig_len); + rv.push(c); + } + + rv +} |