summaryrefslogtreecommitdiff
path: root/tests
diff options
context:
space:
mode:
authorMartin Fischer <martin@push-f.com>2023-08-11 19:37:09 +0200
committerMartin Fischer <martin@push-f.com>2023-08-19 06:39:08 +0200
commitb48e5c3b99fd537d223cb899e8675177d77e650c (patch)
treed614c5c0950f75d677fb5e9e351c19a30d140257 /tests
parent900c12ee92ee9dfff7e2c52770ba17a0c51f837f (diff)
refactor: move html5lib test to own crate to fix `cargo test`
Previously `cargo test` failed because it ran the test_html5lib integration test, which depends on the integration-tests feature (so you always had to run `cargo test` with `--features integration-tests` or `--all-features`, which was annoying). This commit moves the integration tests to another crate, so that the dependency on the feature can be properly defined in a way so that `cargo test` just works and runs the test.
Diffstat (limited to 'tests')
m---------tests/html5lib-tests0
-rw-r--r--tests/test_html5lib.rs388
2 files changed, 0 insertions, 388 deletions
diff --git a/tests/html5lib-tests b/tests/html5lib-tests
deleted file mode 160000
-Subproject 6030cb6e40a0cf68ae38bf0001bb85b727b80a2
diff --git a/tests/test_html5lib.rs b/tests/test_html5lib.rs
deleted file mode 100644
index fc5e89c..0000000
--- a/tests/test_html5lib.rs
+++ /dev/null
@@ -1,388 +0,0 @@
-use html5tokenizer::{
- Attribute, Doctype, EndTag, Error, InternalState as State, Reader, StartTag, Token, Tokenizer,
-};
-use pretty_assertions::assert_eq;
-use serde::{de::Error as _, Deserialize};
-use std::{collections::BTreeMap, fs::File, io::BufReader, path::Path};
-
-#[cfg(not(feature = "integration-tests"))]
-compile_error!(
- "integration tests need the integration-tests feature enabled. Run cargo test --all-features"
-);
-
-struct ExpectedOutputTokens(Vec<Token<()>>);
-
-impl<'de> Deserialize<'de> for ExpectedOutputTokens {
- fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
- where
- D: serde::Deserializer<'de>,
- {
- // this macro is a horrible way to define a type that deserializes only from a particular
- // string. Together with serde(untagged) this gives us really flexible enum tagging with really
- // terrible error messages.
- macro_rules! def_const {
- ($str:expr, $ty:ident) => {
- #[derive(Deserialize)]
- enum $ty {
- #[serde(rename = $str)]
- $ty,
- }
- };
- }
-
- def_const!("DOCTYPE", DoctypeConst);
- def_const!("StartTag", StartTagConst);
- def_const!("EndTag", EndTagConst);
- def_const!("Comment", CommentConst);
- def_const!("Character", CharacterConst);
-
- type Attributes = BTreeMap<String, String>;
-
- #[derive(Deserialize)]
- #[serde(untagged)]
- enum OutputToken {
- // "DOCTYPE", name, public_id, system_id, correctness
- Doctype(
- DoctypeConst,
- Option<String>,
- Option<String>,
- Option<String>,
- bool,
- ),
- // "StartTag", name, attributes, self_closing
- StartTag(StartTagConst, String, Attributes),
- StartTag2(StartTagConst, String, Attributes, bool),
- // "EndTag", name
- EndTag(EndTagConst, String),
- // "Comment", data
- Comment(CommentConst, String),
- // "Character", data
- Character(CharacterConst, String),
- }
-
- Ok(ExpectedOutputTokens(
- Vec::deserialize(deserializer)?
- .into_iter()
- .map(|output_token| match output_token {
- OutputToken::Doctype(
- _,
- name,
- public_identifier,
- system_identifier,
- correctness,
- ) => Token::Doctype(Doctype {
- name: name.unwrap_or_default(),
- public_identifier,
- system_identifier,
- force_quirks: !correctness,
- }),
- OutputToken::StartTag(_, name, attributes) => Token::StartTag(StartTag {
- self_closing: false,
- name,
- attributes: attributes
- .into_iter()
- .map(|(k, v)| {
- (
- k,
- Attribute {
- value: v,
- ..Default::default()
- },
- )
- })
- .collect(),
- name_span: (),
- }),
- OutputToken::StartTag2(_, name, attributes, self_closing) => {
- Token::StartTag(StartTag {
- self_closing,
- name,
- attributes: attributes
- .into_iter()
- .map(|(k, v)| {
- (
- k,
- Attribute {
- value: v,
- ..Default::default()
- },
- )
- })
- .collect(),
- name_span: (),
- })
- }
- OutputToken::EndTag(_, name) => Token::EndTag(EndTag {
- name,
- name_span: (),
- }),
- OutputToken::Comment(_, data) => Token::Comment(data),
- OutputToken::Character(_, data) => Token::String(data),
- })
- .collect::<Vec<Token<()>>>(),
- ))
- }
-}
-
-struct InitialState(State);
-
-impl<'de> Deserialize<'de> for InitialState {
- fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
- where
- D: serde::Deserializer<'de>,
- {
- #[derive(Deserialize)]
- enum RawInitialState {
- #[serde(rename = "Data state")]
- Data,
- #[serde(rename = "PLAINTEXT state")]
- PlainText,
- #[serde(rename = "RCDATA state")]
- RcData,
- #[serde(rename = "RAWTEXT state")]
- RawText,
- #[serde(rename = "Script data state")]
- ScriptData,
- #[serde(rename = "CDATA section state")]
- CdataSection,
- }
-
- Ok(Self(match RawInitialState::deserialize(deserializer)? {
- RawInitialState::Data => State::Data,
- RawInitialState::PlainText => State::PlainText,
- RawInitialState::RcData => State::RcData,
- RawInitialState::RawText => State::RawText,
- RawInitialState::ScriptData => State::ScriptData,
- RawInitialState::CdataSection => State::CdataSection,
- }))
- }
-}
-
-fn initial_states_default() -> Vec<InitialState> {
- vec![InitialState(State::Data)]
-}
-
-#[derive(Deserialize)]
-#[serde(rename_all = "camelCase")]
-struct Test {
- description: String,
- input: String,
- output: ExpectedOutputTokens,
- #[serde(default = "initial_states_default")]
- initial_states: Vec<InitialState>,
- #[serde(default)]
- double_escaped: bool,
- #[serde(default)]
- last_start_tag: Option<String>,
- #[serde(default)]
- errors: Vec<ParseError>,
-}
-
-#[derive(Debug, Eq, PartialEq)]
-struct ParseErrorInner(Error);
-
-impl<'de> Deserialize<'de> for ParseErrorInner {
- fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
- where
- D: serde::Deserializer<'de>,
- {
- let str_err = String::deserialize(deserializer)?;
- let err: Error = str_err
- .parse()
- .map_err(|_| D::Error::custom(&format!("failed to deserialize error: {}", str_err)))?;
- Ok(ParseErrorInner(err))
- }
-}
-
-#[derive(Deserialize, Debug, Eq, PartialEq)]
-#[serde(rename_all = "camelCase")]
-struct ParseError {
- code: ParseErrorInner,
- // TODO: lineno and column?
-}
-
-#[derive(Deserialize)]
-struct Tests {
- tests: Vec<Test>,
-}
-
-/// Path to a local checkout of [html5lib-tests], relative to the
-/// directory containing the `Cargo.toml` file of the current crate.
-///
-/// [html5lib-tests]: https://github.com/html5lib/html5lib-tests
-const HTML5LIB_TESTS_PATH: &str = "tests/html5lib-tests";
-
-// FUTURE: it would be nice to assert that HTML5LIB_TESTS_PATH matches the path defined in .gitmodules
-// but this is currently blocked by:
-// * Cargo not setting CARGO_WORKSPACE_DIR (see https://github.com/rust-lang/cargo/issues/3946)
-// * gix-config having more dependencies than I'd want to add for this
-
-#[test]
-fn tokenizer() {
- // TODO: use a custom test harness with e.g. libtest-mimic
- let test_dir = format!("{HTML5LIB_TESTS_PATH}/tokenizer");
-
- let mut test_paths = glob::glob(&format!("{test_dir}/*.test"))
- .unwrap()
- .peekable();
-
- if test_paths.peek().is_none() {
- panic!(
- "could not find any .test files in {}, maybe try `git submodule update --init`",
- test_dir
- );
- }
-
- for test_path in test_paths {
- let test_path = test_path.unwrap();
-
- test_tokenizer_file(&test_path);
- }
-}
-
-fn test_tokenizer_file(path: &Path) {
- let fname = path.file_name().unwrap().to_str().unwrap();
-
- if matches!(
- fname,
- // We don't implement "Coercing an HTML DOM into an infoset" section
- "xmlViolation.test" |
- // Our parser does not operate on bytes, the input isn't valid Rust &str
- "unicodeCharsProblematic.test"
- ) {
- return;
- }
-
- let f = File::open(path).unwrap();
- let bf = BufReader::new(f);
- let tests: Tests = serde_json::from_reader(bf).unwrap();
-
- for (i, test) in tests.tests.into_iter().enumerate() {
- run_test(fname, i, test);
- }
-}
-
-fn run_test(fname: &str, test_i: usize, mut test: Test) {
- test.input = if test.double_escaped {
- unescape(&test.input)
- } else {
- test.input
- };
-
- test.output = if test.double_escaped {
- ExpectedOutputTokens(
- test.output
- .0
- .into_iter()
- .map(|token| match token {
- Token::String(x) => Token::String(unescape(&x)),
- Token::Comment(x) => Token::Comment(unescape(&x)),
- token => token,
- })
- .collect(),
- )
- } else {
- ExpectedOutputTokens(test.output.0)
- };
-
- for state in &test.initial_states {
- run_test_inner(
- fname,
- test_i,
- &test,
- state.0,
- Tokenizer::new(&test.input),
- "string",
- );
-
- run_test_inner(
- fname,
- test_i,
- &test,
- state.0,
- Tokenizer::new(BufReader::new(test.input.as_bytes())),
- "bufread",
- );
- }
-}
-
-fn run_test_inner<R: Reader>(
- fname: &str,
- test_i: usize,
- test: &Test,
- state: State,
- mut tokenizer: Tokenizer<R>,
- tokenizer_info: &str,
-) {
- println!(
- "==== FILE {}, TEST {}, STATE {:?}, TOKENIZER {} ====",
- fname, test_i, state, tokenizer_info,
- );
- println!("description: {}", test.description);
- tokenizer.set_internal_state(state);
- tokenizer.set_last_start_tag(test.last_start_tag.as_ref().map(String::as_str));
-
- let mut actual_tokens = Vec::new();
- let mut actual_errors = Vec::new();
-
- for token in tokenizer {
- let token = token.unwrap();
-
- if let Token::Error { error, .. } = token {
- actual_errors.push(ParseError {
- code: ParseErrorInner(error),
- });
- } else {
- actual_tokens.push(token);
- }
- }
-
- assert_eq!(test.output.0, actual_tokens);
-
- if !matches!(
- (fname, test_i),
- // TODO: html5lib-tests bug?
- ("test3.test", 79)
- ) {
- assert_eq!(test.errors, actual_errors);
- }
-}
-
-/// Implements the escape sequences described in the tokenizer tests of html5lib-tests (and nothing
-/// more)
-fn unescape(data: &str) -> String {
- let mut stream = data.chars();
- let mut rv = String::new();
-
- loop {
- match stream.next() {
- Some('\\') => (),
- Some(x) => {
- rv.push(x);
- continue;
- }
- None => break,
- }
-
- match stream.next() {
- Some('u') => (),
- x => panic!("unexpected escape: {:?}", x),
- }
-
- let orig_len = rv.len();
-
- for _ in 0..4 {
- rv.push(match stream.next() {
- Some(x) => x,
- None => panic!("unexpected eof after \\u"),
- });
- }
-
- let c = u32::from_str_radix(&rv[orig_len..], 16).expect("failed to parse as hex");
- let c = char::from_u32(c).expect("bad character");
- rv.truncate(orig_len);
- rv.push(c);
- }
-
- rv
-}