diff options
Diffstat (limited to 'tests')
m--------- | tests/html5lib-tests | 0 | ||||
-rw-r--r-- | tests/test_html5lib.rs | 388 |
2 files changed, 0 insertions, 388 deletions
diff --git a/tests/html5lib-tests b/tests/html5lib-tests deleted file mode 160000 -Subproject 6030cb6e40a0cf68ae38bf0001bb85b727b80a2 diff --git a/tests/test_html5lib.rs b/tests/test_html5lib.rs deleted file mode 100644 index fc5e89c..0000000 --- a/tests/test_html5lib.rs +++ /dev/null @@ -1,388 +0,0 @@ -use html5tokenizer::{ - Attribute, Doctype, EndTag, Error, InternalState as State, Reader, StartTag, Token, Tokenizer, -}; -use pretty_assertions::assert_eq; -use serde::{de::Error as _, Deserialize}; -use std::{collections::BTreeMap, fs::File, io::BufReader, path::Path}; - -#[cfg(not(feature = "integration-tests"))] -compile_error!( - "integration tests need the integration-tests feature enabled. Run cargo test --all-features" -); - -struct ExpectedOutputTokens(Vec<Token<()>>); - -impl<'de> Deserialize<'de> for ExpectedOutputTokens { - fn deserialize<D>(deserializer: D) -> Result<Self, D::Error> - where - D: serde::Deserializer<'de>, - { - // this macro is a horrible way to define a type that deserializes only from a particular - // string. Together with serde(untagged) this gives us really flexible enum tagging with really - // terrible error messages. - macro_rules! def_const { - ($str:expr, $ty:ident) => { - #[derive(Deserialize)] - enum $ty { - #[serde(rename = $str)] - $ty, - } - }; - } - - def_const!("DOCTYPE", DoctypeConst); - def_const!("StartTag", StartTagConst); - def_const!("EndTag", EndTagConst); - def_const!("Comment", CommentConst); - def_const!("Character", CharacterConst); - - type Attributes = BTreeMap<String, String>; - - #[derive(Deserialize)] - #[serde(untagged)] - enum OutputToken { - // "DOCTYPE", name, public_id, system_id, correctness - Doctype( - DoctypeConst, - Option<String>, - Option<String>, - Option<String>, - bool, - ), - // "StartTag", name, attributes, self_closing - StartTag(StartTagConst, String, Attributes), - StartTag2(StartTagConst, String, Attributes, bool), - // "EndTag", name - EndTag(EndTagConst, String), - // "Comment", data - Comment(CommentConst, String), - // "Character", data - Character(CharacterConst, String), - } - - Ok(ExpectedOutputTokens( - Vec::deserialize(deserializer)? - .into_iter() - .map(|output_token| match output_token { - OutputToken::Doctype( - _, - name, - public_identifier, - system_identifier, - correctness, - ) => Token::Doctype(Doctype { - name: name.unwrap_or_default(), - public_identifier, - system_identifier, - force_quirks: !correctness, - }), - OutputToken::StartTag(_, name, attributes) => Token::StartTag(StartTag { - self_closing: false, - name, - attributes: attributes - .into_iter() - .map(|(k, v)| { - ( - k, - Attribute { - value: v, - ..Default::default() - }, - ) - }) - .collect(), - name_span: (), - }), - OutputToken::StartTag2(_, name, attributes, self_closing) => { - Token::StartTag(StartTag { - self_closing, - name, - attributes: attributes - .into_iter() - .map(|(k, v)| { - ( - k, - Attribute { - value: v, - ..Default::default() - }, - ) - }) - .collect(), - name_span: (), - }) - } - OutputToken::EndTag(_, name) => Token::EndTag(EndTag { - name, - name_span: (), - }), - OutputToken::Comment(_, data) => Token::Comment(data), - OutputToken::Character(_, data) => Token::String(data), - }) - .collect::<Vec<Token<()>>>(), - )) - } -} - -struct InitialState(State); - -impl<'de> Deserialize<'de> for InitialState { - fn deserialize<D>(deserializer: D) -> Result<Self, D::Error> - where - D: serde::Deserializer<'de>, - { - #[derive(Deserialize)] - enum RawInitialState { - #[serde(rename = "Data state")] - Data, - #[serde(rename = "PLAINTEXT state")] - PlainText, - #[serde(rename = "RCDATA state")] - RcData, - #[serde(rename = "RAWTEXT state")] - RawText, - #[serde(rename = "Script data state")] - ScriptData, - #[serde(rename = "CDATA section state")] - CdataSection, - } - - Ok(Self(match RawInitialState::deserialize(deserializer)? { - RawInitialState::Data => State::Data, - RawInitialState::PlainText => State::PlainText, - RawInitialState::RcData => State::RcData, - RawInitialState::RawText => State::RawText, - RawInitialState::ScriptData => State::ScriptData, - RawInitialState::CdataSection => State::CdataSection, - })) - } -} - -fn initial_states_default() -> Vec<InitialState> { - vec![InitialState(State::Data)] -} - -#[derive(Deserialize)] -#[serde(rename_all = "camelCase")] -struct Test { - description: String, - input: String, - output: ExpectedOutputTokens, - #[serde(default = "initial_states_default")] - initial_states: Vec<InitialState>, - #[serde(default)] - double_escaped: bool, - #[serde(default)] - last_start_tag: Option<String>, - #[serde(default)] - errors: Vec<ParseError>, -} - -#[derive(Debug, Eq, PartialEq)] -struct ParseErrorInner(Error); - -impl<'de> Deserialize<'de> for ParseErrorInner { - fn deserialize<D>(deserializer: D) -> Result<Self, D::Error> - where - D: serde::Deserializer<'de>, - { - let str_err = String::deserialize(deserializer)?; - let err: Error = str_err - .parse() - .map_err(|_| D::Error::custom(&format!("failed to deserialize error: {}", str_err)))?; - Ok(ParseErrorInner(err)) - } -} - -#[derive(Deserialize, Debug, Eq, PartialEq)] -#[serde(rename_all = "camelCase")] -struct ParseError { - code: ParseErrorInner, - // TODO: lineno and column? -} - -#[derive(Deserialize)] -struct Tests { - tests: Vec<Test>, -} - -/// Path to a local checkout of [html5lib-tests], relative to the -/// directory containing the `Cargo.toml` file of the current crate. -/// -/// [html5lib-tests]: https://github.com/html5lib/html5lib-tests -const HTML5LIB_TESTS_PATH: &str = "tests/html5lib-tests"; - -// FUTURE: it would be nice to assert that HTML5LIB_TESTS_PATH matches the path defined in .gitmodules -// but this is currently blocked by: -// * Cargo not setting CARGO_WORKSPACE_DIR (see https://github.com/rust-lang/cargo/issues/3946) -// * gix-config having more dependencies than I'd want to add for this - -#[test] -fn tokenizer() { - // TODO: use a custom test harness with e.g. libtest-mimic - let test_dir = format!("{HTML5LIB_TESTS_PATH}/tokenizer"); - - let mut test_paths = glob::glob(&format!("{test_dir}/*.test")) - .unwrap() - .peekable(); - - if test_paths.peek().is_none() { - panic!( - "could not find any .test files in {}, maybe try `git submodule update --init`", - test_dir - ); - } - - for test_path in test_paths { - let test_path = test_path.unwrap(); - - test_tokenizer_file(&test_path); - } -} - -fn test_tokenizer_file(path: &Path) { - let fname = path.file_name().unwrap().to_str().unwrap(); - - if matches!( - fname, - // We don't implement "Coercing an HTML DOM into an infoset" section - "xmlViolation.test" | - // Our parser does not operate on bytes, the input isn't valid Rust &str - "unicodeCharsProblematic.test" - ) { - return; - } - - let f = File::open(path).unwrap(); - let bf = BufReader::new(f); - let tests: Tests = serde_json::from_reader(bf).unwrap(); - - for (i, test) in tests.tests.into_iter().enumerate() { - run_test(fname, i, test); - } -} - -fn run_test(fname: &str, test_i: usize, mut test: Test) { - test.input = if test.double_escaped { - unescape(&test.input) - } else { - test.input - }; - - test.output = if test.double_escaped { - ExpectedOutputTokens( - test.output - .0 - .into_iter() - .map(|token| match token { - Token::String(x) => Token::String(unescape(&x)), - Token::Comment(x) => Token::Comment(unescape(&x)), - token => token, - }) - .collect(), - ) - } else { - ExpectedOutputTokens(test.output.0) - }; - - for state in &test.initial_states { - run_test_inner( - fname, - test_i, - &test, - state.0, - Tokenizer::new(&test.input), - "string", - ); - - run_test_inner( - fname, - test_i, - &test, - state.0, - Tokenizer::new(BufReader::new(test.input.as_bytes())), - "bufread", - ); - } -} - -fn run_test_inner<R: Reader>( - fname: &str, - test_i: usize, - test: &Test, - state: State, - mut tokenizer: Tokenizer<R>, - tokenizer_info: &str, -) { - println!( - "==== FILE {}, TEST {}, STATE {:?}, TOKENIZER {} ====", - fname, test_i, state, tokenizer_info, - ); - println!("description: {}", test.description); - tokenizer.set_internal_state(state); - tokenizer.set_last_start_tag(test.last_start_tag.as_ref().map(String::as_str)); - - let mut actual_tokens = Vec::new(); - let mut actual_errors = Vec::new(); - - for token in tokenizer { - let token = token.unwrap(); - - if let Token::Error { error, .. } = token { - actual_errors.push(ParseError { - code: ParseErrorInner(error), - }); - } else { - actual_tokens.push(token); - } - } - - assert_eq!(test.output.0, actual_tokens); - - if !matches!( - (fname, test_i), - // TODO: html5lib-tests bug? - ("test3.test", 79) - ) { - assert_eq!(test.errors, actual_errors); - } -} - -/// Implements the escape sequences described in the tokenizer tests of html5lib-tests (and nothing -/// more) -fn unescape(data: &str) -> String { - let mut stream = data.chars(); - let mut rv = String::new(); - - loop { - match stream.next() { - Some('\\') => (), - Some(x) => { - rv.push(x); - continue; - } - None => break, - } - - match stream.next() { - Some('u') => (), - x => panic!("unexpected escape: {:?}", x), - } - - let orig_len = rv.len(); - - for _ in 0..4 { - rv.push(match stream.next() { - Some(x) => x, - None => panic!("unexpected eof after \\u"), - }); - } - - let c = u32::from_str_radix(&rv[orig_len..], 16).expect("failed to parse as hex"); - let c = char::from_u32(c).expect("bad character"); - rv.truncate(orig_len); - rv.push(c); - } - - rv -} |