diff options
-rw-r--r-- | Cargo.toml | 4 | ||||
-rw-r--r-- | html5lib_tests/Cargo.toml | 14 | ||||
-rw-r--r-- | html5lib_tests/src/lib.rs | 252 | ||||
-rw-r--r-- | integration_tests/Cargo.toml | 3 | ||||
-rw-r--r-- | integration_tests/tests/test_html5lib.rs | 291 |
5 files changed, 284 insertions, 280 deletions
@@ -1,6 +1,6 @@ [workspace] -members = [".", "integration_tests"] -default-members = [".", "integration_tests"] +members = [".", "html5lib_tests", "integration_tests"] +default-members = [".", "html5lib_tests", "integration_tests"] [package] name = "html5tokenizer" diff --git a/html5lib_tests/Cargo.toml b/html5lib_tests/Cargo.toml new file mode 100644 index 0000000..06fc6b8 --- /dev/null +++ b/html5lib_tests/Cargo.toml @@ -0,0 +1,14 @@ +[package] +name = "html5lib_tests" +authors = ["Martin Fischer <martin@push-f.com>", "Markus Unterwaditzer <markus-honeypot@unterwaditzer.net>"] +description = "Deserializable types for the .test files from html5lib-tests." +version = "0.0.0" +edition = "2021" +publish = false # prevent accidental publishes until it's ready to be published + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +serde = { version = "1.0.130", features = ["derive"] } +serde_json = "1.0.71" +html5tokenizer = { path = ".." } # TODO: get rid of this dependency diff --git a/html5lib_tests/src/lib.rs b/html5lib_tests/src/lib.rs new file mode 100644 index 0000000..5678b0d --- /dev/null +++ b/html5lib_tests/src/lib.rs @@ -0,0 +1,252 @@ +use html5tokenizer::{Attribute, Doctype, EndTag, Error, StartTag, Token}; +use serde::{de::Error as _, Deserialize}; +use std::collections::BTreeMap; + +pub fn parse_tests( + reader: impl std::io::Read, +) -> Result<impl Iterator<Item = Test>, serde_json::Error> { + let Tests { tests } = serde_json::from_reader(reader)?; + Ok(tests.into_iter().map(undo_double_escaping)) +} + +pub struct ExpectedOutputTokens(pub Vec<Token<()>>); + +impl<'de> Deserialize<'de> for ExpectedOutputTokens { + fn deserialize<D>(deserializer: D) -> Result<Self, D::Error> + where + D: serde::Deserializer<'de>, + { + // this macro is a horrible way to define a type that deserializes only from a particular + // string. Together with serde(untagged) this gives us really flexible enum tagging with really + // terrible error messages. + macro_rules! def_const { + ($str:expr, $ty:ident) => { + #[derive(Deserialize)] + enum $ty { + #[serde(rename = $str)] + $ty, + } + }; + } + + def_const!("DOCTYPE", DoctypeConst); + def_const!("StartTag", StartTagConst); + def_const!("EndTag", EndTagConst); + def_const!("Comment", CommentConst); + def_const!("Character", CharacterConst); + + type Attributes = BTreeMap<String, String>; + + #[derive(Deserialize)] + #[serde(untagged)] + enum OutputToken { + // "DOCTYPE", name, public_id, system_id, correctness + Doctype( + DoctypeConst, + Option<String>, + Option<String>, + Option<String>, + bool, + ), + // "StartTag", name, attributes, self_closing + StartTag(StartTagConst, String, Attributes), + StartTag2(StartTagConst, String, Attributes, bool), + // "EndTag", name + EndTag(EndTagConst, String), + // "Comment", data + Comment(CommentConst, String), + // "Character", data + Character(CharacterConst, String), + } + + Ok(ExpectedOutputTokens( + Vec::deserialize(deserializer)? + .into_iter() + .map(|output_token| match output_token { + OutputToken::Doctype( + _, + name, + public_identifier, + system_identifier, + correctness, + ) => Token::Doctype(Doctype { + name: name.unwrap_or_default(), + public_identifier, + system_identifier, + force_quirks: !correctness, + }), + OutputToken::StartTag(_, name, attributes) => Token::StartTag(StartTag { + self_closing: false, + name, + attributes: attributes + .into_iter() + .map(|(k, v)| { + ( + k, + Attribute { + value: v, + ..Default::default() + }, + ) + }) + .collect(), + name_span: (), + }), + OutputToken::StartTag2(_, name, attributes, self_closing) => { + Token::StartTag(StartTag { + self_closing, + name, + attributes: attributes + .into_iter() + .map(|(k, v)| { + ( + k, + Attribute { + value: v, + ..Default::default() + }, + ) + }) + .collect(), + name_span: (), + }) + } + OutputToken::EndTag(_, name) => Token::EndTag(EndTag { + name, + name_span: (), + }), + OutputToken::Comment(_, data) => Token::Comment(data), + OutputToken::Character(_, data) => Token::String(data), + }) + .collect::<Vec<Token<()>>>(), + )) + } +} + +#[derive(Debug, Deserialize)] +pub enum InitialState { + #[serde(rename = "Data state")] + Data, + #[serde(rename = "PLAINTEXT state")] + PlainText, + #[serde(rename = "RCDATA state")] + RcData, + #[serde(rename = "RAWTEXT state")] + RawText, + #[serde(rename = "Script data state")] + ScriptData, + #[serde(rename = "CDATA section state")] + CdataSection, +} + +fn initial_states_default() -> Vec<InitialState> { + vec![InitialState::Data] +} + +#[derive(Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct Test { + pub description: String, + pub input: String, + pub output: ExpectedOutputTokens, + #[serde(default = "initial_states_default")] + pub initial_states: Vec<InitialState>, + #[serde(default)] + double_escaped: bool, + #[serde(default)] + pub last_start_tag: Option<String>, + #[serde(default)] + pub errors: Vec<ParseError>, +} + +#[derive(Debug, Eq, PartialEq)] +pub struct ParseErrorInner(pub Error); + +impl<'de> Deserialize<'de> for ParseErrorInner { + fn deserialize<D>(deserializer: D) -> Result<Self, D::Error> + where + D: serde::Deserializer<'de>, + { + let str_err = String::deserialize(deserializer)?; + let err: Error = str_err + .parse() + .map_err(|_| D::Error::custom(&format!("failed to deserialize error: {}", str_err)))?; + Ok(ParseErrorInner(err)) + } +} + +#[derive(Deserialize, Debug, Eq, PartialEq)] +#[serde(rename_all = "camelCase")] +pub struct ParseError { + pub code: ParseErrorInner, + // TODO: lineno and column? +} + +#[derive(Deserialize)] +struct Tests { + tests: Vec<Test>, +} + +fn undo_double_escaping(mut test: Test) -> Test { + test.input = if test.double_escaped { + unescape(&test.input) + } else { + test.input + }; + + test.output = if test.double_escaped { + ExpectedOutputTokens( + test.output + .0 + .into_iter() + .map(|token| match token { + Token::String(x) => Token::String(unescape(&x)), + Token::Comment(x) => Token::Comment(unescape(&x)), + token => token, + }) + .collect(), + ) + } else { + ExpectedOutputTokens(test.output.0) + }; + test +} + +/// Implements the escape sequences described in the tokenizer tests of html5lib-tests (and nothing +/// more) +fn unescape(data: &str) -> String { + let mut stream = data.chars(); + let mut rv = String::new(); + + loop { + match stream.next() { + Some('\\') => (), + Some(x) => { + rv.push(x); + continue; + } + None => break, + } + + match stream.next() { + Some('u') => (), + x => panic!("unexpected escape: {:?}", x), + } + + let orig_len = rv.len(); + + for _ in 0..4 { + rv.push(match stream.next() { + Some(x) => x, + None => panic!("unexpected eof after \\u"), + }); + } + + let c = u32::from_str_radix(&rv[orig_len..], 16).expect("failed to parse as hex"); + let c = char::from_u32(c).expect("bad character"); + rv.truncate(orig_len); + rv.push(c); + } + + rv +} diff --git a/integration_tests/Cargo.toml b/integration_tests/Cargo.toml index 1e68a0b..cc27798 100644 --- a/integration_tests/Cargo.toml +++ b/integration_tests/Cargo.toml @@ -16,7 +16,6 @@ edition = "2021" [dev-dependencies] glob = "0.3.1" +html5lib_tests = { path = "../html5lib_tests" } html5tokenizer = { path = "..", features = ["integration-tests"] } pretty_assertions = "1.0.0" -serde = { version = "1.0.130", features = ["derive"] } -serde_json = "1.0.71" diff --git a/integration_tests/tests/test_html5lib.rs b/integration_tests/tests/test_html5lib.rs index 61e2133..3236f0f 100644 --- a/integration_tests/tests/test_html5lib.rs +++ b/integration_tests/tests/test_html5lib.rs @@ -1,212 +1,8 @@ -use html5tokenizer::{ - Attribute, Doctype, EndTag, Error, InternalState as State, Reader, StartTag, Token, Tokenizer, -}; -use pretty_assertions::assert_eq; -use serde::{de::Error as _, Deserialize}; -use std::{collections::BTreeMap, fs::File, io::BufReader, path::Path}; - -fn parse_tests( - reader: impl std::io::Read, -) -> Result<impl Iterator<Item = Test>, serde_json::Error> { - let Tests { tests } = serde_json::from_reader(reader)?; - Ok(tests.into_iter().map(undo_double_escaping)) -} - -struct ExpectedOutputTokens(Vec<Token<()>>); - -impl<'de> Deserialize<'de> for ExpectedOutputTokens { - fn deserialize<D>(deserializer: D) -> Result<Self, D::Error> - where - D: serde::Deserializer<'de>, - { - // this macro is a horrible way to define a type that deserializes only from a particular - // string. Together with serde(untagged) this gives us really flexible enum tagging with really - // terrible error messages. - macro_rules! def_const { - ($str:expr, $ty:ident) => { - #[derive(Deserialize)] - enum $ty { - #[serde(rename = $str)] - $ty, - } - }; - } - - def_const!("DOCTYPE", DoctypeConst); - def_const!("StartTag", StartTagConst); - def_const!("EndTag", EndTagConst); - def_const!("Comment", CommentConst); - def_const!("Character", CharacterConst); - - type Attributes = BTreeMap<String, String>; - - #[derive(Deserialize)] - #[serde(untagged)] - enum OutputToken { - // "DOCTYPE", name, public_id, system_id, correctness - Doctype( - DoctypeConst, - Option<String>, - Option<String>, - Option<String>, - bool, - ), - // "StartTag", name, attributes, self_closing - StartTag(StartTagConst, String, Attributes), - StartTag2(StartTagConst, String, Attributes, bool), - // "EndTag", name - EndTag(EndTagConst, String), - // "Comment", data - Comment(CommentConst, String), - // "Character", data - Character(CharacterConst, String), - } - - Ok(ExpectedOutputTokens( - Vec::deserialize(deserializer)? - .into_iter() - .map(|output_token| match output_token { - OutputToken::Doctype( - _, - name, - public_identifier, - system_identifier, - correctness, - ) => Token::Doctype(Doctype { - name: name.unwrap_or_default(), - public_identifier, - system_identifier, - force_quirks: !correctness, - }), - OutputToken::StartTag(_, name, attributes) => Token::StartTag(StartTag { - self_closing: false, - name, - attributes: attributes - .into_iter() - .map(|(k, v)| { - ( - k, - Attribute { - value: v, - ..Default::default() - }, - ) - }) - .collect(), - name_span: (), - }), - OutputToken::StartTag2(_, name, attributes, self_closing) => { - Token::StartTag(StartTag { - self_closing, - name, - attributes: attributes - .into_iter() - .map(|(k, v)| { - ( - k, - Attribute { - value: v, - ..Default::default() - }, - ) - }) - .collect(), - name_span: (), - }) - } - OutputToken::EndTag(_, name) => Token::EndTag(EndTag { - name, - name_span: (), - }), - OutputToken::Comment(_, data) => Token::Comment(data), - OutputToken::Character(_, data) => Token::String(data), - }) - .collect::<Vec<Token<()>>>(), - )) - } -} - -struct InitialState(State); - -impl<'de> Deserialize<'de> for InitialState { - fn deserialize<D>(deserializer: D) -> Result<Self, D::Error> - where - D: serde::Deserializer<'de>, - { - #[derive(Deserialize)] - enum RawInitialState { - #[serde(rename = "Data state")] - Data, - #[serde(rename = "PLAINTEXT state")] - PlainText, - #[serde(rename = "RCDATA state")] - RcData, - #[serde(rename = "RAWTEXT state")] - RawText, - #[serde(rename = "Script data state")] - ScriptData, - #[serde(rename = "CDATA section state")] - CdataSection, - } - - Ok(Self(match RawInitialState::deserialize(deserializer)? { - RawInitialState::Data => State::Data, - RawInitialState::PlainText => State::PlainText, - RawInitialState::RcData => State::RcData, - RawInitialState::RawText => State::RawText, - RawInitialState::ScriptData => State::ScriptData, - RawInitialState::CdataSection => State::CdataSection, - })) - } -} - -fn initial_states_default() -> Vec<InitialState> { - vec![InitialState(State::Data)] -} - -#[derive(Deserialize)] -#[serde(rename_all = "camelCase")] -struct Test { - description: String, - input: String, - output: ExpectedOutputTokens, - #[serde(default = "initial_states_default")] - initial_states: Vec<InitialState>, - #[serde(default)] - double_escaped: bool, - #[serde(default)] - last_start_tag: Option<String>, - #[serde(default)] - errors: Vec<ParseError>, -} - -#[derive(Debug, Eq, PartialEq)] -struct ParseErrorInner(Error); +use std::{fs::File, io::BufReader, path::Path}; -impl<'de> Deserialize<'de> for ParseErrorInner { - fn deserialize<D>(deserializer: D) -> Result<Self, D::Error> - where - D: serde::Deserializer<'de>, - { - let str_err = String::deserialize(deserializer)?; - let err: Error = str_err - .parse() - .map_err(|_| D::Error::custom(&format!("failed to deserialize error: {}", str_err)))?; - Ok(ParseErrorInner(err)) - } -} - -#[derive(Deserialize, Debug, Eq, PartialEq)] -#[serde(rename_all = "camelCase")] -struct ParseError { - code: ParseErrorInner, - // TODO: lineno and column? -} - -#[derive(Deserialize)] -struct Tests { - tests: Vec<Test>, -} +use html5lib_tests::{parse_tests, InitialState, ParseError, ParseErrorInner, Test}; +use html5tokenizer::{InternalState, Reader, Token, Tokenizer}; +use pretty_assertions::assert_eq; /// Path to a local checkout of [html5lib-tests], relative to the /// directory containing the `Cargo.toml` file of the current crate. @@ -264,38 +60,13 @@ fn test_tokenizer_file(path: &Path) { } } -fn undo_double_escaping(mut test: Test) -> Test { - test.input = if test.double_escaped { - unescape(&test.input) - } else { - test.input - }; - - test.output = if test.double_escaped { - ExpectedOutputTokens( - test.output - .0 - .into_iter() - .map(|token| match token { - Token::String(x) => Token::String(unescape(&x)), - Token::Comment(x) => Token::Comment(unescape(&x)), - token => token, - }) - .collect(), - ) - } else { - ExpectedOutputTokens(test.output.0) - }; - test -} - fn run_test(fname: &str, test_i: usize, test: Test) { for state in &test.initial_states { run_test_inner( fname, test_i, &test, - state.0, + state, Tokenizer::new(&test.input), "string", ); @@ -304,7 +75,7 @@ fn run_test(fname: &str, test_i: usize, test: Test) { fname, test_i, &test, - state.0, + state, Tokenizer::new(BufReader::new(test.input.as_bytes())), "bufread", ); @@ -315,7 +86,7 @@ fn run_test_inner<R: Reader>( fname: &str, test_i: usize, test: &Test, - state: State, + state: &InitialState, mut tokenizer: Tokenizer<R>, tokenizer_info: &str, ) { @@ -324,7 +95,14 @@ fn run_test_inner<R: Reader>( fname, test_i, state, tokenizer_info, ); println!("description: {}", test.description); - tokenizer.set_internal_state(state); + tokenizer.set_internal_state(match state { + InitialState::Data => InternalState::Data, + InitialState::PlainText => InternalState::PlainText, + InitialState::RcData => InternalState::RcData, + InitialState::RawText => InternalState::RawText, + InitialState::ScriptData => InternalState::ScriptData, + InitialState::CdataSection => InternalState::CdataSection, + }); if let Some(last_start_tag) = &test.last_start_tag { tokenizer.set_last_start_tag(last_start_tag); } @@ -348,42 +126,3 @@ fn run_test_inner<R: Reader>( assert_eq!(test.errors, actual_errors); } - -/// Implements the escape sequences described in the tokenizer tests of html5lib-tests (and nothing -/// more) -fn unescape(data: &str) -> String { - let mut stream = data.chars(); - let mut rv = String::new(); - - loop { - match stream.next() { - Some('\\') => (), - Some(x) => { - rv.push(x); - continue; - } - None => break, - } - - match stream.next() { - Some('u') => (), - x => panic!("unexpected escape: {:?}", x), - } - - let orig_len = rv.len(); - - for _ in 0..4 { - rv.push(match stream.next() { - Some(x) => x, - None => panic!("unexpected eof after \\u"), - }); - } - - let c = u32::from_str_radix(&rv[orig_len..], 16).expect("failed to parse as hex"); - let c = char::from_u32(c).expect("bad character"); - rv.truncate(orig_len); - rv.push(c); - } - - rv -} |