diff options
-rw-r--r-- | html5lib_tests/Cargo.toml | 1 | ||||
-rw-r--r-- | html5lib_tests/src/lib.rs | 237 | ||||
-rw-r--r-- | integration_tests/tests/test_html5lib.rs | 44 |
3 files changed, 137 insertions, 145 deletions
diff --git a/html5lib_tests/Cargo.toml b/html5lib_tests/Cargo.toml index 06fc6b8..66e4624 100644 --- a/html5lib_tests/Cargo.toml +++ b/html5lib_tests/Cargo.toml @@ -11,4 +11,3 @@ publish = false # prevent accidental publishes until it's ready to be published [dependencies] serde = { version = "1.0.130", features = ["derive"] } serde_json = "1.0.71" -html5tokenizer = { path = ".." } # TODO: get rid of this dependency diff --git a/html5lib_tests/src/lib.rs b/html5lib_tests/src/lib.rs index c007317..6cf46db 100644 --- a/html5lib_tests/src/lib.rs +++ b/html5lib_tests/src/lib.rs @@ -1,5 +1,4 @@ -use html5tokenizer::{Attribute, Doctype, EndTag, Error, StartTag, Token}; -use serde::{de::Error as _, Deserialize}; +use serde::{de, Deserialize}; use std::collections::BTreeMap; pub fn parse_tests( @@ -9,120 +8,6 @@ pub fn parse_tests( Ok(tests.into_iter().map(undo_double_escaping)) } -pub struct ExpectedOutputTokens(pub Vec<Token<()>>); - -impl<'de> Deserialize<'de> for ExpectedOutputTokens { - fn deserialize<D>(deserializer: D) -> Result<Self, D::Error> - where - D: serde::Deserializer<'de>, - { - // this macro is a horrible way to define a type that deserializes only from a particular - // string. Together with serde(untagged) this gives us really flexible enum tagging with really - // terrible error messages. - macro_rules! def_const { - ($str:expr, $ty:ident) => { - #[derive(Deserialize)] - enum $ty { - #[serde(rename = $str)] - $ty, - } - }; - } - - def_const!("DOCTYPE", DoctypeConst); - def_const!("StartTag", StartTagConst); - def_const!("EndTag", EndTagConst); - def_const!("Comment", CommentConst); - def_const!("Character", CharacterConst); - - type Attributes = BTreeMap<String, String>; - - #[derive(Deserialize)] - #[serde(untagged)] - enum OutputToken { - // "DOCTYPE", name, public_id, system_id, correctness - Doctype( - DoctypeConst, - Option<String>, - Option<String>, - Option<String>, - bool, - ), - // "StartTag", name, attributes, self_closing - StartTag(StartTagConst, String, Attributes), - StartTag2(StartTagConst, String, Attributes, bool), - // "EndTag", name - EndTag(EndTagConst, String), - // "Comment", data - Comment(CommentConst, String), - // "Character", data - Character(CharacterConst, String), - } - - Ok(ExpectedOutputTokens( - Vec::deserialize(deserializer)? - .into_iter() - .map(|output_token| match output_token { - OutputToken::Doctype( - _, - name, - public_identifier, - system_identifier, - correctness, - ) => Token::Doctype(Doctype { - name: name.unwrap_or_default(), - public_identifier, - system_identifier, - force_quirks: !correctness, - }), - OutputToken::StartTag(_, name, attributes) => Token::StartTag(StartTag { - self_closing: false, - name, - attributes: attributes - .into_iter() - .map(|(k, v)| { - ( - k, - Attribute { - value: v, - ..Default::default() - }, - ) - }) - .collect(), - name_span: (), - }), - OutputToken::StartTag2(_, name, attributes, self_closing) => { - Token::StartTag(StartTag { - self_closing, - name, - attributes: attributes - .into_iter() - .map(|(k, v)| { - ( - k, - Attribute { - value: v, - ..Default::default() - }, - ) - }) - .collect(), - name_span: (), - }) - } - OutputToken::EndTag(_, name) => Token::EndTag(EndTag { - name, - name_span: (), - }), - OutputToken::Comment(_, data) => Token::Comment(data), - OutputToken::Character(_, data) => Token::String(data), - }) - .collect::<Vec<Token<()>>>(), - )) - } -} - #[derive(Debug, Deserialize)] pub enum InitialState { #[serde(rename = "Data state")] @@ -148,37 +33,127 @@ fn initial_states_default() -> Vec<InitialState> { pub struct Test { pub description: String, pub input: String, - pub output: ExpectedOutputTokens, #[serde(default = "initial_states_default")] pub initial_states: Vec<InitialState>, + #[serde(flatten)] + pub output: Output, #[serde(default)] double_escaped: bool, #[serde(default)] pub last_start_tag: Option<String>, +} + +#[derive(Deserialize, PartialEq, Eq, Debug)] +pub struct Output { #[serde(default)] - pub errors: Vec<ParseError>, + pub errors: Vec<Error>, + #[serde(rename = "output")] + pub tokens: Vec<Token>, } -#[derive(Debug, Eq, PartialEq)] -pub struct ParseErrorInner(pub Error); +#[derive(Debug, PartialEq, Eq)] +pub enum Token { + Doctype { + name: Option<String>, + public_id: Option<String>, + system_id: Option<String>, + force_quirks: bool, + }, + StartTag { + name: String, + attributes: BTreeMap<String, String>, + self_closing: bool, + }, + EndTag { + name: String, + }, + Comment(String), + Character(String), +} -impl<'de> Deserialize<'de> for ParseErrorInner { +impl<'de> Deserialize<'de> for Token { fn deserialize<D>(deserializer: D) -> Result<Self, D::Error> where D: serde::Deserializer<'de>, { - let str_err = String::deserialize(deserializer)?; - let err: Error = str_err - .parse() - .map_err(|_| D::Error::custom(&format!("failed to deserialize error: {}", str_err)))?; - Ok(ParseErrorInner(err)) + deserializer.deserialize_seq(TokenVisitor) + } +} + +#[derive(Deserialize)] +enum TokenType { + #[serde(rename = "DOCTYPE")] + Doctype, + StartTag, + EndTag, + Comment, + Character, +} + +struct TokenVisitor; + +impl<'de> de::Visitor<'de> for TokenVisitor { + type Value = Token; + + fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result { + formatter.write_str("an array describing a token") + } + + fn visit_seq<A>(self, mut seq: A) -> Result<Self::Value, A::Error> + where + A: serde::de::SeqAccess<'de>, + { + let typ: TokenType = seq.next_element()?.ok_or( + de::Error::custom( + r#"expected first array element to be one of "DOCTYPE", "StartTag", "EndTag", "Comment" or "Character""#, + ) + )?; + + Ok(match typ { + TokenType::Doctype => Token::Doctype { + name: seq + .next_element()? + .ok_or(de::Error::missing_field("name"))?, + public_id: seq + .next_element()? + .ok_or(de::Error::missing_field("public_id"))?, + system_id: seq + .next_element()? + .ok_or(de::Error::missing_field("system_id"))?, + force_quirks: !seq + .next_element()? + .ok_or(de::Error::missing_field("correctness"))?, + }, + TokenType::StartTag => Token::StartTag { + name: seq + .next_element()? + .ok_or(de::Error::missing_field("name"))?, + attributes: seq + .next_element()? + .ok_or(de::Error::missing_field("attributes"))?, + self_closing: seq.next_element()?.unwrap_or_default(), + }, + TokenType::EndTag => Token::EndTag { + name: seq + .next_element()? + .ok_or(de::Error::missing_field("name"))?, + }, + TokenType::Comment => Token::Comment( + seq.next_element()? + .ok_or(de::Error::missing_field("data"))?, + ), + TokenType::Character => Token::Character( + seq.next_element()? + .ok_or(de::Error::missing_field("data"))?, + ), + }) } } #[derive(Deserialize, Debug, Eq, PartialEq)] #[serde(rename_all = "camelCase")] -pub struct ParseError { - pub code: ParseErrorInner, +pub struct Error { + pub code: String, // TODO: lineno and column? } @@ -191,12 +166,12 @@ fn undo_double_escaping(mut test: Test) -> Test { if test.double_escaped { test.input = unescape(&test.input); - test.output.0 = test + test.output.tokens = test .output - .0 + .tokens .into_iter() .map(|token| match token { - Token::String(x) => Token::String(unescape(&x)), + Token::Character(x) => Token::Character(unescape(&x)), Token::Comment(x) => Token::Comment(unescape(&x)), token => token, }) diff --git a/integration_tests/tests/test_html5lib.rs b/integration_tests/tests/test_html5lib.rs index 3236f0f..23adec0 100644 --- a/integration_tests/tests/test_html5lib.rs +++ b/integration_tests/tests/test_html5lib.rs @@ -1,6 +1,8 @@ use std::{fs::File, io::BufReader, path::Path}; -use html5lib_tests::{parse_tests, InitialState, ParseError, ParseErrorInner, Test}; +use html5lib_tests::{ + parse_tests, Error as TestError, InitialState, Output, Test, Token as TestToken, +}; use html5tokenizer::{InternalState, Reader, Token, Tokenizer}; use pretty_assertions::assert_eq; @@ -107,22 +109,38 @@ fn run_test_inner<R: Reader>( tokenizer.set_last_start_tag(last_start_tag); } - let mut actual_tokens = Vec::new(); - let mut actual_errors = Vec::new(); + let mut actual = Output { + errors: Vec::new(), + tokens: Vec::new(), + }; for token in tokenizer { let token = token.unwrap(); - if let Token::Error { error, .. } = token { - actual_errors.push(ParseError { - code: ParseErrorInner(error), - }); - } else { - actual_tokens.push(token); - } + match token { + Token::Error { error, .. } => actual.errors.push(TestError { + code: error.to_string(), + }), + Token::StartTag(tag) => actual.tokens.push(TestToken::StartTag { + name: tag.name, + attributes: tag + .attributes + .into_iter() + .map(|(name, map_val)| (name, map_val.value)) + .collect(), + self_closing: tag.self_closing, + }), + Token::EndTag(tag) => actual.tokens.push(TestToken::EndTag { name: tag.name }), + Token::String(data) => actual.tokens.push(TestToken::Character(data)), + Token::Comment(data) => actual.tokens.push(TestToken::Comment(data)), + Token::Doctype(doctype) => actual.tokens.push(TestToken::Doctype { + name: Some(doctype.name).filter(|name| !name.is_empty()), + public_id: doctype.public_identifier, + system_id: doctype.system_identifier, + force_quirks: doctype.force_quirks, + }), + }; } - assert_eq!(test.output.0, actual_tokens); - - assert_eq!(test.errors, actual_errors); + assert_eq!(test.output, actual); } |