use html5tokenizer::{Attribute, Doctype, EndTag, Error, StartTag, Token}; use serde::{de::Error as _, Deserialize}; use std::collections::BTreeMap; pub fn parse_tests( reader: impl std::io::Read, ) -> Result, serde_json::Error> { let Tests { tests } = serde_json::from_reader(reader)?; Ok(tests.into_iter().map(undo_double_escaping)) } pub struct ExpectedOutputTokens(pub Vec>); impl<'de> Deserialize<'de> for ExpectedOutputTokens { fn deserialize(deserializer: D) -> Result where D: serde::Deserializer<'de>, { // this macro is a horrible way to define a type that deserializes only from a particular // string. Together with serde(untagged) this gives us really flexible enum tagging with really // terrible error messages. macro_rules! def_const { ($str:expr, $ty:ident) => { #[derive(Deserialize)] enum $ty { #[serde(rename = $str)] $ty, } }; } def_const!("DOCTYPE", DoctypeConst); def_const!("StartTag", StartTagConst); def_const!("EndTag", EndTagConst); def_const!("Comment", CommentConst); def_const!("Character", CharacterConst); type Attributes = BTreeMap; #[derive(Deserialize)] #[serde(untagged)] enum OutputToken { // "DOCTYPE", name, public_id, system_id, correctness Doctype( DoctypeConst, Option, Option, Option, bool, ), // "StartTag", name, attributes, self_closing StartTag(StartTagConst, String, Attributes), StartTag2(StartTagConst, String, Attributes, bool), // "EndTag", name EndTag(EndTagConst, String), // "Comment", data Comment(CommentConst, String), // "Character", data Character(CharacterConst, String), } Ok(ExpectedOutputTokens( Vec::deserialize(deserializer)? .into_iter() .map(|output_token| match output_token { OutputToken::Doctype( _, name, public_identifier, system_identifier, correctness, ) => Token::Doctype(Doctype { name: name.unwrap_or_default(), public_identifier, system_identifier, force_quirks: !correctness, }), OutputToken::StartTag(_, name, attributes) => Token::StartTag(StartTag { self_closing: false, name, attributes: attributes .into_iter() .map(|(k, v)| { ( k, Attribute { value: v, ..Default::default() }, ) }) .collect(), name_span: (), }), OutputToken::StartTag2(_, name, attributes, self_closing) => { Token::StartTag(StartTag { self_closing, name, attributes: attributes .into_iter() .map(|(k, v)| { ( k, Attribute { value: v, ..Default::default() }, ) }) .collect(), name_span: (), }) } OutputToken::EndTag(_, name) => Token::EndTag(EndTag { name, name_span: (), }), OutputToken::Comment(_, data) => Token::Comment(data), OutputToken::Character(_, data) => Token::String(data), }) .collect::>>(), )) } } #[derive(Debug, Deserialize)] pub enum InitialState { #[serde(rename = "Data state")] Data, #[serde(rename = "PLAINTEXT state")] PlainText, #[serde(rename = "RCDATA state")] RcData, #[serde(rename = "RAWTEXT state")] RawText, #[serde(rename = "Script data state")] ScriptData, #[serde(rename = "CDATA section state")] CdataSection, } fn initial_states_default() -> Vec { vec![InitialState::Data] } #[derive(Deserialize)] #[serde(rename_all = "camelCase")] pub struct Test { pub description: String, pub input: String, pub output: ExpectedOutputTokens, #[serde(default = "initial_states_default")] pub initial_states: Vec, #[serde(default)] double_escaped: bool, #[serde(default)] pub last_start_tag: Option, #[serde(default)] pub errors: Vec, } #[derive(Debug, Eq, PartialEq)] pub struct ParseErrorInner(pub Error); impl<'de> Deserialize<'de> for ParseErrorInner { fn deserialize(deserializer: D) -> Result where D: serde::Deserializer<'de>, { let str_err = String::deserialize(deserializer)?; let err: Error = str_err .parse() .map_err(|_| D::Error::custom(&format!("failed to deserialize error: {}", str_err)))?; Ok(ParseErrorInner(err)) } } #[derive(Deserialize, Debug, Eq, PartialEq)] #[serde(rename_all = "camelCase")] pub struct ParseError { pub code: ParseErrorInner, // TODO: lineno and column? } #[derive(Deserialize)] struct Tests { tests: Vec, } fn undo_double_escaping(mut test: Test) -> Test { test.input = if test.double_escaped { unescape(&test.input) } else { test.input }; test.output = if test.double_escaped { ExpectedOutputTokens( test.output .0 .into_iter() .map(|token| match token { Token::String(x) => Token::String(unescape(&x)), Token::Comment(x) => Token::Comment(unescape(&x)), token => token, }) .collect(), ) } else { ExpectedOutputTokens(test.output.0) }; test } /// Implements the escape sequences described in the tokenizer tests of html5lib-tests (and nothing /// more) fn unescape(data: &str) -> String { let mut stream = data.chars(); let mut rv = String::new(); loop { match stream.next() { Some('\\') => (), Some(x) => { rv.push(x); continue; } None => break, } match stream.next() { Some('u') => (), x => panic!("unexpected escape: {:?}", x), } let orig_len = rv.len(); for _ in 0..4 { rv.push(match stream.next() { Some(x) => x, None => panic!("unexpected eof after \\u"), }); } let c = u32::from_str_radix(&rv[orig_len..], 16).expect("failed to parse as hex"); let c = char::from_u32(c).expect("bad character"); rv.truncate(orig_len); rv.push(c); } rv }