use serde::{de, Deserialize}; use std::collections::BTreeMap; pub fn parse_tests( reader: impl std::io::Read, ) -> Result, serde_json::Error> { let Tests { tests } = serde_json::from_reader(reader)?; Ok(tests.into_iter().map(undo_double_escaping)) } #[derive(Debug, Deserialize)] pub enum InitialState { #[serde(rename = "Data state")] Data, #[serde(rename = "PLAINTEXT state")] PlainText, #[serde(rename = "RCDATA state")] RcData, #[serde(rename = "RAWTEXT state")] RawText, #[serde(rename = "Script data state")] ScriptData, #[serde(rename = "CDATA section state")] CdataSection, } fn initial_states_default() -> Vec { vec![InitialState::Data] } #[derive(Deserialize)] #[serde(rename_all = "camelCase")] pub struct Test { pub description: String, pub input: String, #[serde(default = "initial_states_default")] pub initial_states: Vec, #[serde(flatten)] pub output: Output, #[serde(default)] double_escaped: bool, #[serde(default)] pub last_start_tag: Option, } #[derive(Deserialize, PartialEq, Eq, Debug)] pub struct Output { #[serde(default)] pub errors: Vec, #[serde(rename = "output")] pub tokens: Vec, } #[derive(Debug, PartialEq, Eq)] pub enum Token { Doctype { name: Option, public_id: Option, system_id: Option, force_quirks: bool, }, StartTag { name: String, attributes: BTreeMap, self_closing: bool, }, EndTag { name: String, }, Comment(String), Character(String), } impl<'de> Deserialize<'de> for Token { fn deserialize(deserializer: D) -> Result where D: serde::Deserializer<'de>, { deserializer.deserialize_seq(TokenVisitor) } } #[derive(Deserialize)] enum TokenType { #[serde(rename = "DOCTYPE")] Doctype, StartTag, EndTag, Comment, Character, } struct TokenVisitor; impl<'de> de::Visitor<'de> for TokenVisitor { type Value = Token; fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result { formatter.write_str("an array describing a token") } fn visit_seq(self, mut seq: A) -> Result where A: serde::de::SeqAccess<'de>, { let typ: TokenType = seq.next_element()?.ok_or( de::Error::custom( r#"expected first array element to be one of "DOCTYPE", "StartTag", "EndTag", "Comment" or "Character""#, ) )?; Ok(match typ { TokenType::Doctype => Token::Doctype { name: seq .next_element()? .ok_or(de::Error::missing_field("name"))?, public_id: seq .next_element()? .ok_or(de::Error::missing_field("public_id"))?, system_id: seq .next_element()? .ok_or(de::Error::missing_field("system_id"))?, force_quirks: !seq .next_element()? .ok_or(de::Error::missing_field("correctness"))?, }, TokenType::StartTag => Token::StartTag { name: seq .next_element()? .ok_or(de::Error::missing_field("name"))?, attributes: seq .next_element()? .ok_or(de::Error::missing_field("attributes"))?, self_closing: seq.next_element()?.unwrap_or_default(), }, TokenType::EndTag => Token::EndTag { name: seq .next_element()? .ok_or(de::Error::missing_field("name"))?, }, TokenType::Comment => Token::Comment( seq.next_element()? .ok_or(de::Error::missing_field("data"))?, ), TokenType::Character => Token::Character( seq.next_element()? .ok_or(de::Error::missing_field("data"))?, ), }) } } #[derive(Deserialize, Debug, Eq, PartialEq)] #[serde(rename_all = "camelCase")] pub struct Error { pub code: String, // TODO: lineno and column? } #[derive(Deserialize)] struct Tests { tests: Vec, } fn undo_double_escaping(mut test: Test) -> Test { if test.double_escaped { test.input = unescape(&test.input); test.output.tokens = test .output .tokens .into_iter() .map(|token| match token { Token::Character(x) => Token::Character(unescape(&x)), Token::Comment(x) => Token::Comment(unescape(&x)), token => token, }) .collect(); } test } /// Implements the escape sequences described in the tokenizer tests of html5lib-tests (and nothing /// more) fn unescape(data: &str) -> String { let mut stream = data.chars(); let mut rv = String::new(); loop { match stream.next() { Some('\\') => (), Some(x) => { rv.push(x); continue; } None => break, } match stream.next() { Some('u') => (), x => panic!("unexpected escape: {:?}", x), } let orig_len = rv.len(); for _ in 0..4 { rv.push(match stream.next() { Some(x) => x, None => panic!("unexpected eof after \\u"), }); } let c = u32::from_str_radix(&rv[orig_len..], 16).expect("failed to parse as hex"); let c = char::from_u32(c).expect("bad character"); rv.truncate(orig_len); rv.push(c); } rv }