use html5tokenizer::{Attribute, Doctype, EndTag, Error, StartTag, Token};
use serde::{de::Error as _, Deserialize};
use std::collections::BTreeMap;
pub fn parse_tests(
reader: impl std::io::Read,
) -> Result, serde_json::Error> {
let Tests { tests } = serde_json::from_reader(reader)?;
Ok(tests.into_iter().map(undo_double_escaping))
}
pub struct ExpectedOutputTokens(pub Vec>);
impl<'de> Deserialize<'de> for ExpectedOutputTokens {
fn deserialize(deserializer: D) -> Result
where
D: serde::Deserializer<'de>,
{
// this macro is a horrible way to define a type that deserializes only from a particular
// string. Together with serde(untagged) this gives us really flexible enum tagging with really
// terrible error messages.
macro_rules! def_const {
($str:expr, $ty:ident) => {
#[derive(Deserialize)]
enum $ty {
#[serde(rename = $str)]
$ty,
}
};
}
def_const!("DOCTYPE", DoctypeConst);
def_const!("StartTag", StartTagConst);
def_const!("EndTag", EndTagConst);
def_const!("Comment", CommentConst);
def_const!("Character", CharacterConst);
type Attributes = BTreeMap;
#[derive(Deserialize)]
#[serde(untagged)]
enum OutputToken {
// "DOCTYPE", name, public_id, system_id, correctness
Doctype(
DoctypeConst,
Option,
Option,
Option,
bool,
),
// "StartTag", name, attributes, self_closing
StartTag(StartTagConst, String, Attributes),
StartTag2(StartTagConst, String, Attributes, bool),
// "EndTag", name
EndTag(EndTagConst, String),
// "Comment", data
Comment(CommentConst, String),
// "Character", data
Character(CharacterConst, String),
}
Ok(ExpectedOutputTokens(
Vec::deserialize(deserializer)?
.into_iter()
.map(|output_token| match output_token {
OutputToken::Doctype(
_,
name,
public_identifier,
system_identifier,
correctness,
) => Token::Doctype(Doctype {
name: name.unwrap_or_default(),
public_identifier,
system_identifier,
force_quirks: !correctness,
}),
OutputToken::StartTag(_, name, attributes) => Token::StartTag(StartTag {
self_closing: false,
name,
attributes: attributes
.into_iter()
.map(|(k, v)| {
(
k,
Attribute {
value: v,
..Default::default()
},
)
})
.collect(),
name_span: (),
}),
OutputToken::StartTag2(_, name, attributes, self_closing) => {
Token::StartTag(StartTag {
self_closing,
name,
attributes: attributes
.into_iter()
.map(|(k, v)| {
(
k,
Attribute {
value: v,
..Default::default()
},
)
})
.collect(),
name_span: (),
})
}
OutputToken::EndTag(_, name) => Token::EndTag(EndTag {
name,
name_span: (),
}),
OutputToken::Comment(_, data) => Token::Comment(data),
OutputToken::Character(_, data) => Token::String(data),
})
.collect::>>(),
))
}
}
#[derive(Debug, Deserialize)]
pub enum InitialState {
#[serde(rename = "Data state")]
Data,
#[serde(rename = "PLAINTEXT state")]
PlainText,
#[serde(rename = "RCDATA state")]
RcData,
#[serde(rename = "RAWTEXT state")]
RawText,
#[serde(rename = "Script data state")]
ScriptData,
#[serde(rename = "CDATA section state")]
CdataSection,
}
fn initial_states_default() -> Vec {
vec![InitialState::Data]
}
#[derive(Deserialize)]
#[serde(rename_all = "camelCase")]
pub struct Test {
pub description: String,
pub input: String,
pub output: ExpectedOutputTokens,
#[serde(default = "initial_states_default")]
pub initial_states: Vec,
#[serde(default)]
double_escaped: bool,
#[serde(default)]
pub last_start_tag: Option,
#[serde(default)]
pub errors: Vec,
}
#[derive(Debug, Eq, PartialEq)]
pub struct ParseErrorInner(pub Error);
impl<'de> Deserialize<'de> for ParseErrorInner {
fn deserialize(deserializer: D) -> Result
where
D: serde::Deserializer<'de>,
{
let str_err = String::deserialize(deserializer)?;
let err: Error = str_err
.parse()
.map_err(|_| D::Error::custom(&format!("failed to deserialize error: {}", str_err)))?;
Ok(ParseErrorInner(err))
}
}
#[derive(Deserialize, Debug, Eq, PartialEq)]
#[serde(rename_all = "camelCase")]
pub struct ParseError {
pub code: ParseErrorInner,
// TODO: lineno and column?
}
#[derive(Deserialize)]
struct Tests {
tests: Vec,
}
fn undo_double_escaping(mut test: Test) -> Test {
test.input = if test.double_escaped {
unescape(&test.input)
} else {
test.input
};
test.output = if test.double_escaped {
ExpectedOutputTokens(
test.output
.0
.into_iter()
.map(|token| match token {
Token::String(x) => Token::String(unescape(&x)),
Token::Comment(x) => Token::Comment(unescape(&x)),
token => token,
})
.collect(),
)
} else {
ExpectedOutputTokens(test.output.0)
};
test
}
/// Implements the escape sequences described in the tokenizer tests of html5lib-tests (and nothing
/// more)
fn unescape(data: &str) -> String {
let mut stream = data.chars();
let mut rv = String::new();
loop {
match stream.next() {
Some('\\') => (),
Some(x) => {
rv.push(x);
continue;
}
None => break,
}
match stream.next() {
Some('u') => (),
x => panic!("unexpected escape: {:?}", x),
}
let orig_len = rv.len();
for _ in 0..4 {
rv.push(match stream.next() {
Some(x) => x,
None => panic!("unexpected eof after \\u"),
});
}
let c = u32::from_str_radix(&rv[orig_len..], 16).expect("failed to parse as hex");
let c = char::from_u32(c).expect("bad character");
rv.truncate(orig_len);
rv.push(c);
}
rv
}