diff options
Diffstat (limited to 'html5lib_tests')
| -rw-r--r-- | html5lib_tests/Cargo.toml | 14 | ||||
| -rw-r--r-- | html5lib_tests/src/lib.rs | 252 | 
2 files changed, 266 insertions, 0 deletions
| diff --git a/html5lib_tests/Cargo.toml b/html5lib_tests/Cargo.toml new file mode 100644 index 0000000..06fc6b8 --- /dev/null +++ b/html5lib_tests/Cargo.toml @@ -0,0 +1,14 @@ +[package] +name = "html5lib_tests" +authors = ["Martin Fischer <martin@push-f.com>", "Markus Unterwaditzer <markus-honeypot@unterwaditzer.net>"] +description = "Deserializable types for the .test files from html5lib-tests." +version = "0.0.0" +edition = "2021" +publish = false # prevent accidental publishes until it's ready to be published + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +serde = { version = "1.0.130", features = ["derive"] } +serde_json = "1.0.71" +html5tokenizer = { path = ".." } # TODO: get rid of this dependency diff --git a/html5lib_tests/src/lib.rs b/html5lib_tests/src/lib.rs new file mode 100644 index 0000000..5678b0d --- /dev/null +++ b/html5lib_tests/src/lib.rs @@ -0,0 +1,252 @@ +use html5tokenizer::{Attribute, Doctype, EndTag, Error, StartTag, Token}; +use serde::{de::Error as _, Deserialize}; +use std::collections::BTreeMap; + +pub fn parse_tests( +    reader: impl std::io::Read, +) -> Result<impl Iterator<Item = Test>, serde_json::Error> { +    let Tests { tests } = serde_json::from_reader(reader)?; +    Ok(tests.into_iter().map(undo_double_escaping)) +} + +pub struct ExpectedOutputTokens(pub Vec<Token<()>>); + +impl<'de> Deserialize<'de> for ExpectedOutputTokens { +    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error> +    where +        D: serde::Deserializer<'de>, +    { +        // this macro is a horrible way to define a type that deserializes only from a particular +        // string. Together with serde(untagged) this gives us really flexible enum tagging with really +        // terrible error messages. +        macro_rules! def_const { +            ($str:expr, $ty:ident) => { +                #[derive(Deserialize)] +                enum $ty { +                    #[serde(rename = $str)] +                    $ty, +                } +            }; +        } + +        def_const!("DOCTYPE", DoctypeConst); +        def_const!("StartTag", StartTagConst); +        def_const!("EndTag", EndTagConst); +        def_const!("Comment", CommentConst); +        def_const!("Character", CharacterConst); + +        type Attributes = BTreeMap<String, String>; + +        #[derive(Deserialize)] +        #[serde(untagged)] +        enum OutputToken { +            // "DOCTYPE", name, public_id, system_id, correctness +            Doctype( +                DoctypeConst, +                Option<String>, +                Option<String>, +                Option<String>, +                bool, +            ), +            // "StartTag", name, attributes, self_closing +            StartTag(StartTagConst, String, Attributes), +            StartTag2(StartTagConst, String, Attributes, bool), +            // "EndTag", name +            EndTag(EndTagConst, String), +            // "Comment", data +            Comment(CommentConst, String), +            // "Character", data +            Character(CharacterConst, String), +        } + +        Ok(ExpectedOutputTokens( +            Vec::deserialize(deserializer)? +                .into_iter() +                .map(|output_token| match output_token { +                    OutputToken::Doctype( +                        _, +                        name, +                        public_identifier, +                        system_identifier, +                        correctness, +                    ) => Token::Doctype(Doctype { +                        name: name.unwrap_or_default(), +                        public_identifier, +                        system_identifier, +                        force_quirks: !correctness, +                    }), +                    OutputToken::StartTag(_, name, attributes) => Token::StartTag(StartTag { +                        self_closing: false, +                        name, +                        attributes: attributes +                            .into_iter() +                            .map(|(k, v)| { +                                ( +                                    k, +                                    Attribute { +                                        value: v, +                                        ..Default::default() +                                    }, +                                ) +                            }) +                            .collect(), +                        name_span: (), +                    }), +                    OutputToken::StartTag2(_, name, attributes, self_closing) => { +                        Token::StartTag(StartTag { +                            self_closing, +                            name, +                            attributes: attributes +                                .into_iter() +                                .map(|(k, v)| { +                                    ( +                                        k, +                                        Attribute { +                                            value: v, +                                            ..Default::default() +                                        }, +                                    ) +                                }) +                                .collect(), +                            name_span: (), +                        }) +                    } +                    OutputToken::EndTag(_, name) => Token::EndTag(EndTag { +                        name, +                        name_span: (), +                    }), +                    OutputToken::Comment(_, data) => Token::Comment(data), +                    OutputToken::Character(_, data) => Token::String(data), +                }) +                .collect::<Vec<Token<()>>>(), +        )) +    } +} + +#[derive(Debug, Deserialize)] +pub enum InitialState { +    #[serde(rename = "Data state")] +    Data, +    #[serde(rename = "PLAINTEXT state")] +    PlainText, +    #[serde(rename = "RCDATA state")] +    RcData, +    #[serde(rename = "RAWTEXT state")] +    RawText, +    #[serde(rename = "Script data state")] +    ScriptData, +    #[serde(rename = "CDATA section state")] +    CdataSection, +} + +fn initial_states_default() -> Vec<InitialState> { +    vec![InitialState::Data] +} + +#[derive(Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct Test { +    pub description: String, +    pub input: String, +    pub output: ExpectedOutputTokens, +    #[serde(default = "initial_states_default")] +    pub initial_states: Vec<InitialState>, +    #[serde(default)] +    double_escaped: bool, +    #[serde(default)] +    pub last_start_tag: Option<String>, +    #[serde(default)] +    pub errors: Vec<ParseError>, +} + +#[derive(Debug, Eq, PartialEq)] +pub struct ParseErrorInner(pub Error); + +impl<'de> Deserialize<'de> for ParseErrorInner { +    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error> +    where +        D: serde::Deserializer<'de>, +    { +        let str_err = String::deserialize(deserializer)?; +        let err: Error = str_err +            .parse() +            .map_err(|_| D::Error::custom(&format!("failed to deserialize error: {}", str_err)))?; +        Ok(ParseErrorInner(err)) +    } +} + +#[derive(Deserialize, Debug, Eq, PartialEq)] +#[serde(rename_all = "camelCase")] +pub struct ParseError { +    pub code: ParseErrorInner, +    // TODO: lineno and column? +} + +#[derive(Deserialize)] +struct Tests { +    tests: Vec<Test>, +} + +fn undo_double_escaping(mut test: Test) -> Test { +    test.input = if test.double_escaped { +        unescape(&test.input) +    } else { +        test.input +    }; + +    test.output = if test.double_escaped { +        ExpectedOutputTokens( +            test.output +                .0 +                .into_iter() +                .map(|token| match token { +                    Token::String(x) => Token::String(unescape(&x)), +                    Token::Comment(x) => Token::Comment(unescape(&x)), +                    token => token, +                }) +                .collect(), +        ) +    } else { +        ExpectedOutputTokens(test.output.0) +    }; +    test +} + +/// Implements the escape sequences described in the tokenizer tests of html5lib-tests (and nothing +/// more) +fn unescape(data: &str) -> String { +    let mut stream = data.chars(); +    let mut rv = String::new(); + +    loop { +        match stream.next() { +            Some('\\') => (), +            Some(x) => { +                rv.push(x); +                continue; +            } +            None => break, +        } + +        match stream.next() { +            Some('u') => (), +            x => panic!("unexpected escape: {:?}", x), +        } + +        let orig_len = rv.len(); + +        for _ in 0..4 { +            rv.push(match stream.next() { +                Some(x) => x, +                None => panic!("unexpected eof after \\u"), +            }); +        } + +        let c = u32::from_str_radix(&rv[orig_len..], 16).expect("failed to parse as hex"); +        let c = char::from_u32(c).expect("bad character"); +        rv.truncate(orig_len); +        rv.push(c); +    } + +    rv +} | 
