use html5tokenizer::{Attribute, Doctype, EndTag, Error, StartTag, Token};
use serde::{de::Error as _, Deserialize};
use std::collections::BTreeMap;

pub fn parse_tests(
    reader: impl std::io::Read,
) -> Result<impl Iterator<Item = Test>, serde_json::Error> {
    let Tests { tests } = serde_json::from_reader(reader)?;
    Ok(tests.into_iter().map(undo_double_escaping))
}

pub struct ExpectedOutputTokens(pub Vec<Token<()>>);

impl<'de> Deserialize<'de> for ExpectedOutputTokens {
    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
    where
        D: serde::Deserializer<'de>,
    {
        // this macro is a horrible way to define a type that deserializes only from a particular
        // string. Together with serde(untagged) this gives us really flexible enum tagging with really
        // terrible error messages.
        macro_rules! def_const {
            ($str:expr, $ty:ident) => {
                #[derive(Deserialize)]
                enum $ty {
                    #[serde(rename = $str)]
                    $ty,
                }
            };
        }

        def_const!("DOCTYPE", DoctypeConst);
        def_const!("StartTag", StartTagConst);
        def_const!("EndTag", EndTagConst);
        def_const!("Comment", CommentConst);
        def_const!("Character", CharacterConst);

        type Attributes = BTreeMap<String, String>;

        #[derive(Deserialize)]
        #[serde(untagged)]
        enum OutputToken {
            // "DOCTYPE", name, public_id, system_id, correctness
            Doctype(
                DoctypeConst,
                Option<String>,
                Option<String>,
                Option<String>,
                bool,
            ),
            // "StartTag", name, attributes, self_closing
            StartTag(StartTagConst, String, Attributes),
            StartTag2(StartTagConst, String, Attributes, bool),
            // "EndTag", name
            EndTag(EndTagConst, String),
            // "Comment", data
            Comment(CommentConst, String),
            // "Character", data
            Character(CharacterConst, String),
        }

        Ok(ExpectedOutputTokens(
            Vec::deserialize(deserializer)?
                .into_iter()
                .map(|output_token| match output_token {
                    OutputToken::Doctype(
                        _,
                        name,
                        public_identifier,
                        system_identifier,
                        correctness,
                    ) => Token::Doctype(Doctype {
                        name: name.unwrap_or_default(),
                        public_identifier,
                        system_identifier,
                        force_quirks: !correctness,
                    }),
                    OutputToken::StartTag(_, name, attributes) => Token::StartTag(StartTag {
                        self_closing: false,
                        name,
                        attributes: attributes
                            .into_iter()
                            .map(|(k, v)| {
                                (
                                    k,
                                    Attribute {
                                        value: v,
                                        ..Default::default()
                                    },
                                )
                            })
                            .collect(),
                        name_span: (),
                    }),
                    OutputToken::StartTag2(_, name, attributes, self_closing) => {
                        Token::StartTag(StartTag {
                            self_closing,
                            name,
                            attributes: attributes
                                .into_iter()
                                .map(|(k, v)| {
                                    (
                                        k,
                                        Attribute {
                                            value: v,
                                            ..Default::default()
                                        },
                                    )
                                })
                                .collect(),
                            name_span: (),
                        })
                    }
                    OutputToken::EndTag(_, name) => Token::EndTag(EndTag {
                        name,
                        name_span: (),
                    }),
                    OutputToken::Comment(_, data) => Token::Comment(data),
                    OutputToken::Character(_, data) => Token::String(data),
                })
                .collect::<Vec<Token<()>>>(),
        ))
    }
}

#[derive(Debug, Deserialize)]
pub enum InitialState {
    #[serde(rename = "Data state")]
    Data,
    #[serde(rename = "PLAINTEXT state")]
    PlainText,
    #[serde(rename = "RCDATA state")]
    RcData,
    #[serde(rename = "RAWTEXT state")]
    RawText,
    #[serde(rename = "Script data state")]
    ScriptData,
    #[serde(rename = "CDATA section state")]
    CdataSection,
}

fn initial_states_default() -> Vec<InitialState> {
    vec![InitialState::Data]
}

#[derive(Deserialize)]
#[serde(rename_all = "camelCase")]
pub struct Test {
    pub description: String,
    pub input: String,
    pub output: ExpectedOutputTokens,
    #[serde(default = "initial_states_default")]
    pub initial_states: Vec<InitialState>,
    #[serde(default)]
    double_escaped: bool,
    #[serde(default)]
    pub last_start_tag: Option<String>,
    #[serde(default)]
    pub errors: Vec<ParseError>,
}

#[derive(Debug, Eq, PartialEq)]
pub struct ParseErrorInner(pub Error);

impl<'de> Deserialize<'de> for ParseErrorInner {
    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
    where
        D: serde::Deserializer<'de>,
    {
        let str_err = String::deserialize(deserializer)?;
        let err: Error = str_err
            .parse()
            .map_err(|_| D::Error::custom(&format!("failed to deserialize error: {}", str_err)))?;
        Ok(ParseErrorInner(err))
    }
}

#[derive(Deserialize, Debug, Eq, PartialEq)]
#[serde(rename_all = "camelCase")]
pub struct ParseError {
    pub code: ParseErrorInner,
    // TODO: lineno and column?
}

#[derive(Deserialize)]
struct Tests {
    tests: Vec<Test>,
}

fn undo_double_escaping(mut test: Test) -> Test {
    test.input = if test.double_escaped {
        unescape(&test.input)
    } else {
        test.input
    };

    test.output = if test.double_escaped {
        ExpectedOutputTokens(
            test.output
                .0
                .into_iter()
                .map(|token| match token {
                    Token::String(x) => Token::String(unescape(&x)),
                    Token::Comment(x) => Token::Comment(unescape(&x)),
                    token => token,
                })
                .collect(),
        )
    } else {
        ExpectedOutputTokens(test.output.0)
    };
    test
}

/// Implements the escape sequences described in the tokenizer tests of html5lib-tests (and nothing
/// more)
fn unescape(data: &str) -> String {
    let mut stream = data.chars();
    let mut rv = String::new();

    loop {
        match stream.next() {
            Some('\\') => (),
            Some(x) => {
                rv.push(x);
                continue;
            }
            None => break,
        }

        match stream.next() {
            Some('u') => (),
            x => panic!("unexpected escape: {:?}", x),
        }

        let orig_len = rv.len();

        for _ in 0..4 {
            rv.push(match stream.next() {
                Some(x) => x,
                None => panic!("unexpected eof after \\u"),
            });
        }

        let c = u32::from_str_radix(&rv[orig_len..], 16).expect("failed to parse as hex");
        let c = char::from_u32(c).expect("bad character");
        rv.truncate(orig_len);
        rv.push(c);
    }

    rv
}