// Copyright 2014-2017 The html5ever Project Developers. See the
// COPYRIGHT file at the top-level directory of this distribution.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.

use super::{TokenSink, Tokenizer};
use crate::util::buffer_queue::BufferQueue;
use crate::util::str::is_ascii_alnum;

use log::debug;
use std::borrow::Cow::Borrowed;
use std::char::from_u32;

use self::State::*;
pub use self::Status::*;

//ยง tokenizing-character-references
pub struct CharRef {
    /// The resulting character(s)
    pub chars: [char; 2],

    /// How many slots in `chars` are valid?
    pub num_chars: u8,
}

pub enum Status {
    Stuck,
    Progress,
    Done,
}

#[derive(Debug)]
enum State {
    Begin,
    Octothorpe,
    Numeric(u32), // base
    NumericSemicolon,
    Named,
    BogusName,
}

pub struct CharRefTokenizer {
    state: State,
    addnl_allowed: Option<char>,
    result: Option<CharRef>,

    num: u32,
    num_too_big: bool,
    seen_digit: bool,
    hex_marker: Option<char>,

    name_buf_opt: Option<String>,
    name_match: Option<(u32, u32)>,
    name_len: usize,
}

impl CharRefTokenizer {
    // NB: We assume that we have an additional allowed character iff we're
    // tokenizing in an attribute value.
    pub fn new(addnl_allowed: Option<char>) -> CharRefTokenizer {
        CharRefTokenizer {
            state: Begin,
            addnl_allowed,
            result: None,
            num: 0,
            num_too_big: false,
            seen_digit: false,
            hex_marker: None,
            name_buf_opt: None,
            name_match: None,
            name_len: 0,
        }
    }

    // A CharRefTokenizer can only tokenize one character reference,
    // so this method consumes the tokenizer.
    pub fn get_result(self) -> CharRef {
        self.result.expect("get_result called before done")
    }

    fn name_buf(&self) -> &str {
        self.name_buf_opt
            .as_ref()
            .expect("name_buf missing in named character reference")
    }

    fn name_buf_mut(&mut self) -> &mut String {
        self.name_buf_opt
            .as_mut()
            .expect("name_buf missing in named character reference")
    }

    fn finish_none(&mut self) -> Status {
        self.result = Some(CharRef {
            chars: ['\0', '\0'],
            num_chars: 0,
        });
        Done
    }

    fn finish_one(&mut self, c: char) -> Status {
        self.result = Some(CharRef {
            chars: [c, '\0'],
            num_chars: 1,
        });
        Done
    }
}

/// The spec replaces most characters in the ISO-2022 C1 control code range
/// (U+0080 through U+009F) with these characters, based on Windows 8-bit
/// codepages.
pub static C1_REPLACEMENTS: [Option<char>; 32] = [
    Some('\u{20ac}'),
    None,
    Some('\u{201a}'),
    Some('\u{0192}'),
    Some('\u{201e}'),
    Some('\u{2026}'),
    Some('\u{2020}'),
    Some('\u{2021}'),
    Some('\u{02c6}'),
    Some('\u{2030}'),
    Some('\u{0160}'),
    Some('\u{2039}'),
    Some('\u{0152}'),
    None,
    Some('\u{017d}'),
    None,
    None,
    Some('\u{2018}'),
    Some('\u{2019}'),
    Some('\u{201c}'),
    Some('\u{201d}'),
    Some('\u{2022}'),
    Some('\u{2013}'),
    Some('\u{2014}'),
    Some('\u{02dc}'),
    Some('\u{2122}'),
    Some('\u{0161}'),
    Some('\u{203a}'),
    Some('\u{0153}'),
    None,
    Some('\u{017e}'),
    Some('\u{0178}'),
];

impl CharRefTokenizer {
    pub fn step<Sink: TokenSink>(
        &mut self,
        tokenizer: &mut Tokenizer<Sink>,
        input: &mut BufferQueue,
    ) -> Status {
        if self.result.is_some() {
            return Done;
        }

        debug!("char ref tokenizer stepping in state {:?}", self.state);
        match self.state {
            Begin => self.do_begin(tokenizer, input),
            Octothorpe => self.do_octothorpe(tokenizer, input),
            Numeric(base) => self.do_numeric(tokenizer, input, base),
            NumericSemicolon => self.do_numeric_semicolon(tokenizer, input),
            Named => self.do_named(tokenizer, input),
            BogusName => self.do_bogus_name(tokenizer, input),
        }
    }

    fn do_begin<Sink: TokenSink>(
        &mut self,
        tokenizer: &mut Tokenizer<Sink>,
        input: &mut BufferQueue,
    ) -> Status {
        match unwrap_or_return!(tokenizer.peek(input), Stuck) {
            '\t' | '\n' | '\x0C' | ' ' | '<' | '&' => self.finish_none(),
            c if Some(c) == self.addnl_allowed => self.finish_none(),

            '#' => {
                tokenizer.discard_char(input);
                self.state = Octothorpe;
                Progress
            },

            _ => {
                self.state = Named;
                self.name_buf_opt = Some(String::new());
                Progress
            },
        }
    }

    fn do_octothorpe<Sink: TokenSink>(
        &mut self,
        tokenizer: &mut Tokenizer<Sink>,
        input: &mut BufferQueue,
    ) -> Status {
        let c = unwrap_or_return!(tokenizer.peek(input), Stuck);
        match c {
            'x' | 'X' => {
                tokenizer.discard_char(input);
                self.hex_marker = Some(c);
                self.state = Numeric(16);
            },

            _ => {
                self.hex_marker = None;
                self.state = Numeric(10);
            },
        }
        Progress
    }

    fn do_numeric<Sink: TokenSink>(
        &mut self,
        tokenizer: &mut Tokenizer<Sink>,
        input: &mut BufferQueue,
        base: u32,
    ) -> Status {
        let c = unwrap_or_return!(tokenizer.peek(input), Stuck);
        match c.to_digit(base) {
            Some(n) => {
                tokenizer.discard_char(input);
                self.num = self.num.wrapping_mul(base);
                if self.num > 0x10FFFF {
                    // We might overflow, and the character is definitely invalid.
                    // We still parse digits and semicolon, but don't use the result.
                    self.num_too_big = true;
                }
                self.num = self.num.wrapping_add(n);
                self.seen_digit = true;
                Progress
            },

            None if !self.seen_digit => self.unconsume_numeric(tokenizer, input),

            None => {
                self.state = NumericSemicolon;
                Progress
            },
        }
    }

    fn do_numeric_semicolon<Sink: TokenSink>(
        &mut self,
        tokenizer: &mut Tokenizer<Sink>,
        input: &mut BufferQueue,
    ) -> Status {
        match unwrap_or_return!(tokenizer.peek(input), Stuck) {
            ';' => tokenizer.discard_char(input),
            _ => tokenizer.emit_error(Borrowed(
                "Semicolon missing after numeric character reference",
            )),
        };
        self.finish_numeric(tokenizer)
    }

    fn unconsume_numeric<Sink: TokenSink>(
        &mut self,
        tokenizer: &mut Tokenizer<Sink>,
        input: &mut BufferQueue,
    ) -> Status {
        let mut unconsume = String::from('#');
        match self.hex_marker {
            Some(c) => unconsume.push(c),
            None => (),
        }

        input.push_front(unconsume);
        tokenizer.emit_error(Borrowed("Numeric character reference without digits"));
        self.finish_none()
    }

    fn finish_numeric<Sink: TokenSink>(&mut self, tokenizer: &mut Tokenizer<Sink>) -> Status {
        fn conv(n: u32) -> char {
            from_u32(n).expect("invalid char missed by error handling cases")
        }

        let (c, error) = match self.num {
            n if (n > 0x10FFFF) || self.num_too_big => ('\u{fffd}', true),
            0x00 | 0xD800..=0xDFFF => ('\u{fffd}', true),

            0x80..=0x9F => match C1_REPLACEMENTS[(self.num - 0x80) as usize] {
                Some(c) => (c, true),
                None => (conv(self.num), true),
            },

            0x01..=0x08 | 0x0B | 0x0D..=0x1F | 0x7F | 0xFDD0..=0xFDEF => (conv(self.num), true),

            n if (n & 0xFFFE) == 0xFFFE => (conv(n), true),

            n => (conv(n), false),
        };

        if error {
            let msg = format_if!(
                tokenizer.opts.exact_errors,
                "Invalid numeric character reference",
                "Invalid numeric character reference value 0x{:06X}",
                self.num
            );
            tokenizer.emit_error(msg);
        }

        self.finish_one(c)
    }

    fn do_named<Sink: TokenSink>(
        &mut self,
        tokenizer: &mut Tokenizer<Sink>,
        input: &mut BufferQueue,
    ) -> Status {
        let c = unwrap_or_return!(tokenizer.get_char(input), Stuck);
        self.name_buf_mut().push(c);
        self.finish_named(tokenizer, input, Some(c))
    }

    fn emit_name_error<Sink: TokenSink>(&mut self, tokenizer: &mut Tokenizer<Sink>) {
        let msg = format_if!(
            tokenizer.opts.exact_errors,
            "Invalid character reference",
            "Invalid character reference &{}",
            self.name_buf()
        );
        tokenizer.emit_error(msg);
    }

    fn unconsume_name(&mut self, input: &mut BufferQueue) {
        input.push_front(self.name_buf_opt.take().unwrap());
    }

    fn finish_named<Sink: TokenSink>(
        &mut self,
        tokenizer: &mut Tokenizer<Sink>,
        input: &mut BufferQueue,
        end_char: Option<char>,
    ) -> Status {
        match self.name_match {
            None => {
                match end_char {
                    Some(c) if is_ascii_alnum(c) => {
                        // Keep looking for a semicolon, to determine whether
                        // we emit a parse error.
                        self.state = BogusName;
                        return Progress;
                    },

                    // Check length because &; is not a parse error.
                    Some(';') if self.name_buf().len() > 1 => self.emit_name_error(tokenizer),

                    _ => (),
                }
                self.unconsume_name(input);
                self.finish_none()
            },

            Some((c1, c2)) => {
                // We have a complete match, but we may have consumed
                // additional characters into self.name_buf.  Usually
                // at least one, but several in cases like
                //
                //     &not    => match for U+00AC
                //     &noti   => valid prefix for &notin
                //     &notit  => can't continue match

                let name_len = self.name_len;
                assert!(name_len > 0);
                let last_matched = self.name_buf()[name_len - 1..].chars().next().unwrap();

                // There might not be a next character after the match, if
                // we had a full match and then hit EOF.
                let next_after = if name_len == self.name_buf().len() {
                    None
                } else {
                    Some(self.name_buf()[name_len..].chars().next().unwrap())
                };

                // "If the character reference is being consumed as part of an
                // attribute, and the last character matched is not a U+003B
                // SEMICOLON character (;), and the next character is either a
                // U+003D EQUALS SIGN character (=) or an alphanumeric ASCII
                // character, then, for historical reasons, all the characters
                // that were matched after the U+0026 AMPERSAND character (&)
                // must be unconsumed, and nothing is returned. However, if
                // this next character is in fact a U+003D EQUALS SIGN
                // character (=), then this is a parse error"

                let unconsume_all = match (self.addnl_allowed, last_matched, next_after) {
                    (_, ';', _) => false,
                    (Some(_), _, Some('=')) => {
                        tokenizer.emit_error(Borrowed(
                            "Equals sign after character reference in attribute",
                        ));
                        true
                    },
                    (Some(_), _, Some(c)) if is_ascii_alnum(c) => true,
                    _ => {
                        tokenizer.emit_error(Borrowed(
                            "Character reference does not end with semicolon",
                        ));
                        false
                    },
                };

                if unconsume_all {
                    self.unconsume_name(input);
                    self.finish_none()
                } else {
                    input.push_front(String::from(&self.name_buf()[name_len..]));
                    self.result = Some(CharRef {
                        chars: [from_u32(c1).unwrap(), from_u32(c2).unwrap()],
                        num_chars: if c2 == 0 { 1 } else { 2 },
                    });
                    Done
                }
            },
        }
    }

    fn do_bogus_name<Sink: TokenSink>(
        &mut self,
        tokenizer: &mut Tokenizer<Sink>,
        input: &mut BufferQueue,
    ) -> Status {
        let c = unwrap_or_return!(tokenizer.get_char(input), Stuck);
        self.name_buf_mut().push(c);
        match c {
            _ if is_ascii_alnum(c) => return Progress,
            ';' => self.emit_name_error(tokenizer),
            _ => (),
        }
        self.unconsume_name(input);
        self.finish_none()
    }

    pub fn end_of_file<Sink: TokenSink>(
        &mut self,
        tokenizer: &mut Tokenizer<Sink>,
        input: &mut BufferQueue,
    ) {
        while self.result.is_none() {
            match self.state {
                Begin => drop(self.finish_none()),

                Numeric(_) if !self.seen_digit => drop(self.unconsume_numeric(tokenizer, input)),

                Numeric(_) | NumericSemicolon => {
                    tokenizer.emit_error(Borrowed("EOF in numeric character reference"));
                    self.finish_numeric(tokenizer);
                },

                Named => drop(self.finish_named(tokenizer, input, None)),

                BogusName => {
                    self.unconsume_name(input);
                    self.finish_none();
                },

                Octothorpe => {
                    input.push_front(String::from("#"));
                    tokenizer.emit_error(Borrowed("EOF after '#' in character reference"));
                    self.finish_none();
                },
            }
        }
    }
}