use crate::machine;
use crate::naive_parser::naive_next_state;
use crate::offset::{Offset, Position};
use crate::reader::{IntoReader, Reader};
use crate::utils::{
    control_pat, noncharacter_pat, surrogate_pat, ControlToken, State as InternalState,
};
use crate::{Emitter, Error};

// this is a stack that can hold 0 to 2 Ts
#[derive(Debug, Default, Clone, Copy)]
struct Stack2<T: Copy>(Option<(T, Option<T>)>);

impl<T: Copy> Stack2<T> {
    #[inline]
    fn push(&mut self, c: T) {
        self.0 = match self.0 {
            None => Some((c, None)),
            Some((c1, None)) => Some((c1, Some(c))),
            Some((_c1, Some(_c2))) => panic!("stack full!"),
        }
    }

    #[inline]
    fn pop(&mut self) -> Option<T> {
        let (new_self, rv) = match self.0 {
            Some((c1, Some(c2))) => (Some((c1, None)), Some(c2)),
            Some((c1, None)) => (None, Some(c1)),
            None => (None, None),
        };
        self.0 = new_self;
        rv
    }
}

/// An HTML tokenizer.
///
/// Note that for proper HTML parsing, you'll have to implement [tree construction]
/// based on this Tokenizer yourself (since this crate currently does not implement it).
///
/// [tree construction]: https://html.spec.whatwg.org/multipage/parsing.html#tree-construction
pub struct Tokenizer<R: Reader, O, E: Emitter<O>> {
    eof: bool,
    pub(crate) state: InternalState,
    pub(crate) emitter: E,
    pub(crate) temporary_buffer: String,
    pub(crate) reader: R,
    to_reconsume: Stack2<Option<char>>,
    pub(crate) character_reference_code: u32,
    pub(crate) return_state: Option<InternalState>,
    current_tag_name: String,
    last_start_tag_name: String,
    is_start_tag: bool,
    pub(crate) doctype_offset: O,
    /// This boolean flag exists so that the [`NaiveParser`](crate::NaiveParser) can work with any [`Emitter`]
    /// (it cannot call [`Tokenizer::set_state`] using the emitted start tags since they can be of an arbitrary type).
    pub(crate) naively_switch_state: bool,
}

impl<R: Reader, O: Offset, E: Emitter<O>> Tokenizer<R, O, E> {
    /// Creates a new tokenizer from some input and an emitter.
    ///
    /// Note that properly parsing HTML with this tokenizer requires you to
    /// implement [tree construction] and call [`Tokenizer::set_state`] accordingly.
    ///
    /// [tree construction]: https://html.spec.whatwg.org/multipage/parsing.html#tree-construction
    pub fn new<'a>(reader: impl IntoReader<'a, Reader = R>, emitter: E) -> Self {
        Tokenizer {
            reader: reader.into_reader(),
            emitter,
            state: InternalState::Data,
            to_reconsume: Stack2::default(),
            return_state: None,
            temporary_buffer: String::new(),
            character_reference_code: 0,
            eof: false,
            current_tag_name: String::new(),
            last_start_tag_name: String::new(),
            is_start_tag: false,
            doctype_offset: O::default(),
            naively_switch_state: false,
        }
    }
}

/// The states you can set the tokenizer to.
#[derive(Debug)]
#[non_exhaustive]
pub enum State {
    /// The [data state].
    ///
    /// [data state]: https://html.spec.whatwg.org/multipage/parsing.html#data-state
    Data,
    /// The [PLAINTEXT state].
    ///
    /// [PLAINTEXT state]: https://html.spec.whatwg.org/multipage/parsing.html#plaintext-state
    PlainText,
    /// The [RCDATA state].
    ///
    /// [RCDATA state]: https://html.spec.whatwg.org/multipage/parsing.html#rcdata-state
    RcData,
    /// The [RAWTEXT state].
    ///
    /// [RAWTEXT state]: https://html.spec.whatwg.org/multipage/parsing.html#rawtext-state
    RawText,
    /// The [script data state].
    ///
    /// [script data state]: https://html.spec.whatwg.org/multipage/parsing.html#script-data-state
    ScriptData,
    /// The [script data escaped state].
    ///
    /// [script data escaped state]: https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-state
    ScriptDataEscaped,
    /// The [script data double escaped state].
    ///
    /// [script data double escaped state]: https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-state
    ScriptDataDoubleEscaped,
}

impl From<State> for InternalState {
    fn from(state: State) -> Self {
        match state {
            State::Data => InternalState::Data,
            State::PlainText => InternalState::PlainText,
            State::RcData => InternalState::RcData,
            State::RawText => InternalState::RawText,
            State::ScriptData => InternalState::ScriptData,
            State::ScriptDataEscaped => InternalState::ScriptDataEscaped,
            State::ScriptDataDoubleEscaped => InternalState::ScriptDataDoubleEscaped,
        }
    }
}

impl<R: Reader + Position<O>, O: Offset, E: Emitter<O>> Tokenizer<R, O, E> {
    /// Test-internal function to override internal state.
    ///
    /// Only available with the `integration-tests` feature which is not public API.
    #[cfg(feature = "integration-tests")]
    pub fn set_internal_state(&mut self, state: InternalState) {
        self.state = state;
    }

    /// Set the statemachine to start/continue in the given state.
    pub fn set_state(&mut self, state: State) {
        self.state = state.into();
    }

    /// Just a helper method for the machine.
    #[inline]
    pub(crate) fn emit_error(&mut self, error: Error) {
        self.emitter.emit_error(error, self.reader.position() - 1);
    }

    /// Assuming the _current token_ is an end tag, return true if all of these hold. Return false otherwise.
    ///
    /// * the _last start tag_ exists
    /// * the current end tag token's name equals to the last start tag's name.
    ///
    /// See also WHATWG's definition of [appropriate end tag token].
    ///
    /// [appropriate end tag token]: https://html.spec.whatwg.org/multipage/parsing.html#appropriate-end-tag-token
    #[inline]
    pub(crate) fn current_end_tag_is_appropriate(&mut self) -> bool {
        self.current_tag_name == self.last_start_tag_name
    }

    #[inline]
    pub(crate) fn init_start_tag(&mut self) {
        self.emitter.init_start_tag(self.reader.position() - 1);
        self.current_tag_name.clear();
        self.is_start_tag = true;
    }

    #[inline]
    pub(crate) fn init_end_tag(&mut self) {
        self.emitter.init_end_tag(self.reader.position() - 1);
        self.current_tag_name.clear();
        self.is_start_tag = false;
    }

    #[inline]
    pub(crate) fn push_tag_name(&mut self, s: &str) {
        self.emitter.push_tag_name(s);
        self.current_tag_name.push_str(s);
    }

    #[inline]
    pub(crate) fn emit_current_tag(&mut self) {
        self.emitter.emit_current_tag(self.reader.position() - 1);
        if self.is_start_tag {
            if self.naively_switch_state {
                self.state = naive_next_state(&self.current_tag_name).into();
            }
            std::mem::swap(&mut self.last_start_tag_name, &mut self.current_tag_name);
        }
    }

    #[inline]
    pub(crate) fn unread_char(&mut self, c: Option<char>) {
        self.to_reconsume.push(c);
    }

    #[inline]
    fn validate_char(&mut self, c: char) {
        match c as u32 {
            surrogate_pat!() => {
                self.emit_error(Error::SurrogateInInputStream);
            }
            noncharacter_pat!() => {
                self.emit_error(Error::NoncharacterInInputStream);
            }
            // control without whitespace or nul
            x @ control_pat!()
                if !matches!(x, 0x0000 | 0x0009 | 0x000a | 0x000c | 0x000d | 0x0020) =>
            {
                self.emit_error(Error::ControlCharacterInInputStream);
            }
            _ => (),
        }
    }

    pub(crate) fn read_char(&mut self) -> Result<Option<char>, R::Error> {
        let (c_res, reconsumed) = match self.to_reconsume.pop() {
            Some(c) => (Ok(c), true),
            None => (self.reader.read_char(), false),
        };

        let mut c = match c_res {
            Ok(Some(c)) => c,
            res => return res,
        };

        if c == '\r' {
            c = '\n';
            let c2 = self.reader.read_char()?;
            if c2 != Some('\n') {
                self.unread_char(c2);
            }
        }

        if !reconsumed {
            self.validate_char(c);
        }

        Ok(Some(c))
    }

    #[inline]
    pub(crate) fn try_read_string(
        &mut self,
        mut s: &str,
        case_sensitive: bool,
    ) -> Result<bool, R::Error> {
        debug_assert!(!s.is_empty());

        let to_reconsume_bak = self.to_reconsume;
        let mut chars = s.chars();
        while let Some(c) = self.to_reconsume.pop() {
            if let (Some(x), Some(x2)) = (c, chars.next()) {
                if x == x2 || (!case_sensitive && x.to_ascii_lowercase() == x2.to_ascii_lowercase())
                {
                    s = &s[x.len_utf8()..];
                    continue;
                }
            }

            self.to_reconsume = to_reconsume_bak;
            return Ok(false);
        }

        self.reader.try_read_string(s, case_sensitive)
    }

    pub(crate) fn is_consumed_as_part_of_an_attribute(&self) -> bool {
        matches!(
            self.return_state,
            Some(
                InternalState::AttributeValueDoubleQuoted
                    | InternalState::AttributeValueSingleQuoted
                    | InternalState::AttributeValueUnquoted
            )
        )
    }

    pub(crate) fn flush_code_points_consumed_as_character_reference(&mut self) {
        if self.is_consumed_as_part_of_an_attribute() {
            self.emitter.push_attribute_value(&self.temporary_buffer);
            self.temporary_buffer.clear();
        } else {
            self.flush_buffer_characters();
        }
    }

    pub(crate) fn next_input_character(&mut self) -> Result<Option<char>, R::Error> {
        let rv = self.read_char()?;
        self.unread_char(rv);
        Ok(rv)
    }

    pub(crate) fn flush_buffer_characters(&mut self) {
        self.emitter.emit_string(&self.temporary_buffer);
        self.temporary_buffer.clear();
    }
}

impl<O, R, E> Iterator for Tokenizer<R, O, E>
where
    O: Offset,
    R: Reader + Position<O>,
    E: Emitter<O>,
{
    type Item = Result<E::Token, R::Error>;

    fn next(&mut self) -> Option<Self::Item> {
        loop {
            if let Some(token) = self.emitter.pop_token() {
                break Some(Ok(token));
            } else if !self.eof {
                match machine::consume(self) {
                    Ok(ControlToken::Continue) => (),
                    Ok(ControlToken::Eof) => {
                        self.eof = true;
                        self.emitter.emit_eof();
                    }
                    Err(e) => break Some(Err(e)),
                }
            } else {
                break None;
            }
        }
    }
}

impl<R: Reader, O, E: Emitter<O>> Tokenizer<R, O, E> {
    /// Test-internal function to override internal state.
    ///
    /// Only available with the `integration-tests` feature which is not public API.
    #[cfg(feature = "integration-tests")]
    pub fn set_last_start_tag(&mut self, last_start_tag: &str) {
        self.last_start_tag_name.clear();
        self.last_start_tag_name.push_str(last_start_tag);
    }
}