diff options
Diffstat (limited to 'src/tokenizer/machine')
| -rw-r--r-- | src/tokenizer/machine/utils.rs | 193 | 
1 files changed, 193 insertions, 0 deletions
| diff --git a/src/tokenizer/machine/utils.rs b/src/tokenizer/machine/utils.rs index 7d220cf..6e45f4d 100644 --- a/src/tokenizer/machine/utils.rs +++ b/src/tokenizer/machine/utils.rs @@ -1,3 +1,196 @@ +use crate::{ +    naive_parser::naive_next_state, +    offset::{Offset, Position}, +    reader::Reader, +    Emitter, Error, +}; + +use super::Machine; + +impl<R, O, E> Machine<R, O, E> +where +    R: Reader + Position<O>, +    O: Offset, +    E: Emitter<O>, +{ +    #[inline] +    pub(crate) fn emit_error(&mut self, error: Error) { +        let span = match error { +            Error::EofBeforeTagName +            | Error::EofInCdata +            | Error::EofInComment +            | Error::EofInDoctype +            | Error::EofInScriptHtmlCommentLikeText +            | Error::EofInTag +            | Error::MissingSemicolonAfterCharacterReference => { +                self.reader.position()..self.reader.position() +            } +            Error::AbsenceOfDigitsInNumericCharacterReference +            | Error::NullCharacterReference +            | Error::CharacterReferenceOutsideUnicodeRange +            | Error::SurrogateCharacterReference +            | Error::NoncharacterCharacterReference +            | Error::ControlCharacterReference +            | Error::UnknownNamedCharacterReference => self.some_offset..self.reader.position(), + +            _ => self.position_before_match..self.reader.position(), +        }; +        self.emitter.report_error(error, span); +    } + +    /// Assuming the _current token_ is an end tag, return true if all of these hold. Return false otherwise. +    /// +    /// * the _last start tag_ exists +    /// * the current end tag token's name equals to the last start tag's name. +    /// +    /// See also WHATWG's definition of [appropriate end tag token]. +    /// +    /// [appropriate end tag token]: https://html.spec.whatwg.org/multipage/parsing.html#appropriate-end-tag-token +    #[inline] +    pub(super) fn current_end_tag_is_appropriate(&mut self) -> bool { +        self.current_tag_name == self.last_start_tag_name +    } + +    #[inline] +    pub(super) fn init_start_tag(&mut self) { +        self.emitter +            .init_start_tag(self.some_offset, self.position_before_match); +        self.current_tag_name.clear(); +        self.is_start_tag = true; +    } + +    #[inline] +    pub(super) fn init_end_tag(&mut self) { +        self.emitter +            .init_end_tag(self.some_offset, self.position_before_match); +        self.current_tag_name.clear(); +        self.is_start_tag = false; +    } + +    #[inline] +    pub(super) fn init_doctype(&mut self) { +        self.emitter.init_doctype(self.some_offset); +    } + +    #[inline] +    pub(super) fn push_tag_name(&mut self, s: &str) { +        self.emitter.push_tag_name(s); +        self.current_tag_name.push_str(s); +    } + +    #[inline] +    pub(super) fn emit_current_tag(&mut self) { +        self.emitter.emit_current_tag(self.reader.position()); +        if self.is_start_tag { +            if self.naively_switch_state { +                self.state = naive_next_state(&self.current_tag_name).into(); +            } +            std::mem::swap(&mut self.last_start_tag_name, &mut self.current_tag_name); +        } +    } + +    #[inline] +    pub(super) fn unread_char(&mut self, c: Option<char>) { +        self.to_reconsume.push(c); +    } + +    #[inline] +    fn validate_char(&mut self, c: char) { +        match c as u32 { +            surrogate_pat!() => { +                self.emit_error(Error::SurrogateInInputStream); +            } +            noncharacter_pat!() => { +                self.emit_error(Error::NoncharacterInInputStream); +            } +            // control without whitespace or nul +            x @ control_pat!() +                if !matches!(x, 0x0000 | 0x0009 | 0x000a | 0x000c | 0x000d | 0x0020) => +            { +                self.emit_error(Error::ControlCharacterInInputStream); +            } +            _ => (), +        } +    } + +    pub(super) fn read_char(&mut self) -> Result<Option<char>, R::Error> { +        let (c_res, reconsumed) = match self.to_reconsume.pop() { +            Some(c) => (Ok(c), true), +            None => (self.reader.read_char(), false), +        }; + +        let mut c = match c_res { +            Ok(Some(c)) => c, +            res => return res, +        }; + +        if c == '\r' { +            c = '\n'; +            let c2 = self.reader.read_char()?; +            if c2 != Some('\n') { +                self.unread_char(c2); +            } +        } + +        if !reconsumed { +            self.validate_char(c); +        } + +        Ok(Some(c)) +    } + +    #[inline] +    pub(super) fn try_read_string( +        &mut self, +        mut s: &str, +        case_sensitive: bool, +    ) -> Result<bool, R::Error> { +        debug_assert!(!s.is_empty()); + +        let to_reconsume_bak = self.to_reconsume; +        let mut chars = s.chars(); +        while let Some(c) = self.to_reconsume.pop() { +            if let (Some(x), Some(x2)) = (c, chars.next()) { +                if x == x2 || (!case_sensitive && x.to_ascii_lowercase() == x2.to_ascii_lowercase()) +                { +                    s = &s[x.len_utf8()..]; +                    continue; +                } +            } + +            self.to_reconsume = to_reconsume_bak; +            return Ok(false); +        } + +        self.reader.try_read_string(s, case_sensitive) +    } + +    pub(super) fn is_consumed_as_part_of_an_attribute(&self) -> bool { +        matches!( +            self.return_state, +            Some( +                State::AttributeValueDoubleQuoted +                    | State::AttributeValueSingleQuoted +                    | State::AttributeValueUnquoted +            ) +        ) +    } + +    pub(super) fn flush_code_points_consumed_as_character_reference(&mut self) { +        if self.is_consumed_as_part_of_an_attribute() { +            self.emitter.push_attribute_value(&self.temporary_buffer); +            self.temporary_buffer.clear(); +        } else { +            self.flush_buffer_characters(); +        } +    } + +    pub(super) fn flush_buffer_characters(&mut self) { +        self.emitter.emit_string(&self.temporary_buffer); +        self.temporary_buffer.clear(); +    } +} +  macro_rules! surrogate_pat {      () => {          0xd800..=0xdfff | 
