use crate::machine; use crate::naive_parser::naive_next_state; use crate::offset::{Offset, Position}; use crate::reader::{IntoReader, Reader}; use crate::utils::{ control_pat, noncharacter_pat, surrogate_pat, ControlToken, State as InternalState, }; use crate::{Emitter, Error}; // this is a stack that can hold 0 to 2 Ts #[derive(Debug, Default, Clone, Copy)] struct Stack2(Option<(T, Option)>); impl Stack2 { #[inline] fn push(&mut self, c: T) { self.0 = match self.0 { None => Some((c, None)), Some((c1, None)) => Some((c1, Some(c))), Some((_c1, Some(_c2))) => panic!("stack full!"), } } #[inline] fn pop(&mut self) -> Option { let (new_self, rv) = match self.0 { Some((c1, Some(c2))) => (Some((c1, None)), Some(c2)), Some((c1, None)) => (None, Some(c1)), None => (None, None), }; self.0 = new_self; rv } } /// An HTML tokenizer. /// /// Note that for proper HTML parsing, you'll have to implement [tree construction] /// based on this Tokenizer yourself (since this crate currently does not implement it). /// /// [tree construction]: https://html.spec.whatwg.org/multipage/parsing.html#tree-construction pub struct Tokenizer> { eof: bool, pub(crate) state: InternalState, pub(crate) emitter: E, pub(crate) temporary_buffer: String, pub(crate) reader: R, to_reconsume: Stack2>, pub(crate) character_reference_code: u32, pub(crate) return_state: Option, current_tag_name: String, last_start_tag_name: String, is_start_tag: bool, pub(crate) doctype_offset: O, /// This boolean flag exists so that the [`NaiveParser`](crate::NaiveParser) can work with any [`Emitter`] /// (it cannot call [`Tokenizer::set_state`] using the emitted start tags since they can be of an arbitrary type). pub(crate) naively_switch_state: bool, } impl, O: Offset, E: Emitter> Tokenizer { /// Creates a new tokenizer from some input and an emitter. /// /// Note that properly parsing HTML with this tokenizer requires you to /// implement [tree construction] and call [`Tokenizer::set_state`] accordingly. /// /// [tree construction]: https://html.spec.whatwg.org/multipage/parsing.html#tree-construction pub fn new<'a>(reader: impl IntoReader<'a, Reader = R>, emitter: E) -> Self { Tokenizer { reader: reader.into_reader(), emitter, state: InternalState::Data, to_reconsume: Stack2::default(), return_state: None, temporary_buffer: String::new(), character_reference_code: 0, eof: false, current_tag_name: String::new(), last_start_tag_name: String::new(), is_start_tag: false, doctype_offset: O::default(), naively_switch_state: false, } } } /// The states you can set the tokenizer to. #[derive(Debug)] #[non_exhaustive] pub enum State { /// The [data state]. /// /// [data state]: https://html.spec.whatwg.org/multipage/parsing.html#data-state Data, /// The [PLAINTEXT state]. /// /// [PLAINTEXT state]: https://html.spec.whatwg.org/multipage/parsing.html#plaintext-state PlainText, /// The [RCDATA state]. /// /// [RCDATA state]: https://html.spec.whatwg.org/multipage/parsing.html#rcdata-state RcData, /// The [RAWTEXT state]. /// /// [RAWTEXT state]: https://html.spec.whatwg.org/multipage/parsing.html#rawtext-state RawText, /// The [script data state]. /// /// [script data state]: https://html.spec.whatwg.org/multipage/parsing.html#script-data-state ScriptData, /// The [script data escaped state]. /// /// [script data escaped state]: https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-state ScriptDataEscaped, /// The [script data double escaped state]. /// /// [script data double escaped state]: https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-state ScriptDataDoubleEscaped, } impl From for InternalState { fn from(state: State) -> Self { match state { State::Data => InternalState::Data, State::PlainText => InternalState::PlainText, State::RcData => InternalState::RcData, State::RawText => InternalState::RawText, State::ScriptData => InternalState::ScriptData, State::ScriptDataEscaped => InternalState::ScriptDataEscaped, State::ScriptDataDoubleEscaped => InternalState::ScriptDataDoubleEscaped, } } } impl, O: Offset, E: Emitter> Tokenizer { /// Test-internal function to override internal state. /// /// Only available with the `integration-tests` feature which is not public API. #[cfg(feature = "integration-tests")] pub fn set_internal_state(&mut self, state: InternalState) { self.state = state; } /// Set the statemachine to start/continue in the given state. pub fn set_state(&mut self, state: State) { self.state = state.into(); } /// Just a helper method for the machine. #[inline] pub(crate) fn emit_error(&mut self, error: Error) { self.emitter.emit_error(error, self.reader.position() - 1); } /// Assuming the _current token_ is an end tag, return true if all of these hold. Return false otherwise. /// /// * the _last start tag_ exists /// * the current end tag token's name equals to the last start tag's name. /// /// See also WHATWG's definition of [appropriate end tag token]. /// /// [appropriate end tag token]: https://html.spec.whatwg.org/multipage/parsing.html#appropriate-end-tag-token #[inline] pub(crate) fn current_end_tag_is_appropriate(&mut self) -> bool { self.current_tag_name == self.last_start_tag_name } #[inline] pub(crate) fn init_start_tag(&mut self) { self.emitter.init_start_tag(self.reader.position() - 1); self.current_tag_name.clear(); self.is_start_tag = true; } #[inline] pub(crate) fn init_end_tag(&mut self) { self.emitter.init_end_tag(self.reader.position() - 1); self.current_tag_name.clear(); self.is_start_tag = false; } #[inline] pub(crate) fn push_tag_name(&mut self, s: &str) { self.emitter.push_tag_name(s); self.current_tag_name.push_str(s); } #[inline] pub(crate) fn emit_current_tag(&mut self) { self.emitter.emit_current_tag(self.reader.position() - 1); if self.is_start_tag { if self.naively_switch_state { self.state = naive_next_state(&self.current_tag_name).into(); } std::mem::swap(&mut self.last_start_tag_name, &mut self.current_tag_name); } } #[inline] pub(crate) fn unread_char(&mut self, c: Option) { self.to_reconsume.push(c); } #[inline] fn validate_char(&mut self, c: char) { match c as u32 { surrogate_pat!() => { self.emit_error(Error::SurrogateInInputStream); } noncharacter_pat!() => { self.emit_error(Error::NoncharacterInInputStream); } // control without whitespace or nul x @ control_pat!() if !matches!(x, 0x0000 | 0x0009 | 0x000a | 0x000c | 0x000d | 0x0020) => { self.emit_error(Error::ControlCharacterInInputStream); } _ => (), } } pub(crate) fn read_char(&mut self) -> Result, R::Error> { let (c_res, reconsumed) = match self.to_reconsume.pop() { Some(c) => (Ok(c), true), None => (self.reader.read_char(), false), }; let mut c = match c_res { Ok(Some(c)) => c, res => return res, }; if c == '\r' { c = '\n'; let c2 = self.reader.read_char()?; if c2 != Some('\n') { self.unread_char(c2); } } if !reconsumed { self.validate_char(c); } Ok(Some(c)) } #[inline] pub(crate) fn try_read_string( &mut self, mut s: &str, case_sensitive: bool, ) -> Result { debug_assert!(!s.is_empty()); let to_reconsume_bak = self.to_reconsume; let mut chars = s.chars(); while let Some(c) = self.to_reconsume.pop() { if let (Some(x), Some(x2)) = (c, chars.next()) { if x == x2 || (!case_sensitive && x.to_ascii_lowercase() == x2.to_ascii_lowercase()) { s = &s[x.len_utf8()..]; continue; } } self.to_reconsume = to_reconsume_bak; return Ok(false); } self.reader.try_read_string(s, case_sensitive) } pub(crate) fn is_consumed_as_part_of_an_attribute(&self) -> bool { matches!( self.return_state, Some( InternalState::AttributeValueDoubleQuoted | InternalState::AttributeValueSingleQuoted | InternalState::AttributeValueUnquoted ) ) } pub(crate) fn flush_code_points_consumed_as_character_reference(&mut self) { if self.is_consumed_as_part_of_an_attribute() { self.emitter.push_attribute_value(&self.temporary_buffer); self.temporary_buffer.clear(); } else { self.flush_buffer_characters(); } } pub(crate) fn next_input_character(&mut self) -> Result, R::Error> { let rv = self.read_char()?; self.unread_char(rv); Ok(rv) } pub(crate) fn flush_buffer_characters(&mut self) { self.emitter.emit_string(&self.temporary_buffer); self.temporary_buffer.clear(); } } impl Iterator for Tokenizer where O: Offset, R: Reader + Position, E: Emitter, { type Item = Result; fn next(&mut self) -> Option { loop { if let Some(token) = self.emitter.pop_token() { break Some(Ok(token)); } else if !self.eof { match machine::consume(self) { Ok(ControlToken::Continue) => (), Ok(ControlToken::Eof) => { self.eof = true; self.emitter.emit_eof(); } Err(e) => break Some(Err(e)), } } else { break None; } } } } impl> Tokenizer { /// Test-internal function to override internal state. /// /// Only available with the `integration-tests` feature which is not public API. #[cfg(feature = "integration-tests")] pub fn set_last_start_tag(&mut self, last_start_tag: &str) { self.last_start_tag_name.clear(); self.last_start_tag_name.push_str(last_start_tag); } }