use crate::machine::{self, ControlToken}; use crate::naive_parser::naive_next_state; use crate::offset::{Offset, Position}; use crate::reader::{IntoReader, Reader}; use crate::utils::{control_pat, noncharacter_pat, surrogate_pat, State as InternalState}; use crate::{Emitter, Error}; // this is a stack that can hold 0 to 2 Ts #[derive(Debug, Default, Clone, Copy)] struct Stack2(Option<(T, Option)>); impl Stack2 { #[inline] fn push(&mut self, c: T) { self.0 = match self.0 { None => Some((c, None)), Some((c1, None)) => Some((c1, Some(c))), Some((_c1, Some(_c2))) => panic!("stack full!"), } } #[inline] fn pop(&mut self) -> Option { let (new_self, rv) = match self.0 { Some((c1, Some(c2))) => (Some((c1, None)), Some(c2)), Some((c1, None)) => (None, Some(c1)), None => (None, None), }; self.0 = new_self; rv } } /// An HTML tokenizer. /// /// Note that for proper HTML parsing, you'll have to implement [tree construction] /// based on this Tokenizer yourself (since this crate currently does not implement it). /// /// [tree construction]: https://html.spec.whatwg.org/multipage/parsing.html#tree-construction pub struct Tokenizer> { eof: bool, pub(crate) state: InternalState, pub(crate) emitter: E, pub(crate) temporary_buffer: String, pub(crate) reader: R, to_reconsume: Stack2>, pub(crate) character_reference_code: u32, pub(crate) return_state: Option, current_tag_name: String, last_start_tag_name: String, is_start_tag: bool, /// The reader position before the match block in [`machine::consume`]. pub(crate) position_before_match: O, /// * Set to the offset of `<` in [`InternalState::Data`]. /// * Set to the offset of `&` in [`InternalState::CharacterReference`]. pub(crate) some_offset: O, /// This boolean flag exists so that the [`NaiveParser`](crate::NaiveParser) can work with any [`Emitter`] /// (it cannot call [`Tokenizer::set_state`] using the emitted start tags since they can be of an arbitrary type). pub(crate) naively_switch_state: bool, } impl, O: Offset, E: Emitter> Tokenizer { /// Creates a new tokenizer from some input and an emitter. /// /// Note that properly parsing HTML with this tokenizer requires you to /// implement [tree construction] and call [`Tokenizer::set_state`] accordingly. /// /// [tree construction]: https://html.spec.whatwg.org/multipage/parsing.html#tree-construction pub fn new<'a>(reader: impl IntoReader<'a, Reader = R>, emitter: E) -> Self { Tokenizer { reader: reader.into_reader(), emitter, state: InternalState::Data, to_reconsume: Stack2::default(), return_state: None, temporary_buffer: String::new(), character_reference_code: 0, eof: false, current_tag_name: String::new(), last_start_tag_name: String::new(), is_start_tag: false, position_before_match: O::default(), some_offset: O::default(), naively_switch_state: false, } } /// To be called when the tokenizer iterator implementation yields [`Event::CdataOpen`]. /// /// For spec-compliant parsing *action* must be [`CdataAction::Cdata`], /// if there is an _adjusted current node_ and it is not an element in /// the HTML namespace, or [`CdataAction::BogusComment`] otherwise /// (as per the third condition under [Markup declaration open state]). /// /// [Markup declaration open state]: https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state pub fn handle_cdata_open(&mut self, action: CdataAction) { machine::handle_cdata_open(self, action); } } /// Used by [`Tokenizer::handle_cdata_open`] to determine how to process ` { /// A token emitted by the [`Emitter`]. Token(T), /// The state machine encountered ` for InternalState { fn from(state: State) -> Self { match state { State::Data => InternalState::Data, State::PlainText => InternalState::PlainText, State::RcData => InternalState::RcData, State::RawText => InternalState::RawText, State::ScriptData => InternalState::ScriptData, State::ScriptDataEscaped => InternalState::ScriptDataEscaped, State::ScriptDataDoubleEscaped => InternalState::ScriptDataDoubleEscaped, } } } impl, O: Offset, E: Emitter> Tokenizer { /// Test-internal function to override internal state. /// /// Only available with the `integration-tests` feature which is not public API. #[cfg(feature = "integration-tests")] pub fn set_internal_state(&mut self, state: InternalState) { self.state = state; } /// Set the statemachine to start/continue in the given state. pub fn set_state(&mut self, state: State) { self.state = state.into(); } /// Just a helper method for the machine. #[inline] pub(crate) fn emit_error(&mut self, error: Error) { let span = match error { Error::EofBeforeTagName | Error::EofInCdata | Error::EofInComment | Error::EofInDoctype | Error::EofInScriptHtmlCommentLikeText | Error::EofInTag | Error::MissingSemicolonAfterCharacterReference => { self.reader.position()..self.reader.position() } Error::AbsenceOfDigitsInNumericCharacterReference | Error::NullCharacterReference | Error::CharacterReferenceOutsideUnicodeRange | Error::SurrogateCharacterReference | Error::NoncharacterCharacterReference | Error::ControlCharacterReference | Error::UnknownNamedCharacterReference => self.some_offset..self.reader.position(), _ => self.position_before_match..self.reader.position(), }; self.emitter.emit_error(error, span); } /// Assuming the _current token_ is an end tag, return true if all of these hold. Return false otherwise. /// /// * the _last start tag_ exists /// * the current end tag token's name equals to the last start tag's name. /// /// See also WHATWG's definition of [appropriate end tag token]. /// /// [appropriate end tag token]: https://html.spec.whatwg.org/multipage/parsing.html#appropriate-end-tag-token #[inline] pub(crate) fn current_end_tag_is_appropriate(&mut self) -> bool { self.current_tag_name == self.last_start_tag_name } #[inline] pub(crate) fn init_start_tag(&mut self) { self.emitter .init_start_tag(self.some_offset, self.position_before_match); self.current_tag_name.clear(); self.is_start_tag = true; } #[inline] pub(crate) fn init_end_tag(&mut self) { self.emitter .init_end_tag(self.some_offset, self.position_before_match); self.current_tag_name.clear(); self.is_start_tag = false; } #[inline] pub(crate) fn push_tag_name(&mut self, s: &str) { self.emitter.push_tag_name(s); self.current_tag_name.push_str(s); } #[inline] pub(crate) fn emit_current_tag(&mut self) { self.emitter.emit_current_tag(self.reader.position()); if self.is_start_tag { if self.naively_switch_state { self.state = naive_next_state(&self.current_tag_name).into(); } std::mem::swap(&mut self.last_start_tag_name, &mut self.current_tag_name); } } #[inline] pub(crate) fn unread_char(&mut self, c: Option) { self.to_reconsume.push(c); } #[inline] fn validate_char(&mut self, c: char) { match c as u32 { surrogate_pat!() => { self.emit_error(Error::SurrogateInInputStream); } noncharacter_pat!() => { self.emit_error(Error::NoncharacterInInputStream); } // control without whitespace or nul x @ control_pat!() if !matches!(x, 0x0000 | 0x0009 | 0x000a | 0x000c | 0x000d | 0x0020) => { self.emit_error(Error::ControlCharacterInInputStream); } _ => (), } } pub(crate) fn read_char(&mut self) -> Result, R::Error> { let (c_res, reconsumed) = match self.to_reconsume.pop() { Some(c) => (Ok(c), true), None => (self.reader.read_char(), false), }; let mut c = match c_res { Ok(Some(c)) => c, res => return res, }; if c == '\r' { c = '\n'; let c2 = self.reader.read_char()?; if c2 != Some('\n') { self.unread_char(c2); } } if !reconsumed { self.validate_char(c); } Ok(Some(c)) } #[inline] pub(crate) fn try_read_string( &mut self, mut s: &str, case_sensitive: bool, ) -> Result { debug_assert!(!s.is_empty()); let to_reconsume_bak = self.to_reconsume; let mut chars = s.chars(); while let Some(c) = self.to_reconsume.pop() { if let (Some(x), Some(x2)) = (c, chars.next()) { if x == x2 || (!case_sensitive && x.to_ascii_lowercase() == x2.to_ascii_lowercase()) { s = &s[x.len_utf8()..]; continue; } } self.to_reconsume = to_reconsume_bak; return Ok(false); } self.reader.try_read_string(s, case_sensitive) } pub(crate) fn is_consumed_as_part_of_an_attribute(&self) -> bool { matches!( self.return_state, Some( InternalState::AttributeValueDoubleQuoted | InternalState::AttributeValueSingleQuoted | InternalState::AttributeValueUnquoted ) ) } pub(crate) fn flush_code_points_consumed_as_character_reference(&mut self) { if self.is_consumed_as_part_of_an_attribute() { self.emitter.push_attribute_value(&self.temporary_buffer); self.temporary_buffer.clear(); } else { self.flush_buffer_characters(); } } pub(crate) fn flush_buffer_characters(&mut self) { self.emitter.emit_string(&self.temporary_buffer); self.temporary_buffer.clear(); } } impl Iterator for Tokenizer where O: Offset, R: Reader + Position, E: Emitter, { type Item = Result, R::Error>; fn next(&mut self) -> Option { loop { if let Some(token) = self.emitter.pop_token() { return Some(Ok(Event::Token(token))); } if self.eof { return None; } match machine::consume(self) { Err(e) => return Some(Err(e)), Ok(ControlToken::Continue) => (), Ok(ControlToken::Eof) => { self.eof = true; self.emitter.emit_eof(); } Ok(ControlToken::CdataOpen) => return Some(Ok(Event::CdataOpen)), } } } } impl> Tokenizer { /// Test-internal function to override internal state. /// /// Only available with the `integration-tests` feature which is not public API. #[cfg(feature = "integration-tests")] pub fn set_last_start_tag(&mut self, last_start_tag: &str) { self.last_start_tag_name.clear(); self.last_start_tag_name.push_str(last_start_tag); } }