pub(crate) mod machine; use crate::offset::{Offset, Position}; use crate::reader::{IntoReader, Reader}; use crate::Emitter; use machine::ControlToken; #[cfg(feature = "integration-tests")] pub use machine::State as InternalState; /// An HTML tokenizer. /// /// # Warning /// /// Iterating over the tokenizer directly without calling [`Tokenizer::set_state`] /// results in wrong state transitions: /// /// ``` /// # use html5tokenizer::{BasicEmitter, Event, Tokenizer, Token}; /// let emitter = BasicEmitter::default(); /// let html = "<script><b>"; /// let mut tokens = Tokenizer::new(html, emitter).flatten(); /// assert!(matches!(tokens.next(), Some(Event::Token(Token::StartTag(_))))); /// assert!(matches!(tokens.next(), Some(Event::Token(Token::StartTag(_))))); /// ``` /// /// Instead use the [`NaiveParser`] (in the future this crate will also provide a proper implementation of [tree construction]). /// /// [`NaiveParser`]: crate::NaiveParser /// [tree construction]: https://html.spec.whatwg.org/multipage/parsing.html#tree-construction pub struct Tokenizer<R, O, E> { machine: machine::Machine<R, O, E>, eof: bool, } impl<R: Reader + Position<O>, O: Offset, E: Emitter<O>> Tokenizer<R, O, E> { /// Creates a new tokenizer from some input and an emitter. /// /// Note that properly parsing HTML with this tokenizer requires you to /// implement [tree construction] and call [`Tokenizer::set_state`] accordingly. /// /// [tree construction]: https://html.spec.whatwg.org/multipage/parsing.html#tree-construction pub fn new<'a>(reader: impl IntoReader<'a, Reader = R>, emitter: E) -> Self { Tokenizer { machine: machine::Machine::new(reader.into_reader(), emitter), eof: false, } } /// To be called when the tokenizer iterator implementation yields [`Event::CdataOpen`]. /// /// For spec-compliant parsing the supplied boolean must be `true` /// if there is an _adjusted current node_ and it is not an element in /// the HTML namespace, or `false` otherwise (as per the third condition /// under [Markup declaration open state]). /// /// [Markup declaration open state]: https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state pub fn handle_cdata_open( &mut self, adjusted_current_node_present_and_not_in_html_namespace: bool, ) { machine::handle_cdata_open( &mut self.machine, adjusted_current_node_present_and_not_in_html_namespace, ); } /// Returns a mutable reference to the emitter. pub fn emitter_mut(&mut self) -> &mut E { &mut self.machine.emitter } } /// An event yielded by the [`Iterator`] implementation for the [`Tokenizer`]. #[derive(Clone, Debug)] pub enum Event<T> { /// A token emitted by the [`Emitter`]. Token(T), /// The state machine encountered `<![CDATA[`. You must call [`Tokenizer::handle_cdata_open`], /// before advancing the tokenizer iterator again. CdataOpen, } /// The states you can set the tokenizer to. #[derive(Debug)] #[non_exhaustive] pub enum State { /// The [data state]. /// /// [data state]: https://html.spec.whatwg.org/multipage/parsing.html#data-state Data, /// The [PLAINTEXT state]. /// /// [PLAINTEXT state]: https://html.spec.whatwg.org/multipage/parsing.html#plaintext-state Plaintext, /// The [RCDATA state]. /// /// [RCDATA state]: https://html.spec.whatwg.org/multipage/parsing.html#rcdata-state Rcdata, /// The [RAWTEXT state]. /// /// [RAWTEXT state]: https://html.spec.whatwg.org/multipage/parsing.html#rawtext-state Rawtext, /// The [script data state]. /// /// [script data state]: https://html.spec.whatwg.org/multipage/parsing.html#script-data-state ScriptData, /// The [script data escaped state]. /// /// [script data escaped state]: https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-state ScriptDataEscaped, /// The [script data double escaped state]. /// /// [script data double escaped state]: https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-state ScriptDataDoubleEscaped, } impl From<State> for machine::State { fn from(state: State) -> Self { match state { State::Data => machine::State::Data, State::Plaintext => machine::State::Plaintext, State::Rcdata => machine::State::Rcdata, State::Rawtext => machine::State::Rawtext, State::ScriptData => machine::State::ScriptData, State::ScriptDataEscaped => machine::State::ScriptDataEscaped, State::ScriptDataDoubleEscaped => machine::State::ScriptDataDoubleEscaped, } } } impl<R: Reader + Position<O>, O: Offset, E: Emitter<O>> Tokenizer<R, O, E> { /// Test-internal function to override internal state. /// /// Only available with the `integration-tests` feature which is not public API. #[cfg(feature = "integration-tests")] pub fn set_internal_state(&mut self, state: InternalState) { self.machine.state = state; } /// Set the statemachine to start/continue in the given state. pub fn set_state(&mut self, state: State) { self.machine.state = state.into(); } } impl<O, R, E> Iterator for Tokenizer<R, O, E> where O: Offset, R: Reader + Position<O>, E: Emitter<O> + Iterator, { type Item = Result<Event<E::Item>, R::Error>; fn next(&mut self) -> Option<Self::Item> { loop { if let Some(token) = self.machine.emitter.next() { return Some(Ok(Event::Token(token))); } if self.eof { return None; } match machine::consume(&mut self.machine) { Err(e) => return Some(Err(e)), Ok(ControlToken::Continue) => (), Ok(ControlToken::Eof) => { self.eof = true; self.machine .emitter .emit_eof(self.machine.reader_position()); } Ok(ControlToken::CdataOpen) => return Some(Ok(Event::CdataOpen)), } } } } impl<R: Reader, O, E: Emitter<O>> Tokenizer<R, O, E> { pub(crate) fn enable_naive_state_switching(&mut self) { self.machine.naively_switch_state = true; } /// Test-internal function to override internal state. /// /// Only available with the `integration-tests` feature which is not public API. #[cfg(feature = "integration-tests")] pub fn set_last_start_tag(&mut self, last_start_tag: &str) { self.machine.last_start_tag_name.clear(); self.machine.last_start_tag_name.push_str(last_start_tag); } }