use std::collections::btree_map::Entry; use std::collections::BTreeSet; use std::collections::VecDeque; use std::mem; use std::ops::Range; use crate::attr::AttrValueSyntax; use crate::offset::NoopOffset; use crate::offset::Offset; use crate::Error; /// An emitter is an object providing methods to the tokenizer to produce ("emit") tokens. /// /// Domain-specific applications of the HTML tokenizer can manually implement this trait to /// customize per-token allocations, or avoid them altogether. /// /// An emitter is assumed to have these internal states: /// /// * _current token_: Can be a tag, doctype or comment token. There's only one current token. /// * _current attribute_: The currently processed HTML attribute, consisting of two strings for name and value. /// /// The following methods are describing what kind of behavior the WHATWG spec expects, but that /// doesn't mean you need to follow it. For example: /// /// * If your usage of the tokenizer will ignore all errors, none of the error handling and /// validation requirements apply to you. You can implement `emit_error` as noop and omit all /// checks that would emit errors. /// /// * If you don't care about attributes at all, you can make all related methods a noop. #[allow(unused_variables)] // workaround for https://github.com/rust-lang/rust/issues/91074 pub trait Emitter { /// The token type emitted by this emitter. This controls what type of values the [`Tokenizer`](crate::Tokenizer) /// yields when used as an iterator. type Token; /// The state machine has reached the end of the file. It will soon call `pop_token` for the /// last time. fn emit_eof(&mut self); /// A (probably recoverable) parsing error has occurred. fn emit_error(&mut self, error: Error, offset: O); /// After every state change, the tokenizer calls this method to retrieve a new token that can /// be returned via the tokenizer's iterator interface. fn pop_token(&mut self) -> Option; /// Emit a bunch of plain characters as character tokens. fn emit_string(&mut self, c: &str); /// Set the _current token_ to a start tag. fn init_start_tag(&mut self, offset: O); /// Set the _current token_ to an end tag. fn init_end_tag(&mut self, offset: O); /// Set the _current token_ to a comment. fn init_comment(&mut self, data_offset: O); /// Emit the _current token_, assuming it is a tag. /// /// Also get the current attribute and append it to the to-be-emitted tag. See docstring for /// [`Emitter::init_attribute_name`] for how duplicates should be handled. /// /// If an end tag is emitted with attributes, an [`Error::EndTagWithAttributes`] /// error should be emitted. /// /// If the current token is not a start/end tag, this method may panic. fn emit_current_tag(&mut self, offset: O); /// Emit the _current token_, assuming it is a comment. /// /// If the current token is not a comment, this method may panic. fn emit_current_comment(&mut self, offset: O); /// Emit the _current token_, assuming it is a doctype. /// /// If the current token is not a doctype, this method may panic. fn emit_current_doctype(&mut self, offset: O); /// Assuming the _current token_ is a start tag, set the self-closing flag. /// /// If the current token is not a start or end tag, this method may panic. /// /// If the current token is an end tag, the emitter should emit the /// [`Error::EndTagWithTrailingSolidus`] error. fn set_self_closing(&mut self, offset: O); /// Assuming the _current token_ is a doctype, set its "force quirks" flag to true. /// /// If the current token is not a doctype, this method pay panic. fn set_force_quirks(&mut self); /// Assuming the _current token_ is a start/end tag, append a string to the current tag's name. /// /// If the current token is not a start or end tag, this method may panic. fn push_tag_name(&mut self, s: &str); /// Assuming the _current token_ is a comment, append a string to the comment's contents. /// /// If the current token is not a comment, this method may panic. fn push_comment(&mut self, s: &str); /// Assuming the _current token_ is a doctype, append a string to the doctype's name. /// /// If the current token is not a doctype, this method may panic. fn push_doctype_name(&mut self, s: &str); /// Set the _current token_ to a new doctype token: /// /// * the name should be empty /// * the "public identifier" should be null (different from empty) /// * the "system identifier" should be null (different from empty) /// * the "force quirks" flag should be `false` fn init_doctype(&mut self, offset: O); /// Set the _current attribute_ to a new one, starting with empty name and value strings. /// /// The old attribute, if any, should be put on the _current token_. If an attribute with that /// name already exists, WHATWG says the new one should be ignored and a /// [`Error::DuplicateAttribute`] error should be emitted. /// /// If the current token is no tag at all, this method may panic. fn init_attribute_name(&mut self, offset: O); /// Called before the first push_attribute_value call. /// /// If there is no current attribute, this method may panic. fn init_attribute_value(&mut self, syntax: AttrValueSyntax, offset: O) {} /// Append a string to the current attribute's name. /// /// If there is no current attribute, this method may panic. fn push_attribute_name(&mut self, s: &str); /// Append a string to the current attribute's value. /// /// If there is no current attribute, this method may panic. fn push_attribute_value(&mut self, s: &str); /// Assuming the _current token_ is a doctype, set its "public identifier" to the empty string. /// /// If the current token is not a doctype, this method may panic. fn init_doctype_public_id(&mut self, offset: O); /// Assuming the _current token_ is a doctype, set its "system identifier" to the empty string. /// /// If the current token is not a doctype, this method may panic. fn init_doctype_system_id(&mut self, offset: O); /// Assuming the _current token_ is a doctype, append a string to its "public identifier" to the given string. /// /// If the current token is not a doctype, this method may panic. fn push_doctype_public_id(&mut self, s: &str); /// Assuming the _current token_ is a doctype, append a string to its "system identifier" to the given string. /// /// If the current token is not a doctype, this method may panic. fn push_doctype_system_id(&mut self, s: &str); } /// The default implementation of [`Emitter`], used to produce tokens. /// /// # Warning /// /// * Using the DefaultEmitter without calling [`Tokenizer::set_state`] /// results in wrong state transitions: /// /// ``` /// # use html5tokenizer::{DefaultEmitter, Event, Tokenizer, Token}; /// let emitter = DefaultEmitter::default(); /// let html = "