aboutsummaryrefslogtreecommitdiff
path: root/src/emitter.rs
diff options
context:
space:
mode:
Diffstat (limited to 'src/emitter.rs')
-rw-r--r--src/emitter.rs493
1 files changed, 493 insertions, 0 deletions
diff --git a/src/emitter.rs b/src/emitter.rs
new file mode 100644
index 0000000..5bf1e0a
--- /dev/null
+++ b/src/emitter.rs
@@ -0,0 +1,493 @@
+use std::collections::BTreeMap;
+use std::collections::BTreeSet;
+use std::collections::VecDeque;
+use std::mem;
+
+use crate::Error;
+
+/// An emitter is an object providing methods to the tokenizer to produce tokens.
+///
+/// Domain-specific applications of the HTML tokenizer can manually implement this trait to
+/// customize per-token allocations, or avoid them altogether.
+///
+/// An emitter is assumed to have these internal states:
+///
+/// * _last start tag_: The most recently emitted start tag's name
+/// * _current token_: Can be a tag, doctype or comment token. There's only one current token.
+/// * _current attribute_: The currently processed HTML attribute, consisting of two strings for name and value.
+///
+/// The following methods are describing what kind of behavior the WHATWG spec expects, but that
+/// doesn't mean you need to follow it. For example:
+///
+/// * If your usage of the tokenizer will ignore all errors, none of the error handling and
+/// validation requirements apply to you. You can implement `emit_error` as noop and omit all
+/// checks that would emit errors.
+///
+/// * If you don't care about attributes at all, you can make all related methods a noop.
+///
+/// The state machine needs to have a functional implementation of
+/// `current_is_appropriate_end_tag_token` to do correct transitions, however.
+pub trait Emitter {
+ /// The token type emitted by this emitter. This controls what type of values the [`crate::Tokenizer`]
+ /// yields when used as an iterator.
+ type Token;
+
+ /// Set the name of the _last start tag_.
+ ///
+ /// This is primarily for testing purposes. This is *not* supposed to override the tag name of
+ /// the current tag.
+ fn set_last_start_tag(&mut self, last_start_tag: Option<&str>);
+
+ /// The state machine has reached the end of the file. It will soon call `pop_token` for the
+ /// last time.
+ fn emit_eof(&mut self);
+
+ /// A (probably recoverable) parsing error has occured.
+ fn emit_error(&mut self, error: Error);
+
+ /// After every state change, the tokenizer calls this method to retrieve a new token that can
+ /// be returned via the tokenizer's iterator interface.
+ fn pop_token(&mut self) -> Option<Self::Token>;
+
+ /// Emit a bunch of plain characters as character tokens.
+ fn emit_string(&mut self, c: &str);
+
+ /// Set the _current token_ to a start tag.
+ fn init_start_tag(&mut self);
+
+ /// Set the _current token_ to an end tag.
+ fn init_end_tag(&mut self);
+
+ /// Set the _current token_ to a comment.
+ fn init_comment(&mut self);
+
+ /// Emit the _current token_, assuming it is a tag.
+ ///
+ /// Also get the current attribute and append it to the to-be-emitted tag. See docstring for
+ /// [`Emitter::init_attribute`] for how duplicates should be handled.
+ ///
+ /// If a start tag is emitted, update the _last start tag_.
+ ///
+ /// If the current token is not a start/end tag, this method may panic.
+ fn emit_current_tag(&mut self);
+
+ /// Emit the _current token_, assuming it is a comment.
+ ///
+ /// If the current token is not a comment, this method may panic.
+ fn emit_current_comment(&mut self);
+
+ /// Emit the _current token_, assuming it is a doctype.
+ ///
+ /// If the current token is not a doctype, this method may panic.
+ fn emit_current_doctype(&mut self);
+
+ /// Assuming the _current token_ is a start tag, set the self-closing flag.
+ ///
+ /// If the current token is not a start or end tag, this method may panic.
+ ///
+ /// If the current token is an end tag, the emitter should emit the
+ /// [`crate::Error::EndTagWithTrailingSolidus`] error.
+ fn set_self_closing(&mut self);
+
+ /// Assuming the _current token_ is a doctype, set its "force quirks" flag to true.
+ ///
+ /// If the current token is not a doctype, this method pay panic.
+ fn set_force_quirks(&mut self);
+
+ /// Assuming the _current token_ is a start/end tag, append a string to the current tag's name.
+ ///
+ /// If the current token is not a start or end tag, this method may panic.
+ fn push_tag_name(&mut self, s: &str);
+
+ /// Assuming the _current token_ is a comment, append a string to the comment's contents.
+ ///
+ /// If the current token is not a comment, this method may panic.
+ fn push_comment(&mut self, s: &str);
+
+ /// Assuming the _current token_ is a doctype, append a string to the doctype's name.
+ ///
+ /// If the current token is not a doctype, this method may panic.
+ fn push_doctype_name(&mut self, s: &str);
+
+ /// Set the _current token_ to a new doctype token:
+ ///
+ /// * the name should be empty
+ /// * the "public identifier" should be null (different from empty)
+ /// * the "system identifier" should be null (different from empty)
+ /// * the "force quirks" flag should be `false`
+ fn init_doctype(&mut self);
+
+ /// Set the _current attribute_ to a new one, starting with empty name and value strings.
+ ///
+ /// The old attribute, if any, should be put on the _current token_. If an attribute with that
+ /// name already exists, WHATWG says the new one should be ignored and a
+ /// [`crate::Error::DuplicateAttribute`] error should be emitted.
+ ///
+ /// If the current token is an end tag token, a [`crate::Error::EndTagWithAttributes`] error should be
+ /// emitted.
+ ///
+ /// If the current token is no tag at all, this method may panic.
+ fn init_attribute(&mut self);
+
+ /// Append a string to the current attribute's name.
+ ///
+ /// If there is no current attribute, this method may panic.
+ fn push_attribute_name(&mut self, s: &str);
+
+ /// Append a string to the current attribute's value.
+ ///
+ /// If there is no current attribute, this method may panic.
+ fn push_attribute_value(&mut self, s: &str);
+
+ /// Assuming the _current token_ is a doctype, set its "public identifier" to the given string.
+ ///
+ /// If the current token is not a doctype, this method may panic.
+ fn set_doctype_public_identifier(&mut self, value: &str);
+
+ /// Assuming the _current token_ is a doctype, set its "system identifier" to the given string.
+ ///
+ /// If the current token is not a doctype, this method may panic.
+ fn set_doctype_system_identifier(&mut self, value: &str);
+
+ /// Assuming the _current token_ is a doctype, append a string to its "public identifier" to the given string.
+ ///
+ /// If the current token is not a doctype, this method may panic.
+ fn push_doctype_public_identifier(&mut self, s: &str);
+
+ /// Assuming the _current token_ is a doctype, append a string to its "system identifier" to the given string.
+ ///
+ /// If the current token is not a doctype, this method may panic.
+ fn push_doctype_system_identifier(&mut self, s: &str);
+
+ /// Return true if all of these hold. Return false otherwise.
+ ///
+ /// * the _current token_ is an end tag
+ /// * the _last start tag_ exists
+ /// * the current end tag token's name equals to the last start tag's name.
+ ///
+ /// See also [WHATWG's definition of "appropriate end tag
+ /// token"](https://html.spec.whatwg.org/#appropriate-end-tag-token).
+ fn current_is_appropriate_end_tag_token(&mut self) -> bool;
+}
+
+/// The default implementation of [`crate::Emitter`], used to produce ("emit") tokens.
+pub struct DefaultEmitter {
+ current_characters: String,
+ current_token: Option<Token>,
+ last_start_tag: String,
+ current_attribute: Option<(String, String)>,
+ seen_attributes: BTreeSet<String>,
+ emitted_tokens: VecDeque<Token>,
+}
+
+impl Default for DefaultEmitter {
+ fn default() -> Self {
+ DefaultEmitter {
+ current_characters: String::new(),
+ current_token: None,
+ last_start_tag: String::new(),
+ current_attribute: None,
+ seen_attributes: BTreeSet::new(),
+ emitted_tokens: VecDeque::new(),
+ }
+ }
+}
+
+impl DefaultEmitter {
+ fn emit_token(&mut self, token: Token) {
+ self.flush_current_characters();
+ self.emitted_tokens.push_front(token);
+ }
+
+ fn flush_current_attribute(&mut self) {
+ if let Some((k, v)) = self.current_attribute.take() {
+ match self.current_token {
+ Some(Token::StartTag(ref mut tag)) => {
+ let mut error = None;
+ tag.attributes
+ .entry(k)
+ .and_modify(|_| {
+ error = Some(Error::DuplicateAttribute);
+ })
+ .or_insert(v);
+
+ if let Some(e) = error {
+ self.emit_error(e);
+ }
+ }
+ Some(Token::EndTag(_)) => {
+ if !self.seen_attributes.insert(k) {
+ self.emit_error(Error::DuplicateAttribute);
+ }
+ }
+ _ => {
+ debug_assert!(false);
+ }
+ }
+ }
+ }
+
+ fn flush_current_characters(&mut self) {
+ if self.current_characters.is_empty() {
+ return;
+ }
+
+ let s = mem::take(&mut self.current_characters);
+ self.emit_token(Token::String(s));
+ }
+}
+
+impl Emitter for DefaultEmitter {
+ type Token = Token;
+
+ fn set_last_start_tag(&mut self, last_start_tag: Option<&str>) {
+ self.last_start_tag.clear();
+ self.last_start_tag
+ .push_str(last_start_tag.unwrap_or_default());
+ }
+
+ fn emit_eof(&mut self) {
+ self.flush_current_characters();
+ }
+
+ fn emit_error(&mut self, error: Error) {
+ // bypass character flushing in self.emit_token: we don't need the error location to be
+ // that exact
+ self.emitted_tokens.push_front(Token::Error(error));
+ }
+
+ fn pop_token(&mut self) -> Option<Self::Token> {
+ self.emitted_tokens.pop_back()
+ }
+
+ fn emit_string(&mut self, s: &str) {
+ self.current_characters.push_str(s);
+ }
+
+ fn init_start_tag(&mut self) {
+ self.current_token = Some(Token::StartTag(Default::default()));
+ }
+ fn init_end_tag(&mut self) {
+ self.current_token = Some(Token::EndTag(Default::default()));
+ self.seen_attributes.clear();
+ }
+
+ fn init_comment(&mut self) {
+ self.current_token = Some(Token::Comment(String::new()));
+ }
+ fn emit_current_tag(&mut self) {
+ self.flush_current_attribute();
+ let mut token = self.current_token.take().unwrap();
+ match token {
+ Token::EndTag(_) => {
+ if !self.seen_attributes.is_empty() {
+ self.emit_error(Error::EndTagWithAttributes);
+ }
+ self.seen_attributes.clear();
+ }
+ Token::StartTag(ref mut _tag) => {
+ self.set_last_start_tag(Some(&_tag.name));
+ }
+ _ => debug_assert!(false),
+ }
+ self.emit_token(token);
+ }
+ fn emit_current_comment(&mut self) {
+ let comment = self.current_token.take().unwrap();
+ debug_assert!(matches!(comment, Token::Comment(_)));
+ self.emit_token(comment);
+ }
+
+ fn emit_current_doctype(&mut self) {
+ let doctype = self.current_token.take().unwrap();
+ debug_assert!(matches!(doctype, Token::Doctype(_)));
+ self.emit_token(doctype);
+ }
+
+ fn set_self_closing(&mut self) {
+ let tag = self.current_token.as_mut().unwrap();
+ match tag {
+ Token::StartTag(StartTag {
+ ref mut self_closing,
+ ..
+ }) => {
+ *self_closing = true;
+ }
+ Token::EndTag(_) => {
+ self.emit_error(Error::EndTagWithTrailingSolidus);
+ }
+ _ => {
+ debug_assert!(false);
+ }
+ }
+ }
+ fn set_force_quirks(&mut self) {
+ match self.current_token {
+ Some(Token::Doctype(ref mut doctype)) => doctype.force_quirks = true,
+ _ => debug_assert!(false),
+ }
+ }
+ fn push_tag_name(&mut self, s: &str) {
+ match self.current_token {
+ Some(Token::StartTag(StartTag { ref mut name, .. })) => {
+ name.push_str(s);
+ }
+ Some(Token::EndTag(EndTag { ref mut name, .. })) => {
+ name.push_str(s);
+ }
+ _ => debug_assert!(false),
+ }
+ }
+
+ fn push_comment(&mut self, s: &str) {
+ match self.current_token {
+ Some(Token::Comment(ref mut data)) => data.push_str(s),
+ _ => debug_assert!(false),
+ }
+ }
+
+ fn push_doctype_name(&mut self, s: &str) {
+ match self.current_token {
+ Some(Token::Doctype(ref mut doctype)) => doctype.name.push_str(s),
+ _ => debug_assert!(false),
+ }
+ }
+ fn init_doctype(&mut self) {
+ self.current_token = Some(Token::Doctype(Doctype {
+ name: String::new(),
+ force_quirks: false,
+ public_identifier: None,
+ system_identifier: None,
+ }));
+ }
+
+ fn init_attribute(&mut self) {
+ self.flush_current_attribute();
+ self.current_attribute = Some((String::new(), String::new()));
+ }
+ fn push_attribute_name(&mut self, s: &str) {
+ self.current_attribute.as_mut().unwrap().0.push_str(s);
+ }
+ fn push_attribute_value(&mut self, s: &str) {
+ self.current_attribute.as_mut().unwrap().1.push_str(s);
+ }
+ fn set_doctype_public_identifier(&mut self, value: &str) {
+ if let Some(Token::Doctype(Doctype {
+ ref mut public_identifier,
+ ..
+ })) = self.current_token
+ {
+ *public_identifier = Some(value.to_owned());
+ } else {
+ debug_assert!(false);
+ }
+ }
+ fn set_doctype_system_identifier(&mut self, value: &str) {
+ if let Some(Token::Doctype(Doctype {
+ ref mut system_identifier,
+ ..
+ })) = self.current_token
+ {
+ *system_identifier = Some(value.to_owned());
+ } else {
+ debug_assert!(false);
+ }
+ }
+ fn push_doctype_public_identifier(&mut self, s: &str) {
+ if let Some(Token::Doctype(Doctype {
+ public_identifier: Some(ref mut id),
+ ..
+ })) = self.current_token
+ {
+ id.push_str(s);
+ } else {
+ debug_assert!(false);
+ }
+ }
+ fn push_doctype_system_identifier(&mut self, s: &str) {
+ if let Some(Token::Doctype(Doctype {
+ system_identifier: Some(ref mut id),
+ ..
+ })) = self.current_token
+ {
+ id.push_str(s);
+ } else {
+ debug_assert!(false);
+ }
+ }
+
+ fn current_is_appropriate_end_tag_token(&mut self) -> bool {
+ match self.current_token {
+ Some(Token::EndTag(ref tag)) => {
+ !self.last_start_tag.is_empty() && self.last_start_tag == tag.name
+ }
+ _ => false,
+ }
+ }
+}
+
+/// A HTML end/close tag, such as `<p>` or `<a>`.
+#[derive(Debug, Default, Eq, PartialEq)]
+pub struct StartTag {
+ /// Whether this tag is self-closing. If it is self-closing, no following [`EndTag`] should be
+ /// expected.
+ pub self_closing: bool,
+
+ /// The start tag's name, such as `"p"` or `"a"`.
+ pub name: String,
+
+ /// A mapping for any HTML attributes this start tag may have.
+ ///
+ /// Duplicate attributes are ignored after the first one as per WHATWG spec. Implement your own
+ /// [`Emitter`] to tweak this behavior.
+ pub attributes: BTreeMap<String, String>,
+}
+
+/// A HTML end/close tag, such as `</p>` or `</a>`.
+#[derive(Debug, Default, Eq, PartialEq)]
+pub struct EndTag {
+ /// The ending tag's name, such as `"p"` or `"a"`.
+ pub name: String,
+}
+
+/// A doctype. Some examples:
+///
+/// * `<!DOCTYPE {name}>`
+/// * `<!DOCTYPE {name} PUBLIC '{public_identifier}'>`
+/// * `<!DOCTYPE {name} SYSTEM '{system_identifier}'>`
+/// * `<!DOCTYPE {name} PUBLIC '{public_identifier}' '{system_identifier}'>`
+#[derive(Debug, Eq, PartialEq)]
+pub struct Doctype {
+ /// The ["force quirks"](https://html.spec.whatwg.org/#force-quirks-flag) flag.
+ pub force_quirks: bool,
+
+ /// The doctype's name. For HTML documents this is "html".
+ pub name: String,
+
+ /// The doctype's public identifier.
+ pub public_identifier: Option<String>,
+
+ /// The doctype's system identifier.
+ pub system_identifier: Option<String>,
+}
+
+/// The token type used by default. You can define your own token type by implementing the
+/// [`crate::Emitter`] trait and using [`crate::Tokenizer::new_with_emitter`].
+#[derive(Debug, Eq, PartialEq)]
+pub enum Token {
+ /// A HTML start tag.
+ StartTag(StartTag),
+ /// A HTML end tag.
+ EndTag(EndTag),
+ /// A literal string.
+ String(String),
+ /// A HTML comment.
+ Comment(String),
+ /// A HTML doctype declaration.
+ Doctype(Doctype),
+ /// A HTML parsing error.
+ ///
+ /// Can be skipped over, the tokenizer is supposed to recover from the error and continues with
+ /// more tokens afterward.
+ Error(Error),
+}