use std::collections::btree_map::Entry; use std::collections::BTreeSet; use std::collections::VecDeque; use std::mem; use std::ops::Range; use crate::attr::AttrValueSyntax; use crate::offset::NoopOffset; use crate::offset::Offset; use crate::Error; /// An emitter is an object providing methods to the tokenizer to produce ("emit") tokens. /// /// Domain-specific applications of the HTML tokenizer can manually implement this trait to /// customize per-token allocations, or avoid them altogether. /// /// An emitter is assumed to have these internal states: /// /// * _current token_: Can be a tag, doctype or comment token. There's only one current token. /// * _current attribute_: The currently processed HTML attribute, consisting of two strings for name and value. /// /// The following methods are describing what kind of behavior the WHATWG spec expects, but that /// doesn't mean you need to follow it. For example: /// /// * If your usage of the tokenizer will ignore all errors, none of the error handling and /// validation requirements apply to you. You can implement `emit_error` as noop and omit all /// checks that would emit errors. /// /// * If you don't care about attributes at all, you can make all related methods a noop. pub trait Emitter<O> { /// The token type emitted by this emitter. This controls what type of values the [`Tokenizer`](crate::Tokenizer) /// yields when used as an iterator. type Token; /// The state machine has reached the end of the file. It will soon call `pop_token` for the /// last time. fn emit_eof(&mut self); /// A (probably recoverable) parsing error has occured. fn emit_error(&mut self, error: Error, offset: O); /// After every state change, the tokenizer calls this method to retrieve a new token that can /// be returned via the tokenizer's iterator interface. fn pop_token(&mut self) -> Option<Self::Token>; /// Emit a bunch of plain characters as character tokens. fn emit_string(&mut self, c: &str); /// Set the _current token_ to a start tag. fn init_start_tag(&mut self, offset: O); /// Set the _current token_ to an end tag. fn init_end_tag(&mut self, offset: O); /// Set the _current token_ to a comment. fn init_comment(&mut self, data_offset: O); /// Emit the _current token_, assuming it is a tag. /// /// Also get the current attribute and append it to the to-be-emitted tag. See docstring for /// [`Emitter::init_attribute_name`] for how duplicates should be handled. /// /// If an end tag is emitted with attributes, an [`Error::EndTagWithAttributes`] /// error should be emitted. /// /// If the current token is not a start/end tag, this method may panic. fn emit_current_tag(&mut self, offset: O); /// Emit the _current token_, assuming it is a comment. /// /// If the current token is not a comment, this method may panic. fn emit_current_comment(&mut self, offset: O); /// Emit the _current token_, assuming it is a doctype. /// /// If the current token is not a doctype, this method may panic. fn emit_current_doctype(&mut self, offset: O); /// Assuming the _current token_ is a start tag, set the self-closing flag. /// /// If the current token is not a start or end tag, this method may panic. /// /// If the current token is an end tag, the emitter should emit the /// [`Error::EndTagWithTrailingSolidus`] error. fn set_self_closing(&mut self, offset: O); /// Assuming the _current token_ is a doctype, set its "force quirks" flag to true. /// /// If the current token is not a doctype, this method pay panic. fn set_force_quirks(&mut self); /// Assuming the _current token_ is a start/end tag, append a string to the current tag's name. /// /// If the current token is not a start or end tag, this method may panic. fn push_tag_name(&mut self, s: &str); /// Assuming the _current token_ is a comment, append a string to the comment's contents. /// /// If the current token is not a comment, this method may panic. fn push_comment(&mut self, s: &str); /// Assuming the _current token_ is a doctype, append a string to the doctype's name. /// /// If the current token is not a doctype, this method may panic. fn push_doctype_name(&mut self, s: &str); /// Set the _current token_ to a new doctype token: /// /// * the name should be empty /// * the "public identifier" should be null (different from empty) /// * the "system identifier" should be null (different from empty) /// * the "force quirks" flag should be `false` fn init_doctype(&mut self, offset: O); /// Set the _current attribute_ to a new one, starting with empty name and value strings. /// /// The old attribute, if any, should be put on the _current token_. If an attribute with that /// name already exists, WHATWG says the new one should be ignored and a /// [`Error::DuplicateAttribute`] error should be emitted. /// /// If the current token is no tag at all, this method may panic. fn init_attribute_name(&mut self, offset: O); /// Called before the first push_attribute_value call. /// /// If there is no current attribute, this method may panic. #[allow(unused_variables)] fn init_attribute_value(&mut self, syntax: AttrValueSyntax, offset: O) {} /// Append a string to the current attribute's name. /// /// If there is no current attribute, this method may panic. fn push_attribute_name(&mut self, s: &str); /// Append a string to the current attribute's value. /// /// If there is no current attribute, this method may panic. fn push_attribute_value(&mut self, s: &str); /// Assuming the _current token_ is a doctype, set its "public identifier" to the empty string. /// /// If the current token is not a doctype, this method may panic. fn init_doctype_public_id(&mut self, offset: O); /// Assuming the _current token_ is a doctype, set its "system identifier" to the empty string. /// /// If the current token is not a doctype, this method may panic. fn init_doctype_system_id(&mut self, offset: O); /// Assuming the _current token_ is a doctype, append a string to its "public identifier" to the given string. /// /// If the current token is not a doctype, this method may panic. fn push_doctype_public_id(&mut self, s: &str); /// Assuming the _current token_ is a doctype, append a string to its "system identifier" to the given string. /// /// If the current token is not a doctype, this method may panic. fn push_doctype_system_id(&mut self, s: &str); } /// The default implementation of [`Emitter`], used to produce tokens. /// /// # Warning /// /// * Using the DefaultEmitter without calling [`Tokenizer::set_state`] /// results in wrong state transitions: /// /// ``` /// # use html5tokenizer::{DefaultEmitter, Tokenizer, Token}; /// let emitter = DefaultEmitter::default(); /// let html = "<script><b>"; /// let mut tokens = Tokenizer::new(html, emitter).flatten(); /// assert!(matches!(tokens.next().unwrap(), Token::StartTag(tag) if tag.name == "script")); /// assert!(matches!(tokens.next().unwrap(), Token::StartTag(tag) if tag.name == "b")); /// ``` /// /// [`Tokenizer::set_state`]: crate::Tokenizer::set_state pub struct DefaultEmitter<O = NoopOffset> { current_characters: String, current_token: Option<Token<O>>, current_attribute: Option<(String, crate::attr::AttrInternal<O>)>, seen_attributes: BTreeSet<String>, emitted_tokens: VecDeque<Token<O>>, attr_in_end_tag_span: Option<Range<O>>, } impl<O> Default for DefaultEmitter<O> { fn default() -> Self { DefaultEmitter { current_characters: String::new(), current_token: None, current_attribute: None, seen_attributes: BTreeSet::new(), emitted_tokens: VecDeque::new(), attr_in_end_tag_span: None, } } } impl<O> DefaultEmitter<O> { fn emit_token(&mut self, token: Token<O>) { self.flush_current_characters(); self.emitted_tokens.push_front(token); } fn flush_current_attribute(&mut self) where O: Offset, { if let Some((name, map_val)) = self.current_attribute.take() { match self.current_token { Some(Token::StartTag(ref mut tag)) => match tag.attributes.inner.entry(name) { Entry::Vacant(vacant) => { vacant.insert(map_val); } Entry::Occupied(entry) => { let name_len = entry.key().len(); self.push_error(Error::DuplicateAttribute, map_val.name_span(name_len)); } }, Some(Token::EndTag(_)) => { let name_span = map_val.name_span(name.len()); self.attr_in_end_tag_span = Some(name_span.clone()); if !self.seen_attributes.insert(name) { self.push_error(Error::DuplicateAttribute, name_span); } } _ => { debug_assert!(false); } } } } fn flush_current_characters(&mut self) { if self.current_characters.is_empty() { return; } let s = mem::take(&mut self.current_characters); self.emit_token(Token::String(s)); } fn push_error(&mut self, error: Error, span: Range<O>) { // bypass character flushing in self.emit_token: we don't need the error location to be // that exact self.emitted_tokens.push_front(Token::Error { error, span }); } } impl<O: Offset> Emitter<O> for DefaultEmitter<O> { type Token = Token<O>; fn emit_eof(&mut self) { self.flush_current_characters(); } fn emit_error(&mut self, error: Error, offset: O) { self.push_error(error, offset..offset); } fn pop_token(&mut self) -> Option<Self::Token> { self.emitted_tokens.pop_back() } fn emit_string(&mut self, s: &str) { self.current_characters.push_str(s); } fn init_start_tag(&mut self, offset: O) { self.current_token = Some(Token::StartTag(StartTag { span: offset - b"<".len()..offset - b"<".len(), self_closing: false, name: String::new(), attributes: Default::default(), })); } fn init_end_tag(&mut self, offset: O) { self.current_token = Some(Token::EndTag(EndTag { span: offset - b"</".len()..offset - b"</".len(), name: String::new(), })); self.seen_attributes.clear(); } fn init_comment(&mut self, data_offset: O) { self.current_token = Some(Token::Comment(Comment { data: String::new(), data_offset, })); } fn emit_current_tag(&mut self, offset: O) { self.flush_current_attribute(); let mut token = self.current_token.take().unwrap(); match &mut token { Token::EndTag(tag) => { if !self.seen_attributes.is_empty() { let span = self.attr_in_end_tag_span.take().unwrap(); self.push_error(Error::EndTagWithAttributes, span); } self.seen_attributes.clear(); tag.span.end = offset + b">".len(); } Token::StartTag(tag) => { tag.span.end = offset + b">".len(); } _ => debug_assert!(false), } self.emit_token(token); } fn emit_current_comment(&mut self, _offset: O) { let comment = self.current_token.take().unwrap(); debug_assert!(matches!(comment, Token::Comment(_))); self.emit_token(comment); } fn emit_current_doctype(&mut self, offset: O) { let Some(Token::Doctype(mut doctype)) = self.current_token.take() else { debug_assert!(false); return; }; doctype.span.end = offset; self.emit_token(Token::Doctype(doctype)); } fn set_self_closing(&mut self, offset: O) { let tag = self.current_token.as_mut().unwrap(); match tag { Token::StartTag(StartTag { ref mut self_closing, .. }) => { *self_closing = true; } Token::EndTag(_) => { self.emit_error(Error::EndTagWithTrailingSolidus, offset - 1); } _ => { debug_assert!(false); } } } fn set_force_quirks(&mut self) { match self.current_token { Some(Token::Doctype(ref mut doctype)) => doctype.force_quirks = true, _ => debug_assert!(false), } } fn push_tag_name(&mut self, s: &str) { match self.current_token { Some(Token::StartTag(StartTag { ref mut name, .. })) => { name.push_str(s); } Some(Token::EndTag(EndTag { ref mut name, .. })) => { name.push_str(s); } _ => debug_assert!(false), } } fn push_comment(&mut self, s: &str) { match self.current_token { Some(Token::Comment(Comment { ref mut data, .. })) => data.push_str(s), _ => debug_assert!(false), } } fn push_doctype_name(&mut self, s: &str) { match self.current_token { Some(Token::Doctype(ref mut doctype)) => doctype.name.push_str(s), _ => debug_assert!(false), } } fn init_doctype(&mut self, offset: O) { self.current_token = Some(Token::Doctype(Doctype { name: String::new(), force_quirks: false, public_id: None, system_id: None, span: offset..O::default(), public_id_offset: O::default(), system_id_offset: O::default(), })); } fn init_attribute_name(&mut self, offset: O) { self.flush_current_attribute(); self.current_attribute = Some(( String::new(), crate::attr::AttrInternal { name_offset: offset, value: String::new(), value_offset: O::default(), value_syntax: None, }, )); } fn init_attribute_value(&mut self, syntax: AttrValueSyntax, offset: O) { let (_, current_attribute) = self.current_attribute.as_mut().unwrap(); current_attribute.value_offset = offset; current_attribute.value_syntax = Some(syntax); } fn push_attribute_name(&mut self, s: &str) { let current_attr = self.current_attribute.as_mut().unwrap(); current_attr.0.push_str(s); } fn push_attribute_value(&mut self, s: &str) { let current_attr = self.current_attribute.as_mut().unwrap(); current_attr.1.value.push_str(s); } fn init_doctype_public_id(&mut self, offset: O) { let Some(Token::Doctype(doctype)) = &mut self.current_token else { debug_assert!(false); return; }; doctype.public_id = Some("".to_owned()); doctype.public_id_offset = offset; } fn init_doctype_system_id(&mut self, offset: O) { let Some(Token::Doctype(doctype)) = &mut self.current_token else { debug_assert!(false); return; }; doctype.system_id = Some("".to_owned()); doctype.system_id_offset = offset; } fn push_doctype_public_id(&mut self, s: &str) { if let Some(Token::Doctype(Doctype { public_id: Some(ref mut id), .. })) = self.current_token { id.push_str(s); } else { debug_assert!(false); } } fn push_doctype_system_id(&mut self, s: &str) { if let Some(Token::Doctype(Doctype { system_id: Some(ref mut id), .. })) = self.current_token { id.push_str(s); } else { debug_assert!(false); } } } /// An HTML start tag, such as `<p>` or `<a>`. #[derive(Debug, Eq, PartialEq)] pub struct StartTag<O> { /// Whether this tag is self-closing. If it is self-closing, no following [`EndTag`] should be /// expected. pub self_closing: bool, /// The start tag's name, such as `"p"` or `"a"`. pub name: String, /// A mapping for any HTML attributes this start tag may have. /// /// Duplicate attributes are ignored after the first one as per WHATWG spec. Implement your own /// [`Emitter`] to tweak this behavior. pub attributes: crate::attr::AttributeMap<O>, /// The source code span of the tag. pub span: Range<O>, } impl<O: Offset> StartTag<O> { /// Calculates the span for the tag name and returns it. pub fn name_span(&self) -> Range<O> { let start = self.span.start + b"<".len(); start..start + self.name.len() } } /// A HTML end/close tag, such as `</p>` or `</a>`. #[derive(Debug, Eq, PartialEq)] pub struct EndTag<O> { /// The ending tag's name, such as `"p"` or `"a"`. pub name: String, /// The source code span of the tag. pub span: Range<O>, } impl<O: Offset> EndTag<O> { /// Calculates the span for the tag name and returns it. pub fn name_span(&self) -> Range<O> { let start = self.span.start + b"</".len(); start..start + self.name.len() } } /// An HTML comment. #[derive(PartialEq, Eq, Debug)] pub struct Comment<O> { /// The text within the comment. pub data: String, /// The source offset of the comment data. pub data_offset: O, } impl<O: Offset> Comment<O> { /// Calculates the span for the comment data and returns it. pub fn data_span(&self) -> Range<O> { self.data_offset..self.data_offset + self.data.len() } } /// A doctype. Some examples: /// /// * `<!DOCTYPE {name}>` /// * `<!DOCTYPE {name} PUBLIC '{public_id}'>` /// * `<!DOCTYPE {name} SYSTEM '{system_id}'>` /// * `<!DOCTYPE {name} PUBLIC '{public_id}' '{system_id}'>` #[derive(Debug, Eq, PartialEq)] pub struct Doctype<O> { /// The [force-quirks flag]. /// /// [force-quirks flag]: https://html.spec.whatwg.org/#force-quirks-flag pub force_quirks: bool, /// The doctype's name. For HTML documents this is "html". pub name: String, /// The doctype's public identifier. pub public_id: Option<String>, /// The doctype's system identifier. pub system_id: Option<String>, /// The source code span of the doctype. pub span: Range<O>, /// The source offset of the pulic identifier. public_id_offset: O, /// The source offset of the system identifier. system_id_offset: O, } impl<O: Offset> Doctype<O> { /// Calculates the span of the public identifier and returns it. pub fn public_id_span(&self) -> Option<Range<O>> { let public_id = self.public_id.as_ref()?; Some(self.public_id_offset..self.public_id_offset + public_id.len()) } /// Calculates the span of the system identifier and returns it. pub fn system_id_span(&self) -> Option<Range<O>> { let system_id = self.system_id.as_ref()?; Some(self.system_id_offset..self.system_id_offset + system_id.len()) } } /// The token type used by default. You can define your own token type by implementing the /// [`Emitter`] trait. #[derive(Debug, Eq, PartialEq)] pub enum Token<O> { /// A HTML start tag. StartTag(StartTag<O>), /// A HTML end tag. EndTag(EndTag<O>), /// A literal string. String(String), /// A HTML comment. Comment(Comment<O>), /// A HTML doctype declaration. Doctype(Doctype<O>), /// A HTML parsing error. /// /// Can be skipped over, the tokenizer is supposed to recover from the error and continues with /// more tokens afterward. Error { /// What kind of error occured. error: Error, /// The source code span of the error. span: Range<O>, }, } /// The majority of our testing of the [`DefaultEmitter`] is done against the /// html5lib-tests in the html5lib integration test. This module only tests /// details that aren't present in the html5lib test data. #[cfg(test)] mod tests { use super::{DefaultEmitter, Token}; use crate::{attr::AttrValueSyntax, Tokenizer}; #[test] fn test_attribute_value_syntax() { let mut tokenizer = Tokenizer::new( "<div empty unquoted=foo single-quoted='foo' double-quoted=\"foo\">", DefaultEmitter::default(), ) .flatten(); let Token::StartTag(start_tag) = tokenizer.next().unwrap() else { panic!("expected start tag"); }; assert_eq!( start_tag.attributes.get("empty").unwrap().value_syntax(), None ); assert_eq!( start_tag.attributes.get("unquoted").unwrap().value_syntax(), Some(AttrValueSyntax::Unquoted) ); assert_eq!( start_tag .attributes .get("single-quoted") .unwrap() .value_syntax(), Some(AttrValueSyntax::SingleQuoted) ); assert_eq!( start_tag .attributes .get("double-quoted") .unwrap() .value_syntax(), Some(AttrValueSyntax::DoubleQuoted) ); } }