use std::collections::btree_map::Entry; use std::collections::BTreeSet; use std::collections::VecDeque; use std::mem; use std::ops::Range; use crate::offset::NoopOffset; use crate::offset::Offset; use crate::token::{AttrValueSyntax, Comment, Doctype, EndTag, StartTag, Token}; use crate::Emitter; use crate::Error; /// The default implementation of [`Emitter`], used to produce tokens. pub struct DefaultEmitter { current_characters: String, current_token: Option>, current_attribute: Option<(String, crate::token::AttrInternal)>, seen_attributes: BTreeSet, emitted_tokens: VecDeque>, attr_in_end_tag_span: Option>, } impl Default for DefaultEmitter { fn default() -> Self { DefaultEmitter { current_characters: String::new(), current_token: None, current_attribute: None, seen_attributes: BTreeSet::new(), emitted_tokens: VecDeque::new(), attr_in_end_tag_span: None, } } } impl Iterator for DefaultEmitter { type Item = Token; fn next(&mut self) -> Option { self.emitted_tokens.pop_back() } } impl Emitter for DefaultEmitter { fn emit_eof(&mut self) { self.flush_current_characters(); } fn emit_error(&mut self, error: Error, span: Range) { self.push_error(error, span); } fn emit_string(&mut self, s: &str) { self.current_characters.push_str(s); } fn init_start_tag(&mut self, tag_offset: O, name_offset: O) { self.current_token = Some(Token::StartTag(StartTag { span: tag_offset..O::default(), self_closing: false, name: String::new(), attributes: Default::default(), name_span: name_offset..O::default(), })); } fn init_end_tag(&mut self, tag_offset: O, name_offset: O) { self.current_token = Some(Token::EndTag(EndTag { span: tag_offset..O::default(), name: String::new(), name_span: name_offset..O::default(), })); self.seen_attributes.clear(); } fn init_comment(&mut self, data_start_offset: O) { self.current_token = Some(Token::Comment(Comment { data: String::new(), data_span: data_start_offset..O::default(), })); } fn emit_current_tag(&mut self, offset: O) { self.flush_current_attribute(); let mut token = self.current_token.take().unwrap(); match &mut token { Token::EndTag(tag) => { if !self.seen_attributes.is_empty() { let span = self.attr_in_end_tag_span.take().unwrap(); self.push_error(Error::EndTagWithAttributes, span); } self.seen_attributes.clear(); tag.span.end = offset; } Token::StartTag(tag) => { tag.span.end = offset; } _ => debug_assert!(false), } self.emit_token(token); } fn emit_current_comment(&mut self, data_end_offset: O) { let mut token = self.current_token.take().unwrap(); if let Token::Comment(comment) = &mut token { comment.data_span.end = data_end_offset; } else { debug_assert!(false); } self.emit_token(token); } fn emit_current_doctype(&mut self, offset: O) { let Some(Token::Doctype(mut doctype)) = self.current_token.take() else { debug_assert!(false); return; }; doctype.span.end = offset; self.emit_token(Token::Doctype(doctype)); } fn set_self_closing(&mut self, slash_span: Range) { let tag = self.current_token.as_mut().unwrap(); match tag { Token::StartTag(StartTag { ref mut self_closing, .. }) => { *self_closing = true; } Token::EndTag(_) => { self.emit_error(Error::EndTagWithTrailingSolidus, slash_span); } _ => { debug_assert!(false); } } } fn set_force_quirks(&mut self) { match self.current_token { Some(Token::Doctype(ref mut doctype)) => doctype.force_quirks = true, _ => debug_assert!(false), } } fn push_tag_name(&mut self, s: &str) { match self.current_token { Some(Token::StartTag(StartTag { ref mut name, .. })) => { name.push_str(s); } Some(Token::EndTag(EndTag { ref mut name, .. })) => { name.push_str(s); } _ => debug_assert!(false), } } fn terminate_tag_name(&mut self, offset: O) { match self.current_token { Some(Token::StartTag(StartTag { ref mut name_span, .. })) => { name_span.end = offset; } Some(Token::EndTag(EndTag { ref mut name_span, .. })) => { name_span.end = offset; } _ => debug_assert!(false), } } fn push_comment(&mut self, s: &str) { match self.current_token { Some(Token::Comment(Comment { ref mut data, .. })) => data.push_str(s), _ => debug_assert!(false), } } fn init_doctype_name(&mut self, offset: O) { let Some(Token::Doctype(doctype)) = &mut self.current_token else { debug_assert!(false); return; }; doctype.name = Some("".into()); doctype.name_span.start = offset; } fn push_doctype_name(&mut self, s: &str) { match self.current_token { Some(Token::Doctype(Doctype { name: Some(ref mut name), .. })) => name.push_str(s), _ => debug_assert!(false), } } fn terminate_doctype_name(&mut self, offset: O) { let Some(Token::Doctype(doctype)) = &mut self.current_token else { debug_assert!(false); return; }; doctype.name_span.end = offset; } fn init_doctype(&mut self, offset: O) { self.current_token = Some(Token::Doctype(Doctype { name: None, force_quirks: false, public_id: None, system_id: None, span: offset..O::default(), name_span: O::default()..O::default(), public_id_span: O::default()..O::default(), system_id_span: O::default()..O::default(), })); } fn init_attribute_name(&mut self, offset: O) { self.flush_current_attribute(); self.current_attribute = Some(( String::new(), crate::token::AttrInternal { name_span: offset..O::default(), value: String::new(), value_span: O::default()..O::default(), value_syntax: None, }, )); } fn init_attribute_value(&mut self, syntax: AttrValueSyntax, offset: O) { let (_, current_attribute) = self.current_attribute.as_mut().unwrap(); current_attribute.value_span.start = offset; current_attribute.value_syntax = Some(syntax); } fn push_attribute_name(&mut self, s: &str) { let current_attr = self.current_attribute.as_mut().unwrap(); current_attr.0.push_str(s); } fn terminate_attribute_name(&mut self, offset: O) { let current_attr = self.current_attribute.as_mut().unwrap(); current_attr.1.name_span.end = offset; } fn push_attribute_value(&mut self, s: &str) { let current_attr = self.current_attribute.as_mut().unwrap(); current_attr.1.value.push_str(s); } fn terminate_attribute_value(&mut self, offset: O) { let current_attr = self.current_attribute.as_mut().unwrap(); current_attr.1.value_span.end = offset; } fn init_doctype_public_id(&mut self, offset: O) { let Some(Token::Doctype(doctype)) = &mut self.current_token else { debug_assert!(false); return; }; doctype.public_id = Some("".to_owned()); doctype.public_id_span.start = offset; } fn init_doctype_system_id(&mut self, offset: O) { let Some(Token::Doctype(doctype)) = &mut self.current_token else { debug_assert!(false); return; }; doctype.system_id = Some("".to_owned()); doctype.system_id_span.start = offset; } fn push_doctype_public_id(&mut self, s: &str) { if let Some(Token::Doctype(Doctype { public_id: Some(ref mut id), .. })) = self.current_token { id.push_str(s); } else { debug_assert!(false); } } fn terminate_doctype_public_id(&mut self, offset: O) { if let Some(Token::Doctype(Doctype { ref mut public_id_span, .. })) = self.current_token { public_id_span.end = offset; } else { debug_assert!(false); } } fn push_doctype_system_id(&mut self, s: &str) { if let Some(Token::Doctype(Doctype { system_id: Some(ref mut id), .. })) = self.current_token { id.push_str(s); } else { debug_assert!(false); } } fn terminate_doctype_system_id(&mut self, offset: O) { if let Some(Token::Doctype(Doctype { ref mut system_id_span, .. })) = self.current_token { system_id_span.end = offset; } else { debug_assert!(false); } } } impl DefaultEmitter { fn emit_token(&mut self, token: Token) { self.flush_current_characters(); self.emitted_tokens.push_front(token); } fn flush_current_attribute(&mut self) where O: Offset, { if let Some((name, map_val)) = self.current_attribute.take() { match self.current_token { Some(Token::StartTag(ref mut tag)) => match tag.attributes.inner.entry(name) { Entry::Vacant(vacant) => { vacant.insert(map_val); } Entry::Occupied(_) => { self.push_error(Error::DuplicateAttribute, map_val.name_span); } }, Some(Token::EndTag(_)) => { self.attr_in_end_tag_span = Some(map_val.name_span.clone()); if !self.seen_attributes.insert(name) { self.push_error(Error::DuplicateAttribute, map_val.name_span); } } _ => { debug_assert!(false); } } } } fn flush_current_characters(&mut self) { if self.current_characters.is_empty() { return; } let s = mem::take(&mut self.current_characters); self.emit_token(Token::String(s)); } fn push_error(&mut self, error: Error, span: Range) { // bypass character flushing in self.emit_token: we don't need the error location to be // that exact self.emitted_tokens.push_front(Token::Error { error, span }); } } /// The majority of our testing of the [`DefaultEmitter`] is done against the /// html5lib-tests in the html5lib integration test. This module only tests /// details that aren't present in the html5lib test data. #[cfg(test)] mod tests { use super::DefaultEmitter; use crate::token::{AttrValueSyntax, Token}; use crate::{Event, Tokenizer}; #[test] fn test_attribute_value_syntax() { let mut tokenizer = Tokenizer::new( "
", DefaultEmitter::default(), ) .flatten(); let Event::Token(Token::StartTag(start_tag)) = tokenizer.next().unwrap() else { panic!("expected start tag"); }; assert_eq!( start_tag.attributes.get("empty").unwrap().value_syntax(), None ); assert_eq!( start_tag.attributes.get("unquoted").unwrap().value_syntax(), Some(AttrValueSyntax::Unquoted) ); assert_eq!( start_tag .attributes .get("single-quoted") .unwrap() .value_syntax(), Some(AttrValueSyntax::SingleQuoted) ); assert_eq!( start_tag .attributes .get("double-quoted") .unwrap() .value_syntax(), Some(AttrValueSyntax::DoubleQuoted) ); } }