summaryrefslogtreecommitdiff
path: root/src/tracing_emitter.rs
diff options
context:
space:
mode:
Diffstat (limited to 'src/tracing_emitter.rs')
-rw-r--r--src/tracing_emitter.rs344
1 files changed, 344 insertions, 0 deletions
diff --git a/src/tracing_emitter.rs b/src/tracing_emitter.rs
new file mode 100644
index 0000000..408d9b0
--- /dev/null
+++ b/src/tracing_emitter.rs
@@ -0,0 +1,344 @@
+use std::collections::btree_map::Entry;
+use std::collections::BTreeSet;
+use std::collections::VecDeque;
+use std::ops::Range;
+
+use crate::let_else::assume;
+use crate::offset::NoopOffset;
+use crate::offset::Offset;
+use crate::token::{AttrValueSyntax, Comment, Doctype, EndTag, StartTag, Token};
+use crate::Emitter;
+use crate::Error;
+
+/// The default implementation of [`Emitter`], used to produce tokens.
+pub struct TracingEmitter<O = NoopOffset> {
+ current_token: Option<Token<O>>,
+ current_attribute_name: String,
+ current_attr_internal: crate::token::AttrInternal<O>,
+ seen_attributes: BTreeSet<String>,
+ emitted_tokens: VecDeque<Token<O>>,
+ errors: VecDeque<(Error, Range<O>)>,
+ attr_in_end_tag_span: Option<Range<O>>,
+}
+
+impl<O: Default> Default for TracingEmitter<O> {
+ fn default() -> Self {
+ TracingEmitter {
+ current_token: None,
+ current_attribute_name: String::new(),
+ current_attr_internal: Default::default(),
+ seen_attributes: BTreeSet::new(),
+ emitted_tokens: VecDeque::new(),
+ errors: VecDeque::new(),
+ attr_in_end_tag_span: None,
+ }
+ }
+}
+
+impl<O> TracingEmitter<O> {
+ /// Removes all encountered tokenizer errors and returns them as an iterator.
+ pub fn drain_errors(&mut self) -> impl Iterator<Item = (Error, Range<O>)> + '_ {
+ self.errors.drain(0..)
+ }
+}
+
+impl<O> Iterator for TracingEmitter<O> {
+ type Item = Token<O>;
+
+ fn next(&mut self) -> Option<Self::Item> {
+ self.emitted_tokens.pop_back()
+ }
+}
+
+impl<O: Offset> Emitter<O> for TracingEmitter<O> {
+ fn report_error(&mut self, error: Error, span: Range<O>) {
+ self.errors.push_back((error, span));
+ }
+
+ fn emit_char(&mut self, c: char) {
+ self.emit_token(Token::Char(c));
+ }
+
+ fn emit_eof(&mut self) {
+ self.emit_token(Token::EndOfFile);
+ }
+
+ fn init_start_tag(&mut self, tag_offset: O, name_offset: O) {
+ self.current_token = Some(Token::StartTag(StartTag {
+ span: tag_offset..O::default(),
+ self_closing: false,
+ name: String::new(),
+ attributes: Default::default(),
+ name_span: name_offset..O::default(),
+ }));
+ }
+
+ fn init_end_tag(&mut self, tag_offset: O, name_offset: O) {
+ self.current_token = Some(Token::EndTag(EndTag {
+ span: tag_offset..O::default(),
+ name: String::new(),
+ name_span: name_offset..O::default(),
+ }));
+ self.seen_attributes.clear();
+ }
+
+ fn push_tag_name(&mut self, s: &str) {
+ assume!(
+ Some(Token::StartTag(StartTag { name, .. }) | Token::EndTag(EndTag { name, .. })),
+ &mut self.current_token
+ );
+ name.push_str(s);
+ }
+
+ fn terminate_tag_name(&mut self, offset: O) {
+ assume!(
+ Some(
+ Token::StartTag(StartTag { name_span, .. })
+ | Token::EndTag(EndTag { name_span, .. })
+ ),
+ &mut self.current_token
+ );
+ name_span.end = offset;
+ }
+
+ fn init_attribute_name(&mut self, offset: O) {
+ self.flush_current_attribute();
+ self.current_attr_internal.name_span.start = offset;
+ }
+
+ fn push_attribute_name(&mut self, s: &str) {
+ self.current_attribute_name.push_str(s);
+ }
+
+ fn terminate_attribute_name(&mut self, offset: O) {
+ self.current_attr_internal.name_span.end = offset;
+ }
+
+ fn init_attribute_value(&mut self, syntax: AttrValueSyntax, offset: O) {
+ self.current_attr_internal.value_span.start = offset;
+ self.current_attr_internal.value_syntax = Some(syntax);
+ }
+
+ fn push_attribute_value(&mut self, s: &str) {
+ self.current_attr_internal.value.push_str(s);
+ }
+
+ fn terminate_attribute_value(&mut self, offset: O) {
+ self.current_attr_internal.value_span.end = offset;
+ }
+
+ fn set_self_closing(&mut self, slash_span: Range<O>) {
+ let token = self.current_token.as_mut().unwrap();
+
+ match token {
+ Token::StartTag(tag) => {
+ tag.self_closing = true;
+ }
+ Token::EndTag(_) => {
+ self.report_error(Error::EndTagWithTrailingSolidus, slash_span);
+ }
+ other => debug_assert!(false, "unexpected current_token: {other:?}"),
+ }
+ }
+
+ fn emit_current_tag(&mut self, offset: O) {
+ self.flush_current_attribute();
+ let mut token = self.current_token.take().unwrap();
+ match &mut token {
+ Token::EndTag(tag) => {
+ if !self.seen_attributes.is_empty() {
+ let span = self.attr_in_end_tag_span.take().unwrap();
+ self.report_error(Error::EndTagWithAttributes, span);
+ }
+ self.seen_attributes.clear();
+ tag.span.end = offset;
+ }
+ Token::StartTag(tag) => {
+ tag.span.end = offset;
+ }
+ other => {
+ debug_assert!(false, "unexpected current_token: {other:?}");
+ return;
+ }
+ }
+ self.emit_token(token);
+ }
+
+ fn init_comment(&mut self, data_start_offset: O) {
+ self.current_token = Some(Token::Comment(Comment {
+ data: String::new(),
+ data_span: data_start_offset..O::default(),
+ }));
+ }
+
+ fn push_comment(&mut self, s: &str) {
+ assume!(Some(Token::Comment(comment)), &mut self.current_token);
+ comment.data.push_str(s);
+ }
+
+ fn emit_current_comment(&mut self, data_end_offset: O) {
+ let mut token = self.current_token.take().unwrap();
+ assume!(Token::Comment(comment), &mut token);
+ comment.data_span.end = data_end_offset;
+ self.emit_token(token);
+ }
+
+ fn init_doctype(&mut self, offset: O) {
+ self.current_token = Some(Token::Doctype(Doctype {
+ name: None,
+ force_quirks: false,
+ public_id: None,
+ system_id: None,
+ span: offset..O::default(),
+ name_span: O::default()..O::default(),
+ public_id_span: O::default()..O::default(),
+ system_id_span: O::default()..O::default(),
+ }));
+ }
+
+ fn init_doctype_name(&mut self, offset: O) {
+ assume!(Some(Token::Doctype(doctype)), &mut self.current_token);
+ doctype.name = Some("".into());
+ doctype.name_span.start = offset;
+ }
+
+ fn push_doctype_name(&mut self, s: &str) {
+ assume!(
+ Some(Token::Doctype(Doctype {
+ name: Some(name),
+ ..
+ })),
+ &mut self.current_token
+ );
+ name.push_str(s);
+ }
+
+ fn terminate_doctype_name(&mut self, offset: O) {
+ assume!(Some(Token::Doctype(doctype)), &mut self.current_token);
+ doctype.name_span.end = offset;
+ }
+
+ fn init_doctype_public_id(&mut self, offset: O) {
+ assume!(Some(Token::Doctype(doctype)), &mut self.current_token);
+ doctype.public_id = Some("".to_owned());
+ doctype.public_id_span.start = offset;
+ }
+
+ fn push_doctype_public_id(&mut self, s: &str) {
+ assume!(
+ Some(Token::Doctype(Doctype {
+ public_id: Some(public_id),
+ ..
+ })),
+ &mut self.current_token
+ );
+ public_id.push_str(s);
+ }
+
+ fn terminate_doctype_public_id(&mut self, offset: O) {
+ assume!(Some(Token::Doctype(doctype)), &mut self.current_token);
+ doctype.public_id_span.end = offset;
+ }
+
+ fn init_doctype_system_id(&mut self, offset: O) {
+ assume!(Some(Token::Doctype(doctype)), &mut self.current_token);
+ doctype.system_id = Some("".to_owned());
+ doctype.system_id_span.start = offset;
+ }
+
+ fn push_doctype_system_id(&mut self, s: &str) {
+ assume!(
+ Some(Token::Doctype(Doctype {
+ system_id: Some(id),
+ ..
+ })),
+ &mut self.current_token
+ );
+ id.push_str(s);
+ }
+
+ fn terminate_doctype_system_id(&mut self, offset: O) {
+ assume!(Some(Token::Doctype(doctype)), &mut self.current_token);
+ doctype.system_id_span.end = offset;
+ }
+
+ fn set_force_quirks(&mut self) {
+ assume!(Some(Token::Doctype(doctype)), &mut self.current_token);
+ doctype.force_quirks = true;
+ }
+
+ fn emit_current_doctype(&mut self, offset: O) {
+ assume!(Some(Token::Doctype(mut doctype)), self.current_token.take());
+ doctype.span.end = offset;
+ self.emit_token(Token::Doctype(doctype));
+ }
+}
+
+impl<O> TracingEmitter<O> {
+ fn emit_token(&mut self, token: Token<O>) {
+ self.emitted_tokens.push_front(token);
+ }
+
+ fn flush_current_attribute(&mut self)
+ where
+ O: Offset,
+ {
+ if self.current_attribute_name.is_empty() {
+ return;
+ }
+ let name = std::mem::take(&mut self.current_attribute_name);
+ let attr_internal = std::mem::take(&mut self.current_attr_internal);
+
+ match &mut self.current_token {
+ Some(Token::StartTag(tag)) => match tag.attributes.inner.entry(name) {
+ Entry::Vacant(vacant) => {
+ vacant.insert(attr_internal);
+ }
+ Entry::Occupied(_) => {
+ self.report_error(Error::DuplicateAttribute, attr_internal.name_span);
+ }
+ },
+ Some(Token::EndTag(_)) => {
+ self.attr_in_end_tag_span = Some(attr_internal.name_span.clone());
+ if !self.seen_attributes.insert(name) {
+ self.report_error(Error::DuplicateAttribute, attr_internal.name_span);
+ }
+ }
+ other => debug_assert!(false, "unexpected current_token: {other:?}"),
+ }
+ }
+}
+
+/// The majority of our testing of the [`TracingEmitter`] is done against the
+/// html5lib-tests in the html5lib integration test. This module only tests
+/// details that aren't present in the html5lib test data.
+#[cfg(test)]
+mod tests {
+ use super::TracingEmitter;
+ use crate::token::{AttrValueSyntax, Token};
+ use crate::{Event, Tokenizer};
+
+ #[test]
+ fn test_attribute_value_syntax() {
+ let mut tokenizer = Tokenizer::new(
+ "<div empty unquoted=foo single-quoted='foo' double-quoted=\"foo\">",
+ TracingEmitter::default(),
+ )
+ .flatten();
+ let Event::Token(Token::StartTag(tag)) = tokenizer.next().unwrap() else {
+ panic!("expected start tag");
+ };
+ for (name, syntax) in [
+ ("empty", None),
+ ("unquoted", Some(AttrValueSyntax::Unquoted)),
+ ("single-quoted", Some(AttrValueSyntax::SingleQuoted)),
+ ("double-quoted", Some(AttrValueSyntax::DoubleQuoted)),
+ ] {
+ assert_eq!(
+ tag.attributes.get(name).unwrap().value_syntax(),
+ syntax,
+ "unexpected value for attribute {name}"
+ );
+ }
+ }
+}