From ad6ac5f0a825775c231e76cdc9016e61e54f4141 Mon Sep 17 00:00:00 2001 From: Martin Fischer <martin@push-f.com> Date: Tue, 12 Sep 2023 08:19:00 +0200 Subject: break!: rename DefaultEmitter to TracingEmitter --- src/default_emitter.rs | 344 ------------------------------------------------- src/lib.rs | 4 +- src/naive_parser.rs | 8 +- src/tokenizer.rs | 2 +- src/tracing_emitter.rs | 344 +++++++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 351 insertions(+), 351 deletions(-) delete mode 100644 src/default_emitter.rs create mode 100644 src/tracing_emitter.rs (limited to 'src') diff --git a/src/default_emitter.rs b/src/default_emitter.rs deleted file mode 100644 index 7b6c51e..0000000 --- a/src/default_emitter.rs +++ /dev/null @@ -1,344 +0,0 @@ -use std::collections::btree_map::Entry; -use std::collections::BTreeSet; -use std::collections::VecDeque; -use std::ops::Range; - -use crate::let_else::assume; -use crate::offset::NoopOffset; -use crate::offset::Offset; -use crate::token::{AttrValueSyntax, Comment, Doctype, EndTag, StartTag, Token}; -use crate::Emitter; -use crate::Error; - -/// The default implementation of [`Emitter`], used to produce tokens. -pub struct DefaultEmitter<O = NoopOffset> { - current_token: Option<Token<O>>, - current_attribute_name: String, - current_attr_internal: crate::token::AttrInternal<O>, - seen_attributes: BTreeSet<String>, - emitted_tokens: VecDeque<Token<O>>, - errors: VecDeque<(Error, Range<O>)>, - attr_in_end_tag_span: Option<Range<O>>, -} - -impl<O: Default> Default for DefaultEmitter<O> { - fn default() -> Self { - DefaultEmitter { - current_token: None, - current_attribute_name: String::new(), - current_attr_internal: Default::default(), - seen_attributes: BTreeSet::new(), - emitted_tokens: VecDeque::new(), - errors: VecDeque::new(), - attr_in_end_tag_span: None, - } - } -} - -impl<O> DefaultEmitter<O> { - /// Removes all encountered tokenizer errors and returns them as an iterator. - pub fn drain_errors(&mut self) -> impl Iterator<Item = (Error, Range<O>)> + '_ { - self.errors.drain(0..) - } -} - -impl<O> Iterator for DefaultEmitter<O> { - type Item = Token<O>; - - fn next(&mut self) -> Option<Self::Item> { - self.emitted_tokens.pop_back() - } -} - -impl<O: Offset> Emitter<O> for DefaultEmitter<O> { - fn report_error(&mut self, error: Error, span: Range<O>) { - self.errors.push_back((error, span)); - } - - fn emit_char(&mut self, c: char) { - self.emit_token(Token::Char(c)); - } - - fn emit_eof(&mut self) { - self.emit_token(Token::EndOfFile); - } - - fn init_start_tag(&mut self, tag_offset: O, name_offset: O) { - self.current_token = Some(Token::StartTag(StartTag { - span: tag_offset..O::default(), - self_closing: false, - name: String::new(), - attributes: Default::default(), - name_span: name_offset..O::default(), - })); - } - - fn init_end_tag(&mut self, tag_offset: O, name_offset: O) { - self.current_token = Some(Token::EndTag(EndTag { - span: tag_offset..O::default(), - name: String::new(), - name_span: name_offset..O::default(), - })); - self.seen_attributes.clear(); - } - - fn push_tag_name(&mut self, s: &str) { - assume!( - Some(Token::StartTag(StartTag { name, .. }) | Token::EndTag(EndTag { name, .. })), - &mut self.current_token - ); - name.push_str(s); - } - - fn terminate_tag_name(&mut self, offset: O) { - assume!( - Some( - Token::StartTag(StartTag { name_span, .. }) - | Token::EndTag(EndTag { name_span, .. }) - ), - &mut self.current_token - ); - name_span.end = offset; - } - - fn init_attribute_name(&mut self, offset: O) { - self.flush_current_attribute(); - self.current_attr_internal.name_span.start = offset; - } - - fn push_attribute_name(&mut self, s: &str) { - self.current_attribute_name.push_str(s); - } - - fn terminate_attribute_name(&mut self, offset: O) { - self.current_attr_internal.name_span.end = offset; - } - - fn init_attribute_value(&mut self, syntax: AttrValueSyntax, offset: O) { - self.current_attr_internal.value_span.start = offset; - self.current_attr_internal.value_syntax = Some(syntax); - } - - fn push_attribute_value(&mut self, s: &str) { - self.current_attr_internal.value.push_str(s); - } - - fn terminate_attribute_value(&mut self, offset: O) { - self.current_attr_internal.value_span.end = offset; - } - - fn set_self_closing(&mut self, slash_span: Range<O>) { - let token = self.current_token.as_mut().unwrap(); - - match token { - Token::StartTag(tag) => { - tag.self_closing = true; - } - Token::EndTag(_) => { - self.report_error(Error::EndTagWithTrailingSolidus, slash_span); - } - other => debug_assert!(false, "unexpected current_token: {other:?}"), - } - } - - fn emit_current_tag(&mut self, offset: O) { - self.flush_current_attribute(); - let mut token = self.current_token.take().unwrap(); - match &mut token { - Token::EndTag(tag) => { - if !self.seen_attributes.is_empty() { - let span = self.attr_in_end_tag_span.take().unwrap(); - self.report_error(Error::EndTagWithAttributes, span); - } - self.seen_attributes.clear(); - tag.span.end = offset; - } - Token::StartTag(tag) => { - tag.span.end = offset; - } - other => { - debug_assert!(false, "unexpected current_token: {other:?}"); - return; - } - } - self.emit_token(token); - } - - fn init_comment(&mut self, data_start_offset: O) { - self.current_token = Some(Token::Comment(Comment { - data: String::new(), - data_span: data_start_offset..O::default(), - })); - } - - fn push_comment(&mut self, s: &str) { - assume!(Some(Token::Comment(comment)), &mut self.current_token); - comment.data.push_str(s); - } - - fn emit_current_comment(&mut self, data_end_offset: O) { - let mut token = self.current_token.take().unwrap(); - assume!(Token::Comment(comment), &mut token); - comment.data_span.end = data_end_offset; - self.emit_token(token); - } - - fn init_doctype(&mut self, offset: O) { - self.current_token = Some(Token::Doctype(Doctype { - name: None, - force_quirks: false, - public_id: None, - system_id: None, - span: offset..O::default(), - name_span: O::default()..O::default(), - public_id_span: O::default()..O::default(), - system_id_span: O::default()..O::default(), - })); - } - - fn init_doctype_name(&mut self, offset: O) { - assume!(Some(Token::Doctype(doctype)), &mut self.current_token); - doctype.name = Some("".into()); - doctype.name_span.start = offset; - } - - fn push_doctype_name(&mut self, s: &str) { - assume!( - Some(Token::Doctype(Doctype { - name: Some(name), - .. - })), - &mut self.current_token - ); - name.push_str(s); - } - - fn terminate_doctype_name(&mut self, offset: O) { - assume!(Some(Token::Doctype(doctype)), &mut self.current_token); - doctype.name_span.end = offset; - } - - fn init_doctype_public_id(&mut self, offset: O) { - assume!(Some(Token::Doctype(doctype)), &mut self.current_token); - doctype.public_id = Some("".to_owned()); - doctype.public_id_span.start = offset; - } - - fn push_doctype_public_id(&mut self, s: &str) { - assume!( - Some(Token::Doctype(Doctype { - public_id: Some(public_id), - .. - })), - &mut self.current_token - ); - public_id.push_str(s); - } - - fn terminate_doctype_public_id(&mut self, offset: O) { - assume!(Some(Token::Doctype(doctype)), &mut self.current_token); - doctype.public_id_span.end = offset; - } - - fn init_doctype_system_id(&mut self, offset: O) { - assume!(Some(Token::Doctype(doctype)), &mut self.current_token); - doctype.system_id = Some("".to_owned()); - doctype.system_id_span.start = offset; - } - - fn push_doctype_system_id(&mut self, s: &str) { - assume!( - Some(Token::Doctype(Doctype { - system_id: Some(id), - .. - })), - &mut self.current_token - ); - id.push_str(s); - } - - fn terminate_doctype_system_id(&mut self, offset: O) { - assume!(Some(Token::Doctype(doctype)), &mut self.current_token); - doctype.system_id_span.end = offset; - } - - fn set_force_quirks(&mut self) { - assume!(Some(Token::Doctype(doctype)), &mut self.current_token); - doctype.force_quirks = true; - } - - fn emit_current_doctype(&mut self, offset: O) { - assume!(Some(Token::Doctype(mut doctype)), self.current_token.take()); - doctype.span.end = offset; - self.emit_token(Token::Doctype(doctype)); - } -} - -impl<O> DefaultEmitter<O> { - fn emit_token(&mut self, token: Token<O>) { - self.emitted_tokens.push_front(token); - } - - fn flush_current_attribute(&mut self) - where - O: Offset, - { - if self.current_attribute_name.is_empty() { - return; - } - let name = std::mem::take(&mut self.current_attribute_name); - let attr_internal = std::mem::take(&mut self.current_attr_internal); - - match &mut self.current_token { - Some(Token::StartTag(tag)) => match tag.attributes.inner.entry(name) { - Entry::Vacant(vacant) => { - vacant.insert(attr_internal); - } - Entry::Occupied(_) => { - self.report_error(Error::DuplicateAttribute, attr_internal.name_span); - } - }, - Some(Token::EndTag(_)) => { - self.attr_in_end_tag_span = Some(attr_internal.name_span.clone()); - if !self.seen_attributes.insert(name) { - self.report_error(Error::DuplicateAttribute, attr_internal.name_span); - } - } - other => debug_assert!(false, "unexpected current_token: {other:?}"), - } - } -} - -/// The majority of our testing of the [`DefaultEmitter`] is done against the -/// html5lib-tests in the html5lib integration test. This module only tests -/// details that aren't present in the html5lib test data. -#[cfg(test)] -mod tests { - use super::DefaultEmitter; - use crate::token::{AttrValueSyntax, Token}; - use crate::{Event, Tokenizer}; - - #[test] - fn test_attribute_value_syntax() { - let mut tokenizer = Tokenizer::new( - "<div empty unquoted=foo single-quoted='foo' double-quoted=\"foo\">", - DefaultEmitter::default(), - ) - .flatten(); - let Event::Token(Token::StartTag(tag)) = tokenizer.next().unwrap() else { - panic!("expected start tag"); - }; - for (name, syntax) in [ - ("empty", None), - ("unquoted", Some(AttrValueSyntax::Unquoted)), - ("single-quoted", Some(AttrValueSyntax::SingleQuoted)), - ("double-quoted", Some(AttrValueSyntax::DoubleQuoted)), - ] { - assert_eq!( - tag.attributes.get(name).unwrap().value_syntax(), - syntax, - "unexpected value for attribute {name}" - ); - } - } -} diff --git a/src/lib.rs b/src/lib.rs index 40b691a..aecbef3 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -7,13 +7,13 @@ #![doc = concat!("[the LICENSE file]: ", file_url!("LICENSE"))] #![doc = include_str!("../README.md")] -mod default_emitter; mod emitter; mod entities; mod error; mod let_else; mod naive_parser; mod tokenizer; +mod tracing_emitter; /// Types for HTML attributes. pub mod attr { @@ -25,12 +25,12 @@ pub mod offset; pub mod reader; pub mod token; -pub use default_emitter::DefaultEmitter; pub use emitter::Emitter; pub use error::Error; pub use naive_parser::NaiveParser; pub use token::{Comment, Doctype, EndTag, StartTag, Token}; pub use tokenizer::{CdataAction, Event, State, Tokenizer}; +pub use tracing_emitter::TracingEmitter; #[cfg(feature = "integration-tests")] pub use tokenizer::InternalState; diff --git a/src/naive_parser.rs b/src/naive_parser.rs index 4988477..91edbc0 100644 --- a/src/naive_parser.rs +++ b/src/naive_parser.rs @@ -1,7 +1,7 @@ -use crate::default_emitter::DefaultEmitter; use crate::offset::{Offset, Position}; use crate::reader::{IntoReader, Reader}; use crate::tokenizer::CdataAction; +use crate::tracing_emitter::TracingEmitter; use crate::{Emitter, Event, State, Tokenizer}; /// A naive HTML parser (**not** spec-compliant since it doesn't do tree construction). @@ -30,18 +30,18 @@ pub struct NaiveParser<R: Reader, O: Offset, E: Emitter<O>> { tokenizer: Tokenizer<R, O, E>, } -impl<R, O> NaiveParser<R, O, DefaultEmitter<O>> +impl<R, O> NaiveParser<R, O, TracingEmitter<O>> where R: Reader + Position<O>, O: Offset, { /// Constructs a new naive parser. // TODO: add example for NaiveParser::new - pub fn new<'a, IR>(reader: IR) -> NaiveParser<R, O, DefaultEmitter<O>> + pub fn new<'a, IR>(reader: IR) -> NaiveParser<R, O, TracingEmitter<O>> where IR: IntoReader<'a, Reader = R>, { - NaiveParser::new_with_emitter(reader, DefaultEmitter::default()) + NaiveParser::new_with_emitter(reader, TracingEmitter::default()) } } diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 7c38e49..d0e2eaf 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -15,7 +15,7 @@ pub use machine::State as InternalState; /// Iterating over the tokenizer directly without calling [`Tokenizer::set_state`] /// results in wrong state transitions: /// -/// ``` +/// ```ignore TODO: unignore once the BasicEmitter has been implemented /// # use html5tokenizer::{DefaultEmitter, Event, Tokenizer, Token}; /// let emitter = DefaultEmitter::default(); /// let html = "<script><b>"; diff --git a/src/tracing_emitter.rs b/src/tracing_emitter.rs new file mode 100644 index 0000000..408d9b0 --- /dev/null +++ b/src/tracing_emitter.rs @@ -0,0 +1,344 @@ +use std::collections::btree_map::Entry; +use std::collections::BTreeSet; +use std::collections::VecDeque; +use std::ops::Range; + +use crate::let_else::assume; +use crate::offset::NoopOffset; +use crate::offset::Offset; +use crate::token::{AttrValueSyntax, Comment, Doctype, EndTag, StartTag, Token}; +use crate::Emitter; +use crate::Error; + +/// The default implementation of [`Emitter`], used to produce tokens. +pub struct TracingEmitter<O = NoopOffset> { + current_token: Option<Token<O>>, + current_attribute_name: String, + current_attr_internal: crate::token::AttrInternal<O>, + seen_attributes: BTreeSet<String>, + emitted_tokens: VecDeque<Token<O>>, + errors: VecDeque<(Error, Range<O>)>, + attr_in_end_tag_span: Option<Range<O>>, +} + +impl<O: Default> Default for TracingEmitter<O> { + fn default() -> Self { + TracingEmitter { + current_token: None, + current_attribute_name: String::new(), + current_attr_internal: Default::default(), + seen_attributes: BTreeSet::new(), + emitted_tokens: VecDeque::new(), + errors: VecDeque::new(), + attr_in_end_tag_span: None, + } + } +} + +impl<O> TracingEmitter<O> { + /// Removes all encountered tokenizer errors and returns them as an iterator. + pub fn drain_errors(&mut self) -> impl Iterator<Item = (Error, Range<O>)> + '_ { + self.errors.drain(0..) + } +} + +impl<O> Iterator for TracingEmitter<O> { + type Item = Token<O>; + + fn next(&mut self) -> Option<Self::Item> { + self.emitted_tokens.pop_back() + } +} + +impl<O: Offset> Emitter<O> for TracingEmitter<O> { + fn report_error(&mut self, error: Error, span: Range<O>) { + self.errors.push_back((error, span)); + } + + fn emit_char(&mut self, c: char) { + self.emit_token(Token::Char(c)); + } + + fn emit_eof(&mut self) { + self.emit_token(Token::EndOfFile); + } + + fn init_start_tag(&mut self, tag_offset: O, name_offset: O) { + self.current_token = Some(Token::StartTag(StartTag { + span: tag_offset..O::default(), + self_closing: false, + name: String::new(), + attributes: Default::default(), + name_span: name_offset..O::default(), + })); + } + + fn init_end_tag(&mut self, tag_offset: O, name_offset: O) { + self.current_token = Some(Token::EndTag(EndTag { + span: tag_offset..O::default(), + name: String::new(), + name_span: name_offset..O::default(), + })); + self.seen_attributes.clear(); + } + + fn push_tag_name(&mut self, s: &str) { + assume!( + Some(Token::StartTag(StartTag { name, .. }) | Token::EndTag(EndTag { name, .. })), + &mut self.current_token + ); + name.push_str(s); + } + + fn terminate_tag_name(&mut self, offset: O) { + assume!( + Some( + Token::StartTag(StartTag { name_span, .. }) + | Token::EndTag(EndTag { name_span, .. }) + ), + &mut self.current_token + ); + name_span.end = offset; + } + + fn init_attribute_name(&mut self, offset: O) { + self.flush_current_attribute(); + self.current_attr_internal.name_span.start = offset; + } + + fn push_attribute_name(&mut self, s: &str) { + self.current_attribute_name.push_str(s); + } + + fn terminate_attribute_name(&mut self, offset: O) { + self.current_attr_internal.name_span.end = offset; + } + + fn init_attribute_value(&mut self, syntax: AttrValueSyntax, offset: O) { + self.current_attr_internal.value_span.start = offset; + self.current_attr_internal.value_syntax = Some(syntax); + } + + fn push_attribute_value(&mut self, s: &str) { + self.current_attr_internal.value.push_str(s); + } + + fn terminate_attribute_value(&mut self, offset: O) { + self.current_attr_internal.value_span.end = offset; + } + + fn set_self_closing(&mut self, slash_span: Range<O>) { + let token = self.current_token.as_mut().unwrap(); + + match token { + Token::StartTag(tag) => { + tag.self_closing = true; + } + Token::EndTag(_) => { + self.report_error(Error::EndTagWithTrailingSolidus, slash_span); + } + other => debug_assert!(false, "unexpected current_token: {other:?}"), + } + } + + fn emit_current_tag(&mut self, offset: O) { + self.flush_current_attribute(); + let mut token = self.current_token.take().unwrap(); + match &mut token { + Token::EndTag(tag) => { + if !self.seen_attributes.is_empty() { + let span = self.attr_in_end_tag_span.take().unwrap(); + self.report_error(Error::EndTagWithAttributes, span); + } + self.seen_attributes.clear(); + tag.span.end = offset; + } + Token::StartTag(tag) => { + tag.span.end = offset; + } + other => { + debug_assert!(false, "unexpected current_token: {other:?}"); + return; + } + } + self.emit_token(token); + } + + fn init_comment(&mut self, data_start_offset: O) { + self.current_token = Some(Token::Comment(Comment { + data: String::new(), + data_span: data_start_offset..O::default(), + })); + } + + fn push_comment(&mut self, s: &str) { + assume!(Some(Token::Comment(comment)), &mut self.current_token); + comment.data.push_str(s); + } + + fn emit_current_comment(&mut self, data_end_offset: O) { + let mut token = self.current_token.take().unwrap(); + assume!(Token::Comment(comment), &mut token); + comment.data_span.end = data_end_offset; + self.emit_token(token); + } + + fn init_doctype(&mut self, offset: O) { + self.current_token = Some(Token::Doctype(Doctype { + name: None, + force_quirks: false, + public_id: None, + system_id: None, + span: offset..O::default(), + name_span: O::default()..O::default(), + public_id_span: O::default()..O::default(), + system_id_span: O::default()..O::default(), + })); + } + + fn init_doctype_name(&mut self, offset: O) { + assume!(Some(Token::Doctype(doctype)), &mut self.current_token); + doctype.name = Some("".into()); + doctype.name_span.start = offset; + } + + fn push_doctype_name(&mut self, s: &str) { + assume!( + Some(Token::Doctype(Doctype { + name: Some(name), + .. + })), + &mut self.current_token + ); + name.push_str(s); + } + + fn terminate_doctype_name(&mut self, offset: O) { + assume!(Some(Token::Doctype(doctype)), &mut self.current_token); + doctype.name_span.end = offset; + } + + fn init_doctype_public_id(&mut self, offset: O) { + assume!(Some(Token::Doctype(doctype)), &mut self.current_token); + doctype.public_id = Some("".to_owned()); + doctype.public_id_span.start = offset; + } + + fn push_doctype_public_id(&mut self, s: &str) { + assume!( + Some(Token::Doctype(Doctype { + public_id: Some(public_id), + .. + })), + &mut self.current_token + ); + public_id.push_str(s); + } + + fn terminate_doctype_public_id(&mut self, offset: O) { + assume!(Some(Token::Doctype(doctype)), &mut self.current_token); + doctype.public_id_span.end = offset; + } + + fn init_doctype_system_id(&mut self, offset: O) { + assume!(Some(Token::Doctype(doctype)), &mut self.current_token); + doctype.system_id = Some("".to_owned()); + doctype.system_id_span.start = offset; + } + + fn push_doctype_system_id(&mut self, s: &str) { + assume!( + Some(Token::Doctype(Doctype { + system_id: Some(id), + .. + })), + &mut self.current_token + ); + id.push_str(s); + } + + fn terminate_doctype_system_id(&mut self, offset: O) { + assume!(Some(Token::Doctype(doctype)), &mut self.current_token); + doctype.system_id_span.end = offset; + } + + fn set_force_quirks(&mut self) { + assume!(Some(Token::Doctype(doctype)), &mut self.current_token); + doctype.force_quirks = true; + } + + fn emit_current_doctype(&mut self, offset: O) { + assume!(Some(Token::Doctype(mut doctype)), self.current_token.take()); + doctype.span.end = offset; + self.emit_token(Token::Doctype(doctype)); + } +} + +impl<O> TracingEmitter<O> { + fn emit_token(&mut self, token: Token<O>) { + self.emitted_tokens.push_front(token); + } + + fn flush_current_attribute(&mut self) + where + O: Offset, + { + if self.current_attribute_name.is_empty() { + return; + } + let name = std::mem::take(&mut self.current_attribute_name); + let attr_internal = std::mem::take(&mut self.current_attr_internal); + + match &mut self.current_token { + Some(Token::StartTag(tag)) => match tag.attributes.inner.entry(name) { + Entry::Vacant(vacant) => { + vacant.insert(attr_internal); + } + Entry::Occupied(_) => { + self.report_error(Error::DuplicateAttribute, attr_internal.name_span); + } + }, + Some(Token::EndTag(_)) => { + self.attr_in_end_tag_span = Some(attr_internal.name_span.clone()); + if !self.seen_attributes.insert(name) { + self.report_error(Error::DuplicateAttribute, attr_internal.name_span); + } + } + other => debug_assert!(false, "unexpected current_token: {other:?}"), + } + } +} + +/// The majority of our testing of the [`TracingEmitter`] is done against the +/// html5lib-tests in the html5lib integration test. This module only tests +/// details that aren't present in the html5lib test data. +#[cfg(test)] +mod tests { + use super::TracingEmitter; + use crate::token::{AttrValueSyntax, Token}; + use crate::{Event, Tokenizer}; + + #[test] + fn test_attribute_value_syntax() { + let mut tokenizer = Tokenizer::new( + "<div empty unquoted=foo single-quoted='foo' double-quoted=\"foo\">", + TracingEmitter::default(), + ) + .flatten(); + let Event::Token(Token::StartTag(tag)) = tokenizer.next().unwrap() else { + panic!("expected start tag"); + }; + for (name, syntax) in [ + ("empty", None), + ("unquoted", Some(AttrValueSyntax::Unquoted)), + ("single-quoted", Some(AttrValueSyntax::SingleQuoted)), + ("double-quoted", Some(AttrValueSyntax::DoubleQuoted)), + ] { + assert_eq!( + tag.attributes.get(name).unwrap().value_syntax(), + syntax, + "unexpected value for attribute {name}" + ); + } + } +} -- cgit v1.2.3