summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMartin Fischer <martin@push-f.com>2023-09-12 08:23:52 +0200
committerMartin Fischer <martin@push-f.com>2023-09-28 10:36:08 +0200
commitd913e6e91e43241b0105afbbad7db5c5bcda0255 (patch)
tree35258fc2df6e788315c4572f99e45c9830487738
parent852d5c6f2e65a5ab466662ae1c649a0ed25c70a9 (diff)
feat: implement BasicEmitter
-rw-r--r--CHANGELOG.md8
-rw-r--r--README.md2
-rw-r--r--integration_tests/tests/test_html5lib.rs30
-rw-r--r--src/basic_emitter.rs178
-rw-r--r--src/naive_parser.rs2
-rw-r--r--src/tokenizer.rs7
6 files changed, 189 insertions, 38 deletions
diff --git a/CHANGELOG.md b/CHANGELOG.md
index f0c1ed6..52de087 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -21,12 +21,10 @@
* Added the `EndOfFile` variant.
-* The `DefaultEmitter` has been renamed to `TracingEmitter`.
+* The `DefaultEmitter` has been removed, there now is:
-* The `DefaultEmitter` now yields `(Token, Trace)` instead of just `Token`.
-
-* The `DefaultEmitter` now emits `Token::EndOfFile` on the end-of-file.
- (Previously it did not emit any token symbolizing the end-of-file.)
+ * the `BasicEmitter` which yields just `Token`
+ * the `TracingEmitter` which yields `(Token, Trace)`
* `Emitter` trait
diff --git a/README.md b/README.md
index a02a06c..6513b61 100644
--- a/README.md
+++ b/README.md
@@ -7,7 +7,7 @@ Spec-compliant HTML parsing [requires both tokenization and tree-construction][p
While this crate implements a spec-compliant HTML tokenizer it does not implement any
tree-construction. Instead it just provides a `NaiveParser` that may be used as follows:
-```rust no_run TODO: run again once BasicEmitter has been implemented
+```rust
use std::fmt::Write;
use html5tokenizer::{NaiveParser, Token};
diff --git a/integration_tests/tests/test_html5lib.rs b/integration_tests/tests/test_html5lib.rs
index 42d93f1..3e07531 100644
--- a/integration_tests/tests/test_html5lib.rs
+++ b/integration_tests/tests/test_html5lib.rs
@@ -6,7 +6,8 @@ use html5lib_tests::{
use html5tokenizer::{
offset::{Offset, PosTrackingReader, Position},
reader::Reader,
- CdataAction, Emitter, Error, Event, InternalState, Token, Tokenizer, TracingEmitter,
+ BasicEmitter, CdataAction, Emitter, Error, Event, InternalState, Token, Tokenizer,
+ TracingEmitter,
};
use similar_asserts::assert_eq;
@@ -68,7 +69,26 @@ fn test_tokenizer_file(path: &Path) {
fn run_test(fname: &str, test_i: usize, test: Test) {
for state in &test.initial_states {
- // TODO: test BasicEmitter here once it's implemented
+ run_test_inner(
+ fname,
+ test_i,
+ &test,
+ state,
+ Tokenizer::new(&test.input, BasicEmitter::default()),
+ "BasicEmitter string",
+ );
+
+ run_test_inner(
+ fname,
+ test_i,
+ &test,
+ state,
+ Tokenizer::new(
+ BufReader::new(test.input.as_bytes()),
+ BasicEmitter::default(),
+ ),
+ "BasicEmitter bufread",
+ );
run_test_inner(
fname,
@@ -186,6 +206,12 @@ trait DrainErrors<O> {
fn drain_errors(&mut self) -> Box<dyn Iterator<Item = (Error, Range<O>)> + '_>;
}
+impl<O> DrainErrors<O> for BasicEmitter<O> {
+ fn drain_errors(&mut self) -> Box<dyn Iterator<Item = (Error, Range<O>)> + '_> {
+ Box::new(self.drain_errors())
+ }
+}
+
impl DrainErrors<usize> for TracingEmitter {
fn drain_errors(&mut self) -> Box<dyn Iterator<Item = (Error, Range<usize>)> + '_> {
Box::new(self.drain_errors())
diff --git a/src/basic_emitter.rs b/src/basic_emitter.rs
index bcb3f41..e67447b 100644
--- a/src/basic_emitter.rs
+++ b/src/basic_emitter.rs
@@ -1,20 +1,36 @@
+use std::collections::btree_map::Entry;
+use std::collections::BTreeSet;
use std::collections::VecDeque;
use std::ops::Range;
+use crate::let_else::assume;
+use crate::offset::NoopOffset;
use crate::offset::Offset;
+use crate::token::{Doctype, EndTag, StartTag, Token};
use crate::Emitter;
use crate::Error;
-use crate::Token;
/// An [`Emitter`] implementation that yields [`Token`].
-pub struct BasicEmitter<O> {
+pub struct BasicEmitter<O = NoopOffset> {
+ current_token: Option<Token>,
+ current_attribute_name: String,
+ current_attr_internal: crate::token::AttrInternal,
+ seen_attributes: BTreeSet<String>,
+ emitted_tokens: VecDeque<Token>,
errors: VecDeque<(Error, Range<O>)>,
+ attr_name_span: Range<O>,
}
impl<O: Default> Default for BasicEmitter<O> {
fn default() -> Self {
BasicEmitter {
+ current_token: None,
+ current_attribute_name: String::new(),
+ current_attr_internal: Default::default(),
+ seen_attributes: BTreeSet::new(),
+ emitted_tokens: VecDeque::new(),
errors: VecDeque::new(),
+ attr_name_span: Default::default(),
}
}
}
@@ -30,97 +46,209 @@ impl<O> Iterator for BasicEmitter<O> {
type Item = Token;
fn next(&mut self) -> Option<Self::Item> {
- todo!()
+ self.emitted_tokens.pop_back()
}
}
#[allow(unused_variables)]
impl<O: Offset> Emitter<O> for BasicEmitter<O> {
fn report_error(&mut self, error: Error, span: Range<O>) {
- todo!()
+ self.errors.push_back((error, span));
}
fn emit_char(&mut self, c: char) {
- todo!()
+ self.emit_token(Token::Char(c));
}
fn emit_eof(&mut self) {
- todo!()
+ self.emit_token(Token::EndOfFile);
}
fn init_start_tag(&mut self, tag_offset: O, name_offset: O) {
- todo!()
+ self.current_token = Some(Token::StartTag(StartTag {
+ self_closing: false,
+ name: String::new(),
+ attributes: Default::default(),
+ }));
}
fn init_end_tag(&mut self, tag_offset: O, name_offset: O) {
- todo!()
+ self.current_token = Some(Token::EndTag(EndTag {
+ name: String::new(),
+ }));
+ self.seen_attributes.clear();
}
fn push_tag_name(&mut self, s: &str) {
- todo!()
+ assume!(
+ Some(Token::StartTag(StartTag { name, .. }) | Token::EndTag(EndTag { name, .. })),
+ &mut self.current_token
+ );
+ name.push_str(s);
}
fn init_attribute_name(&mut self, offset: O) {
- todo!()
+ self.flush_current_attribute();
+ self.attr_name_span.start = offset;
}
fn push_attribute_name(&mut self, s: &str) {
- todo!()
+ self.current_attribute_name.push_str(s);
+ }
+
+ fn terminate_attribute_name(&mut self, offset: O) {
+ self.attr_name_span.end = offset;
}
fn push_attribute_value(&mut self, s: &str) {
- todo!()
+ self.current_attr_internal.value.push_str(s);
}
fn set_self_closing(&mut self, slash_span: Range<O>) {
- todo!()
+ let token = self.current_token.as_mut().unwrap();
+
+ match token {
+ Token::StartTag(tag) => {
+ tag.self_closing = true;
+ }
+ Token::EndTag(_) => {
+ self.report_error(Error::EndTagWithTrailingSolidus, slash_span);
+ }
+ other => debug_assert!(false, "unexpected current_token: {other:?}"),
+ }
}
fn emit_current_tag(&mut self, offset: O) {
- todo!()
+ self.flush_current_attribute();
+ let mut token = self.current_token.take().unwrap();
+ match &mut token {
+ Token::EndTag(_) => {
+ if !self.seen_attributes.is_empty() {
+ self.report_error(Error::EndTagWithAttributes, self.attr_name_span.clone());
+ }
+ self.seen_attributes.clear();
+ }
+ Token::StartTag(_) => {}
+ other => {
+ debug_assert!(false, "unexpected current_token: {other:?}");
+ return;
+ }
+ }
+ self.emit_token(token);
}
fn init_comment(&mut self, data_start_offset: O) {
- todo!()
+ self.current_token = Some(Token::Comment(String::new()));
}
fn push_comment(&mut self, s: &str) {
- todo!()
+ assume!(Some(Token::Comment(data)), &mut self.current_token);
+ data.push_str(s);
}
fn emit_current_comment(&mut self, data_end_offset: O) {
- todo!()
+ let token = self.current_token.take().unwrap();
+ self.emit_token(token);
}
fn init_doctype(&mut self, offset: O) {
- todo!()
+ self.current_token = Some(Token::Doctype(Doctype {
+ name: None,
+ force_quirks: false,
+ public_id: None,
+ system_id: None,
+ }));
+ }
+
+ fn init_doctype_name(&mut self, offset: O) {
+ assume!(Some(Token::Doctype(doctype)), &mut self.current_token);
+ doctype.name = Some("".into());
}
fn push_doctype_name(&mut self, s: &str) {
- todo!()
+ assume!(
+ Some(Token::Doctype(Doctype {
+ name: Some(name),
+ ..
+ })),
+ &mut self.current_token
+ );
+ name.push_str(s);
}
fn init_doctype_public_id(&mut self, offset: O) {
- todo!()
+ assume!(Some(Token::Doctype(doctype)), &mut self.current_token);
+ doctype.public_id = Some("".to_owned());
}
fn push_doctype_public_id(&mut self, s: &str) {
- todo!()
+ assume!(
+ Some(Token::Doctype(Doctype {
+ public_id: Some(public_id),
+ ..
+ })),
+ &mut self.current_token
+ );
+ public_id.push_str(s);
}
fn init_doctype_system_id(&mut self, offset: O) {
- todo!()
+ assume!(Some(Token::Doctype(doctype)), &mut self.current_token);
+ doctype.system_id = Some("".to_owned());
}
fn push_doctype_system_id(&mut self, s: &str) {
- todo!()
+ assume!(
+ Some(Token::Doctype(Doctype {
+ system_id: Some(id),
+ ..
+ })),
+ &mut self.current_token
+ );
+ id.push_str(s);
}
fn set_force_quirks(&mut self) {
- todo!()
+ assume!(Some(Token::Doctype(doctype)), &mut self.current_token);
+ doctype.force_quirks = true;
}
fn emit_current_doctype(&mut self, offset: O) {
- todo!()
+ let token = self.current_token.take().unwrap();
+ self.emit_token(token);
+ }
+}
+
+impl<O> BasicEmitter<O> {
+ fn emit_token(&mut self, token: Token) {
+ self.emitted_tokens.push_front(token);
+ }
+
+ fn flush_current_attribute(&mut self)
+ where
+ O: Offset,
+ {
+ if self.current_attribute_name.is_empty() {
+ return;
+ }
+ let name = std::mem::take(&mut self.current_attribute_name);
+ let attr_internal = std::mem::take(&mut self.current_attr_internal);
+
+ match &mut self.current_token {
+ Some(Token::StartTag(tag)) => match tag.attributes.inner.entry(name) {
+ Entry::Vacant(vacant) => {
+ vacant.insert(attr_internal);
+ }
+ Entry::Occupied(_) => {
+ self.report_error(Error::DuplicateAttribute, self.attr_name_span.clone());
+ }
+ },
+ Some(Token::EndTag(_)) => {
+ if !self.seen_attributes.insert(name) {
+ self.report_error(Error::DuplicateAttribute, self.attr_name_span.clone());
+ }
+ }
+ other => debug_assert!(false, "unexpected current_token: {other:?}"),
+ }
}
}
diff --git a/src/naive_parser.rs b/src/naive_parser.rs
index 4f8dc0d..70b6522 100644
--- a/src/naive_parser.rs
+++ b/src/naive_parser.rs
@@ -12,7 +12,7 @@ use crate::{BasicEmitter, Emitter, Event, State, Tokenizer};
///
/// * it naively emits any CDATA sections as bogus comments, for example:
///
-/// ```no_run TODO: run again once BasicEmitter has been implemented
+/// ```
/// # use html5tokenizer::{NaiveParser, Token};
/// let html = "<svg><![CDATA[I love SVG]]>";
/// let mut tokens = NaiveParser::new(html).flatten();
diff --git a/src/tokenizer.rs b/src/tokenizer.rs
index decd4df..3359637 100644
--- a/src/tokenizer.rs
+++ b/src/tokenizer.rs
@@ -15,12 +15,11 @@ pub use machine::State as InternalState;
/// Iterating over the tokenizer directly without calling [`Tokenizer::set_state`]
/// results in wrong state transitions:
///
-/// ```ignore TODO: unignore once the BasicEmitter has been implemented
-/// # use html5tokenizer::{DefaultEmitter, Event, Tokenizer, Token};
-/// let emitter = DefaultEmitter::default();
+/// ```
+/// # use html5tokenizer::{BasicEmitter, Event, Tokenizer, Token};
+/// let emitter = BasicEmitter::default();
/// let html = "<script><b>";
/// let mut tokens = Tokenizer::new(html, emitter).flatten();
-/// let mut tokens = tokens.map(|event| match event { Event::Token((token, _)) => Event::Token(token), Event::CdataOpen => Event::CdataOpen }); // TODO: remove once BasicEmitter can be used instead
/// assert!(matches!(tokens.next(), Some(Event::Token(Token::StartTag(_)))));
/// assert!(matches!(tokens.next(), Some(Event::Token(Token::StartTag(_)))));
/// ```