diff options
author | Martin Fischer <martin@push-f.com> | 2023-09-12 08:23:52 +0200 |
---|---|---|
committer | Martin Fischer <martin@push-f.com> | 2023-09-28 10:36:08 +0200 |
commit | d913e6e91e43241b0105afbbad7db5c5bcda0255 (patch) | |
tree | 35258fc2df6e788315c4572f99e45c9830487738 | |
parent | 852d5c6f2e65a5ab466662ae1c649a0ed25c70a9 (diff) |
feat: implement BasicEmitter
-rw-r--r-- | CHANGELOG.md | 8 | ||||
-rw-r--r-- | README.md | 2 | ||||
-rw-r--r-- | integration_tests/tests/test_html5lib.rs | 30 | ||||
-rw-r--r-- | src/basic_emitter.rs | 178 | ||||
-rw-r--r-- | src/naive_parser.rs | 2 | ||||
-rw-r--r-- | src/tokenizer.rs | 7 |
6 files changed, 189 insertions, 38 deletions
diff --git a/CHANGELOG.md b/CHANGELOG.md index f0c1ed6..52de087 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -21,12 +21,10 @@ * Added the `EndOfFile` variant. -* The `DefaultEmitter` has been renamed to `TracingEmitter`. +* The `DefaultEmitter` has been removed, there now is: -* The `DefaultEmitter` now yields `(Token, Trace)` instead of just `Token`. - -* The `DefaultEmitter` now emits `Token::EndOfFile` on the end-of-file. - (Previously it did not emit any token symbolizing the end-of-file.) + * the `BasicEmitter` which yields just `Token` + * the `TracingEmitter` which yields `(Token, Trace)` * `Emitter` trait @@ -7,7 +7,7 @@ Spec-compliant HTML parsing [requires both tokenization and tree-construction][p While this crate implements a spec-compliant HTML tokenizer it does not implement any tree-construction. Instead it just provides a `NaiveParser` that may be used as follows: -```rust no_run TODO: run again once BasicEmitter has been implemented +```rust use std::fmt::Write; use html5tokenizer::{NaiveParser, Token}; diff --git a/integration_tests/tests/test_html5lib.rs b/integration_tests/tests/test_html5lib.rs index 42d93f1..3e07531 100644 --- a/integration_tests/tests/test_html5lib.rs +++ b/integration_tests/tests/test_html5lib.rs @@ -6,7 +6,8 @@ use html5lib_tests::{ use html5tokenizer::{ offset::{Offset, PosTrackingReader, Position}, reader::Reader, - CdataAction, Emitter, Error, Event, InternalState, Token, Tokenizer, TracingEmitter, + BasicEmitter, CdataAction, Emitter, Error, Event, InternalState, Token, Tokenizer, + TracingEmitter, }; use similar_asserts::assert_eq; @@ -68,7 +69,26 @@ fn test_tokenizer_file(path: &Path) { fn run_test(fname: &str, test_i: usize, test: Test) { for state in &test.initial_states { - // TODO: test BasicEmitter here once it's implemented + run_test_inner( + fname, + test_i, + &test, + state, + Tokenizer::new(&test.input, BasicEmitter::default()), + "BasicEmitter string", + ); + + run_test_inner( + fname, + test_i, + &test, + state, + Tokenizer::new( + BufReader::new(test.input.as_bytes()), + BasicEmitter::default(), + ), + "BasicEmitter bufread", + ); run_test_inner( fname, @@ -186,6 +206,12 @@ trait DrainErrors<O> { fn drain_errors(&mut self) -> Box<dyn Iterator<Item = (Error, Range<O>)> + '_>; } +impl<O> DrainErrors<O> for BasicEmitter<O> { + fn drain_errors(&mut self) -> Box<dyn Iterator<Item = (Error, Range<O>)> + '_> { + Box::new(self.drain_errors()) + } +} + impl DrainErrors<usize> for TracingEmitter { fn drain_errors(&mut self) -> Box<dyn Iterator<Item = (Error, Range<usize>)> + '_> { Box::new(self.drain_errors()) diff --git a/src/basic_emitter.rs b/src/basic_emitter.rs index bcb3f41..e67447b 100644 --- a/src/basic_emitter.rs +++ b/src/basic_emitter.rs @@ -1,20 +1,36 @@ +use std::collections::btree_map::Entry; +use std::collections::BTreeSet; use std::collections::VecDeque; use std::ops::Range; +use crate::let_else::assume; +use crate::offset::NoopOffset; use crate::offset::Offset; +use crate::token::{Doctype, EndTag, StartTag, Token}; use crate::Emitter; use crate::Error; -use crate::Token; /// An [`Emitter`] implementation that yields [`Token`]. -pub struct BasicEmitter<O> { +pub struct BasicEmitter<O = NoopOffset> { + current_token: Option<Token>, + current_attribute_name: String, + current_attr_internal: crate::token::AttrInternal, + seen_attributes: BTreeSet<String>, + emitted_tokens: VecDeque<Token>, errors: VecDeque<(Error, Range<O>)>, + attr_name_span: Range<O>, } impl<O: Default> Default for BasicEmitter<O> { fn default() -> Self { BasicEmitter { + current_token: None, + current_attribute_name: String::new(), + current_attr_internal: Default::default(), + seen_attributes: BTreeSet::new(), + emitted_tokens: VecDeque::new(), errors: VecDeque::new(), + attr_name_span: Default::default(), } } } @@ -30,97 +46,209 @@ impl<O> Iterator for BasicEmitter<O> { type Item = Token; fn next(&mut self) -> Option<Self::Item> { - todo!() + self.emitted_tokens.pop_back() } } #[allow(unused_variables)] impl<O: Offset> Emitter<O> for BasicEmitter<O> { fn report_error(&mut self, error: Error, span: Range<O>) { - todo!() + self.errors.push_back((error, span)); } fn emit_char(&mut self, c: char) { - todo!() + self.emit_token(Token::Char(c)); } fn emit_eof(&mut self) { - todo!() + self.emit_token(Token::EndOfFile); } fn init_start_tag(&mut self, tag_offset: O, name_offset: O) { - todo!() + self.current_token = Some(Token::StartTag(StartTag { + self_closing: false, + name: String::new(), + attributes: Default::default(), + })); } fn init_end_tag(&mut self, tag_offset: O, name_offset: O) { - todo!() + self.current_token = Some(Token::EndTag(EndTag { + name: String::new(), + })); + self.seen_attributes.clear(); } fn push_tag_name(&mut self, s: &str) { - todo!() + assume!( + Some(Token::StartTag(StartTag { name, .. }) | Token::EndTag(EndTag { name, .. })), + &mut self.current_token + ); + name.push_str(s); } fn init_attribute_name(&mut self, offset: O) { - todo!() + self.flush_current_attribute(); + self.attr_name_span.start = offset; } fn push_attribute_name(&mut self, s: &str) { - todo!() + self.current_attribute_name.push_str(s); + } + + fn terminate_attribute_name(&mut self, offset: O) { + self.attr_name_span.end = offset; } fn push_attribute_value(&mut self, s: &str) { - todo!() + self.current_attr_internal.value.push_str(s); } fn set_self_closing(&mut self, slash_span: Range<O>) { - todo!() + let token = self.current_token.as_mut().unwrap(); + + match token { + Token::StartTag(tag) => { + tag.self_closing = true; + } + Token::EndTag(_) => { + self.report_error(Error::EndTagWithTrailingSolidus, slash_span); + } + other => debug_assert!(false, "unexpected current_token: {other:?}"), + } } fn emit_current_tag(&mut self, offset: O) { - todo!() + self.flush_current_attribute(); + let mut token = self.current_token.take().unwrap(); + match &mut token { + Token::EndTag(_) => { + if !self.seen_attributes.is_empty() { + self.report_error(Error::EndTagWithAttributes, self.attr_name_span.clone()); + } + self.seen_attributes.clear(); + } + Token::StartTag(_) => {} + other => { + debug_assert!(false, "unexpected current_token: {other:?}"); + return; + } + } + self.emit_token(token); } fn init_comment(&mut self, data_start_offset: O) { - todo!() + self.current_token = Some(Token::Comment(String::new())); } fn push_comment(&mut self, s: &str) { - todo!() + assume!(Some(Token::Comment(data)), &mut self.current_token); + data.push_str(s); } fn emit_current_comment(&mut self, data_end_offset: O) { - todo!() + let token = self.current_token.take().unwrap(); + self.emit_token(token); } fn init_doctype(&mut self, offset: O) { - todo!() + self.current_token = Some(Token::Doctype(Doctype { + name: None, + force_quirks: false, + public_id: None, + system_id: None, + })); + } + + fn init_doctype_name(&mut self, offset: O) { + assume!(Some(Token::Doctype(doctype)), &mut self.current_token); + doctype.name = Some("".into()); } fn push_doctype_name(&mut self, s: &str) { - todo!() + assume!( + Some(Token::Doctype(Doctype { + name: Some(name), + .. + })), + &mut self.current_token + ); + name.push_str(s); } fn init_doctype_public_id(&mut self, offset: O) { - todo!() + assume!(Some(Token::Doctype(doctype)), &mut self.current_token); + doctype.public_id = Some("".to_owned()); } fn push_doctype_public_id(&mut self, s: &str) { - todo!() + assume!( + Some(Token::Doctype(Doctype { + public_id: Some(public_id), + .. + })), + &mut self.current_token + ); + public_id.push_str(s); } fn init_doctype_system_id(&mut self, offset: O) { - todo!() + assume!(Some(Token::Doctype(doctype)), &mut self.current_token); + doctype.system_id = Some("".to_owned()); } fn push_doctype_system_id(&mut self, s: &str) { - todo!() + assume!( + Some(Token::Doctype(Doctype { + system_id: Some(id), + .. + })), + &mut self.current_token + ); + id.push_str(s); } fn set_force_quirks(&mut self) { - todo!() + assume!(Some(Token::Doctype(doctype)), &mut self.current_token); + doctype.force_quirks = true; } fn emit_current_doctype(&mut self, offset: O) { - todo!() + let token = self.current_token.take().unwrap(); + self.emit_token(token); + } +} + +impl<O> BasicEmitter<O> { + fn emit_token(&mut self, token: Token) { + self.emitted_tokens.push_front(token); + } + + fn flush_current_attribute(&mut self) + where + O: Offset, + { + if self.current_attribute_name.is_empty() { + return; + } + let name = std::mem::take(&mut self.current_attribute_name); + let attr_internal = std::mem::take(&mut self.current_attr_internal); + + match &mut self.current_token { + Some(Token::StartTag(tag)) => match tag.attributes.inner.entry(name) { + Entry::Vacant(vacant) => { + vacant.insert(attr_internal); + } + Entry::Occupied(_) => { + self.report_error(Error::DuplicateAttribute, self.attr_name_span.clone()); + } + }, + Some(Token::EndTag(_)) => { + if !self.seen_attributes.insert(name) { + self.report_error(Error::DuplicateAttribute, self.attr_name_span.clone()); + } + } + other => debug_assert!(false, "unexpected current_token: {other:?}"), + } } } diff --git a/src/naive_parser.rs b/src/naive_parser.rs index 4f8dc0d..70b6522 100644 --- a/src/naive_parser.rs +++ b/src/naive_parser.rs @@ -12,7 +12,7 @@ use crate::{BasicEmitter, Emitter, Event, State, Tokenizer}; /// /// * it naively emits any CDATA sections as bogus comments, for example: /// -/// ```no_run TODO: run again once BasicEmitter has been implemented +/// ``` /// # use html5tokenizer::{NaiveParser, Token}; /// let html = "<svg><![CDATA[I love SVG]]>"; /// let mut tokens = NaiveParser::new(html).flatten(); diff --git a/src/tokenizer.rs b/src/tokenizer.rs index decd4df..3359637 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -15,12 +15,11 @@ pub use machine::State as InternalState; /// Iterating over the tokenizer directly without calling [`Tokenizer::set_state`] /// results in wrong state transitions: /// -/// ```ignore TODO: unignore once the BasicEmitter has been implemented -/// # use html5tokenizer::{DefaultEmitter, Event, Tokenizer, Token}; -/// let emitter = DefaultEmitter::default(); +/// ``` +/// # use html5tokenizer::{BasicEmitter, Event, Tokenizer, Token}; +/// let emitter = BasicEmitter::default(); /// let html = "<script><b>"; /// let mut tokens = Tokenizer::new(html, emitter).flatten(); -/// let mut tokens = tokens.map(|event| match event { Event::Token((token, _)) => Event::Token(token), Event::CdataOpen => Event::CdataOpen }); // TODO: remove once BasicEmitter can be used instead /// assert!(matches!(tokens.next(), Some(Event::Token(Token::StartTag(_))))); /// assert!(matches!(tokens.next(), Some(Event::Token(Token::StartTag(_))))); /// ``` |