diff options
-rw-r--r-- | CHANGELOG.md | 2 | ||||
-rw-r--r-- | README.md | 2 | ||||
-rw-r--r-- | examples/spans.rs | 5 | ||||
-rw-r--r-- | examples/tokenize.rs | 4 | ||||
-rw-r--r-- | integration_tests/tests/test_html5lib.rs | 6 | ||||
-rw-r--r-- | src/basic_emitter.rs | 126 | ||||
-rw-r--r-- | src/lib.rs | 2 | ||||
-rw-r--r-- | src/naive_parser.rs | 13 | ||||
-rw-r--r-- | tests/test_spans.rs | 11 |
9 files changed, 152 insertions, 19 deletions
diff --git a/CHANGELOG.md b/CHANGELOG.md index 06831c3..146d627 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,7 +12,7 @@ * Removed the `Error` variant. (Errors now have to be queried separately with - `TracingEmitter::drain_errors`.) + `BasicEmitter::drain_errors` or `TracingEmitter::drain_errors`.) * Replaced the `String` variant with a new `Char` variant. (The tokenizer now emits chars instead of strings.) @@ -7,7 +7,7 @@ Spec-compliant HTML parsing [requires both tokenization and tree-construction][p While this crate implements a spec-compliant HTML tokenizer it does not implement any tree-construction. Instead it just provides a `NaiveParser` that may be used as follows: -```rust +```rust no_run TODO: run again once BasicEmitter has been implemented use std::fmt::Write; use html5tokenizer::{NaiveParser, Token}; diff --git a/examples/spans.rs b/examples/spans.rs index fc3c6a1..c1fe23b 100644 --- a/examples/spans.rs +++ b/examples/spans.rs @@ -4,11 +4,12 @@ use codespan_reporting::{ term, term::termcolor::{ColorChoice, StandardStream}, }; -use html5tokenizer::{offset::PosTrackingReader, NaiveParser, Token}; +use html5tokenizer::{offset::PosTrackingReader, NaiveParser, Token, TracingEmitter}; fn main() { let html = r#"<img src=example.jpg alt="some description">"#; - let parser = NaiveParser::new(PosTrackingReader::new(html)); + let parser = + NaiveParser::new_with_emitter(PosTrackingReader::new(html), TracingEmitter::default()); let Token::StartTag(tag) = parser.flatten().next().unwrap() else { panic!() diff --git a/examples/tokenize.rs b/examples/tokenize.rs index 791db0f..a7c2214 100644 --- a/examples/tokenize.rs +++ b/examples/tokenize.rs @@ -1,13 +1,13 @@ //! Let's you easily try out the tokenizer with e.g. //! printf '<h1>Hello world!</h1>' | cargo run --example=tokenize -use html5tokenizer::{Tokenizer, TracingEmitter}; +use html5tokenizer::{BasicEmitter, Tokenizer}; use std::io::BufReader; fn main() { let mut tokenizer = Tokenizer::new( BufReader::new(std::io::stdin().lock()), - TracingEmitter::default(), + BasicEmitter::default(), ); while let Some(token) = tokenizer.next() { for (error, _) in tokenizer.emitter_mut().drain_errors() { diff --git a/integration_tests/tests/test_html5lib.rs b/integration_tests/tests/test_html5lib.rs index 36fb880..0040a01 100644 --- a/integration_tests/tests/test_html5lib.rs +++ b/integration_tests/tests/test_html5lib.rs @@ -68,13 +68,15 @@ fn test_tokenizer_file(path: &Path) { fn run_test(fname: &str, test_i: usize, test: Test) { for state in &test.initial_states { + // TODO: test BasicEmitter here once it's implemented + run_test_inner( fname, test_i, &test, state, Tokenizer::new(&test.input, TracingEmitter::default()), - "string", + "TracingEmitter string", ); run_test_inner( @@ -86,7 +88,7 @@ fn run_test(fname: &str, test_i: usize, test: Test) { BufReader::new(test.input.as_bytes()), TracingEmitter::default(), ), - "bufread", + "TracingEmitter bufread", ); } } diff --git a/src/basic_emitter.rs b/src/basic_emitter.rs new file mode 100644 index 0000000..046b645 --- /dev/null +++ b/src/basic_emitter.rs @@ -0,0 +1,126 @@ +use std::collections::VecDeque; +use std::ops::Range; + +use crate::offset::Offset; +use crate::Emitter; +use crate::Error; +use crate::Token; + +/// An [`Emitter`] implementation that yields [`Token`]. +pub struct BasicEmitter<O> { + errors: VecDeque<(Error, Range<O>)>, +} + +impl<O: Default> Default for BasicEmitter<O> { + fn default() -> Self { + BasicEmitter { + errors: VecDeque::new(), + } + } +} + +impl<O> BasicEmitter<O> { + /// Removes all encountered tokenizer errors and returns them as an iterator. + pub fn drain_errors(&mut self) -> impl Iterator<Item = (Error, Range<O>)> + '_ { + self.errors.drain(0..) + } +} + +impl<O> Iterator for BasicEmitter<O> { + type Item = Token<O>; + + fn next(&mut self) -> Option<Self::Item> { + todo!() + } +} + +#[allow(unused_variables)] +impl<O: Offset> Emitter<O> for BasicEmitter<O> { + fn report_error(&mut self, error: Error, span: Range<O>) { + todo!() + } + + fn emit_char(&mut self, c: char) { + todo!() + } + + fn emit_eof(&mut self) { + todo!() + } + + fn init_start_tag(&mut self, tag_offset: O, name_offset: O) { + todo!() + } + + fn init_end_tag(&mut self, tag_offset: O, name_offset: O) { + todo!() + } + + fn push_tag_name(&mut self, s: &str) { + todo!() + } + + fn init_attribute_name(&mut self, offset: O) { + todo!() + } + + fn push_attribute_name(&mut self, s: &str) { + todo!() + } + + fn push_attribute_value(&mut self, s: &str) { + todo!() + } + + fn set_self_closing(&mut self, slash_span: Range<O>) { + todo!() + } + + fn emit_current_tag(&mut self, offset: O) { + todo!() + } + + fn init_comment(&mut self, data_start_offset: O) { + todo!() + } + + fn push_comment(&mut self, s: &str) { + todo!() + } + + fn emit_current_comment(&mut self, data_end_offset: O) { + todo!() + } + + fn init_doctype(&mut self, offset: O) { + todo!() + } + + fn push_doctype_name(&mut self, s: &str) { + todo!() + } + + fn init_doctype_public_id(&mut self, offset: O) { + todo!() + } + + fn push_doctype_public_id(&mut self, s: &str) { + todo!() + } + + fn init_doctype_system_id(&mut self, offset: O) { + todo!() + } + + fn push_doctype_system_id(&mut self, s: &str) { + todo!() + } + + fn set_force_quirks(&mut self) { + todo!() + } + + fn emit_current_doctype(&mut self, offset: O) { + todo!() + } +} @@ -7,6 +7,7 @@ #![doc = concat!("[the LICENSE file]: ", file_url!("LICENSE"))] #![doc = include_str!("../README.md")] +mod basic_emitter; mod emitter; mod entities; mod error; @@ -25,6 +26,7 @@ pub mod offset; pub mod reader; pub mod token; +pub use basic_emitter::BasicEmitter; pub use emitter::Emitter; pub use error::Error; pub use naive_parser::NaiveParser; diff --git a/src/naive_parser.rs b/src/naive_parser.rs index 91edbc0..4f8dc0d 100644 --- a/src/naive_parser.rs +++ b/src/naive_parser.rs @@ -1,8 +1,7 @@ use crate::offset::{Offset, Position}; use crate::reader::{IntoReader, Reader}; use crate::tokenizer::CdataAction; -use crate::tracing_emitter::TracingEmitter; -use crate::{Emitter, Event, State, Tokenizer}; +use crate::{BasicEmitter, Emitter, Event, State, Tokenizer}; /// A naive HTML parser (**not** spec-compliant since it doesn't do tree construction). /// @@ -13,7 +12,7 @@ use crate::{Emitter, Event, State, Tokenizer}; /// /// * it naively emits any CDATA sections as bogus comments, for example: /// -/// ``` +/// ```no_run TODO: run again once BasicEmitter has been implemented /// # use html5tokenizer::{NaiveParser, Token}; /// let html = "<svg><![CDATA[I love SVG]]>"; /// let mut tokens = NaiveParser::new(html).flatten(); @@ -30,18 +29,18 @@ pub struct NaiveParser<R: Reader, O: Offset, E: Emitter<O>> { tokenizer: Tokenizer<R, O, E>, } -impl<R, O> NaiveParser<R, O, TracingEmitter<O>> +impl<R, O> NaiveParser<R, O, BasicEmitter<O>> where R: Reader + Position<O>, O: Offset, { - /// Constructs a new naive parser. + /// Constructs a new naive parser using the [`BasicEmitter`]. // TODO: add example for NaiveParser::new - pub fn new<'a, IR>(reader: IR) -> NaiveParser<R, O, TracingEmitter<O>> + pub fn new<'a, IR>(reader: IR) -> NaiveParser<R, O, BasicEmitter<O>> where IR: IntoReader<'a, Reader = R>, { - NaiveParser::new_with_emitter(reader, TracingEmitter::default()) + NaiveParser::new_with_emitter(reader, BasicEmitter::default()) } } diff --git a/tests/test_spans.rs b/tests/test_spans.rs index eb93d43..fdb9a78 100644 --- a/tests/test_spans.rs +++ b/tests/test_spans.rs @@ -10,7 +10,7 @@ use codespan_reporting::{ use html5tokenizer::{ offset::PosTrackingReader, reader::{IntoReader, Reader}, - NaiveParser, Token, + NaiveParser, Token, TracingEmitter, }; use insta::assert_snapshot; use similar_asserts::assert_eq; @@ -27,9 +27,12 @@ fn parser<R>(reader: impl IntoReader<'static, Reader = R>) -> Parser where R: Reader<Error = Infallible> + 'static, { - NaiveParser::new(PosTrackingReader::new( - Box::new(reader.into_reader()) as Box<dyn Reader<Error = Infallible>> - )) + NaiveParser::new_with_emitter( + PosTrackingReader::new( + Box::new(reader.into_reader()) as Box<dyn Reader<Error = Infallible>> + ), + TracingEmitter::default(), + ) } fn test_and_annotate<S: AsRef<str> + Clone>( |