aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMartin Fischer <martin@push-f.com>2023-09-12 09:03:56 +0200
committerMartin Fischer <martin@push-f.com>2023-09-28 10:36:08 +0200
commit14bc6f2cceed0fa578d6a1195266885bf57a5d4c (patch)
tree50988abce274aa5e4aa5905fb4bcc5c8cc4de652
parentad6ac5f0a825775c231e76cdc9016e61e54f4141 (diff)
chore: add BasicEmitter stub
-rw-r--r--CHANGELOG.md2
-rw-r--r--README.md2
-rw-r--r--examples/spans.rs5
-rw-r--r--examples/tokenize.rs4
-rw-r--r--integration_tests/tests/test_html5lib.rs6
-rw-r--r--src/basic_emitter.rs126
-rw-r--r--src/lib.rs2
-rw-r--r--src/naive_parser.rs13
-rw-r--r--tests/test_spans.rs11
9 files changed, 152 insertions, 19 deletions
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 06831c3..146d627 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -12,7 +12,7 @@
* Removed the `Error` variant.
(Errors now have to be queried separately with
- `TracingEmitter::drain_errors`.)
+ `BasicEmitter::drain_errors` or `TracingEmitter::drain_errors`.)
* Replaced the `String` variant with a new `Char` variant.
(The tokenizer now emits chars instead of strings.)
diff --git a/README.md b/README.md
index 6513b61..a02a06c 100644
--- a/README.md
+++ b/README.md
@@ -7,7 +7,7 @@ Spec-compliant HTML parsing [requires both tokenization and tree-construction][p
While this crate implements a spec-compliant HTML tokenizer it does not implement any
tree-construction. Instead it just provides a `NaiveParser` that may be used as follows:
-```rust
+```rust no_run TODO: run again once BasicEmitter has been implemented
use std::fmt::Write;
use html5tokenizer::{NaiveParser, Token};
diff --git a/examples/spans.rs b/examples/spans.rs
index fc3c6a1..c1fe23b 100644
--- a/examples/spans.rs
+++ b/examples/spans.rs
@@ -4,11 +4,12 @@ use codespan_reporting::{
term,
term::termcolor::{ColorChoice, StandardStream},
};
-use html5tokenizer::{offset::PosTrackingReader, NaiveParser, Token};
+use html5tokenizer::{offset::PosTrackingReader, NaiveParser, Token, TracingEmitter};
fn main() {
let html = r#"<img src=example.jpg alt="some description">"#;
- let parser = NaiveParser::new(PosTrackingReader::new(html));
+ let parser =
+ NaiveParser::new_with_emitter(PosTrackingReader::new(html), TracingEmitter::default());
let Token::StartTag(tag) = parser.flatten().next().unwrap() else {
panic!()
diff --git a/examples/tokenize.rs b/examples/tokenize.rs
index 791db0f..a7c2214 100644
--- a/examples/tokenize.rs
+++ b/examples/tokenize.rs
@@ -1,13 +1,13 @@
//! Let's you easily try out the tokenizer with e.g.
//! printf '<h1>Hello world!</h1>' | cargo run --example=tokenize
-use html5tokenizer::{Tokenizer, TracingEmitter};
+use html5tokenizer::{BasicEmitter, Tokenizer};
use std::io::BufReader;
fn main() {
let mut tokenizer = Tokenizer::new(
BufReader::new(std::io::stdin().lock()),
- TracingEmitter::default(),
+ BasicEmitter::default(),
);
while let Some(token) = tokenizer.next() {
for (error, _) in tokenizer.emitter_mut().drain_errors() {
diff --git a/integration_tests/tests/test_html5lib.rs b/integration_tests/tests/test_html5lib.rs
index 36fb880..0040a01 100644
--- a/integration_tests/tests/test_html5lib.rs
+++ b/integration_tests/tests/test_html5lib.rs
@@ -68,13 +68,15 @@ fn test_tokenizer_file(path: &Path) {
fn run_test(fname: &str, test_i: usize, test: Test) {
for state in &test.initial_states {
+ // TODO: test BasicEmitter here once it's implemented
+
run_test_inner(
fname,
test_i,
&test,
state,
Tokenizer::new(&test.input, TracingEmitter::default()),
- "string",
+ "TracingEmitter string",
);
run_test_inner(
@@ -86,7 +88,7 @@ fn run_test(fname: &str, test_i: usize, test: Test) {
BufReader::new(test.input.as_bytes()),
TracingEmitter::default(),
),
- "bufread",
+ "TracingEmitter bufread",
);
}
}
diff --git a/src/basic_emitter.rs b/src/basic_emitter.rs
new file mode 100644
index 0000000..046b645
--- /dev/null
+++ b/src/basic_emitter.rs
@@ -0,0 +1,126 @@
+use std::collections::VecDeque;
+use std::ops::Range;
+
+use crate::offset::Offset;
+use crate::Emitter;
+use crate::Error;
+use crate::Token;
+
+/// An [`Emitter`] implementation that yields [`Token`].
+pub struct BasicEmitter<O> {
+ errors: VecDeque<(Error, Range<O>)>,
+}
+
+impl<O: Default> Default for BasicEmitter<O> {
+ fn default() -> Self {
+ BasicEmitter {
+ errors: VecDeque::new(),
+ }
+ }
+}
+
+impl<O> BasicEmitter<O> {
+ /// Removes all encountered tokenizer errors and returns them as an iterator.
+ pub fn drain_errors(&mut self) -> impl Iterator<Item = (Error, Range<O>)> + '_ {
+ self.errors.drain(0..)
+ }
+}
+
+impl<O> Iterator for BasicEmitter<O> {
+ type Item = Token<O>;
+
+ fn next(&mut self) -> Option<Self::Item> {
+ todo!()
+ }
+}
+
+#[allow(unused_variables)]
+impl<O: Offset> Emitter<O> for BasicEmitter<O> {
+ fn report_error(&mut self, error: Error, span: Range<O>) {
+ todo!()
+ }
+
+ fn emit_char(&mut self, c: char) {
+ todo!()
+ }
+
+ fn emit_eof(&mut self) {
+ todo!()
+ }
+
+ fn init_start_tag(&mut self, tag_offset: O, name_offset: O) {
+ todo!()
+ }
+
+ fn init_end_tag(&mut self, tag_offset: O, name_offset: O) {
+ todo!()
+ }
+
+ fn push_tag_name(&mut self, s: &str) {
+ todo!()
+ }
+
+ fn init_attribute_name(&mut self, offset: O) {
+ todo!()
+ }
+
+ fn push_attribute_name(&mut self, s: &str) {
+ todo!()
+ }
+
+ fn push_attribute_value(&mut self, s: &str) {
+ todo!()
+ }
+
+ fn set_self_closing(&mut self, slash_span: Range<O>) {
+ todo!()
+ }
+
+ fn emit_current_tag(&mut self, offset: O) {
+ todo!()
+ }
+
+ fn init_comment(&mut self, data_start_offset: O) {
+ todo!()
+ }
+
+ fn push_comment(&mut self, s: &str) {
+ todo!()
+ }
+
+ fn emit_current_comment(&mut self, data_end_offset: O) {
+ todo!()
+ }
+
+ fn init_doctype(&mut self, offset: O) {
+ todo!()
+ }
+
+ fn push_doctype_name(&mut self, s: &str) {
+ todo!()
+ }
+
+ fn init_doctype_public_id(&mut self, offset: O) {
+ todo!()
+ }
+
+ fn push_doctype_public_id(&mut self, s: &str) {
+ todo!()
+ }
+
+ fn init_doctype_system_id(&mut self, offset: O) {
+ todo!()
+ }
+
+ fn push_doctype_system_id(&mut self, s: &str) {
+ todo!()
+ }
+
+ fn set_force_quirks(&mut self) {
+ todo!()
+ }
+
+ fn emit_current_doctype(&mut self, offset: O) {
+ todo!()
+ }
+}
diff --git a/src/lib.rs b/src/lib.rs
index aecbef3..16728ad 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -7,6 +7,7 @@
#![doc = concat!("[the LICENSE file]: ", file_url!("LICENSE"))]
#![doc = include_str!("../README.md")]
+mod basic_emitter;
mod emitter;
mod entities;
mod error;
@@ -25,6 +26,7 @@ pub mod offset;
pub mod reader;
pub mod token;
+pub use basic_emitter::BasicEmitter;
pub use emitter::Emitter;
pub use error::Error;
pub use naive_parser::NaiveParser;
diff --git a/src/naive_parser.rs b/src/naive_parser.rs
index 91edbc0..4f8dc0d 100644
--- a/src/naive_parser.rs
+++ b/src/naive_parser.rs
@@ -1,8 +1,7 @@
use crate::offset::{Offset, Position};
use crate::reader::{IntoReader, Reader};
use crate::tokenizer::CdataAction;
-use crate::tracing_emitter::TracingEmitter;
-use crate::{Emitter, Event, State, Tokenizer};
+use crate::{BasicEmitter, Emitter, Event, State, Tokenizer};
/// A naive HTML parser (**not** spec-compliant since it doesn't do tree construction).
///
@@ -13,7 +12,7 @@ use crate::{Emitter, Event, State, Tokenizer};
///
/// * it naively emits any CDATA sections as bogus comments, for example:
///
-/// ```
+/// ```no_run TODO: run again once BasicEmitter has been implemented
/// # use html5tokenizer::{NaiveParser, Token};
/// let html = "<svg><![CDATA[I love SVG]]>";
/// let mut tokens = NaiveParser::new(html).flatten();
@@ -30,18 +29,18 @@ pub struct NaiveParser<R: Reader, O: Offset, E: Emitter<O>> {
tokenizer: Tokenizer<R, O, E>,
}
-impl<R, O> NaiveParser<R, O, TracingEmitter<O>>
+impl<R, O> NaiveParser<R, O, BasicEmitter<O>>
where
R: Reader + Position<O>,
O: Offset,
{
- /// Constructs a new naive parser.
+ /// Constructs a new naive parser using the [`BasicEmitter`].
// TODO: add example for NaiveParser::new
- pub fn new<'a, IR>(reader: IR) -> NaiveParser<R, O, TracingEmitter<O>>
+ pub fn new<'a, IR>(reader: IR) -> NaiveParser<R, O, BasicEmitter<O>>
where
IR: IntoReader<'a, Reader = R>,
{
- NaiveParser::new_with_emitter(reader, TracingEmitter::default())
+ NaiveParser::new_with_emitter(reader, BasicEmitter::default())
}
}
diff --git a/tests/test_spans.rs b/tests/test_spans.rs
index eb93d43..fdb9a78 100644
--- a/tests/test_spans.rs
+++ b/tests/test_spans.rs
@@ -10,7 +10,7 @@ use codespan_reporting::{
use html5tokenizer::{
offset::PosTrackingReader,
reader::{IntoReader, Reader},
- NaiveParser, Token,
+ NaiveParser, Token, TracingEmitter,
};
use insta::assert_snapshot;
use similar_asserts::assert_eq;
@@ -27,9 +27,12 @@ fn parser<R>(reader: impl IntoReader<'static, Reader = R>) -> Parser
where
R: Reader<Error = Infallible> + 'static,
{
- NaiveParser::new(PosTrackingReader::new(
- Box::new(reader.into_reader()) as Box<dyn Reader<Error = Infallible>>
- ))
+ NaiveParser::new_with_emitter(
+ PosTrackingReader::new(
+ Box::new(reader.into_reader()) as Box<dyn Reader<Error = Infallible>>
+ ),
+ TracingEmitter::default(),
+ )
}
fn test_and_annotate<S: AsRef<str> + Clone>(