summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMartin Fischer <martin@push-f.com>2023-08-12 11:06:02 +0200
committerMartin Fischer <martin@push-f.com>2023-08-19 06:41:55 +0200
commit9f1019afa7a8e9102d67356d85bd632044eb2d0c (patch)
tree4c6664aad5a11a942d6684a62e507de28193f5bb
parentc3d60e88efa32329614178dfc9455ef33ea0a88d (diff)
break!: merge Tokenizer::new_with_emitter into Tokenizer::new
The Tokenizer does not perform any state switching, since proper state switching requires a feedback loop between tokenization and DOM tree building. Using the Tokenizer directly therefore is a bit of a pitfall, since you might not expect it to e.g. tokenize `<script><b>` as: StartTag(StartTag { name: "script", .. }) StartTag(StartTag { name: "b", .. }) Since we don't want to make walking into pitfalls particularly easy, this commit changes the Tokenizer::new method so that you have to specify the Emitter. Since this makes new_with_emitter redundant it is removed.
-rw-r--r--README.md5
-rw-r--r--examples/tokenize.rs11
-rw-r--r--integration_tests/tests/test_html5lib.rs9
-rw-r--r--src/emitter.rs2
-rw-r--r--src/reader.rs4
-rw-r--r--src/tokenizer.rs36
-rw-r--r--tests/test_spans.rs2
7 files changed, 37 insertions, 32 deletions
diff --git a/README.md b/README.md
index 8305537..13d9cc2 100644
--- a/README.md
+++ b/README.md
@@ -8,12 +8,13 @@
```rust
use std::fmt::Write;
-use html5tokenizer::{Tokenizer, Token};
+use html5tokenizer::{DefaultEmitter, Tokenizer, Token};
let html = "<title >hello world</title>";
+let emitter = DefaultEmitter::<_, ()>::default();
let mut new_html = String::new();
-for token in Tokenizer::new(html).flatten() {
+for token in Tokenizer::new(html, emitter).flatten() {
match token {
Token::StartTag(tag) => {
write!(new_html, "<{}>", tag.name).unwrap();
diff --git a/examples/tokenize.rs b/examples/tokenize.rs
index ceb5751..5776362 100644
--- a/examples/tokenize.rs
+++ b/examples/tokenize.rs
@@ -1,9 +1,16 @@
//! Let's you easily try out the tokenizer with e.g.
//! printf '<h1>Hello world!</h1>' | cargo run --example=tokenize
-use html5tokenizer::{BufReadReader, Tokenizer};
+
+use html5tokenizer::{DefaultEmitter, Tokenizer};
+use std::io::BufReader;
fn main() {
- for token in Tokenizer::new(BufReadReader::new(std::io::stdin().lock())).flatten() {
+ for token in Tokenizer::new(
+ BufReader::new(std::io::stdin().lock()),
+ DefaultEmitter::<_, ()>::default(),
+ )
+ .flatten()
+ {
println!("{:?}", token);
}
}
diff --git a/integration_tests/tests/test_html5lib.rs b/integration_tests/tests/test_html5lib.rs
index 23adec0..f5a69c3 100644
--- a/integration_tests/tests/test_html5lib.rs
+++ b/integration_tests/tests/test_html5lib.rs
@@ -3,7 +3,7 @@ use std::{fs::File, io::BufReader, path::Path};
use html5lib_tests::{
parse_tests, Error as TestError, InitialState, Output, Test, Token as TestToken,
};
-use html5tokenizer::{InternalState, Reader, Token, Tokenizer};
+use html5tokenizer::{DefaultEmitter, InternalState, Reader, Token, Tokenizer};
use pretty_assertions::assert_eq;
/// Path to a local checkout of [html5lib-tests], relative to the
@@ -69,7 +69,7 @@ fn run_test(fname: &str, test_i: usize, test: Test) {
test_i,
&test,
state,
- Tokenizer::new(&test.input),
+ Tokenizer::new(&test.input, DefaultEmitter::default()),
"string",
);
@@ -78,7 +78,10 @@ fn run_test(fname: &str, test_i: usize, test: Test) {
test_i,
&test,
state,
- Tokenizer::new(BufReader::new(test.input.as_bytes())),
+ Tokenizer::new(
+ BufReader::new(test.input.as_bytes()),
+ DefaultEmitter::default(),
+ ),
"bufread",
);
}
diff --git a/src/emitter.rs b/src/emitter.rs
index be712df..110ed5d 100644
--- a/src/emitter.rs
+++ b/src/emitter.rs
@@ -534,7 +534,7 @@ pub struct Doctype {
}
/// The token type used by default. You can define your own token type by implementing the
-/// [`crate::Emitter`] trait and using [`crate::Tokenizer::new_with_emitter`].
+/// [`crate::Emitter`] trait.
#[derive(Debug, Eq, PartialEq)]
pub enum Token<S> {
/// A HTML start tag.
diff --git a/src/reader.rs b/src/reader.rs
index f756c65..19929d4 100644
--- a/src/reader.rs
+++ b/src/reader.rs
@@ -30,8 +30,8 @@ pub trait Reader {
/// An object that can be converted into a [`crate::Reader`].
///
-/// For example, any utf8-string can be converted into a `StringReader`, such that
-/// `Tokenizer::new("mystring")` and `Tokenizer::new(&String::new("foo"))` work.
+/// For example, any utf8-string can be converted into a `StringReader`.
+// TODO: , such that [give concrete examples of not-yet-implemented parser API] work.
pub trait IntoReader<'a> {
/// The reader type into which this type should be converted.
type Reader: Reader + 'a;
diff --git a/src/tokenizer.rs b/src/tokenizer.rs
index d7db3b6..5abd6ba 100644
--- a/src/tokenizer.rs
+++ b/src/tokenizer.rs
@@ -42,10 +42,21 @@ pub struct Tokenizer<R: Reader, E: Emitter<R> = DefaultEmitter<R, ()>> {
pub(crate) return_state: Option<InternalState>,
}
-impl<R: Reader> Tokenizer<R> {
- /// Create a new tokenizer from some input.
- pub fn new<'a, S: IntoReader<'a, Reader = R>>(input: S) -> Self {
- Tokenizer::<S::Reader>::new_with_emitter(input, DefaultEmitter::default())
+impl<R: Reader, E: Emitter<R>> Tokenizer<R, E> {
+ /// Creates a new tokenizer from some input and an emitter.
+ ///
+ /// TODO: add warning about you needing to do the state switching
+ pub fn new<'a>(reader: impl IntoReader<'a, Reader = R>, emitter: E) -> Self {
+ Tokenizer {
+ reader: reader.into_reader(),
+ emitter,
+ state: InternalState::Data,
+ to_reconsume: Stack2::default(),
+ return_state: None,
+ temporary_buffer: String::new(),
+ character_reference_code: 0,
+ eof: false,
+ }
}
}
@@ -84,23 +95,6 @@ impl From<State> for InternalState {
}
impl<R: Reader, E: Emitter<R>> Tokenizer<R, E> {
- /// Construct a new tokenizer from some input and a custom emitter.
- ///
- /// Use this method over [`Tokenizer::new`] when you want to have more control over string allocation for
- /// tokens.
- pub fn new_with_emitter<'a, S: IntoReader<'a, Reader = R>>(input: S, emitter: E) -> Self {
- Tokenizer {
- eof: false,
- state: InternalState::Data,
- emitter,
- temporary_buffer: String::new(),
- to_reconsume: Stack2::default(),
- reader: input.into_reader(),
- character_reference_code: 0,
- return_state: None,
- }
- }
-
/// Test-internal function to override internal state.
///
/// Only available with the `integration-tests` feature which is not public API.
diff --git a/tests/test_spans.rs b/tests/test_spans.rs
index 64a64cf..8a820de 100644
--- a/tests/test_spans.rs
+++ b/tests/test_spans.rs
@@ -16,7 +16,7 @@ fn test() {
let file_id = files.add("test.html", html);
let mut labels = Vec::new();
- for token in Tokenizer::new_with_emitter(
+ for token in Tokenizer::new(
PosTrackingReader::new(html),
DefaultEmitter::<_, Range<usize>>::default(),
)