summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorMartin Fischer <martin@push-f.com>2023-08-12 11:06:02 +0200
committerMartin Fischer <martin@push-f.com>2023-08-19 06:41:55 +0200
commit9f1019afa7a8e9102d67356d85bd632044eb2d0c (patch)
tree4c6664aad5a11a942d6684a62e507de28193f5bb /src
parentc3d60e88efa32329614178dfc9455ef33ea0a88d (diff)
break!: merge Tokenizer::new_with_emitter into Tokenizer::new
The Tokenizer does not perform any state switching, since proper state switching requires a feedback loop between tokenization and DOM tree building. Using the Tokenizer directly therefore is a bit of a pitfall, since you might not expect it to e.g. tokenize `<script><b>` as: StartTag(StartTag { name: "script", .. }) StartTag(StartTag { name: "b", .. }) Since we don't want to make walking into pitfalls particularly easy, this commit changes the Tokenizer::new method so that you have to specify the Emitter. Since this makes new_with_emitter redundant it is removed.
Diffstat (limited to 'src')
-rw-r--r--src/emitter.rs2
-rw-r--r--src/reader.rs4
-rw-r--r--src/tokenizer.rs36
3 files changed, 18 insertions, 24 deletions
diff --git a/src/emitter.rs b/src/emitter.rs
index be712df..110ed5d 100644
--- a/src/emitter.rs
+++ b/src/emitter.rs
@@ -534,7 +534,7 @@ pub struct Doctype {
}
/// The token type used by default. You can define your own token type by implementing the
-/// [`crate::Emitter`] trait and using [`crate::Tokenizer::new_with_emitter`].
+/// [`crate::Emitter`] trait.
#[derive(Debug, Eq, PartialEq)]
pub enum Token<S> {
/// A HTML start tag.
diff --git a/src/reader.rs b/src/reader.rs
index f756c65..19929d4 100644
--- a/src/reader.rs
+++ b/src/reader.rs
@@ -30,8 +30,8 @@ pub trait Reader {
/// An object that can be converted into a [`crate::Reader`].
///
-/// For example, any utf8-string can be converted into a `StringReader`, such that
-/// `Tokenizer::new("mystring")` and `Tokenizer::new(&String::new("foo"))` work.
+/// For example, any utf8-string can be converted into a `StringReader`.
+// TODO: , such that [give concrete examples of not-yet-implemented parser API] work.
pub trait IntoReader<'a> {
/// The reader type into which this type should be converted.
type Reader: Reader + 'a;
diff --git a/src/tokenizer.rs b/src/tokenizer.rs
index d7db3b6..5abd6ba 100644
--- a/src/tokenizer.rs
+++ b/src/tokenizer.rs
@@ -42,10 +42,21 @@ pub struct Tokenizer<R: Reader, E: Emitter<R> = DefaultEmitter<R, ()>> {
pub(crate) return_state: Option<InternalState>,
}
-impl<R: Reader> Tokenizer<R> {
- /// Create a new tokenizer from some input.
- pub fn new<'a, S: IntoReader<'a, Reader = R>>(input: S) -> Self {
- Tokenizer::<S::Reader>::new_with_emitter(input, DefaultEmitter::default())
+impl<R: Reader, E: Emitter<R>> Tokenizer<R, E> {
+ /// Creates a new tokenizer from some input and an emitter.
+ ///
+ /// TODO: add warning about you needing to do the state switching
+ pub fn new<'a>(reader: impl IntoReader<'a, Reader = R>, emitter: E) -> Self {
+ Tokenizer {
+ reader: reader.into_reader(),
+ emitter,
+ state: InternalState::Data,
+ to_reconsume: Stack2::default(),
+ return_state: None,
+ temporary_buffer: String::new(),
+ character_reference_code: 0,
+ eof: false,
+ }
}
}
@@ -84,23 +95,6 @@ impl From<State> for InternalState {
}
impl<R: Reader, E: Emitter<R>> Tokenizer<R, E> {
- /// Construct a new tokenizer from some input and a custom emitter.
- ///
- /// Use this method over [`Tokenizer::new`] when you want to have more control over string allocation for
- /// tokens.
- pub fn new_with_emitter<'a, S: IntoReader<'a, Reader = R>>(input: S, emitter: E) -> Self {
- Tokenizer {
- eof: false,
- state: InternalState::Data,
- emitter,
- temporary_buffer: String::new(),
- to_reconsume: Stack2::default(),
- reader: input.into_reader(),
- character_reference_code: 0,
- return_state: None,
- }
- }
-
/// Test-internal function to override internal state.
///
/// Only available with the `integration-tests` feature which is not public API.