diff options
author | Martin Fischer <martin@push-f.com> | 2023-08-12 11:06:02 +0200 |
---|---|---|
committer | Martin Fischer <martin@push-f.com> | 2023-08-19 06:41:55 +0200 |
commit | 9f1019afa7a8e9102d67356d85bd632044eb2d0c (patch) | |
tree | 4c6664aad5a11a942d6684a62e507de28193f5bb /src | |
parent | c3d60e88efa32329614178dfc9455ef33ea0a88d (diff) |
break!: merge Tokenizer::new_with_emitter into Tokenizer::new
The Tokenizer does not perform any state switching, since
proper state switching requires a feedback loop between
tokenization and DOM tree building. Using the Tokenizer
directly therefore is a bit of a pitfall, since you might
not expect it to e.g. tokenize `<script><b>` as:
StartTag(StartTag { name: "script", .. })
StartTag(StartTag { name: "b", .. })
Since we don't want to make walking into pitfalls
particularly easy, this commit changes the Tokenizer::new
method so that you have to specify the Emitter.
Since this makes new_with_emitter redundant it is removed.
Diffstat (limited to 'src')
-rw-r--r-- | src/emitter.rs | 2 | ||||
-rw-r--r-- | src/reader.rs | 4 | ||||
-rw-r--r-- | src/tokenizer.rs | 36 |
3 files changed, 18 insertions, 24 deletions
diff --git a/src/emitter.rs b/src/emitter.rs index be712df..110ed5d 100644 --- a/src/emitter.rs +++ b/src/emitter.rs @@ -534,7 +534,7 @@ pub struct Doctype { } /// The token type used by default. You can define your own token type by implementing the -/// [`crate::Emitter`] trait and using [`crate::Tokenizer::new_with_emitter`]. +/// [`crate::Emitter`] trait. #[derive(Debug, Eq, PartialEq)] pub enum Token<S> { /// A HTML start tag. diff --git a/src/reader.rs b/src/reader.rs index f756c65..19929d4 100644 --- a/src/reader.rs +++ b/src/reader.rs @@ -30,8 +30,8 @@ pub trait Reader { /// An object that can be converted into a [`crate::Reader`]. /// -/// For example, any utf8-string can be converted into a `StringReader`, such that -/// `Tokenizer::new("mystring")` and `Tokenizer::new(&String::new("foo"))` work. +/// For example, any utf8-string can be converted into a `StringReader`. +// TODO: , such that [give concrete examples of not-yet-implemented parser API] work. pub trait IntoReader<'a> { /// The reader type into which this type should be converted. type Reader: Reader + 'a; diff --git a/src/tokenizer.rs b/src/tokenizer.rs index d7db3b6..5abd6ba 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -42,10 +42,21 @@ pub struct Tokenizer<R: Reader, E: Emitter<R> = DefaultEmitter<R, ()>> { pub(crate) return_state: Option<InternalState>, } -impl<R: Reader> Tokenizer<R> { - /// Create a new tokenizer from some input. - pub fn new<'a, S: IntoReader<'a, Reader = R>>(input: S) -> Self { - Tokenizer::<S::Reader>::new_with_emitter(input, DefaultEmitter::default()) +impl<R: Reader, E: Emitter<R>> Tokenizer<R, E> { + /// Creates a new tokenizer from some input and an emitter. + /// + /// TODO: add warning about you needing to do the state switching + pub fn new<'a>(reader: impl IntoReader<'a, Reader = R>, emitter: E) -> Self { + Tokenizer { + reader: reader.into_reader(), + emitter, + state: InternalState::Data, + to_reconsume: Stack2::default(), + return_state: None, + temporary_buffer: String::new(), + character_reference_code: 0, + eof: false, + } } } @@ -84,23 +95,6 @@ impl From<State> for InternalState { } impl<R: Reader, E: Emitter<R>> Tokenizer<R, E> { - /// Construct a new tokenizer from some input and a custom emitter. - /// - /// Use this method over [`Tokenizer::new`] when you want to have more control over string allocation for - /// tokens. - pub fn new_with_emitter<'a, S: IntoReader<'a, Reader = R>>(input: S, emitter: E) -> Self { - Tokenizer { - eof: false, - state: InternalState::Data, - emitter, - temporary_buffer: String::new(), - to_reconsume: Stack2::default(), - reader: input.into_reader(), - character_reference_code: 0, - return_state: None, - } - } - /// Test-internal function to override internal state. /// /// Only available with the `integration-tests` feature which is not public API. |