aboutsummaryrefslogtreecommitdiff
path: root/src/tokenizer.rs
diff options
context:
space:
mode:
authorMartin Fischer <martin@push-f.com>2023-08-17 09:40:47 +0200
committerMartin Fischer <martin@push-f.com>2023-08-19 13:41:55 +0200
commitc15895d44d17984386d3684e2aa85aca386ba3bf (patch)
treea7c92e5eff97bd7645c7d309c8bf94ea891459ad /src/tokenizer.rs
parentd5c9a851756b1e84b022c2fbf984137aae68e2c9 (diff)
refactor!: make Emitter generic over offset instead of reader
Emitters should not have access to the reader at all. Also the current position of the reader, at the time an Emitted method is called, very much depends on machine implementation details such as if `Tokenizer::unread_char` is used. Having the Emitter methods take offsets lets the machine take care of providing the right offsets, as evidenced by the next commit.
Diffstat (limited to 'src/tokenizer.rs')
-rw-r--r--src/tokenizer.rs27
1 files changed, 16 insertions, 11 deletions
diff --git a/src/tokenizer.rs b/src/tokenizer.rs
index 7eb33f7..02a4d62 100644
--- a/src/tokenizer.rs
+++ b/src/tokenizer.rs
@@ -1,5 +1,7 @@
+use std::marker::PhantomData;
+
use crate::machine;
-use crate::offset::NoopOffset;
+use crate::offset::{NoopOffset, Offset, Position};
use crate::reader::{IntoReader, Reader};
use crate::utils::{
control_pat, noncharacter_pat, surrogate_pat, ControlToken, State as InternalState,
@@ -33,12 +35,13 @@ impl<T: Copy> Stack2<T> {
}
/// A HTML tokenizer. See crate-level docs for basic usage.
-pub struct Tokenizer<R: Reader, E: Emitter<R> = DefaultEmitter<R, NoopOffset>> {
+pub struct Tokenizer<R: Reader, O = NoopOffset, E: Emitter<O> = DefaultEmitter<O>> {
eof: bool,
pub(crate) state: InternalState,
pub(crate) emitter: E,
pub(crate) temporary_buffer: String,
pub(crate) reader: R,
+ _offset: PhantomData<O>,
to_reconsume: Stack2<Option<char>>,
pub(crate) character_reference_code: u32,
pub(crate) return_state: Option<InternalState>,
@@ -47,7 +50,7 @@ pub struct Tokenizer<R: Reader, E: Emitter<R> = DefaultEmitter<R, NoopOffset>> {
is_start_tag: bool,
}
-impl<R: Reader, E: Emitter<R>> Tokenizer<R, E> {
+impl<R: Reader, O, E: Emitter<O>> Tokenizer<R, O, E> {
/// Creates a new tokenizer from some input and an emitter.
///
/// TODO: add warning about you needing to do the state switching
@@ -55,6 +58,7 @@ impl<R: Reader, E: Emitter<R>> Tokenizer<R, E> {
Tokenizer {
reader: reader.into_reader(),
emitter,
+ _offset: PhantomData,
state: InternalState::Data,
to_reconsume: Stack2::default(),
return_state: None,
@@ -102,7 +106,7 @@ impl From<State> for InternalState {
}
}
-impl<R: Reader, E: Emitter<R>> Tokenizer<R, E> {
+impl<R: Reader + Position<O>, O, E: Emitter<O>> Tokenizer<R, O, E> {
/// Test-internal function to override internal state.
///
/// Only available with the `integration-tests` feature which is not public API.
@@ -119,7 +123,7 @@ impl<R: Reader, E: Emitter<R>> Tokenizer<R, E> {
/// Just a helper method for the machine.
#[inline]
pub(crate) fn emit_error(&mut self, error: Error) {
- self.emitter.emit_error(error, &self.reader);
+ self.emitter.emit_error(error, self.reader.position());
}
/// Assuming the _current token_ is an end tag, return true if all of these hold. Return false otherwise.
@@ -136,14 +140,14 @@ impl<R: Reader, E: Emitter<R>> Tokenizer<R, E> {
#[inline]
pub(crate) fn init_start_tag(&mut self) {
- self.emitter.init_start_tag(&self.reader);
+ self.emitter.init_start_tag(self.reader.position());
self.current_tag_name.clear();
self.is_start_tag = true;
}
#[inline]
pub(crate) fn init_end_tag(&mut self) {
- self.emitter.init_end_tag(&self.reader);
+ self.emitter.init_end_tag(self.reader.position());
self.current_tag_name.clear();
self.is_start_tag = false;
}
@@ -270,10 +274,11 @@ impl<R: Reader, E: Emitter<R>> Tokenizer<R, E> {
}
}
-impl<R, E> Iterator for Tokenizer<R, E>
+impl<O, R, E> Iterator for Tokenizer<R, O, E>
where
- R: Reader,
- E: Emitter<R>,
+ O: Offset,
+ R: Reader + Position<O>,
+ E: Emitter<O>,
{
type Item = Result<E::Token, R::Error>;
@@ -297,7 +302,7 @@ where
}
}
-impl<R: Reader, E: Emitter<R>> Tokenizer<R, E> {
+impl<R: Reader, O, E: Emitter<O>> Tokenizer<R, O, E> {
/// Test-internal function to override internal state.
///
/// Only available with the `integration-tests` feature which is not public API.