aboutsummaryrefslogtreecommitdiff
path: root/src/tokenizer.rs
diff options
context:
space:
mode:
authorMartin Fischer <martin@push-f.com>2023-08-12 12:58:08 +0200
committerMartin Fischer <martin@push-f.com>2023-08-19 13:53:58 +0200
commit0d9cd9ed44b676ccd4991cea27dc620b94ebe7e7 (patch)
treeaba2bff89958bbe4516a49caba5edffc866c64af /src/tokenizer.rs
parentb125bec9914bd211d77719bd60bc5a23bd9db579 (diff)
feat: introduce NaiveParser
Diffstat (limited to 'src/tokenizer.rs')
-rw-r--r--src/tokenizer.rs20
1 files changed, 18 insertions, 2 deletions
diff --git a/src/tokenizer.rs b/src/tokenizer.rs
index 3a75e60..7cc4712 100644
--- a/src/tokenizer.rs
+++ b/src/tokenizer.rs
@@ -1,4 +1,5 @@
use crate::machine;
+use crate::naive_parser::naive_next_state;
use crate::offset::{Offset, Position};
use crate::reader::{IntoReader, Reader};
use crate::utils::{
@@ -32,7 +33,12 @@ impl<T: Copy> Stack2<T> {
}
}
-/// An HTML tokenizer. See crate-level docs for basic usage.
+/// An HTML tokenizer.
+///
+/// Note that for proper HTML parsing, you'll have to implement [tree construction]
+/// based on this Tokenizer yourself (since this crate currently does not implement it).
+///
+/// [tree construction]: https://html.spec.whatwg.org/multipage/parsing.html#tree-construction
pub struct Tokenizer<R: Reader, O, E: Emitter<O>> {
eof: bool,
pub(crate) state: InternalState,
@@ -46,12 +52,18 @@ pub struct Tokenizer<R: Reader, O, E: Emitter<O>> {
last_start_tag_name: String,
is_start_tag: bool,
pub(crate) doctype_offset: O,
+ /// This boolean flag exists so that the [`NaiveParser`](crate::NaiveParser) can work with any [`Emitter`]
+ /// (it cannot call [`Tokenizer::set_state`] using the emitted start tags since they can be of an arbitrary type).
+ pub(crate) naively_switch_state: bool,
}
impl<R: Reader, O: Offset, E: Emitter<O>> Tokenizer<R, O, E> {
/// Creates a new tokenizer from some input and an emitter.
///
- /// TODO: add warning about you needing to do the state switching
+ /// Note that properly parsing HTML with this tokenizer requires you to
+ /// implement [tree construction] and call [`Tokenizer::set_state`] accordingly.
+ ///
+ /// [tree construction]: https://html.spec.whatwg.org/multipage/parsing.html#tree-construction
pub fn new<'a>(reader: impl IntoReader<'a, Reader = R>, emitter: E) -> Self {
Tokenizer {
reader: reader.into_reader(),
@@ -66,6 +78,7 @@ impl<R: Reader, O: Offset, E: Emitter<O>> Tokenizer<R, O, E> {
last_start_tag_name: String::new(),
is_start_tag: false,
doctype_offset: O::default(),
+ naively_switch_state: false,
}
}
}
@@ -175,6 +188,9 @@ impl<R: Reader + Position<O>, O: Offset, E: Emitter<O>> Tokenizer<R, O, E> {
pub(crate) fn emit_current_tag(&mut self) {
self.emitter.emit_current_tag(self.reader.position() - 1);
if self.is_start_tag {
+ if self.naively_switch_state {
+ self.state = naive_next_state(&self.current_tag_name).into();
+ }
std::mem::swap(&mut self.last_start_tag_name, &mut self.current_tag_name);
}
}