feat: introduce NaiveParser

author: Martin Fischer <martin@push-f.com> 2023-08-12 12:58:08 +0200
committer: Martin Fischer <martin@push-f.com> 2023-08-19 13:53:58 +0200
commit: 0d9cd9ed44b676ccd4991cea27dc620b94ebe7e7 (patch)
tree: aba2bff89958bbe4516a49caba5edffc866c64af /src
parent: b125bec9914bd211d77719bd60bc5a23bd9db579 (diff)
4 files changed, 91 insertions, 3 deletions
diff --git a/src/lib.rs b/src/lib.rs
index c14613b..1cfb7c9 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -9,6 +9,7 @@ mod emitter;
 mod entities;
 mod error;
 mod machine;
+mod naive_parser;
 pub mod offset;
 pub mod reader;
 mod tokenizer;
@@ -16,6 +17,7 @@ mod utils;
 
 pub use emitter::{Comment, Doctype, Emitter, EndTag, StartTag, Token};
 pub use error::Error;
+pub use naive_parser::NaiveParser;
 pub use tokenizer::{State, Tokenizer};
 
 #[cfg(feature = "integration-tests")]
diff --git a/src/naive_parser.rs b/src/naive_parser.rs
new file mode 100644
index 0000000..e229592
--- /dev/null
+++ b/src/naive_parser.rs
@@ -0,0 +1,70 @@
+use crate::emitter::DefaultEmitter;
+use crate::offset::{Offset, Position};
+use crate::reader::{IntoReader, Reader};
+use crate::{Emitter, State, Tokenizer};
+
+/// A naive HTML parser (**not** spec-compliant since it doesn't do tree construction).
+///
+/// * it **does not** correct [misnested tags]
+/// * it **does not** recognize implicitly self-closing elements like
+///  `<img>`, it will simply emit a start token
+/// * it naively emits any CDATA sections as bogus comments
+///
+/// It has similar caveats to the [HTMLParser] from the Python standard library.
+/// It should suffice for web scraping but you wouldn't use it to implement a browser.
+///
+/// [misnested tags]: https://html.spec.whatwg.org/multipage/parsing.html#an-introduction-to-error-handling-and-strange-cases-in-the-parser
+/// [HTMLParser]: https://docs.python.org/3/library/html.parser.html
+pub struct NaiveParser<R: Reader, O: Offset, E: Emitter<O>> {
+    tokenizer: Tokenizer<R, O, E>,
+}
+
+impl<R: Reader, O: Offset> NaiveParser<R, O, DefaultEmitter<O>> {
+    /// Constructs a new naive parser.
+    // TODO: add example for NaiveParser::new
+    pub fn new<'a>(reader: impl IntoReader<'a, Reader = R>) -> Self {
+        let mut tokenizer = Tokenizer::new(reader, DefaultEmitter::default());
+        tokenizer.naively_switch_state = true;
+        NaiveParser { tokenizer }
+    }
+}
+
+impl<R: Reader + Position<usize>> NaiveParser<R, usize, DefaultEmitter<usize>> {
+    /// Constructs a new naive parser with source code offsets and spans.
+    // TODO: add example for NaiveParser::new_with_spans
+    pub fn new_with_spans<'a>(reader: impl IntoReader<'a, Reader = R>) -> Self {
+        let mut tokenizer = Tokenizer::new(reader, DefaultEmitter::default());
+        tokenizer.naively_switch_state = true;
+        NaiveParser { tokenizer }
+    }
+}
+
+impl<R: Reader, O: Offset, E: Emitter<O>> NaiveParser<R, O, E> {
+    /// Constructs a new naive parser with a custom emitter.
+    // TODO: add example for NaiveParser::new_with_emitter
+    pub fn new_with_emitter<'a>(reader: impl IntoReader<'a, Reader = R>, emitter: E) -> Self {
+        let mut tokenizer = Tokenizer::new(reader, emitter);
+        tokenizer.naively_switch_state = true;
+        NaiveParser { tokenizer }
+    }
+}
+
+impl<R: Reader + Position<O>, O: Offset, E: Emitter<O>> Iterator for NaiveParser<R, O, E> {
+    type Item = Result<E::Token, R::Error>;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        self.tokenizer.next()
+    }
+}
+
+pub(crate) fn naive_next_state(tag_name: &str) -> State {
+    // These transitions are defined in https://html.spec.whatwg.org/multipage/parsing.html#parsing-html-fragments.
+    // TODO: investigate what state logic Python's HTMLParser is using
+    match tag_name {
+        "title" | "textarea" => State::RcData,
+        "style" | "xmp" | "iframe" | "noembed" | "noframes" => State::RawText,
+        "script" => State::ScriptData,
+        "plaintext" => State::PlainText,
+        _other => State::Data,
+    }
+}
diff --git a/src/reader.rs b/src/reader.rs
index e0161e5..b6e0905 100644
--- a/src/reader.rs
+++ b/src/reader.rs
@@ -33,7 +33,7 @@ pub trait Reader {
 /// An object that can be converted into a [`Reader`].
 ///
 /// For example, any utf8-string can be converted into a `StringReader`.
-// TODO: , such that [give concrete examples of not-yet-implemented parser API] work.
+// TODO: , such that [give concrete examples of NaiveParser::new] work.
 pub trait IntoReader<'a> {
     /// The reader type into which this type should be converted.
     type Reader: Reader + 'a;
diff --git a/src/tokenizer.rs b/src/tokenizer.rs
index 3a75e60..7cc4712 100644
--- a/src/tokenizer.rs
+++ b/src/tokenizer.rs
@@ -1,4 +1,5 @@
 use crate::machine;
+use crate::naive_parser::naive_next_state;
 use crate::offset::{Offset, Position};
 use crate::reader::{IntoReader, Reader};
 use crate::utils::{
@@ -32,7 +33,12 @@ impl<T: Copy> Stack2<T> {
     }
 }
 
-/// An HTML tokenizer. See crate-level docs for basic usage.
+/// An HTML tokenizer.
+///
+/// Note that for proper HTML parsing, you'll have to implement [tree construction]
+/// based on this Tokenizer yourself (since this crate currently does not implement it).
+///
+/// [tree construction]: https://html.spec.whatwg.org/multipage/parsing.html#tree-construction
 pub struct Tokenizer<R: Reader, O, E: Emitter<O>> {
     eof: bool,
     pub(crate) state: InternalState,
@@ -46,12 +52,18 @@ pub struct Tokenizer<R: Reader, O, E: Emitter<O>> {
     last_start_tag_name: String,
     is_start_tag: bool,
     pub(crate) doctype_offset: O,
+    /// This boolean flag exists so that the [`NaiveParser`](crate::NaiveParser) can work with any [`Emitter`]
+    /// (it cannot call [`Tokenizer::set_state`] using the emitted start tags since they can be of an arbitrary type).
+    pub(crate) naively_switch_state: bool,
 }
 
 impl<R: Reader, O: Offset, E: Emitter<O>> Tokenizer<R, O, E> {
     /// Creates a new tokenizer from some input and an emitter.
     ///
-    /// TODO: add warning about you needing to do the state switching
+    /// Note that properly parsing HTML with this tokenizer requires you to
+    /// implement [tree construction] and call [`Tokenizer::set_state`] accordingly.
+    ///
+    /// [tree construction]: https://html.spec.whatwg.org/multipage/parsing.html#tree-construction
     pub fn new<'a>(reader: impl IntoReader<'a, Reader = R>, emitter: E) -> Self {
         Tokenizer {
             reader: reader.into_reader(),
@@ -66,6 +78,7 @@ impl<R: Reader, O: Offset, E: Emitter<O>> Tokenizer<R, O, E> {
             last_start_tag_name: String::new(),
             is_start_tag: false,
             doctype_offset: O::default(),
+            naively_switch_state: false,
         }
     }
 }
@@ -175,6 +188,9 @@ impl<R: Reader + Position<O>, O: Offset, E: Emitter<O>> Tokenizer<R, O, E> {
     pub(crate) fn emit_current_tag(&mut self) {
         self.emitter.emit_current_tag(self.reader.position() - 1);
         if self.is_start_tag {
+            if self.naively_switch_state {
+                self.state = naive_next_state(&self.current_tag_name).into();
+            }
             std::mem::swap(&mut self.last_start_tag_name, &mut self.current_tag_name);
         }
     }
author	Martin Fischer <martin@push-f.com>	2023-08-12 12:58:08 +0200
committer	Martin Fischer <martin@push-f.com>	2023-08-19 13:53:58 +0200
commit	0d9cd9ed44b676ccd4991cea27dc620b94ebe7e7 (patch)
tree	aba2bff89958bbe4516a49caba5edffc866c64af /src
parent	b125bec9914bd211d77719bd60bc5a23bd9db579 (diff)