aboutsummaryrefslogtreecommitdiff
path: root/src/naive_parser.rs
diff options
context:
space:
mode:
authorMartin Fischer <martin@push-f.com>2023-08-12 12:58:08 +0200
committerMartin Fischer <martin@push-f.com>2023-08-19 13:53:58 +0200
commit0d9cd9ed44b676ccd4991cea27dc620b94ebe7e7 (patch)
treeaba2bff89958bbe4516a49caba5edffc866c64af /src/naive_parser.rs
parentb125bec9914bd211d77719bd60bc5a23bd9db579 (diff)
feat: introduce NaiveParser
Diffstat (limited to 'src/naive_parser.rs')
-rw-r--r--src/naive_parser.rs70
1 files changed, 70 insertions, 0 deletions
diff --git a/src/naive_parser.rs b/src/naive_parser.rs
new file mode 100644
index 0000000..e229592
--- /dev/null
+++ b/src/naive_parser.rs
@@ -0,0 +1,70 @@
+use crate::emitter::DefaultEmitter;
+use crate::offset::{Offset, Position};
+use crate::reader::{IntoReader, Reader};
+use crate::{Emitter, State, Tokenizer};
+
+/// A naive HTML parser (**not** spec-compliant since it doesn't do tree construction).
+///
+/// * it **does not** correct [misnested tags]
+/// * it **does not** recognize implicitly self-closing elements like
+/// `<img>`, it will simply emit a start token
+/// * it naively emits any CDATA sections as bogus comments
+///
+/// It has similar caveats to the [HTMLParser] from the Python standard library.
+/// It should suffice for web scraping but you wouldn't use it to implement a browser.
+///
+/// [misnested tags]: https://html.spec.whatwg.org/multipage/parsing.html#an-introduction-to-error-handling-and-strange-cases-in-the-parser
+/// [HTMLParser]: https://docs.python.org/3/library/html.parser.html
+pub struct NaiveParser<R: Reader, O: Offset, E: Emitter<O>> {
+ tokenizer: Tokenizer<R, O, E>,
+}
+
+impl<R: Reader, O: Offset> NaiveParser<R, O, DefaultEmitter<O>> {
+ /// Constructs a new naive parser.
+ // TODO: add example for NaiveParser::new
+ pub fn new<'a>(reader: impl IntoReader<'a, Reader = R>) -> Self {
+ let mut tokenizer = Tokenizer::new(reader, DefaultEmitter::default());
+ tokenizer.naively_switch_state = true;
+ NaiveParser { tokenizer }
+ }
+}
+
+impl<R: Reader + Position<usize>> NaiveParser<R, usize, DefaultEmitter<usize>> {
+ /// Constructs a new naive parser with source code offsets and spans.
+ // TODO: add example for NaiveParser::new_with_spans
+ pub fn new_with_spans<'a>(reader: impl IntoReader<'a, Reader = R>) -> Self {
+ let mut tokenizer = Tokenizer::new(reader, DefaultEmitter::default());
+ tokenizer.naively_switch_state = true;
+ NaiveParser { tokenizer }
+ }
+}
+
+impl<R: Reader, O: Offset, E: Emitter<O>> NaiveParser<R, O, E> {
+ /// Constructs a new naive parser with a custom emitter.
+ // TODO: add example for NaiveParser::new_with_emitter
+ pub fn new_with_emitter<'a>(reader: impl IntoReader<'a, Reader = R>, emitter: E) -> Self {
+ let mut tokenizer = Tokenizer::new(reader, emitter);
+ tokenizer.naively_switch_state = true;
+ NaiveParser { tokenizer }
+ }
+}
+
+impl<R: Reader + Position<O>, O: Offset, E: Emitter<O>> Iterator for NaiveParser<R, O, E> {
+ type Item = Result<E::Token, R::Error>;
+
+ fn next(&mut self) -> Option<Self::Item> {
+ self.tokenizer.next()
+ }
+}
+
+pub(crate) fn naive_next_state(tag_name: &str) -> State {
+ // These transitions are defined in https://html.spec.whatwg.org/multipage/parsing.html#parsing-html-fragments.
+ // TODO: investigate what state logic Python's HTMLParser is using
+ match tag_name {
+ "title" | "textarea" => State::RcData,
+ "style" | "xmp" | "iframe" | "noembed" | "noframes" => State::RawText,
+ "script" => State::ScriptData,
+ "plaintext" => State::PlainText,
+ _other => State::Data,
+ }
+}