diff options
Diffstat (limited to 'src/naive_parser.rs')
-rw-r--r-- | src/naive_parser.rs | 70 |
1 files changed, 70 insertions, 0 deletions
diff --git a/src/naive_parser.rs b/src/naive_parser.rs new file mode 100644 index 0000000..e229592 --- /dev/null +++ b/src/naive_parser.rs @@ -0,0 +1,70 @@ +use crate::emitter::DefaultEmitter; +use crate::offset::{Offset, Position}; +use crate::reader::{IntoReader, Reader}; +use crate::{Emitter, State, Tokenizer}; + +/// A naive HTML parser (**not** spec-compliant since it doesn't do tree construction). +/// +/// * it **does not** correct [misnested tags] +/// * it **does not** recognize implicitly self-closing elements like +/// `<img>`, it will simply emit a start token +/// * it naively emits any CDATA sections as bogus comments +/// +/// It has similar caveats to the [HTMLParser] from the Python standard library. +/// It should suffice for web scraping but you wouldn't use it to implement a browser. +/// +/// [misnested tags]: https://html.spec.whatwg.org/multipage/parsing.html#an-introduction-to-error-handling-and-strange-cases-in-the-parser +/// [HTMLParser]: https://docs.python.org/3/library/html.parser.html +pub struct NaiveParser<R: Reader, O: Offset, E: Emitter<O>> { + tokenizer: Tokenizer<R, O, E>, +} + +impl<R: Reader, O: Offset> NaiveParser<R, O, DefaultEmitter<O>> { + /// Constructs a new naive parser. + // TODO: add example for NaiveParser::new + pub fn new<'a>(reader: impl IntoReader<'a, Reader = R>) -> Self { + let mut tokenizer = Tokenizer::new(reader, DefaultEmitter::default()); + tokenizer.naively_switch_state = true; + NaiveParser { tokenizer } + } +} + +impl<R: Reader + Position<usize>> NaiveParser<R, usize, DefaultEmitter<usize>> { + /// Constructs a new naive parser with source code offsets and spans. + // TODO: add example for NaiveParser::new_with_spans + pub fn new_with_spans<'a>(reader: impl IntoReader<'a, Reader = R>) -> Self { + let mut tokenizer = Tokenizer::new(reader, DefaultEmitter::default()); + tokenizer.naively_switch_state = true; + NaiveParser { tokenizer } + } +} + +impl<R: Reader, O: Offset, E: Emitter<O>> NaiveParser<R, O, E> { + /// Constructs a new naive parser with a custom emitter. + // TODO: add example for NaiveParser::new_with_emitter + pub fn new_with_emitter<'a>(reader: impl IntoReader<'a, Reader = R>, emitter: E) -> Self { + let mut tokenizer = Tokenizer::new(reader, emitter); + tokenizer.naively_switch_state = true; + NaiveParser { tokenizer } + } +} + +impl<R: Reader + Position<O>, O: Offset, E: Emitter<O>> Iterator for NaiveParser<R, O, E> { + type Item = Result<E::Token, R::Error>; + + fn next(&mut self) -> Option<Self::Item> { + self.tokenizer.next() + } +} + +pub(crate) fn naive_next_state(tag_name: &str) -> State { + // These transitions are defined in https://html.spec.whatwg.org/multipage/parsing.html#parsing-html-fragments. + // TODO: investigate what state logic Python's HTMLParser is using + match tag_name { + "title" | "textarea" => State::RcData, + "style" | "xmp" | "iframe" | "noembed" | "noframes" => State::RawText, + "script" => State::ScriptData, + "plaintext" => State::PlainText, + _other => State::Data, + } +} |