use crate::emitter::DefaultEmitter; use crate::offset::{Offset, Position}; use crate::reader::{IntoReader, Reader}; use crate::{Emitter, State, Tokenizer}; /// A naive HTML parser (**not** spec-compliant since it doesn't do tree construction). /// /// * it **does not** correct [misnested tags] /// * it **does not** recognize implicitly self-closing elements like /// ``, it will simply emit a start token /// * it naively emits any CDATA sections as bogus comments /// /// It has similar caveats to the [HTMLParser] from the Python standard library. /// It should suffice for web scraping but you wouldn't use it to implement a browser. /// /// [misnested tags]: https://html.spec.whatwg.org/multipage/parsing.html#an-introduction-to-error-handling-and-strange-cases-in-the-parser /// [HTMLParser]: https://docs.python.org/3/library/html.parser.html pub struct NaiveParser> { tokenizer: Tokenizer, } impl NaiveParser> { /// Constructs a new naive parser. // TODO: add example for NaiveParser::new pub fn new<'a>(reader: impl IntoReader<'a, Reader = R>) -> Self { let mut tokenizer = Tokenizer::new(reader, DefaultEmitter::default()); tokenizer.naively_switch_state = true; NaiveParser { tokenizer } } } impl> NaiveParser> { /// Constructs a new naive parser with source code offsets and spans. // TODO: add example for NaiveParser::new_with_spans pub fn new_with_spans<'a>(reader: impl IntoReader<'a, Reader = R>) -> Self { let mut tokenizer = Tokenizer::new(reader, DefaultEmitter::default()); tokenizer.naively_switch_state = true; NaiveParser { tokenizer } } } impl> NaiveParser { /// Constructs a new naive parser with a custom emitter. // TODO: add example for NaiveParser::new_with_emitter pub fn new_with_emitter<'a>(reader: impl IntoReader<'a, Reader = R>, emitter: E) -> Self { let mut tokenizer = Tokenizer::new(reader, emitter); tokenizer.naively_switch_state = true; NaiveParser { tokenizer } } } impl, O: Offset, E: Emitter> Iterator for NaiveParser { type Item = Result; fn next(&mut self) -> Option { self.tokenizer.next() } } pub(crate) fn naive_next_state(tag_name: &str) -> State { // These transitions are defined in https://html.spec.whatwg.org/multipage/parsing.html#parsing-html-fragments. // TODO: investigate what state logic Python's HTMLParser is using match tag_name { "title" | "textarea" => State::RcData, "style" | "xmp" | "iframe" | "noembed" | "noframes" => State::RawText, "script" => State::ScriptData, "plaintext" => State::PlainText, _other => State::Data, } }