use crate::offset::{Offset, Position}; use crate::reader::{IntoReader, Reader}; use crate::tokenizer::CdataAction; use crate::{BasicEmitter, Emitter, Event, State, Tokenizer}; /// A naive HTML parser (**not** spec-compliant since it doesn't do tree construction). /// /// * it **does not** correct [misnested tags] /// /// * it **does not** recognize implicitly self-closing elements like /// ``, it will simply emit a start token /// /// * it naively emits any CDATA sections as bogus comments, for example: /// /// ``` /// # use html5tokenizer::{NaiveParser, Token}; /// let html = "I love SVG"; /// let mut tokens = NaiveParser::new(html).flatten(); /// assert!(matches!(tokens.next().unwrap(), Token::StartTag(tag) if tag.name == "svg")); /// assert!(matches!(tokens.next().unwrap(), Token::Comment(_bogus_comment))); /// ``` /// /// It has similar caveats to the [HTMLParser] from the Python standard library. /// It should suffice for web scraping but you wouldn't use it to implement a browser. /// /// [misnested tags]: https://html.spec.whatwg.org/multipage/parsing.html#an-introduction-to-error-handling-and-strange-cases-in-the-parser /// [HTMLParser]: https://docs.python.org/3/library/html.parser.html pub struct NaiveParser> { tokenizer: Tokenizer, } impl NaiveParser> where R: Reader + Position, O: Offset, { /// Constructs a new naive parser using the [`BasicEmitter`]. // TODO: add example for NaiveParser::new pub fn new<'a, IR>(reader: IR) -> NaiveParser> where IR: IntoReader<'a, Reader = R>, { NaiveParser::new_with_emitter(reader, BasicEmitter::default()) } } impl NaiveParser where R: Reader + Position, O: Offset, E: Emitter, { /// Constructs a new naive parser with a custom emitter. // TODO: add example for NaiveParser::new_with_emitter pub fn new_with_emitter<'a, IR>(reader: IR, emitter: E) -> NaiveParser where IR: IntoReader<'a, Reader = R>, { let mut tokenizer = Tokenizer::new(reader, emitter); tokenizer.enable_naive_state_switching(); NaiveParser { tokenizer } } /// Returns a mutable reference to the emitter. pub fn emitter_mut(&mut self) -> &mut E { self.tokenizer.emitter_mut() } } impl Iterator for NaiveParser where R: Reader + Position, O: Offset, E: Emitter + Iterator, { type Item = Result; fn next(&mut self) -> Option { loop { let event = self.tokenizer.next()?; match event { Err(e) => return Some(Err(e)), Ok(Event::Token(t)) => { // A proper parser would follow the steps described under section '13.2.6 Tree construction' // of the spec. Since this parser is naive, we directly return the token instead. return Some(Ok(t)); } Ok(Event::CdataOpen) => { // Naively parse any CDATA sections as bogus comments. self.tokenizer.handle_cdata_open(CdataAction::BogusComment) } } } } } pub(crate) fn naive_next_state(tag_name: &str) -> State { // These transitions are defined in https://html.spec.whatwg.org/multipage/parsing.html#parsing-html-fragments. // TODO: investigate what state logic Python's HTMLParser is using match tag_name { "title" | "textarea" => State::RcData, "style" | "xmp" | "iframe" | "noembed" | "noframes" => State::RawText, "script" => State::ScriptData, "plaintext" => State::PlainText, _other => State::Data, } }