use crate::default_emitter::DefaultEmitter; use crate::offset::{Offset, Position}; use crate::reader::{IntoReader, Reader}; use crate::tokenizer::CdataAction; use crate::{Emitter, Event, State, Tokenizer}; /// A naive HTML parser (**not** spec-compliant since it doesn't do tree construction). /// /// * it **does not** correct [misnested tags] /// /// * it **does not** recognize implicitly self-closing elements like /// ``, it will simply emit a start token /// /// * it naively emits any CDATA sections as bogus comments, for example: /// /// ``` /// # use html5tokenizer::{Error, NaiveParser, Tokenizer, Token}; /// let html = "I love SVG"; /// let mut tokens = NaiveParser::new(html).flatten(); /// assert!(matches!(tokens.next().unwrap(), Token::StartTag(tag) if tag.name == "svg")); /// assert!(matches!(tokens.next().unwrap(), Token::Error {error: Error::CdataInHtmlContent, ..})); /// assert!(matches!(tokens.next().unwrap(), Token::Comment(_bogus_comment))); /// ``` /// /// It has similar caveats to the [HTMLParser] from the Python standard library. /// It should suffice for web scraping but you wouldn't use it to implement a browser. /// /// [misnested tags]: https://html.spec.whatwg.org/multipage/parsing.html#an-introduction-to-error-handling-and-strange-cases-in-the-parser /// [HTMLParser]: https://docs.python.org/3/library/html.parser.html pub struct NaiveParser> { tokenizer: Tokenizer, } impl, O: Offset> NaiveParser> { /// Constructs a new naive parser. // TODO: add example for NaiveParser::new pub fn new<'a>(reader: impl IntoReader<'a, Reader = R>) -> Self { let mut tokenizer = Tokenizer::new(reader, DefaultEmitter::default()); tokenizer.naively_switch_state = true; NaiveParser { tokenizer } } } impl> NaiveParser> { /// Constructs a new naive parser with source code offsets and spans. // TODO: add example for NaiveParser::new_with_spans pub fn new_with_spans<'a>(reader: impl IntoReader<'a, Reader = R>) -> Self { let mut tokenizer = Tokenizer::new(reader, DefaultEmitter::default()); tokenizer.naively_switch_state = true; NaiveParser { tokenizer } } } impl, O: Offset, E: Emitter> NaiveParser { /// Constructs a new naive parser with a custom emitter. // TODO: add example for NaiveParser::new_with_emitter pub fn new_with_emitter<'a>(reader: impl IntoReader<'a, Reader = R>, emitter: E) -> Self { let mut tokenizer = Tokenizer::new(reader, emitter); tokenizer.naively_switch_state = true; NaiveParser { tokenizer } } } impl Iterator for NaiveParser where R: Reader + Position, O: Offset, E: Emitter + Iterator, { type Item = Result; fn next(&mut self) -> Option { loop { let event = self.tokenizer.next()?; match event { Err(e) => return Some(Err(e)), Ok(Event::Token(t)) => { // A proper parser would follow the steps described under section '13.2.6 Tree construction' // of the spec. Since this parser is naive, we directly return the token instead. return Some(Ok(t)); } Ok(Event::CdataOpen) => { // Naively parse any CDATA sections as bogus comments. self.tokenizer.handle_cdata_open(CdataAction::BogusComment) } } } } } pub(crate) fn naive_next_state(tag_name: &str) -> State { // These transitions are defined in https://html.spec.whatwg.org/multipage/parsing.html#parsing-html-fragments. // TODO: investigate what state logic Python's HTMLParser is using match tag_name { "title" | "textarea" => State::RcData, "style" | "xmp" | "iframe" | "noembed" | "noframes" => State::RawText, "script" => State::ScriptData, "plaintext" => State::PlainText, _other => State::Data, } }