use crate::emitter::DefaultEmitter;
use crate::offset::{Offset, Position};
use crate::reader::{IntoReader, Reader};
use crate::{Emitter, State, Tokenizer};
/// A naive HTML parser (**not** spec-compliant since it doesn't do tree construction).
/// * it **does not** correct [misnested tags]
/// * it **does not** recognize implicitly self-closing elements like
/// `
`, it will simply emit a start token
/// * it naively emits any CDATA sections as bogus comments
/// It has similar caveats to the [HTMLParser] from the Python standard library.
/// It should suffice for web scraping but you wouldn't use it to implement a browser.
/// [misnested tags]:
/// [HTMLParser]:
pub struct NaiveParser> {
tokenizer: Tokenizer,
impl, O: Offset> NaiveParser> {
/// Constructs a new naive parser.
// TODO: add example for NaiveParser::new
pub fn new<'a>(reader: impl IntoReader<'a, Reader = R>) -> Self {
let mut tokenizer = Tokenizer::new(reader, DefaultEmitter::default());
tokenizer.naively_switch_state = true;
NaiveParser { tokenizer }
impl> NaiveParser> {
/// Constructs a new naive parser with source code offsets and spans.
// TODO: add example for NaiveParser::new_with_spans
pub fn new_with_spans<'a>(reader: impl IntoReader<'a, Reader = R>) -> Self {
let mut tokenizer = Tokenizer::new(reader, DefaultEmitter::default());
tokenizer.naively_switch_state = true;
NaiveParser { tokenizer }
impl, O: Offset, E: Emitter> NaiveParser {
/// Constructs a new naive parser with a custom emitter.
// TODO: add example for NaiveParser::new_with_emitter
pub fn new_with_emitter<'a>(reader: impl IntoReader<'a, Reader = R>, emitter: E) -> Self {
let mut tokenizer = Tokenizer::new(reader, emitter);
tokenizer.naively_switch_state = true;
NaiveParser { tokenizer }
impl, O: Offset, E: Emitter> Iterator for NaiveParser {
type Item = Result;
fn next(&mut self) -> Option {
pub(crate) fn naive_next_state(tag_name: &str) -> State {
// These transitions are defined in
// TODO: investigate what state logic Python's HTMLParser is using
match tag_name {
"title" | "textarea" => State::RcData,
"style" | "xmp" | "iframe" | "noembed" | "noframes" => State::RawText,
"script" => State::ScriptData,
"plaintext" => State::PlainText,
_other => State::Data,