src/naive_parser.rs


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107

use crate::offset::{Offset, Position};
use crate::reader::{IntoReader, Reader};
use crate::tokenizer::CdataAction;
use crate::{BasicEmitter, Emitter, Event, State, Tokenizer};

/// A naive HTML parser (**not** spec-compliant since it doesn't do tree construction).
///
/// * it **does not** correct [misnested tags]
///
/// * it **does not** recognize implicitly self-closing elements like
///  `<img>`, it will simply emit a start token
///
/// * it naively emits any CDATA sections as bogus comments, for example:
///
///   ```
///   # use html5tokenizer::{NaiveParser, Token};
///   let html = "<svg><![CDATA[I love SVG]]>";
///   let mut tokens = NaiveParser::new(html).flatten();
///   assert!(matches!(tokens.next().unwrap(), Token::StartTag(tag) if tag.name == "svg"));
///   assert!(matches!(tokens.next().unwrap(), Token::Comment(_bogus_comment)));
///   ```
///
/// It has similar caveats to the [HTMLParser] from the Python standard library.
/// It should suffice for web scraping but you wouldn't use it to implement a browser.
///
/// [misnested tags]: https://html.spec.whatwg.org/multipage/parsing.html#an-introduction-to-error-handling-and-strange-cases-in-the-parser
/// [HTMLParser]: https://docs.python.org/3/library/html.parser.html
pub struct NaiveParser<R: Reader, O: Offset, E: Emitter<O>> {
    tokenizer: Tokenizer<R, O, E>,
}

impl<R, O> NaiveParser<R, O, BasicEmitter<O>>
where
    R: Reader + Position<O>,
    O: Offset,
{
    /// Constructs a new naive parser using the [`BasicEmitter`].
    // TODO: add example for NaiveParser::new
    pub fn new<'a, IR>(reader: IR) -> NaiveParser<R, O, BasicEmitter<O>>
    where
        IR: IntoReader<'a, Reader = R>,
    {
        NaiveParser::new_with_emitter(reader, BasicEmitter::default())
    }
}

impl<R, O, E> NaiveParser<R, O, E>
where
    R: Reader + Position<O>,
    O: Offset,
    E: Emitter<O>,
{
    /// Constructs a new naive parser with a custom emitter.
    // TODO: add example for NaiveParser::new_with_emitter
    pub fn new_with_emitter<'a, IR>(reader: IR, emitter: E) -> NaiveParser<R, O, E>
    where
        IR: IntoReader<'a, Reader = R>,
    {
        let mut tokenizer = Tokenizer::new(reader, emitter);
        tokenizer.enable_naive_state_switching();
        NaiveParser { tokenizer }
    }

    /// Returns a mutable reference to the emitter.
    pub fn emitter_mut(&mut self) -> &mut E {
        self.tokenizer.emitter_mut()
    }
}

impl<R, O, E> Iterator for NaiveParser<R, O, E>
where
    R: Reader + Position<O>,
    O: Offset,
    E: Emitter<O> + Iterator,
{
    type Item = Result<E::Item, R::Error>;

    fn next(&mut self) -> Option<Self::Item> {
        loop {
            let event = self.tokenizer.next()?;
            match event {
                Err(e) => return Some(Err(e)),
                Ok(Event::Token(t)) => {
                    // A proper parser would follow the steps described under section '13.2.6 Tree construction'
                    // of the spec. Since this parser is naive, we directly return the token instead.
                    return Some(Ok(t));
                }
                Ok(Event::CdataOpen) => {
                    // Naively parse any CDATA sections as bogus comments.
                    self.tokenizer.handle_cdata_open(CdataAction::BogusComment)
                }
            }
        }
    }
}

pub(crate) fn naive_next_state(tag_name: &str) -> State {
    // These transitions are defined in https://html.spec.whatwg.org/multipage/parsing.html#parsing-html-fragments.
    // TODO: investigate what state logic Python's HTMLParser is using
    match tag_name {
        "title" | "textarea" => State::RcData,
        "style" | "xmp" | "iframe" | "noembed" | "noframes" => State::RawText,
        "script" => State::ScriptData,
        "plaintext" => State::PlainText,
        _other => State::Data,
    }
}