diff options
author | Martin Fischer <martin@push-f.com> | 2023-08-12 12:58:08 +0200 |
---|---|---|
committer | Martin Fischer <martin@push-f.com> | 2023-08-19 13:53:58 +0200 |
commit | 0d9cd9ed44b676ccd4991cea27dc620b94ebe7e7 (patch) | |
tree | aba2bff89958bbe4516a49caba5edffc866c64af | |
parent | b125bec9914bd211d77719bd60bc5a23bd9db579 (diff) |
feat: introduce NaiveParser
-rw-r--r-- | README.md | 48 | ||||
-rw-r--r-- | examples/naive-parser.rs | 13 | ||||
-rw-r--r-- | src/lib.rs | 2 | ||||
-rw-r--r-- | src/naive_parser.rs | 70 | ||||
-rw-r--r-- | src/reader.rs | 2 | ||||
-rw-r--r-- | src/tokenizer.rs | 20 | ||||
-rw-r--r-- | tests/test_spans.rs | 9 |
7 files changed, 126 insertions, 38 deletions
@@ -3,19 +3,18 @@ [![docs.rs](https://img.shields.io/docsrs/html5tokenizer)](https://docs.rs/html5tokenizer) [![crates.io](https://img.shields.io/crates/l/html5tokenizer.svg)](https://crates.io/crates/html5tokenizer) -`html5tokenizer` is a WHATWG-compliant HTML tokenizer -(forked from [html5gum] with added code span support). +Spec-compliant HTML parsing [requires both tokenization and tree-construction][parsing model]. +While this crate implements a spec-compliant HTML tokenizer it does not implement any +tree-construction. Instead it just provides a `NaiveParser` that may be used as follows: -<!-- TODO: update to use NaiveParser API --> -```ignore +``` use std::fmt::Write; -use html5tokenizer::{DefaultEmitter, Tokenizer, Token}; +use html5tokenizer::{NaiveParser, Token}; let html = "<title >hello world</title>"; -let emitter = DefaultEmitter::default(); let mut new_html = String::new(); -for token in Tokenizer::new(html, emitter).flatten() { +for token in NaiveParser::new(html).flatten() { match token { Token::StartTag(tag) => { write!(new_html, "<{}>", tag.name).unwrap(); @@ -33,31 +32,25 @@ for token in Tokenizer::new(html, emitter).flatten() { assert_eq!(new_html, "<title>hello world</title>"); ``` -## What a tokenizer does and what it does not do +## Compared to html5gum -`html5tokenizer` fully implements [13.2.5 of the WHATWG HTML spec][tokenization], -i.e. is able to tokenize HTML documents and passes [html5lib's tokenizer test suite]. -Since it is just a tokenizer, this means: +`html5tokenizer` was forked from [html5gum] 0.2.1. -* `html5tokenizer` **does not** implement [charset detection]. - This implementation requires all input to be Rust strings and therefore valid UTF-8. -* `html5tokenizer` **does not** correct [misnested tags]. -* `html5tokenizer` **does not** recognize implicitly self-closing elements like - `<img>`, as a tokenizer it will simply emit a start token. It does however - emit a self-closing tag for `<img .. />`. -* `html5tokenizer` **does not** generally qualify as a browser-grade HTML *parser* as - per the WHATWG spec. This can change in the future. +* Code span support has been added. +* The API has been revised. -With those caveats in mind, `html5tokenizer` can pretty much ~parse~ _tokenize_ -anything that browsers can. +html5gum has since switched its parsing to operate on bytes, +which html5tokenizer doesn't yet support. +`html5tokenizer` **does not** implement [charset detection]. +This implementation requires all input to be Rust strings and therefore valid UTF-8. -## The `Emitter` trait +Both crates pass the [html5lib tokenizer test suite]. -A distinguishing feature of `html5tokenizer` is that you can bring your own token -datastructure and hook into token creation by implementing the `Emitter` trait. +Both crates have an `Emitter` trait that lets you bring your own token data +structure and hook into token creation by implementing the `Emitter` trait. This allows you to: -* Rewrite all per-HTML-tag allocations to use a custom allocator or datastructure. +* Rewrite all per-HTML-tag allocations to use a custom allocator or data structure. * Efficiently filter out uninteresting categories data without ever allocating for it. For example if any plaintext between tokens is not of interest to @@ -69,9 +62,8 @@ This allows you to: Licensed under the MIT license, see [the LICENSE file]. +[parsing model]: https://html.spec.whatwg.org/multipage/parsing.html#overview-of-the-parsing-model [html5gum]: https://crates.io/crates/html5gum -[tokenization]: https://html.spec.whatwg.org/multipage/parsing.html#tokenization -[html5lib's tokenizer test suite]: https://github.com/html5lib/html5lib-tests/tree/master/tokenizer +[html5lib tokenizer test suite]: https://github.com/html5lib/html5lib-tests/tree/master/tokenizer [charset detection]: https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding -[misnested tags]: https://html.spec.whatwg.org/multipage/parsing.html#an-introduction-to-error-handling-and-strange-cases-in-the-parser [the LICENSE file]: ./LICENSE diff --git a/examples/naive-parser.rs b/examples/naive-parser.rs new file mode 100644 index 0000000..10fdf34 --- /dev/null +++ b/examples/naive-parser.rs @@ -0,0 +1,13 @@ +//! Let's you easily try out the NaiveParser with e.g. +//! printf '<style><b>Hello world!</b></style>' | cargo run --example=naive-parser +use html5tokenizer::NaiveParser; +use std::io::{stdin, BufReader}; + +fn main() { + let stdin = stdin(); + + for token in NaiveParser::new(BufReader::new(stdin.lock())) { + let token = token.unwrap(); + println!("{:?}", token); + } +} @@ -9,6 +9,7 @@ mod emitter; mod entities; mod error; mod machine; +mod naive_parser; pub mod offset; pub mod reader; mod tokenizer; @@ -16,6 +17,7 @@ mod utils; pub use emitter::{Comment, Doctype, Emitter, EndTag, StartTag, Token}; pub use error::Error; +pub use naive_parser::NaiveParser; pub use tokenizer::{State, Tokenizer}; #[cfg(feature = "integration-tests")] diff --git a/src/naive_parser.rs b/src/naive_parser.rs new file mode 100644 index 0000000..e229592 --- /dev/null +++ b/src/naive_parser.rs @@ -0,0 +1,70 @@ +use crate::emitter::DefaultEmitter; +use crate::offset::{Offset, Position}; +use crate::reader::{IntoReader, Reader}; +use crate::{Emitter, State, Tokenizer}; + +/// A naive HTML parser (**not** spec-compliant since it doesn't do tree construction). +/// +/// * it **does not** correct [misnested tags] +/// * it **does not** recognize implicitly self-closing elements like +/// `<img>`, it will simply emit a start token +/// * it naively emits any CDATA sections as bogus comments +/// +/// It has similar caveats to the [HTMLParser] from the Python standard library. +/// It should suffice for web scraping but you wouldn't use it to implement a browser. +/// +/// [misnested tags]: https://html.spec.whatwg.org/multipage/parsing.html#an-introduction-to-error-handling-and-strange-cases-in-the-parser +/// [HTMLParser]: https://docs.python.org/3/library/html.parser.html +pub struct NaiveParser<R: Reader, O: Offset, E: Emitter<O>> { + tokenizer: Tokenizer<R, O, E>, +} + +impl<R: Reader, O: Offset> NaiveParser<R, O, DefaultEmitter<O>> { + /// Constructs a new naive parser. + // TODO: add example for NaiveParser::new + pub fn new<'a>(reader: impl IntoReader<'a, Reader = R>) -> Self { + let mut tokenizer = Tokenizer::new(reader, DefaultEmitter::default()); + tokenizer.naively_switch_state = true; + NaiveParser { tokenizer } + } +} + +impl<R: Reader + Position<usize>> NaiveParser<R, usize, DefaultEmitter<usize>> { + /// Constructs a new naive parser with source code offsets and spans. + // TODO: add example for NaiveParser::new_with_spans + pub fn new_with_spans<'a>(reader: impl IntoReader<'a, Reader = R>) -> Self { + let mut tokenizer = Tokenizer::new(reader, DefaultEmitter::default()); + tokenizer.naively_switch_state = true; + NaiveParser { tokenizer } + } +} + +impl<R: Reader, O: Offset, E: Emitter<O>> NaiveParser<R, O, E> { + /// Constructs a new naive parser with a custom emitter. + // TODO: add example for NaiveParser::new_with_emitter + pub fn new_with_emitter<'a>(reader: impl IntoReader<'a, Reader = R>, emitter: E) -> Self { + let mut tokenizer = Tokenizer::new(reader, emitter); + tokenizer.naively_switch_state = true; + NaiveParser { tokenizer } + } +} + +impl<R: Reader + Position<O>, O: Offset, E: Emitter<O>> Iterator for NaiveParser<R, O, E> { + type Item = Result<E::Token, R::Error>; + + fn next(&mut self) -> Option<Self::Item> { + self.tokenizer.next() + } +} + +pub(crate) fn naive_next_state(tag_name: &str) -> State { + // These transitions are defined in https://html.spec.whatwg.org/multipage/parsing.html#parsing-html-fragments. + // TODO: investigate what state logic Python's HTMLParser is using + match tag_name { + "title" | "textarea" => State::RcData, + "style" | "xmp" | "iframe" | "noembed" | "noframes" => State::RawText, + "script" => State::ScriptData, + "plaintext" => State::PlainText, + _other => State::Data, + } +} diff --git a/src/reader.rs b/src/reader.rs index e0161e5..b6e0905 100644 --- a/src/reader.rs +++ b/src/reader.rs @@ -33,7 +33,7 @@ pub trait Reader { /// An object that can be converted into a [`Reader`]. /// /// For example, any utf8-string can be converted into a `StringReader`. -// TODO: , such that [give concrete examples of not-yet-implemented parser API] work. +// TODO: , such that [give concrete examples of NaiveParser::new] work. pub trait IntoReader<'a> { /// The reader type into which this type should be converted. type Reader: Reader + 'a; diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 3a75e60..7cc4712 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -1,4 +1,5 @@ use crate::machine; +use crate::naive_parser::naive_next_state; use crate::offset::{Offset, Position}; use crate::reader::{IntoReader, Reader}; use crate::utils::{ @@ -32,7 +33,12 @@ impl<T: Copy> Stack2<T> { } } -/// An HTML tokenizer. See crate-level docs for basic usage. +/// An HTML tokenizer. +/// +/// Note that for proper HTML parsing, you'll have to implement [tree construction] +/// based on this Tokenizer yourself (since this crate currently does not implement it). +/// +/// [tree construction]: https://html.spec.whatwg.org/multipage/parsing.html#tree-construction pub struct Tokenizer<R: Reader, O, E: Emitter<O>> { eof: bool, pub(crate) state: InternalState, @@ -46,12 +52,18 @@ pub struct Tokenizer<R: Reader, O, E: Emitter<O>> { last_start_tag_name: String, is_start_tag: bool, pub(crate) doctype_offset: O, + /// This boolean flag exists so that the [`NaiveParser`](crate::NaiveParser) can work with any [`Emitter`] + /// (it cannot call [`Tokenizer::set_state`] using the emitted start tags since they can be of an arbitrary type). + pub(crate) naively_switch_state: bool, } impl<R: Reader, O: Offset, E: Emitter<O>> Tokenizer<R, O, E> { /// Creates a new tokenizer from some input and an emitter. /// - /// TODO: add warning about you needing to do the state switching + /// Note that properly parsing HTML with this tokenizer requires you to + /// implement [tree construction] and call [`Tokenizer::set_state`] accordingly. + /// + /// [tree construction]: https://html.spec.whatwg.org/multipage/parsing.html#tree-construction pub fn new<'a>(reader: impl IntoReader<'a, Reader = R>, emitter: E) -> Self { Tokenizer { reader: reader.into_reader(), @@ -66,6 +78,7 @@ impl<R: Reader, O: Offset, E: Emitter<O>> Tokenizer<R, O, E> { last_start_tag_name: String::new(), is_start_tag: false, doctype_offset: O::default(), + naively_switch_state: false, } } } @@ -175,6 +188,9 @@ impl<R: Reader + Position<O>, O: Offset, E: Emitter<O>> Tokenizer<R, O, E> { pub(crate) fn emit_current_tag(&mut self) { self.emitter.emit_current_tag(self.reader.position() - 1); if self.is_start_tag { + if self.naively_switch_state { + self.state = naive_next_state(&self.current_tag_name).into(); + } std::mem::swap(&mut self.last_start_tag_name, &mut self.current_tag_name); } } diff --git a/tests/test_spans.rs b/tests/test_spans.rs index 791f1a6..d3e62ae 100644 --- a/tests/test_spans.rs +++ b/tests/test_spans.rs @@ -1,4 +1,3 @@ -#![cfg(feature = "integration-tests")] // TODO: switch to NaiveParser API use std::ops::Range; use codespan_reporting::{ @@ -7,16 +6,12 @@ use codespan_reporting::{ files::SimpleFiles, term::{self, termcolor::Buffer}, }; -use html5tokenizer::{offset::PosTrackingReader, DefaultEmitter, Token, Tokenizer}; +use html5tokenizer::{offset::PosTrackingReader, NaiveParser, Token}; use insta::assert_snapshot; use similar_asserts::assert_eq; fn tokenizer(html: &'static str) -> impl Iterator<Item = Token<usize>> { - Tokenizer::new( - PosTrackingReader::new(html), - DefaultEmitter::<usize>::default(), - ) - .flatten() + NaiveParser::new(PosTrackingReader::new(html)).flatten() } fn annotate(html: &str, labels: Vec<(Range<usize>, impl AsRef<str>)>) -> String { |