aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMartin Fischer <martin@push-f.com>2023-08-12 12:58:08 +0200
committerMartin Fischer <martin@push-f.com>2023-08-19 13:53:58 +0200
commit0d9cd9ed44b676ccd4991cea27dc620b94ebe7e7 (patch)
treeaba2bff89958bbe4516a49caba5edffc866c64af
parentb125bec9914bd211d77719bd60bc5a23bd9db579 (diff)
feat: introduce NaiveParser
-rw-r--r--README.md48
-rw-r--r--examples/naive-parser.rs13
-rw-r--r--src/lib.rs2
-rw-r--r--src/naive_parser.rs70
-rw-r--r--src/reader.rs2
-rw-r--r--src/tokenizer.rs20
-rw-r--r--tests/test_spans.rs9
7 files changed, 126 insertions, 38 deletions
diff --git a/README.md b/README.md
index ce68663..54cf1bd 100644
--- a/README.md
+++ b/README.md
@@ -3,19 +3,18 @@
[![docs.rs](https://img.shields.io/docsrs/html5tokenizer)](https://docs.rs/html5tokenizer)
[![crates.io](https://img.shields.io/crates/l/html5tokenizer.svg)](https://crates.io/crates/html5tokenizer)
-`html5tokenizer` is a WHATWG-compliant HTML tokenizer
-(forked from [html5gum] with added code span support).
+Spec-compliant HTML parsing [requires both tokenization and tree-construction][parsing model].
+While this crate implements a spec-compliant HTML tokenizer it does not implement any
+tree-construction. Instead it just provides a `NaiveParser` that may be used as follows:
-<!-- TODO: update to use NaiveParser API -->
-```ignore
+```
use std::fmt::Write;
-use html5tokenizer::{DefaultEmitter, Tokenizer, Token};
+use html5tokenizer::{NaiveParser, Token};
let html = "<title >hello world</title>";
-let emitter = DefaultEmitter::default();
let mut new_html = String::new();
-for token in Tokenizer::new(html, emitter).flatten() {
+for token in NaiveParser::new(html).flatten() {
match token {
Token::StartTag(tag) => {
write!(new_html, "<{}>", tag.name).unwrap();
@@ -33,31 +32,25 @@ for token in Tokenizer::new(html, emitter).flatten() {
assert_eq!(new_html, "<title>hello world</title>");
```
-## What a tokenizer does and what it does not do
+## Compared to html5gum
-`html5tokenizer` fully implements [13.2.5 of the WHATWG HTML spec][tokenization],
-i.e. is able to tokenize HTML documents and passes [html5lib's tokenizer test suite].
-Since it is just a tokenizer, this means:
+`html5tokenizer` was forked from [html5gum] 0.2.1.
-* `html5tokenizer` **does not** implement [charset detection].
- This implementation requires all input to be Rust strings and therefore valid UTF-8.
-* `html5tokenizer` **does not** correct [misnested tags].
-* `html5tokenizer` **does not** recognize implicitly self-closing elements like
- `<img>`, as a tokenizer it will simply emit a start token. It does however
- emit a self-closing tag for `<img .. />`.
-* `html5tokenizer` **does not** generally qualify as a browser-grade HTML *parser* as
- per the WHATWG spec. This can change in the future.
+* Code span support has been added.
+* The API has been revised.
-With those caveats in mind, `html5tokenizer` can pretty much ~parse~ _tokenize_
-anything that browsers can.
+html5gum has since switched its parsing to operate on bytes,
+which html5tokenizer doesn't yet support.
+`html5tokenizer` **does not** implement [charset detection].
+This implementation requires all input to be Rust strings and therefore valid UTF-8.
-## The `Emitter` trait
+Both crates pass the [html5lib tokenizer test suite].
-A distinguishing feature of `html5tokenizer` is that you can bring your own token
-datastructure and hook into token creation by implementing the `Emitter` trait.
+Both crates have an `Emitter` trait that lets you bring your own token data
+structure and hook into token creation by implementing the `Emitter` trait.
This allows you to:
-* Rewrite all per-HTML-tag allocations to use a custom allocator or datastructure.
+* Rewrite all per-HTML-tag allocations to use a custom allocator or data structure.
* Efficiently filter out uninteresting categories data without ever allocating
for it. For example if any plaintext between tokens is not of interest to
@@ -69,9 +62,8 @@ This allows you to:
Licensed under the MIT license, see [the LICENSE file].
+[parsing model]: https://html.spec.whatwg.org/multipage/parsing.html#overview-of-the-parsing-model
[html5gum]: https://crates.io/crates/html5gum
-[tokenization]: https://html.spec.whatwg.org/multipage/parsing.html#tokenization
-[html5lib's tokenizer test suite]: https://github.com/html5lib/html5lib-tests/tree/master/tokenizer
+[html5lib tokenizer test suite]: https://github.com/html5lib/html5lib-tests/tree/master/tokenizer
[charset detection]: https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding
-[misnested tags]: https://html.spec.whatwg.org/multipage/parsing.html#an-introduction-to-error-handling-and-strange-cases-in-the-parser
[the LICENSE file]: ./LICENSE
diff --git a/examples/naive-parser.rs b/examples/naive-parser.rs
new file mode 100644
index 0000000..10fdf34
--- /dev/null
+++ b/examples/naive-parser.rs
@@ -0,0 +1,13 @@
+//! Let's you easily try out the NaiveParser with e.g.
+//! printf '<style><b>Hello world!</b></style>' | cargo run --example=naive-parser
+use html5tokenizer::NaiveParser;
+use std::io::{stdin, BufReader};
+
+fn main() {
+ let stdin = stdin();
+
+ for token in NaiveParser::new(BufReader::new(stdin.lock())) {
+ let token = token.unwrap();
+ println!("{:?}", token);
+ }
+}
diff --git a/src/lib.rs b/src/lib.rs
index c14613b..1cfb7c9 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -9,6 +9,7 @@ mod emitter;
mod entities;
mod error;
mod machine;
+mod naive_parser;
pub mod offset;
pub mod reader;
mod tokenizer;
@@ -16,6 +17,7 @@ mod utils;
pub use emitter::{Comment, Doctype, Emitter, EndTag, StartTag, Token};
pub use error::Error;
+pub use naive_parser::NaiveParser;
pub use tokenizer::{State, Tokenizer};
#[cfg(feature = "integration-tests")]
diff --git a/src/naive_parser.rs b/src/naive_parser.rs
new file mode 100644
index 0000000..e229592
--- /dev/null
+++ b/src/naive_parser.rs
@@ -0,0 +1,70 @@
+use crate::emitter::DefaultEmitter;
+use crate::offset::{Offset, Position};
+use crate::reader::{IntoReader, Reader};
+use crate::{Emitter, State, Tokenizer};
+
+/// A naive HTML parser (**not** spec-compliant since it doesn't do tree construction).
+///
+/// * it **does not** correct [misnested tags]
+/// * it **does not** recognize implicitly self-closing elements like
+/// `<img>`, it will simply emit a start token
+/// * it naively emits any CDATA sections as bogus comments
+///
+/// It has similar caveats to the [HTMLParser] from the Python standard library.
+/// It should suffice for web scraping but you wouldn't use it to implement a browser.
+///
+/// [misnested tags]: https://html.spec.whatwg.org/multipage/parsing.html#an-introduction-to-error-handling-and-strange-cases-in-the-parser
+/// [HTMLParser]: https://docs.python.org/3/library/html.parser.html
+pub struct NaiveParser<R: Reader, O: Offset, E: Emitter<O>> {
+ tokenizer: Tokenizer<R, O, E>,
+}
+
+impl<R: Reader, O: Offset> NaiveParser<R, O, DefaultEmitter<O>> {
+ /// Constructs a new naive parser.
+ // TODO: add example for NaiveParser::new
+ pub fn new<'a>(reader: impl IntoReader<'a, Reader = R>) -> Self {
+ let mut tokenizer = Tokenizer::new(reader, DefaultEmitter::default());
+ tokenizer.naively_switch_state = true;
+ NaiveParser { tokenizer }
+ }
+}
+
+impl<R: Reader + Position<usize>> NaiveParser<R, usize, DefaultEmitter<usize>> {
+ /// Constructs a new naive parser with source code offsets and spans.
+ // TODO: add example for NaiveParser::new_with_spans
+ pub fn new_with_spans<'a>(reader: impl IntoReader<'a, Reader = R>) -> Self {
+ let mut tokenizer = Tokenizer::new(reader, DefaultEmitter::default());
+ tokenizer.naively_switch_state = true;
+ NaiveParser { tokenizer }
+ }
+}
+
+impl<R: Reader, O: Offset, E: Emitter<O>> NaiveParser<R, O, E> {
+ /// Constructs a new naive parser with a custom emitter.
+ // TODO: add example for NaiveParser::new_with_emitter
+ pub fn new_with_emitter<'a>(reader: impl IntoReader<'a, Reader = R>, emitter: E) -> Self {
+ let mut tokenizer = Tokenizer::new(reader, emitter);
+ tokenizer.naively_switch_state = true;
+ NaiveParser { tokenizer }
+ }
+}
+
+impl<R: Reader + Position<O>, O: Offset, E: Emitter<O>> Iterator for NaiveParser<R, O, E> {
+ type Item = Result<E::Token, R::Error>;
+
+ fn next(&mut self) -> Option<Self::Item> {
+ self.tokenizer.next()
+ }
+}
+
+pub(crate) fn naive_next_state(tag_name: &str) -> State {
+ // These transitions are defined in https://html.spec.whatwg.org/multipage/parsing.html#parsing-html-fragments.
+ // TODO: investigate what state logic Python's HTMLParser is using
+ match tag_name {
+ "title" | "textarea" => State::RcData,
+ "style" | "xmp" | "iframe" | "noembed" | "noframes" => State::RawText,
+ "script" => State::ScriptData,
+ "plaintext" => State::PlainText,
+ _other => State::Data,
+ }
+}
diff --git a/src/reader.rs b/src/reader.rs
index e0161e5..b6e0905 100644
--- a/src/reader.rs
+++ b/src/reader.rs
@@ -33,7 +33,7 @@ pub trait Reader {
/// An object that can be converted into a [`Reader`].
///
/// For example, any utf8-string can be converted into a `StringReader`.
-// TODO: , such that [give concrete examples of not-yet-implemented parser API] work.
+// TODO: , such that [give concrete examples of NaiveParser::new] work.
pub trait IntoReader<'a> {
/// The reader type into which this type should be converted.
type Reader: Reader + 'a;
diff --git a/src/tokenizer.rs b/src/tokenizer.rs
index 3a75e60..7cc4712 100644
--- a/src/tokenizer.rs
+++ b/src/tokenizer.rs
@@ -1,4 +1,5 @@
use crate::machine;
+use crate::naive_parser::naive_next_state;
use crate::offset::{Offset, Position};
use crate::reader::{IntoReader, Reader};
use crate::utils::{
@@ -32,7 +33,12 @@ impl<T: Copy> Stack2<T> {
}
}
-/// An HTML tokenizer. See crate-level docs for basic usage.
+/// An HTML tokenizer.
+///
+/// Note that for proper HTML parsing, you'll have to implement [tree construction]
+/// based on this Tokenizer yourself (since this crate currently does not implement it).
+///
+/// [tree construction]: https://html.spec.whatwg.org/multipage/parsing.html#tree-construction
pub struct Tokenizer<R: Reader, O, E: Emitter<O>> {
eof: bool,
pub(crate) state: InternalState,
@@ -46,12 +52,18 @@ pub struct Tokenizer<R: Reader, O, E: Emitter<O>> {
last_start_tag_name: String,
is_start_tag: bool,
pub(crate) doctype_offset: O,
+ /// This boolean flag exists so that the [`NaiveParser`](crate::NaiveParser) can work with any [`Emitter`]
+ /// (it cannot call [`Tokenizer::set_state`] using the emitted start tags since they can be of an arbitrary type).
+ pub(crate) naively_switch_state: bool,
}
impl<R: Reader, O: Offset, E: Emitter<O>> Tokenizer<R, O, E> {
/// Creates a new tokenizer from some input and an emitter.
///
- /// TODO: add warning about you needing to do the state switching
+ /// Note that properly parsing HTML with this tokenizer requires you to
+ /// implement [tree construction] and call [`Tokenizer::set_state`] accordingly.
+ ///
+ /// [tree construction]: https://html.spec.whatwg.org/multipage/parsing.html#tree-construction
pub fn new<'a>(reader: impl IntoReader<'a, Reader = R>, emitter: E) -> Self {
Tokenizer {
reader: reader.into_reader(),
@@ -66,6 +78,7 @@ impl<R: Reader, O: Offset, E: Emitter<O>> Tokenizer<R, O, E> {
last_start_tag_name: String::new(),
is_start_tag: false,
doctype_offset: O::default(),
+ naively_switch_state: false,
}
}
}
@@ -175,6 +188,9 @@ impl<R: Reader + Position<O>, O: Offset, E: Emitter<O>> Tokenizer<R, O, E> {
pub(crate) fn emit_current_tag(&mut self) {
self.emitter.emit_current_tag(self.reader.position() - 1);
if self.is_start_tag {
+ if self.naively_switch_state {
+ self.state = naive_next_state(&self.current_tag_name).into();
+ }
std::mem::swap(&mut self.last_start_tag_name, &mut self.current_tag_name);
}
}
diff --git a/tests/test_spans.rs b/tests/test_spans.rs
index 791f1a6..d3e62ae 100644
--- a/tests/test_spans.rs
+++ b/tests/test_spans.rs
@@ -1,4 +1,3 @@
-#![cfg(feature = "integration-tests")] // TODO: switch to NaiveParser API
use std::ops::Range;
use codespan_reporting::{
@@ -7,16 +6,12 @@ use codespan_reporting::{
files::SimpleFiles,
term::{self, termcolor::Buffer},
};
-use html5tokenizer::{offset::PosTrackingReader, DefaultEmitter, Token, Tokenizer};
+use html5tokenizer::{offset::PosTrackingReader, NaiveParser, Token};
use insta::assert_snapshot;
use similar_asserts::assert_eq;
fn tokenizer(html: &'static str) -> impl Iterator<Item = Token<usize>> {
- Tokenizer::new(
- PosTrackingReader::new(html),
- DefaultEmitter::<usize>::default(),
- )
- .flatten()
+ NaiveParser::new(PosTrackingReader::new(html)).flatten()
}
fn annotate(html: &str, labels: Vec<(Range<usize>, impl AsRef<str>)>) -> String {