diff options
-rw-r--r-- | Cargo.toml | 10 | ||||
-rw-r--r-- | LICENSE | 2 | ||||
-rw-r--r-- | README.md | 55 | ||||
-rw-r--r-- | examples/switch-state.rs | 2 | ||||
-rw-r--r-- | examples/tokenize.rs | 2 | ||||
-rw-r--r-- | src/reader.rs | 4 | ||||
-rw-r--r-- | tests/test_html5lib.rs | 2 | ||||
-rw-r--r-- | tests/test_spans.rs | 2 |
8 files changed, 25 insertions, 54 deletions
@@ -1,13 +1,13 @@ [package] -name = "html5gum" -authors = ["Markus Unterwaditzer <markus-honeypot@unterwaditzer.net>"] -description = "A WHATWG-compliant HTML5 tokenizer and tag soup parser." +name = "html5tokenizer" +authors = ["Markus Unterwaditzer <markus-honeypot@unterwaditzer.net>", "Martin Fischer <martin@push-f.com>"] +description = "An HTML5 tokenizer with code span support." edition = "2018" readme = "README.md" keywords = ["html", "html5", "whatwg", "parser", "tokenizer"] license = "MIT" -repository = "https://github.com/untitaker/html5gum" -version = "0.2.1" +repository = "https://git.push-f.com/html5tokenizer/" +version = "0.4.0" include = ["src/**/*", "LICENSE", "README.md"] [dev-dependencies] @@ -1,4 +1,4 @@ -Copyright (c) 2021 Markus Unterwaditzer +Copyright (c) 2021 Markus Unterwaditzer & Martin Fischer Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in @@ -1,13 +1,14 @@ -# html5gum +# html5tokenizer -[![docs.rs](https://img.shields.io/docsrs/html5gum)](https://docs.rs/html5gum) -[![crates.io](https://img.shields.io/crates/l/html5gum.svg)](https://crates.io/crates/html5gum) +[![docs.rs](https://img.shields.io/docsrs/html5tokenizer)](https://docs.rs/html5tokenizer) +[![crates.io](https://img.shields.io/crates/l/html5tokenizer.svg)](https://crates.io/crates/html5tokenizer) -`html5gum` is a WHATWG-compliant HTML tokenizer. +`html5tokenizer` is a WHATWG-compliant HTML tokenizer (forked from +[html5gum](https://crates.io/crates/html5gum) with added code span support). ```rust use std::fmt::Write; -use html5gum::{Tokenizer, Token}; +use html5tokenizer::{Tokenizer, Token}; let html = "<title >hello world</title>"; let mut new_html = String::new(); @@ -32,28 +33,28 @@ assert_eq!(new_html, "<title>hello world</title>"); ## What a tokenizer does and what it does not do -`html5gum` fully implements [13.2.5 of the WHATWG HTML +`html5tokenizer` fully implements [13.2.5 of the WHATWG HTML spec](https://html.spec.whatwg.org/#tokenization), i.e. is able to tokenize HTML documents and passes [html5lib's tokenizer test suite](https://github.com/html5lib/html5lib-tests/tree/master/tokenizer). Since it is just a tokenizer, this means: -* `html5gum` **does not** [implement charset +* `html5tokenizer` **does not** [implement charset detection.](https://html.spec.whatwg.org/#determining-the-character-encoding) This implementation requires all input to be Rust strings and therefore valid UTF-8. -* `html5gum` **does not** [correct mis-nested +* `html5tokenizer` **does not** [correct mis-nested tags.](https://html.spec.whatwg.org/#an-introduction-to-error-handling-and-strange-cases-in-the-parser) -* `html5gum` **does not** recognize implicitly self-closing elements like +* `html5tokenizer` **does not** recognize implicitly self-closing elements like `<img>`, as a tokenizer it will simply emit a start token. It does however emit a self-closing tag for `<img .. />`. -* `html5gum` **does not** generally qualify as a browser-grade HTML *parser* as +* `html5tokenizer` **does not** generally qualify as a browser-grade HTML *parser* as per the WHATWG spec. This can change in the future. -With those caveats in mind, `html5gum` can pretty much ~parse~ _tokenize_ +With those caveats in mind, `html5tokenizer` can pretty much ~parse~ _tokenize_ anything that browsers can. ## The `Emitter` trait -A distinguishing feature of `html5gum` is that you can bring your own token +A distinguishing feature of `html5tokenizer` is that you can bring your own token datastructure and hook into token creation by implementing the `Emitter` trait. This allows you to: @@ -64,36 +65,6 @@ This allows you to: you, you can implement the respective trait methods as noop and therefore avoid any overhead creating plaintext tokens. -## Alternative HTML parsers - -`html5gum` was created out of a need to parse HTML tag soup efficiently. Previous options were to: - -* use [quick-xml](https://github.com/tafia/quick-xml/) or - [xmlparser](https://github.com/RazrFalcon/xmlparser) with some hacks to make - either one not choke on bad HTML. For some (rather large) set of HTML input - this works well (particularly `quick-xml` can be configured to be very - lenient about parsing errors) and parsing speed is stellar. But neither can - parse all HTML. - - For my own usecase `html5gum` is about 2x slower than `quick-xml`. - -* use [html5ever's own - tokenizer](https://docs.rs/html5ever/0.25.1/html5ever/tokenizer/index.html) - to avoid as much tree-building overhead as possible. This was functional but - had poor performance for my own usecase (10-15x slower than `quick-xml`). - -* use [lol-html](https://github.com/cloudflare/lol-html), which would probably - perform at least as well as `html5gum`, but comes with a closure-based API - that I didn't manage to get working for my usecase. - -## Etymology - -Why is this library called `html5gum`? - -* G.U.M: **G**iant **U**nreadable **M**atch-statement - -* \<insert "how it feels to <s>chew 5 gum</s> _parse HTML_" meme here\> - ## License Licensed under the MIT license, see [`./LICENSE`](./LICENSE). diff --git a/examples/switch-state.rs b/examples/switch-state.rs index e966687..9ebc673 100644 --- a/examples/switch-state.rs +++ b/examples/switch-state.rs @@ -1,6 +1,6 @@ //! Let's you easily try out the tokenizer with e.g. //! printf '<style><b>Hello world!</b></style>' | cargo run --example=switch-state -use html5gum::{BufReadReader, Token, Tokenizer}; +use html5tokenizer::{BufReadReader, Token, Tokenizer}; use std::io::stdin; fn main() { diff --git a/examples/tokenize.rs b/examples/tokenize.rs index 9a039c3..ceb5751 100644 --- a/examples/tokenize.rs +++ b/examples/tokenize.rs @@ -1,6 +1,6 @@ //! Let's you easily try out the tokenizer with e.g. //! printf '<h1>Hello world!</h1>' | cargo run --example=tokenize -use html5gum::{BufReadReader, Tokenizer}; +use html5tokenizer::{BufReadReader, Tokenizer}; fn main() { for token in Tokenizer::new(BufReadReader::new(std::io::stdin().lock())).flatten() { diff --git a/src/reader.rs b/src/reader.rs index b7a63b2..eb2b479 100644 --- a/src/reader.rs +++ b/src/reader.rs @@ -54,7 +54,7 @@ impl<'a, R: 'a + Reader> Readable<'a> for R { /// /// ```rust /// use std::fmt::Write; -/// use html5gum::{Tokenizer, Token}; +/// use html5tokenizer::{Tokenizer, Token}; /// /// let html = "<title >hello world</title>"; /// let mut new_html = String::new(); @@ -144,7 +144,7 @@ impl<'a> Readable<'a> for &'a String { /// ```rust /// use std::io::BufReader; /// use std::fmt::Write; -/// use html5gum::{Token, BufReadReader, Tokenizer}; +/// use html5tokenizer::{Token, BufReadReader, Tokenizer}; /// /// let tokenizer = Tokenizer::new(BufReader::new("<title>hello world</title>".as_bytes())); /// // or alternatively: diff --git a/tests/test_html5lib.rs b/tests/test_html5lib.rs index cd3785f..cda932c 100644 --- a/tests/test_html5lib.rs +++ b/tests/test_html5lib.rs @@ -1,4 +1,4 @@ -use html5gum::{ +use html5tokenizer::{ Attribute, Doctype, EndTag, Error, InternalState as State, Reader, StartTag, Token, Tokenizer, }; use pretty_assertions::assert_eq; diff --git a/tests/test_spans.rs b/tests/test_spans.rs index 5b1e814..93330db 100644 --- a/tests/test_spans.rs +++ b/tests/test_spans.rs @@ -6,7 +6,7 @@ use codespan_reporting::{ files::SimpleFiles, term::{self, termcolor::Buffer}, }; -use html5gum::{spans::PosTracker, DefaultEmitter, Readable, StringReader, Token, Tokenizer}; +use html5tokenizer::{spans::PosTracker, DefaultEmitter, Readable, StringReader, Token, Tokenizer}; #[test] fn test() { |