diff options
author | Martin Fischer <martin@push-f.com> | 2023-08-18 14:36:21 +0200 |
---|---|---|
committer | Martin Fischer <martin@push-f.com> | 2023-08-19 13:41:55 +0200 |
commit | fba3706e29d049e05da36d522b25cc8fc75d689a (patch) | |
tree | 7b29c7cce79cd40878945c044b82ca620471c57d | |
parent | 11316f041985345dd3a712d14bea749790f937a4 (diff) |
chore: use link reference definitions in Markdown
-rw-r--r-- | README.md | 26 | ||||
-rw-r--r-- | src/emitter.rs | 4 | ||||
-rw-r--r-- | src/error.rs | 4 | ||||
-rw-r--r-- | src/tokenizer.rs | 33 |
4 files changed, 45 insertions, 22 deletions
@@ -3,8 +3,8 @@ [![docs.rs](https://img.shields.io/docsrs/html5tokenizer)](https://docs.rs/html5tokenizer) [![crates.io](https://img.shields.io/crates/l/html5tokenizer.svg)](https://crates.io/crates/html5tokenizer) -`html5tokenizer` is a WHATWG-compliant HTML tokenizer (forked from -[html5gum](https://crates.io/crates/html5gum) with added code span support). +`html5tokenizer` is a WHATWG-compliant HTML tokenizer +(forked from [html5gum] with added code span support). ```rust use std::fmt::Write; @@ -34,16 +34,13 @@ assert_eq!(new_html, "<title>hello world</title>"); ## What a tokenizer does and what it does not do -`html5tokenizer` fully implements [13.2.5 of the WHATWG HTML -spec](https://html.spec.whatwg.org/#tokenization), i.e. is able to tokenize HTML documents and passes [html5lib's tokenizer -test suite](https://github.com/html5lib/html5lib-tests/tree/master/tokenizer). Since it is just a tokenizer, this means: +`html5tokenizer` fully implements [13.2.5 of the WHATWG HTML spec][tokenization], +i.e. is able to tokenize HTML documents and passes [html5lib's tokenizer test suite]. +Since it is just a tokenizer, this means: -* `html5tokenizer` **does not** [implement charset - detection.](https://html.spec.whatwg.org/#determining-the-character-encoding) - This implementation requires all input to be Rust strings and therefore valid - UTF-8. -* `html5tokenizer` **does not** [correct mis-nested - tags.](https://html.spec.whatwg.org/#an-introduction-to-error-handling-and-strange-cases-in-the-parser) +* `html5tokenizer` **does not** implement [charset detection]. + This implementation requires all input to be Rust strings and therefore valid UTF-8. +* `html5tokenizer` **does not** correct [misnested tags]. * `html5tokenizer` **does not** recognize implicitly self-closing elements like `<img>`, as a tokenizer it will simply emit a start token. It does however emit a self-closing tag for `<img .. />`. @@ -69,3 +66,10 @@ This allows you to: ## License Licensed under the MIT license, see [`./LICENSE`](./LICENSE). + + +[html5gum]: https://crates.io/crates/html5gum +[tokenization]: https://html.spec.whatwg.org/#tokenization +[html5lib's tokenizer test suite]: https://github.com/html5lib/html5lib-tests/tree/master/tokenizer +[charset detection]: https://html.spec.whatwg.org/#determining-the-character-encoding +[misnested tags]: https://html.spec.whatwg.org/#an-introduction-to-error-handling-and-strange-cases-in-the-parser diff --git a/src/emitter.rs b/src/emitter.rs index 5b64acd..4fc2159 100644 --- a/src/emitter.rs +++ b/src/emitter.rs @@ -519,7 +519,9 @@ impl<O: Offset> Comment<O> { /// * `<!DOCTYPE {name} PUBLIC '{public_id}' '{system_id}'>` #[derive(Debug, Eq, PartialEq)] pub struct Doctype<O> { - /// The ["force quirks"](https://html.spec.whatwg.org/#force-quirks-flag) flag. + /// The [force-quirks flag]. + /// + /// [force-quirks flag]: https://html.spec.whatwg.org/#force-quirks-flag pub force_quirks: bool, /// The doctype's name. For HTML documents this is "html". diff --git a/src/error.rs b/src/error.rs index 4ff9dfd..401937b 100644 --- a/src/error.rs +++ b/src/error.rs @@ -2,7 +2,9 @@ macro_rules! impl_error { ($( $string:literal <=> $variant:ident, )*) => { - /// All [parsing errors](https://html.spec.whatwg.org/#parse-errors) this tokenizer can emit. + /// All [parse errors] this tokenizer can emit. + /// + /// [parse errors]: https://html.spec.whatwg.org/#parse-errors #[derive(Debug, Eq, PartialEq)] pub enum Error { $( diff --git a/src/tokenizer.rs b/src/tokenizer.rs index d272b14..3a6fb32 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -74,19 +74,33 @@ impl<R: Reader, O: Offset, E: Emitter<O>> Tokenizer<R, O, E> { #[derive(Debug)] #[non_exhaustive] pub enum State { - /// The [Data state](https://html.spec.whatwg.org/#data-state). + /// The [data state]. + /// + /// [data state]: https://html.spec.whatwg.org/#data-state Data, - /// The [PLAINTEXT state](https://html.spec.whatwg.org/#plaintext-state). + /// The [PLAINTEXT state]. + /// + /// [PLAINTEXT state]: https://html.spec.whatwg.org/#plaintext-state PlainText, - /// The [RCDATA state](https://html.spec.whatwg.org/#rcdata-state). + /// The [RCDATA state]. + /// + /// [RCDATA state]: https://html.spec.whatwg.org/#rcdata-state RcData, - /// The [RAWTEXT state](https://html.spec.whatwg.org/#rawtext-state). + /// The [RAWTEXT state]. + /// + /// [RAWTEXT state]: https://html.spec.whatwg.org/#rawtext-state RawText, - /// The [Script data state](https://html.spec.whatwg.org/#script-data-state). + /// The [script data state]. + /// + /// [script data state]: https://html.spec.whatwg.org/#script-data-state ScriptData, - /// The [Script data escaped state](https://html.spec.whatwg.org/#script-data-escaped-state). + /// The [script data escaped state]. + /// + /// [script data escaped state]: https://html.spec.whatwg.org/#script-data-escaped-state ScriptDataEscaped, - /// The [Script data double escaped state](https://html.spec.whatwg.org/#script-data-double-escaped-state). + /// The [script data double escaped state]. + /// + /// [script data double escaped state]: https://html.spec.whatwg.org/#script-data-double-escaped-state ScriptDataDoubleEscaped, } @@ -129,8 +143,9 @@ impl<R: Reader + Position<O>, O: Offset, E: Emitter<O>> Tokenizer<R, O, E> { /// * the _last start tag_ exists /// * the current end tag token's name equals to the last start tag's name. /// - /// See also [WHATWG's definition of "appropriate end tag - /// token"](https://html.spec.whatwg.org/#appropriate-end-tag-token). + /// See also WHATWG's definition of [appropriate end tag token]. + /// + /// [appropriate end tag token]: https://html.spec.whatwg.org/#appropriate-end-tag-token #[inline] pub(crate) fn current_end_tag_is_appropriate(&mut self) -> bool { self.current_tag_name == self.last_start_tag_name |