summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--README.md26
-rw-r--r--src/emitter.rs4
-rw-r--r--src/error.rs4
-rw-r--r--src/tokenizer.rs33
4 files changed, 45 insertions, 22 deletions
diff --git a/README.md b/README.md
index 98a09c5..652edd8 100644
--- a/README.md
+++ b/README.md
@@ -3,8 +3,8 @@
[![docs.rs](https://img.shields.io/docsrs/html5tokenizer)](https://docs.rs/html5tokenizer)
[![crates.io](https://img.shields.io/crates/l/html5tokenizer.svg)](https://crates.io/crates/html5tokenizer)
-`html5tokenizer` is a WHATWG-compliant HTML tokenizer (forked from
-[html5gum](https://crates.io/crates/html5gum) with added code span support).
+`html5tokenizer` is a WHATWG-compliant HTML tokenizer
+(forked from [html5gum] with added code span support).
```rust
use std::fmt::Write;
@@ -34,16 +34,13 @@ assert_eq!(new_html, "<title>hello world</title>");
## What a tokenizer does and what it does not do
-`html5tokenizer` fully implements [13.2.5 of the WHATWG HTML
-spec](https://html.spec.whatwg.org/#tokenization), i.e. is able to tokenize HTML documents and passes [html5lib's tokenizer
-test suite](https://github.com/html5lib/html5lib-tests/tree/master/tokenizer). Since it is just a tokenizer, this means:
+`html5tokenizer` fully implements [13.2.5 of the WHATWG HTML spec][tokenization],
+i.e. is able to tokenize HTML documents and passes [html5lib's tokenizer test suite].
+Since it is just a tokenizer, this means:
-* `html5tokenizer` **does not** [implement charset
- detection.](https://html.spec.whatwg.org/#determining-the-character-encoding)
- This implementation requires all input to be Rust strings and therefore valid
- UTF-8.
-* `html5tokenizer` **does not** [correct mis-nested
- tags.](https://html.spec.whatwg.org/#an-introduction-to-error-handling-and-strange-cases-in-the-parser)
+* `html5tokenizer` **does not** implement [charset detection].
+ This implementation requires all input to be Rust strings and therefore valid UTF-8.
+* `html5tokenizer` **does not** correct [misnested tags].
* `html5tokenizer` **does not** recognize implicitly self-closing elements like
`<img>`, as a tokenizer it will simply emit a start token. It does however
emit a self-closing tag for `<img .. />`.
@@ -69,3 +66,10 @@ This allows you to:
## License
Licensed under the MIT license, see [`./LICENSE`](./LICENSE).
+
+
+[html5gum]: https://crates.io/crates/html5gum
+[tokenization]: https://html.spec.whatwg.org/#tokenization
+[html5lib's tokenizer test suite]: https://github.com/html5lib/html5lib-tests/tree/master/tokenizer
+[charset detection]: https://html.spec.whatwg.org/#determining-the-character-encoding
+[misnested tags]: https://html.spec.whatwg.org/#an-introduction-to-error-handling-and-strange-cases-in-the-parser
diff --git a/src/emitter.rs b/src/emitter.rs
index 5b64acd..4fc2159 100644
--- a/src/emitter.rs
+++ b/src/emitter.rs
@@ -519,7 +519,9 @@ impl<O: Offset> Comment<O> {
/// * `<!DOCTYPE {name} PUBLIC '{public_id}' '{system_id}'>`
#[derive(Debug, Eq, PartialEq)]
pub struct Doctype<O> {
- /// The ["force quirks"](https://html.spec.whatwg.org/#force-quirks-flag) flag.
+ /// The [force-quirks flag].
+ ///
+ /// [force-quirks flag]: https://html.spec.whatwg.org/#force-quirks-flag
pub force_quirks: bool,
/// The doctype's name. For HTML documents this is "html".
diff --git a/src/error.rs b/src/error.rs
index 4ff9dfd..401937b 100644
--- a/src/error.rs
+++ b/src/error.rs
@@ -2,7 +2,9 @@ macro_rules! impl_error {
($(
$string:literal <=> $variant:ident,
)*) => {
- /// All [parsing errors](https://html.spec.whatwg.org/#parse-errors) this tokenizer can emit.
+ /// All [parse errors] this tokenizer can emit.
+ ///
+ /// [parse errors]: https://html.spec.whatwg.org/#parse-errors
#[derive(Debug, Eq, PartialEq)]
pub enum Error {
$(
diff --git a/src/tokenizer.rs b/src/tokenizer.rs
index d272b14..3a6fb32 100644
--- a/src/tokenizer.rs
+++ b/src/tokenizer.rs
@@ -74,19 +74,33 @@ impl<R: Reader, O: Offset, E: Emitter<O>> Tokenizer<R, O, E> {
#[derive(Debug)]
#[non_exhaustive]
pub enum State {
- /// The [Data state](https://html.spec.whatwg.org/#data-state).
+ /// The [data state].
+ ///
+ /// [data state]: https://html.spec.whatwg.org/#data-state
Data,
- /// The [PLAINTEXT state](https://html.spec.whatwg.org/#plaintext-state).
+ /// The [PLAINTEXT state].
+ ///
+ /// [PLAINTEXT state]: https://html.spec.whatwg.org/#plaintext-state
PlainText,
- /// The [RCDATA state](https://html.spec.whatwg.org/#rcdata-state).
+ /// The [RCDATA state].
+ ///
+ /// [RCDATA state]: https://html.spec.whatwg.org/#rcdata-state
RcData,
- /// The [RAWTEXT state](https://html.spec.whatwg.org/#rawtext-state).
+ /// The [RAWTEXT state].
+ ///
+ /// [RAWTEXT state]: https://html.spec.whatwg.org/#rawtext-state
RawText,
- /// The [Script data state](https://html.spec.whatwg.org/#script-data-state).
+ /// The [script data state].
+ ///
+ /// [script data state]: https://html.spec.whatwg.org/#script-data-state
ScriptData,
- /// The [Script data escaped state](https://html.spec.whatwg.org/#script-data-escaped-state).
+ /// The [script data escaped state].
+ ///
+ /// [script data escaped state]: https://html.spec.whatwg.org/#script-data-escaped-state
ScriptDataEscaped,
- /// The [Script data double escaped state](https://html.spec.whatwg.org/#script-data-double-escaped-state).
+ /// The [script data double escaped state].
+ ///
+ /// [script data double escaped state]: https://html.spec.whatwg.org/#script-data-double-escaped-state
ScriptDataDoubleEscaped,
}
@@ -129,8 +143,9 @@ impl<R: Reader + Position<O>, O: Offset, E: Emitter<O>> Tokenizer<R, O, E> {
/// * the _last start tag_ exists
/// * the current end tag token's name equals to the last start tag's name.
///
- /// See also [WHATWG's definition of "appropriate end tag
- /// token"](https://html.spec.whatwg.org/#appropriate-end-tag-token).
+ /// See also WHATWG's definition of [appropriate end tag token].
+ ///
+ /// [appropriate end tag token]: https://html.spec.whatwg.org/#appropriate-end-tag-token
#[inline]
pub(crate) fn current_end_tag_is_appropriate(&mut self) -> bool {
self.current_tag_name == self.last_start_tag_name