aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMartin Fischer <martin@push-f.com>2023-08-18 16:54:43 +0200
committerMartin Fischer <martin@push-f.com>2023-08-19 13:41:55 +0200
commit4682f104ea24fc257c22dc12db1a3dad1323662a (patch)
tree8815652555207038cc0837f25b9ef35068bbc16a
parent0c495ba984436cccc6caeed66639a2b61095dbad (diff)
docs: link multipage version of HTML spec
-rw-r--r--Cargo.toml1
-rw-r--r--README.md6
-rw-r--r--src/emitter.rs2
-rw-r--r--src/error.rs2
-rw-r--r--src/tokenizer.rs16
-rw-r--r--tests/misc.rs47
6 files changed, 61 insertions, 13 deletions
diff --git a/Cargo.toml b/Cargo.toml
index 943f710..d32b406 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -21,6 +21,7 @@ include = ["src/**/*", "LICENSE", "README.md"]
codespan-reporting = "0.11.1"
insta = "1.31.0"
similar-asserts = { workspace = true }
+walkdir = "2.3.3"
[features]
# Feature used by integration tests in tests/ to get access to library internals.
diff --git a/README.md b/README.md
index 652edd8..9700826 100644
--- a/README.md
+++ b/README.md
@@ -69,7 +69,7 @@ Licensed under the MIT license, see [`./LICENSE`](./LICENSE).
[html5gum]: https://crates.io/crates/html5gum
-[tokenization]: https://html.spec.whatwg.org/#tokenization
+[tokenization]: https://html.spec.whatwg.org/multipage/parsing.html#tokenization
[html5lib's tokenizer test suite]: https://github.com/html5lib/html5lib-tests/tree/master/tokenizer
-[charset detection]: https://html.spec.whatwg.org/#determining-the-character-encoding
-[misnested tags]: https://html.spec.whatwg.org/#an-introduction-to-error-handling-and-strange-cases-in-the-parser
+[charset detection]: https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding
+[misnested tags]: https://html.spec.whatwg.org/multipage/parsing.html#an-introduction-to-error-handling-and-strange-cases-in-the-parser
diff --git a/src/emitter.rs b/src/emitter.rs
index 4fc2159..d1180a5 100644
--- a/src/emitter.rs
+++ b/src/emitter.rs
@@ -521,7 +521,7 @@ impl<O: Offset> Comment<O> {
pub struct Doctype<O> {
/// The [force-quirks flag].
///
- /// [force-quirks flag]: https://html.spec.whatwg.org/#force-quirks-flag
+ /// [force-quirks flag]: https://html.spec.whatwg.org/multipage/parsing.html#force-quirks-flag
pub force_quirks: bool,
/// The doctype's name. For HTML documents this is "html".
diff --git a/src/error.rs b/src/error.rs
index 401937b..3ba8f63 100644
--- a/src/error.rs
+++ b/src/error.rs
@@ -4,7 +4,7 @@ macro_rules! impl_error {
)*) => {
/// All [parse errors] this tokenizer can emit.
///
- /// [parse errors]: https://html.spec.whatwg.org/#parse-errors
+ /// [parse errors]: https://html.spec.whatwg.org/multipage/parsing.html#parse-errors
#[derive(Debug, Eq, PartialEq)]
pub enum Error {
$(
diff --git a/src/tokenizer.rs b/src/tokenizer.rs
index 3a6fb32..469cbd1 100644
--- a/src/tokenizer.rs
+++ b/src/tokenizer.rs
@@ -76,31 +76,31 @@ impl<R: Reader, O: Offset, E: Emitter<O>> Tokenizer<R, O, E> {
pub enum State {
/// The [data state].
///
- /// [data state]: https://html.spec.whatwg.org/#data-state
+ /// [data state]: https://html.spec.whatwg.org/multipage/parsing.html#data-state
Data,
/// The [PLAINTEXT state].
///
- /// [PLAINTEXT state]: https://html.spec.whatwg.org/#plaintext-state
+ /// [PLAINTEXT state]: https://html.spec.whatwg.org/multipage/parsing.html#plaintext-state
PlainText,
/// The [RCDATA state].
///
- /// [RCDATA state]: https://html.spec.whatwg.org/#rcdata-state
+ /// [RCDATA state]: https://html.spec.whatwg.org/multipage/parsing.html#rcdata-state
RcData,
/// The [RAWTEXT state].
///
- /// [RAWTEXT state]: https://html.spec.whatwg.org/#rawtext-state
+ /// [RAWTEXT state]: https://html.spec.whatwg.org/multipage/parsing.html#rawtext-state
RawText,
/// The [script data state].
///
- /// [script data state]: https://html.spec.whatwg.org/#script-data-state
+ /// [script data state]: https://html.spec.whatwg.org/multipage/parsing.html#script-data-state
ScriptData,
/// The [script data escaped state].
///
- /// [script data escaped state]: https://html.spec.whatwg.org/#script-data-escaped-state
+ /// [script data escaped state]: https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-state
ScriptDataEscaped,
/// The [script data double escaped state].
///
- /// [script data double escaped state]: https://html.spec.whatwg.org/#script-data-double-escaped-state
+ /// [script data double escaped state]: https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-state
ScriptDataDoubleEscaped,
}
@@ -145,7 +145,7 @@ impl<R: Reader + Position<O>, O: Offset, E: Emitter<O>> Tokenizer<R, O, E> {
///
/// See also WHATWG's definition of [appropriate end tag token].
///
- /// [appropriate end tag token]: https://html.spec.whatwg.org/#appropriate-end-tag-token
+ /// [appropriate end tag token]: https://html.spec.whatwg.org/multipage/parsing.html#appropriate-end-tag-token
#[inline]
pub(crate) fn current_end_tag_is_appropriate(&mut self) -> bool {
self.current_tag_name == self.last_start_tag_name
diff --git a/tests/misc.rs b/tests/misc.rs
new file mode 100644
index 0000000..416e506
--- /dev/null
+++ b/tests/misc.rs
@@ -0,0 +1,47 @@
+use similar_asserts::assert_eq;
+use walkdir::{DirEntry, WalkDir};
+
+#[test]
+fn links_to_html_spec_use_multipage_version() {
+ for entry in WalkDir::new(".")
+ .min_depth(1)
+ .into_iter()
+ .filter_entry(is_source_file)
+ .flatten()
+ {
+ if !entry.file_type().is_file() {
+ continue;
+ }
+
+ let actual = match std::fs::read_to_string(entry.path()) {
+ Ok(content) => content,
+ Err(err) => panic!("invalid UTF-8 in file content: {:?}: {}", entry.path(), err),
+ };
+
+ let expected = actual.replace(
+ concat!("://html.spec.whatwg.org/", "#"),
+ concat!("://html.spec.whatwg.org/multipage/???.html#"),
+ );
+
+ assert_eq!(
+ actual,
+ expected,
+ "Found a link to the one-page version of the HTML spec, which is huge and takes long to load. We want to link the multipage version instead."
+ );
+ }
+}
+
+fn is_source_file(entry: &DirEntry) -> bool {
+ let Some(filename) = entry.file_name().to_str() else {
+ panic!("invalid UTF-8 in filename: {:?}", entry.path())
+ };
+
+ if entry.depth() == 1 && filename == "target" || filename == "Cargo.lock" {
+ return false; // cargo files
+ }
+ if filename == "html5lib-tests" {
+ return false; // git submodule
+ }
+
+ !filename.starts_with('.') // .git, etc.
+}