aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Cargo.toml4
-rw-r--r--html5lib_tests/Cargo.toml14
-rw-r--r--html5lib_tests/src/lib.rs252
-rw-r--r--integration_tests/Cargo.toml3
-rw-r--r--integration_tests/tests/test_html5lib.rs291
5 files changed, 284 insertions, 280 deletions
diff --git a/Cargo.toml b/Cargo.toml
index 64306d8..efa299d 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
[workspace]
-members = [".", "integration_tests"]
-default-members = [".", "integration_tests"]
+members = [".", "html5lib_tests", "integration_tests"]
+default-members = [".", "html5lib_tests", "integration_tests"]
[package]
name = "html5tokenizer"
diff --git a/html5lib_tests/Cargo.toml b/html5lib_tests/Cargo.toml
new file mode 100644
index 0000000..06fc6b8
--- /dev/null
+++ b/html5lib_tests/Cargo.toml
@@ -0,0 +1,14 @@
+[package]
+name = "html5lib_tests"
+authors = ["Martin Fischer <martin@push-f.com>", "Markus Unterwaditzer <markus-honeypot@unterwaditzer.net>"]
+description = "Deserializable types for the .test files from html5lib-tests."
+version = "0.0.0"
+edition = "2021"
+publish = false # prevent accidental publishes until it's ready to be published
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
+serde = { version = "1.0.130", features = ["derive"] }
+serde_json = "1.0.71"
+html5tokenizer = { path = ".." } # TODO: get rid of this dependency
diff --git a/html5lib_tests/src/lib.rs b/html5lib_tests/src/lib.rs
new file mode 100644
index 0000000..5678b0d
--- /dev/null
+++ b/html5lib_tests/src/lib.rs
@@ -0,0 +1,252 @@
+use html5tokenizer::{Attribute, Doctype, EndTag, Error, StartTag, Token};
+use serde::{de::Error as _, Deserialize};
+use std::collections::BTreeMap;
+
+pub fn parse_tests(
+ reader: impl std::io::Read,
+) -> Result<impl Iterator<Item = Test>, serde_json::Error> {
+ let Tests { tests } = serde_json::from_reader(reader)?;
+ Ok(tests.into_iter().map(undo_double_escaping))
+}
+
+pub struct ExpectedOutputTokens(pub Vec<Token<()>>);
+
+impl<'de> Deserialize<'de> for ExpectedOutputTokens {
+ fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
+ where
+ D: serde::Deserializer<'de>,
+ {
+ // this macro is a horrible way to define a type that deserializes only from a particular
+ // string. Together with serde(untagged) this gives us really flexible enum tagging with really
+ // terrible error messages.
+ macro_rules! def_const {
+ ($str:expr, $ty:ident) => {
+ #[derive(Deserialize)]
+ enum $ty {
+ #[serde(rename = $str)]
+ $ty,
+ }
+ };
+ }
+
+ def_const!("DOCTYPE", DoctypeConst);
+ def_const!("StartTag", StartTagConst);
+ def_const!("EndTag", EndTagConst);
+ def_const!("Comment", CommentConst);
+ def_const!("Character", CharacterConst);
+
+ type Attributes = BTreeMap<String, String>;
+
+ #[derive(Deserialize)]
+ #[serde(untagged)]
+ enum OutputToken {
+ // "DOCTYPE", name, public_id, system_id, correctness
+ Doctype(
+ DoctypeConst,
+ Option<String>,
+ Option<String>,
+ Option<String>,
+ bool,
+ ),
+ // "StartTag", name, attributes, self_closing
+ StartTag(StartTagConst, String, Attributes),
+ StartTag2(StartTagConst, String, Attributes, bool),
+ // "EndTag", name
+ EndTag(EndTagConst, String),
+ // "Comment", data
+ Comment(CommentConst, String),
+ // "Character", data
+ Character(CharacterConst, String),
+ }
+
+ Ok(ExpectedOutputTokens(
+ Vec::deserialize(deserializer)?
+ .into_iter()
+ .map(|output_token| match output_token {
+ OutputToken::Doctype(
+ _,
+ name,
+ public_identifier,
+ system_identifier,
+ correctness,
+ ) => Token::Doctype(Doctype {
+ name: name.unwrap_or_default(),
+ public_identifier,
+ system_identifier,
+ force_quirks: !correctness,
+ }),
+ OutputToken::StartTag(_, name, attributes) => Token::StartTag(StartTag {
+ self_closing: false,
+ name,
+ attributes: attributes
+ .into_iter()
+ .map(|(k, v)| {
+ (
+ k,
+ Attribute {
+ value: v,
+ ..Default::default()
+ },
+ )
+ })
+ .collect(),
+ name_span: (),
+ }),
+ OutputToken::StartTag2(_, name, attributes, self_closing) => {
+ Token::StartTag(StartTag {
+ self_closing,
+ name,
+ attributes: attributes
+ .into_iter()
+ .map(|(k, v)| {
+ (
+ k,
+ Attribute {
+ value: v,
+ ..Default::default()
+ },
+ )
+ })
+ .collect(),
+ name_span: (),
+ })
+ }
+ OutputToken::EndTag(_, name) => Token::EndTag(EndTag {
+ name,
+ name_span: (),
+ }),
+ OutputToken::Comment(_, data) => Token::Comment(data),
+ OutputToken::Character(_, data) => Token::String(data),
+ })
+ .collect::<Vec<Token<()>>>(),
+ ))
+ }
+}
+
+#[derive(Debug, Deserialize)]
+pub enum InitialState {
+ #[serde(rename = "Data state")]
+ Data,
+ #[serde(rename = "PLAINTEXT state")]
+ PlainText,
+ #[serde(rename = "RCDATA state")]
+ RcData,
+ #[serde(rename = "RAWTEXT state")]
+ RawText,
+ #[serde(rename = "Script data state")]
+ ScriptData,
+ #[serde(rename = "CDATA section state")]
+ CdataSection,
+}
+
+fn initial_states_default() -> Vec<InitialState> {
+ vec![InitialState::Data]
+}
+
+#[derive(Deserialize)]
+#[serde(rename_all = "camelCase")]
+pub struct Test {
+ pub description: String,
+ pub input: String,
+ pub output: ExpectedOutputTokens,
+ #[serde(default = "initial_states_default")]
+ pub initial_states: Vec<InitialState>,
+ #[serde(default)]
+ double_escaped: bool,
+ #[serde(default)]
+ pub last_start_tag: Option<String>,
+ #[serde(default)]
+ pub errors: Vec<ParseError>,
+}
+
+#[derive(Debug, Eq, PartialEq)]
+pub struct ParseErrorInner(pub Error);
+
+impl<'de> Deserialize<'de> for ParseErrorInner {
+ fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
+ where
+ D: serde::Deserializer<'de>,
+ {
+ let str_err = String::deserialize(deserializer)?;
+ let err: Error = str_err
+ .parse()
+ .map_err(|_| D::Error::custom(&format!("failed to deserialize error: {}", str_err)))?;
+ Ok(ParseErrorInner(err))
+ }
+}
+
+#[derive(Deserialize, Debug, Eq, PartialEq)]
+#[serde(rename_all = "camelCase")]
+pub struct ParseError {
+ pub code: ParseErrorInner,
+ // TODO: lineno and column?
+}
+
+#[derive(Deserialize)]
+struct Tests {
+ tests: Vec<Test>,
+}
+
+fn undo_double_escaping(mut test: Test) -> Test {
+ test.input = if test.double_escaped {
+ unescape(&test.input)
+ } else {
+ test.input
+ };
+
+ test.output = if test.double_escaped {
+ ExpectedOutputTokens(
+ test.output
+ .0
+ .into_iter()
+ .map(|token| match token {
+ Token::String(x) => Token::String(unescape(&x)),
+ Token::Comment(x) => Token::Comment(unescape(&x)),
+ token => token,
+ })
+ .collect(),
+ )
+ } else {
+ ExpectedOutputTokens(test.output.0)
+ };
+ test
+}
+
+/// Implements the escape sequences described in the tokenizer tests of html5lib-tests (and nothing
+/// more)
+fn unescape(data: &str) -> String {
+ let mut stream = data.chars();
+ let mut rv = String::new();
+
+ loop {
+ match stream.next() {
+ Some('\\') => (),
+ Some(x) => {
+ rv.push(x);
+ continue;
+ }
+ None => break,
+ }
+
+ match stream.next() {
+ Some('u') => (),
+ x => panic!("unexpected escape: {:?}", x),
+ }
+
+ let orig_len = rv.len();
+
+ for _ in 0..4 {
+ rv.push(match stream.next() {
+ Some(x) => x,
+ None => panic!("unexpected eof after \\u"),
+ });
+ }
+
+ let c = u32::from_str_radix(&rv[orig_len..], 16).expect("failed to parse as hex");
+ let c = char::from_u32(c).expect("bad character");
+ rv.truncate(orig_len);
+ rv.push(c);
+ }
+
+ rv
+}
diff --git a/integration_tests/Cargo.toml b/integration_tests/Cargo.toml
index 1e68a0b..cc27798 100644
--- a/integration_tests/Cargo.toml
+++ b/integration_tests/Cargo.toml
@@ -16,7 +16,6 @@ edition = "2021"
[dev-dependencies]
glob = "0.3.1"
+html5lib_tests = { path = "../html5lib_tests" }
html5tokenizer = { path = "..", features = ["integration-tests"] }
pretty_assertions = "1.0.0"
-serde = { version = "1.0.130", features = ["derive"] }
-serde_json = "1.0.71"
diff --git a/integration_tests/tests/test_html5lib.rs b/integration_tests/tests/test_html5lib.rs
index 61e2133..3236f0f 100644
--- a/integration_tests/tests/test_html5lib.rs
+++ b/integration_tests/tests/test_html5lib.rs
@@ -1,212 +1,8 @@
-use html5tokenizer::{
- Attribute, Doctype, EndTag, Error, InternalState as State, Reader, StartTag, Token, Tokenizer,
-};
-use pretty_assertions::assert_eq;
-use serde::{de::Error as _, Deserialize};
-use std::{collections::BTreeMap, fs::File, io::BufReader, path::Path};
-
-fn parse_tests(
- reader: impl std::io::Read,
-) -> Result<impl Iterator<Item = Test>, serde_json::Error> {
- let Tests { tests } = serde_json::from_reader(reader)?;
- Ok(tests.into_iter().map(undo_double_escaping))
-}
-
-struct ExpectedOutputTokens(Vec<Token<()>>);
-
-impl<'de> Deserialize<'de> for ExpectedOutputTokens {
- fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
- where
- D: serde::Deserializer<'de>,
- {
- // this macro is a horrible way to define a type that deserializes only from a particular
- // string. Together with serde(untagged) this gives us really flexible enum tagging with really
- // terrible error messages.
- macro_rules! def_const {
- ($str:expr, $ty:ident) => {
- #[derive(Deserialize)]
- enum $ty {
- #[serde(rename = $str)]
- $ty,
- }
- };
- }
-
- def_const!("DOCTYPE", DoctypeConst);
- def_const!("StartTag", StartTagConst);
- def_const!("EndTag", EndTagConst);
- def_const!("Comment", CommentConst);
- def_const!("Character", CharacterConst);
-
- type Attributes = BTreeMap<String, String>;
-
- #[derive(Deserialize)]
- #[serde(untagged)]
- enum OutputToken {
- // "DOCTYPE", name, public_id, system_id, correctness
- Doctype(
- DoctypeConst,
- Option<String>,
- Option<String>,
- Option<String>,
- bool,
- ),
- // "StartTag", name, attributes, self_closing
- StartTag(StartTagConst, String, Attributes),
- StartTag2(StartTagConst, String, Attributes, bool),
- // "EndTag", name
- EndTag(EndTagConst, String),
- // "Comment", data
- Comment(CommentConst, String),
- // "Character", data
- Character(CharacterConst, String),
- }
-
- Ok(ExpectedOutputTokens(
- Vec::deserialize(deserializer)?
- .into_iter()
- .map(|output_token| match output_token {
- OutputToken::Doctype(
- _,
- name,
- public_identifier,
- system_identifier,
- correctness,
- ) => Token::Doctype(Doctype {
- name: name.unwrap_or_default(),
- public_identifier,
- system_identifier,
- force_quirks: !correctness,
- }),
- OutputToken::StartTag(_, name, attributes) => Token::StartTag(StartTag {
- self_closing: false,
- name,
- attributes: attributes
- .into_iter()
- .map(|(k, v)| {
- (
- k,
- Attribute {
- value: v,
- ..Default::default()
- },
- )
- })
- .collect(),
- name_span: (),
- }),
- OutputToken::StartTag2(_, name, attributes, self_closing) => {
- Token::StartTag(StartTag {
- self_closing,
- name,
- attributes: attributes
- .into_iter()
- .map(|(k, v)| {
- (
- k,
- Attribute {
- value: v,
- ..Default::default()
- },
- )
- })
- .collect(),
- name_span: (),
- })
- }
- OutputToken::EndTag(_, name) => Token::EndTag(EndTag {
- name,
- name_span: (),
- }),
- OutputToken::Comment(_, data) => Token::Comment(data),
- OutputToken::Character(_, data) => Token::String(data),
- })
- .collect::<Vec<Token<()>>>(),
- ))
- }
-}
-
-struct InitialState(State);
-
-impl<'de> Deserialize<'de> for InitialState {
- fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
- where
- D: serde::Deserializer<'de>,
- {
- #[derive(Deserialize)]
- enum RawInitialState {
- #[serde(rename = "Data state")]
- Data,
- #[serde(rename = "PLAINTEXT state")]
- PlainText,
- #[serde(rename = "RCDATA state")]
- RcData,
- #[serde(rename = "RAWTEXT state")]
- RawText,
- #[serde(rename = "Script data state")]
- ScriptData,
- #[serde(rename = "CDATA section state")]
- CdataSection,
- }
-
- Ok(Self(match RawInitialState::deserialize(deserializer)? {
- RawInitialState::Data => State::Data,
- RawInitialState::PlainText => State::PlainText,
- RawInitialState::RcData => State::RcData,
- RawInitialState::RawText => State::RawText,
- RawInitialState::ScriptData => State::ScriptData,
- RawInitialState::CdataSection => State::CdataSection,
- }))
- }
-}
-
-fn initial_states_default() -> Vec<InitialState> {
- vec![InitialState(State::Data)]
-}
-
-#[derive(Deserialize)]
-#[serde(rename_all = "camelCase")]
-struct Test {
- description: String,
- input: String,
- output: ExpectedOutputTokens,
- #[serde(default = "initial_states_default")]
- initial_states: Vec<InitialState>,
- #[serde(default)]
- double_escaped: bool,
- #[serde(default)]
- last_start_tag: Option<String>,
- #[serde(default)]
- errors: Vec<ParseError>,
-}
-
-#[derive(Debug, Eq, PartialEq)]
-struct ParseErrorInner(Error);
+use std::{fs::File, io::BufReader, path::Path};
-impl<'de> Deserialize<'de> for ParseErrorInner {
- fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
- where
- D: serde::Deserializer<'de>,
- {
- let str_err = String::deserialize(deserializer)?;
- let err: Error = str_err
- .parse()
- .map_err(|_| D::Error::custom(&format!("failed to deserialize error: {}", str_err)))?;
- Ok(ParseErrorInner(err))
- }
-}
-
-#[derive(Deserialize, Debug, Eq, PartialEq)]
-#[serde(rename_all = "camelCase")]
-struct ParseError {
- code: ParseErrorInner,
- // TODO: lineno and column?
-}
-
-#[derive(Deserialize)]
-struct Tests {
- tests: Vec<Test>,
-}
+use html5lib_tests::{parse_tests, InitialState, ParseError, ParseErrorInner, Test};
+use html5tokenizer::{InternalState, Reader, Token, Tokenizer};
+use pretty_assertions::assert_eq;
/// Path to a local checkout of [html5lib-tests], relative to the
/// directory containing the `Cargo.toml` file of the current crate.
@@ -264,38 +60,13 @@ fn test_tokenizer_file(path: &Path) {
}
}
-fn undo_double_escaping(mut test: Test) -> Test {
- test.input = if test.double_escaped {
- unescape(&test.input)
- } else {
- test.input
- };
-
- test.output = if test.double_escaped {
- ExpectedOutputTokens(
- test.output
- .0
- .into_iter()
- .map(|token| match token {
- Token::String(x) => Token::String(unescape(&x)),
- Token::Comment(x) => Token::Comment(unescape(&x)),
- token => token,
- })
- .collect(),
- )
- } else {
- ExpectedOutputTokens(test.output.0)
- };
- test
-}
-
fn run_test(fname: &str, test_i: usize, test: Test) {
for state in &test.initial_states {
run_test_inner(
fname,
test_i,
&test,
- state.0,
+ state,
Tokenizer::new(&test.input),
"string",
);
@@ -304,7 +75,7 @@ fn run_test(fname: &str, test_i: usize, test: Test) {
fname,
test_i,
&test,
- state.0,
+ state,
Tokenizer::new(BufReader::new(test.input.as_bytes())),
"bufread",
);
@@ -315,7 +86,7 @@ fn run_test_inner<R: Reader>(
fname: &str,
test_i: usize,
test: &Test,
- state: State,
+ state: &InitialState,
mut tokenizer: Tokenizer<R>,
tokenizer_info: &str,
) {
@@ -324,7 +95,14 @@ fn run_test_inner<R: Reader>(
fname, test_i, state, tokenizer_info,
);
println!("description: {}", test.description);
- tokenizer.set_internal_state(state);
+ tokenizer.set_internal_state(match state {
+ InitialState::Data => InternalState::Data,
+ InitialState::PlainText => InternalState::PlainText,
+ InitialState::RcData => InternalState::RcData,
+ InitialState::RawText => InternalState::RawText,
+ InitialState::ScriptData => InternalState::ScriptData,
+ InitialState::CdataSection => InternalState::CdataSection,
+ });
if let Some(last_start_tag) = &test.last_start_tag {
tokenizer.set_last_start_tag(last_start_tag);
}
@@ -348,42 +126,3 @@ fn run_test_inner<R: Reader>(
assert_eq!(test.errors, actual_errors);
}
-
-/// Implements the escape sequences described in the tokenizer tests of html5lib-tests (and nothing
-/// more)
-fn unescape(data: &str) -> String {
- let mut stream = data.chars();
- let mut rv = String::new();
-
- loop {
- match stream.next() {
- Some('\\') => (),
- Some(x) => {
- rv.push(x);
- continue;
- }
- None => break,
- }
-
- match stream.next() {
- Some('u') => (),
- x => panic!("unexpected escape: {:?}", x),
- }
-
- let orig_len = rv.len();
-
- for _ in 0..4 {
- rv.push(match stream.next() {
- Some(x) => x,
- None => panic!("unexpected eof after \\u"),
- });
- }
-
- let c = u32::from_str_radix(&rv[orig_len..], 16).expect("failed to parse as hex");
- let c = char::from_u32(c).expect("bad character");
- rv.truncate(orig_len);
- rv.push(c);
- }
-
- rv
-}