aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--html5lib_tests/Cargo.toml1
-rw-r--r--html5lib_tests/src/lib.rs237
-rw-r--r--integration_tests/tests/test_html5lib.rs44
3 files changed, 137 insertions, 145 deletions
diff --git a/html5lib_tests/Cargo.toml b/html5lib_tests/Cargo.toml
index 06fc6b8..66e4624 100644
--- a/html5lib_tests/Cargo.toml
+++ b/html5lib_tests/Cargo.toml
@@ -11,4 +11,3 @@ publish = false # prevent accidental publishes until it's ready to be published
[dependencies]
serde = { version = "1.0.130", features = ["derive"] }
serde_json = "1.0.71"
-html5tokenizer = { path = ".." } # TODO: get rid of this dependency
diff --git a/html5lib_tests/src/lib.rs b/html5lib_tests/src/lib.rs
index c007317..6cf46db 100644
--- a/html5lib_tests/src/lib.rs
+++ b/html5lib_tests/src/lib.rs
@@ -1,5 +1,4 @@
-use html5tokenizer::{Attribute, Doctype, EndTag, Error, StartTag, Token};
-use serde::{de::Error as _, Deserialize};
+use serde::{de, Deserialize};
use std::collections::BTreeMap;
pub fn parse_tests(
@@ -9,120 +8,6 @@ pub fn parse_tests(
Ok(tests.into_iter().map(undo_double_escaping))
}
-pub struct ExpectedOutputTokens(pub Vec<Token<()>>);
-
-impl<'de> Deserialize<'de> for ExpectedOutputTokens {
- fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
- where
- D: serde::Deserializer<'de>,
- {
- // this macro is a horrible way to define a type that deserializes only from a particular
- // string. Together with serde(untagged) this gives us really flexible enum tagging with really
- // terrible error messages.
- macro_rules! def_const {
- ($str:expr, $ty:ident) => {
- #[derive(Deserialize)]
- enum $ty {
- #[serde(rename = $str)]
- $ty,
- }
- };
- }
-
- def_const!("DOCTYPE", DoctypeConst);
- def_const!("StartTag", StartTagConst);
- def_const!("EndTag", EndTagConst);
- def_const!("Comment", CommentConst);
- def_const!("Character", CharacterConst);
-
- type Attributes = BTreeMap<String, String>;
-
- #[derive(Deserialize)]
- #[serde(untagged)]
- enum OutputToken {
- // "DOCTYPE", name, public_id, system_id, correctness
- Doctype(
- DoctypeConst,
- Option<String>,
- Option<String>,
- Option<String>,
- bool,
- ),
- // "StartTag", name, attributes, self_closing
- StartTag(StartTagConst, String, Attributes),
- StartTag2(StartTagConst, String, Attributes, bool),
- // "EndTag", name
- EndTag(EndTagConst, String),
- // "Comment", data
- Comment(CommentConst, String),
- // "Character", data
- Character(CharacterConst, String),
- }
-
- Ok(ExpectedOutputTokens(
- Vec::deserialize(deserializer)?
- .into_iter()
- .map(|output_token| match output_token {
- OutputToken::Doctype(
- _,
- name,
- public_identifier,
- system_identifier,
- correctness,
- ) => Token::Doctype(Doctype {
- name: name.unwrap_or_default(),
- public_identifier,
- system_identifier,
- force_quirks: !correctness,
- }),
- OutputToken::StartTag(_, name, attributes) => Token::StartTag(StartTag {
- self_closing: false,
- name,
- attributes: attributes
- .into_iter()
- .map(|(k, v)| {
- (
- k,
- Attribute {
- value: v,
- ..Default::default()
- },
- )
- })
- .collect(),
- name_span: (),
- }),
- OutputToken::StartTag2(_, name, attributes, self_closing) => {
- Token::StartTag(StartTag {
- self_closing,
- name,
- attributes: attributes
- .into_iter()
- .map(|(k, v)| {
- (
- k,
- Attribute {
- value: v,
- ..Default::default()
- },
- )
- })
- .collect(),
- name_span: (),
- })
- }
- OutputToken::EndTag(_, name) => Token::EndTag(EndTag {
- name,
- name_span: (),
- }),
- OutputToken::Comment(_, data) => Token::Comment(data),
- OutputToken::Character(_, data) => Token::String(data),
- })
- .collect::<Vec<Token<()>>>(),
- ))
- }
-}
-
#[derive(Debug, Deserialize)]
pub enum InitialState {
#[serde(rename = "Data state")]
@@ -148,37 +33,127 @@ fn initial_states_default() -> Vec<InitialState> {
pub struct Test {
pub description: String,
pub input: String,
- pub output: ExpectedOutputTokens,
#[serde(default = "initial_states_default")]
pub initial_states: Vec<InitialState>,
+ #[serde(flatten)]
+ pub output: Output,
#[serde(default)]
double_escaped: bool,
#[serde(default)]
pub last_start_tag: Option<String>,
+}
+
+#[derive(Deserialize, PartialEq, Eq, Debug)]
+pub struct Output {
#[serde(default)]
- pub errors: Vec<ParseError>,
+ pub errors: Vec<Error>,
+ #[serde(rename = "output")]
+ pub tokens: Vec<Token>,
}
-#[derive(Debug, Eq, PartialEq)]
-pub struct ParseErrorInner(pub Error);
+#[derive(Debug, PartialEq, Eq)]
+pub enum Token {
+ Doctype {
+ name: Option<String>,
+ public_id: Option<String>,
+ system_id: Option<String>,
+ force_quirks: bool,
+ },
+ StartTag {
+ name: String,
+ attributes: BTreeMap<String, String>,
+ self_closing: bool,
+ },
+ EndTag {
+ name: String,
+ },
+ Comment(String),
+ Character(String),
+}
-impl<'de> Deserialize<'de> for ParseErrorInner {
+impl<'de> Deserialize<'de> for Token {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where
D: serde::Deserializer<'de>,
{
- let str_err = String::deserialize(deserializer)?;
- let err: Error = str_err
- .parse()
- .map_err(|_| D::Error::custom(&format!("failed to deserialize error: {}", str_err)))?;
- Ok(ParseErrorInner(err))
+ deserializer.deserialize_seq(TokenVisitor)
+ }
+}
+
+#[derive(Deserialize)]
+enum TokenType {
+ #[serde(rename = "DOCTYPE")]
+ Doctype,
+ StartTag,
+ EndTag,
+ Comment,
+ Character,
+}
+
+struct TokenVisitor;
+
+impl<'de> de::Visitor<'de> for TokenVisitor {
+ type Value = Token;
+
+ fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
+ formatter.write_str("an array describing a token")
+ }
+
+ fn visit_seq<A>(self, mut seq: A) -> Result<Self::Value, A::Error>
+ where
+ A: serde::de::SeqAccess<'de>,
+ {
+ let typ: TokenType = seq.next_element()?.ok_or(
+ de::Error::custom(
+ r#"expected first array element to be one of "DOCTYPE", "StartTag", "EndTag", "Comment" or "Character""#,
+ )
+ )?;
+
+ Ok(match typ {
+ TokenType::Doctype => Token::Doctype {
+ name: seq
+ .next_element()?
+ .ok_or(de::Error::missing_field("name"))?,
+ public_id: seq
+ .next_element()?
+ .ok_or(de::Error::missing_field("public_id"))?,
+ system_id: seq
+ .next_element()?
+ .ok_or(de::Error::missing_field("system_id"))?,
+ force_quirks: !seq
+ .next_element()?
+ .ok_or(de::Error::missing_field("correctness"))?,
+ },
+ TokenType::StartTag => Token::StartTag {
+ name: seq
+ .next_element()?
+ .ok_or(de::Error::missing_field("name"))?,
+ attributes: seq
+ .next_element()?
+ .ok_or(de::Error::missing_field("attributes"))?,
+ self_closing: seq.next_element()?.unwrap_or_default(),
+ },
+ TokenType::EndTag => Token::EndTag {
+ name: seq
+ .next_element()?
+ .ok_or(de::Error::missing_field("name"))?,
+ },
+ TokenType::Comment => Token::Comment(
+ seq.next_element()?
+ .ok_or(de::Error::missing_field("data"))?,
+ ),
+ TokenType::Character => Token::Character(
+ seq.next_element()?
+ .ok_or(de::Error::missing_field("data"))?,
+ ),
+ })
}
}
#[derive(Deserialize, Debug, Eq, PartialEq)]
#[serde(rename_all = "camelCase")]
-pub struct ParseError {
- pub code: ParseErrorInner,
+pub struct Error {
+ pub code: String,
// TODO: lineno and column?
}
@@ -191,12 +166,12 @@ fn undo_double_escaping(mut test: Test) -> Test {
if test.double_escaped {
test.input = unescape(&test.input);
- test.output.0 = test
+ test.output.tokens = test
.output
- .0
+ .tokens
.into_iter()
.map(|token| match token {
- Token::String(x) => Token::String(unescape(&x)),
+ Token::Character(x) => Token::Character(unescape(&x)),
Token::Comment(x) => Token::Comment(unescape(&x)),
token => token,
})
diff --git a/integration_tests/tests/test_html5lib.rs b/integration_tests/tests/test_html5lib.rs
index 3236f0f..23adec0 100644
--- a/integration_tests/tests/test_html5lib.rs
+++ b/integration_tests/tests/test_html5lib.rs
@@ -1,6 +1,8 @@
use std::{fs::File, io::BufReader, path::Path};
-use html5lib_tests::{parse_tests, InitialState, ParseError, ParseErrorInner, Test};
+use html5lib_tests::{
+ parse_tests, Error as TestError, InitialState, Output, Test, Token as TestToken,
+};
use html5tokenizer::{InternalState, Reader, Token, Tokenizer};
use pretty_assertions::assert_eq;
@@ -107,22 +109,38 @@ fn run_test_inner<R: Reader>(
tokenizer.set_last_start_tag(last_start_tag);
}
- let mut actual_tokens = Vec::new();
- let mut actual_errors = Vec::new();
+ let mut actual = Output {
+ errors: Vec::new(),
+ tokens: Vec::new(),
+ };
for token in tokenizer {
let token = token.unwrap();
- if let Token::Error { error, .. } = token {
- actual_errors.push(ParseError {
- code: ParseErrorInner(error),
- });
- } else {
- actual_tokens.push(token);
- }
+ match token {
+ Token::Error { error, .. } => actual.errors.push(TestError {
+ code: error.to_string(),
+ }),
+ Token::StartTag(tag) => actual.tokens.push(TestToken::StartTag {
+ name: tag.name,
+ attributes: tag
+ .attributes
+ .into_iter()
+ .map(|(name, map_val)| (name, map_val.value))
+ .collect(),
+ self_closing: tag.self_closing,
+ }),
+ Token::EndTag(tag) => actual.tokens.push(TestToken::EndTag { name: tag.name }),
+ Token::String(data) => actual.tokens.push(TestToken::Character(data)),
+ Token::Comment(data) => actual.tokens.push(TestToken::Comment(data)),
+ Token::Doctype(doctype) => actual.tokens.push(TestToken::Doctype {
+ name: Some(doctype.name).filter(|name| !name.is_empty()),
+ public_id: doctype.public_identifier,
+ system_id: doctype.system_identifier,
+ force_quirks: doctype.force_quirks,
+ }),
+ };
}
- assert_eq!(test.output.0, actual_tokens);
-
- assert_eq!(test.errors, actual_errors);
+ assert_eq!(test.output, actual);
}