1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
|
use std::{fs::File, io::BufReader, path::Path};
use html5lib_tests::{parse_tests, InitialState, ParseError, ParseErrorInner, Test};
use html5tokenizer::{InternalState, Reader, Token, Tokenizer};
use pretty_assertions::assert_eq;
/// Path to a local checkout of [html5lib-tests], relative to the
/// directory containing the `Cargo.toml` file of the current crate.
///
/// [html5lib-tests]: https://github.com/html5lib/html5lib-tests
const HTML5LIB_TESTS_PATH: &str = "html5lib-tests";
// FUTURE: it would be nice to assert that HTML5LIB_TESTS_PATH matches the path defined in .gitmodules
// but this is currently blocked by:
// * Cargo not setting CARGO_WORKSPACE_DIR (see https://github.com/rust-lang/cargo/issues/3946)
// * gix-config having more dependencies than I'd want to add for this
#[test]
fn tokenizer() {
// TODO: use a custom test harness with e.g. libtest-mimic
let test_dir = format!("{HTML5LIB_TESTS_PATH}/tokenizer");
let mut test_paths = glob::glob(&format!("{test_dir}/*.test"))
.unwrap()
.peekable();
if test_paths.peek().is_none() {
panic!(
"could not find any .test files in {}, maybe try `git submodule update --init`",
test_dir
);
}
for test_path in test_paths {
let test_path = test_path.unwrap();
test_tokenizer_file(&test_path);
}
}
fn test_tokenizer_file(path: &Path) {
let fname = path.file_name().unwrap().to_str().unwrap();
if matches!(
fname,
// We don't implement "Coercing an HTML DOM into an infoset" section
"xmlViolation.test" |
// Our parser does not operate on bytes, the input isn't valid Rust &str
"unicodeCharsProblematic.test"
) {
return;
}
let f = File::open(path).unwrap();
let bf = BufReader::new(f);
let tests = parse_tests(bf).expect(&format!("failed to parse {path:?}"));
for (i, test) in tests.into_iter().enumerate() {
run_test(fname, i, test);
}
}
fn run_test(fname: &str, test_i: usize, test: Test) {
for state in &test.initial_states {
run_test_inner(
fname,
test_i,
&test,
state,
Tokenizer::new(&test.input),
"string",
);
run_test_inner(
fname,
test_i,
&test,
state,
Tokenizer::new(BufReader::new(test.input.as_bytes())),
"bufread",
);
}
}
fn run_test_inner<R: Reader>(
fname: &str,
test_i: usize,
test: &Test,
state: &InitialState,
mut tokenizer: Tokenizer<R>,
tokenizer_info: &str,
) {
println!(
"==== FILE {}, TEST {}, STATE {:?}, TOKENIZER {} ====",
fname, test_i, state, tokenizer_info,
);
println!("description: {}", test.description);
tokenizer.set_internal_state(match state {
InitialState::Data => InternalState::Data,
InitialState::PlainText => InternalState::PlainText,
InitialState::RcData => InternalState::RcData,
InitialState::RawText => InternalState::RawText,
InitialState::ScriptData => InternalState::ScriptData,
InitialState::CdataSection => InternalState::CdataSection,
});
if let Some(last_start_tag) = &test.last_start_tag {
tokenizer.set_last_start_tag(last_start_tag);
}
let mut actual_tokens = Vec::new();
let mut actual_errors = Vec::new();
for token in tokenizer {
let token = token.unwrap();
if let Token::Error { error, .. } = token {
actual_errors.push(ParseError {
code: ParseErrorInner(error),
});
} else {
actual_tokens.push(token);
}
}
assert_eq!(test.output.0, actual_tokens);
assert_eq!(test.errors, actual_errors);
}
|