aboutsummaryrefslogtreecommitdiff
path: root/integration_tests/tests/test_html5lib.rs
diff options
context:
space:
mode:
Diffstat (limited to 'integration_tests/tests/test_html5lib.rs')
-rw-r--r--integration_tests/tests/test_html5lib.rs218
1 files changed, 0 insertions, 218 deletions
diff --git a/integration_tests/tests/test_html5lib.rs b/integration_tests/tests/test_html5lib.rs
deleted file mode 100644
index 2d3e4cb..0000000
--- a/integration_tests/tests/test_html5lib.rs
+++ /dev/null
@@ -1,218 +0,0 @@
-use std::{fs::File, io::BufReader, ops::Range, path::Path};
-
-use html5lib_tests::{
- parse_tests, Error as TestError, InitialState, Output, Test, Token as TestToken,
-};
-use html5tokenizer::{
- offset::{Offset, PosTrackingReader, Position},
- reader::Reader,
- BasicEmitter, Emitter, Error, Event, InternalState, Token, Tokenizer, TracingEmitter,
-};
-use similar_asserts::assert_eq;
-
-/// Path to a local checkout of [html5lib-tests], relative to the
-/// directory containing the `Cargo.toml` file of the current crate.
-///
-/// [html5lib-tests]: https://github.com/html5lib/html5lib-tests
-const HTML5LIB_TESTS_PATH: &str = "html5lib-tests";
-
-// FUTURE: it would be nice to assert that HTML5LIB_TESTS_PATH matches the path defined in .gitmodules
-// but this is currently blocked by:
-// * Cargo not setting CARGO_WORKSPACE_DIR (see https://github.com/rust-lang/cargo/issues/3946)
-// * gix-config having more dependencies than I'd want to add for this
-
-#[test]
-fn tokenizer() {
- // TODO: use a custom test harness with e.g. libtest-mimic
- let test_dir = format!("{HTML5LIB_TESTS_PATH}/tokenizer");
-
- let mut test_paths = glob::glob(&format!("{test_dir}/*.test"))
- .unwrap()
- .peekable();
-
- if test_paths.peek().is_none() {
- panic!(
- "could not find any .test files in {}, maybe try `git submodule update --init`",
- test_dir
- );
- }
-
- for test_path in test_paths {
- let test_path = test_path.unwrap();
-
- test_tokenizer_file(&test_path);
- }
-}
-
-fn test_tokenizer_file(path: &Path) {
- let fname = path.file_name().unwrap().to_str().unwrap();
-
- if matches!(
- fname,
- // We don't implement "Coercing an HTML DOM into an infoset" section
- "xmlViolation.test" |
- // Our parser does not operate on bytes, the input isn't valid Rust &str
- "unicodeCharsProblematic.test"
- ) {
- return;
- }
-
- let f = File::open(path).unwrap();
- let bf = BufReader::new(f);
- let tests = parse_tests(bf).expect(&format!("failed to parse {path:?}"));
-
- for (i, test) in tests.into_iter().enumerate() {
- run_test(fname, i, test);
- }
-}
-
-fn run_test(fname: &str, test_i: usize, test: Test) {
- for state in &test.initial_states {
- run_test_inner(
- fname,
- test_i,
- &test,
- state,
- Tokenizer::new(&test.input, BasicEmitter::default()),
- "BasicEmitter string",
- );
-
- run_test_inner(
- fname,
- test_i,
- &test,
- state,
- Tokenizer::new(
- BufReader::new(test.input.as_bytes()),
- BasicEmitter::default(),
- ),
- "BasicEmitter bufread",
- );
-
- run_test_inner(
- fname,
- test_i,
- &test,
- state,
- Tokenizer::new(
- PosTrackingReader::new(&test.input),
- TracingEmitter::default(),
- ),
- "TracingEmitter string",
- );
-
- run_test_inner(
- fname,
- test_i,
- &test,
- state,
- Tokenizer::new(
- PosTrackingReader::new(BufReader::new(test.input.as_bytes())),
- TracingEmitter::default(),
- ),
- "TracingEmitter bufread",
- );
- }
-}
-
-fn run_test_inner<R, O, E, T>(
- fname: &str,
- test_i: usize,
- test: &Test,
- state: &InitialState,
- mut tokenizer: Tokenizer<R, O, E>,
- tokenizer_info: &str,
-) where
- R: Reader + Position<O>,
- O: Offset,
- E: Emitter<O> + Iterator<Item = T> + DrainErrors<O>,
- T: Into<Token>,
-{
- println!(
- "==== FILE {}, TEST {}, STATE {:?}, TOKENIZER {} ====",
- fname, test_i, state, tokenizer_info,
- );
- println!("description: {}", test.description);
- tokenizer.set_internal_state(match state {
- InitialState::Data => InternalState::Data,
- InitialState::Plaintext => InternalState::Plaintext,
- InitialState::Rcdata => InternalState::Rcdata,
- InitialState::Rawtext => InternalState::Rawtext,
- InitialState::ScriptData => InternalState::ScriptData,
- InitialState::CdataSection => InternalState::CdataSection,
- });
- if let Some(last_start_tag) = &test.last_start_tag {
- tokenizer.set_last_start_tag(last_start_tag);
- }
-
- let mut actual_tokens = Vec::new();
-
- while let Some(event) = tokenizer.next() {
- let token = match event.unwrap() {
- Event::CdataOpen => {
- tokenizer.handle_cdata_open(false);
- continue;
- }
- Event::Token(token) => token.into(),
- };
-
- match token {
- Token::StartTag(tag) => actual_tokens.push(TestToken::StartTag {
- name: tag.name,
- attributes: tag
- .attributes
- .into_iter()
- .map(|attr| (attr.name, attr.value))
- .collect(),
- self_closing: tag.self_closing,
- }),
- Token::EndTag(tag) => actual_tokens.push(TestToken::EndTag { name: tag.name }),
- Token::Char(c) => {
- // Coalesce all adjacent character tokens into a single string.
- if let Some(TestToken::Character(s)) = actual_tokens.last_mut() {
- s.push(c);
- } else {
- actual_tokens.push(TestToken::Character(c.into()));
- }
- }
- Token::Comment(comment) => actual_tokens.push(TestToken::Comment(comment)),
- Token::Doctype(doctype) => actual_tokens.push(TestToken::Doctype {
- name: doctype.name,
- public_id: doctype.public_id,
- system_id: doctype.system_id,
- force_quirks: doctype.force_quirks,
- }),
- Token::EndOfFile => {}
- };
- }
-
- assert_eq!(
- Output {
- errors: tokenizer
- .emitter_mut()
- .drain_errors()
- .map(|(e, _)| TestError {
- code: e.code().to_string()
- })
- .collect(),
- tokens: actual_tokens,
- },
- test.output,
- );
-}
-
-trait DrainErrors<O> {
- fn drain_errors(&mut self) -> Box<dyn Iterator<Item = (Error, Range<O>)> + '_>;
-}
-
-impl<O> DrainErrors<O> for BasicEmitter<O> {
- fn drain_errors(&mut self) -> Box<dyn Iterator<Item = (Error, Range<O>)> + '_> {
- Box::new(self.drain_errors())
- }
-}
-
-impl DrainErrors<usize> for TracingEmitter {
- fn drain_errors(&mut self) -> Box<dyn Iterator<Item = (Error, Range<usize>)> + '_> {
- Box::new(self.drain_errors())
- }
-}