use std::{fs::File, io::BufReader, ops::Range, path::Path}; use html5lib_tests::{ parse_tests, Error as TestError, InitialState, Output, Test, Token as TestToken, }; use html5tokenizer::{ offset::{Offset, PosTrackingReader, Position}, reader::Reader, BasicEmitter, Emitter, Error, Event, InternalState, Token, Tokenizer, TracingEmitter, }; use similar_asserts::assert_eq; /// Path to a local checkout of [html5lib-tests], relative to the /// directory containing the `Cargo.toml` file of the current crate. /// /// [html5lib-tests]: https://github.com/html5lib/html5lib-tests const HTML5LIB_TESTS_PATH: &str = "html5lib-tests"; // FUTURE: it would be nice to assert that HTML5LIB_TESTS_PATH matches the path defined in .gitmodules // but this is currently blocked by: // * Cargo not setting CARGO_WORKSPACE_DIR (see https://github.com/rust-lang/cargo/issues/3946) // * gix-config having more dependencies than I'd want to add for this #[test] fn tokenizer() { // TODO: use a custom test harness with e.g. libtest-mimic let test_dir = format!("{HTML5LIB_TESTS_PATH}/tokenizer"); let mut test_paths = glob::glob(&format!("{test_dir}/*.test")) .unwrap() .peekable(); if test_paths.peek().is_none() { panic!( "could not find any .test files in {}, maybe try `git submodule update --init`", test_dir ); } for test_path in test_paths { let test_path = test_path.unwrap(); test_tokenizer_file(&test_path); } } fn test_tokenizer_file(path: &Path) { let fname = path.file_name().unwrap().to_str().unwrap(); if matches!( fname, // We don't implement "Coercing an HTML DOM into an infoset" section "xmlViolation.test" | // Our parser does not operate on bytes, the input isn't valid Rust &str "unicodeCharsProblematic.test" ) { return; } let f = File::open(path).unwrap(); let bf = BufReader::new(f); let tests = parse_tests(bf).expect(&format!("failed to parse {path:?}")); for (i, test) in tests.into_iter().enumerate() { run_test(fname, i, test); } } fn run_test(fname: &str, test_i: usize, test: Test) { for state in &test.initial_states { run_test_inner( fname, test_i, &test, state, Tokenizer::new(&test.input, BasicEmitter::default()), "BasicEmitter string", ); run_test_inner( fname, test_i, &test, state, Tokenizer::new( BufReader::new(test.input.as_bytes()), BasicEmitter::default(), ), "BasicEmitter bufread", ); run_test_inner( fname, test_i, &test, state, Tokenizer::new( PosTrackingReader::new(&test.input), TracingEmitter::default(), ), "TracingEmitter string", ); run_test_inner( fname, test_i, &test, state, Tokenizer::new( PosTrackingReader::new(BufReader::new(test.input.as_bytes())), TracingEmitter::default(), ), "TracingEmitter bufread", ); } } fn run_test_inner( fname: &str, test_i: usize, test: &Test, state: &InitialState, mut tokenizer: Tokenizer, tokenizer_info: &str, ) where R: Reader + Position, O: Offset, E: Emitter + Iterator + DrainErrors, T: Into, { println!( "==== FILE {}, TEST {}, STATE {:?}, TOKENIZER {} ====", fname, test_i, state, tokenizer_info, ); println!("description: {}", test.description); tokenizer.set_internal_state(match state { InitialState::Data => InternalState::Data, InitialState::PlainText => InternalState::PlainText, InitialState::RcData => InternalState::RcData, InitialState::RawText => InternalState::RawText, InitialState::ScriptData => InternalState::ScriptData, InitialState::CdataSection => InternalState::CdataSection, }); if let Some(last_start_tag) = &test.last_start_tag { tokenizer.set_last_start_tag(last_start_tag); } let mut actual_tokens = Vec::new(); while let Some(event) = tokenizer.next() { let token = match event.unwrap() { Event::CdataOpen => { tokenizer.handle_cdata_open(false); continue; } Event::Token(token) => token.into(), }; match token { Token::StartTag(tag) => actual_tokens.push(TestToken::StartTag { name: tag.name, attributes: tag .attributes .into_iter() .map(|attr| (attr.name, attr.value)) .collect(), self_closing: tag.self_closing, }), Token::EndTag(tag) => actual_tokens.push(TestToken::EndTag { name: tag.name }), Token::Char(c) => { // Coalesce all adjacent character tokens into a single string. if let Some(TestToken::Character(s)) = actual_tokens.last_mut() { s.push(c); } else { actual_tokens.push(TestToken::Character(c.into())); } } Token::Comment(comment) => actual_tokens.push(TestToken::Comment(comment)), Token::Doctype(doctype) => actual_tokens.push(TestToken::Doctype { name: doctype.name, public_id: doctype.public_id, system_id: doctype.system_id, force_quirks: doctype.force_quirks, }), Token::EndOfFile => {} }; } assert_eq!( Output { errors: tokenizer .emitter_mut() .drain_errors() .map(|(e, _)| TestError { code: e.code().to_string() }) .collect(), tokens: actual_tokens, }, test.output, ); } trait DrainErrors { fn drain_errors(&mut self) -> Box)> + '_>; } impl DrainErrors for BasicEmitter { fn drain_errors(&mut self) -> Box)> + '_> { Box::new(self.drain_errors()) } } impl DrainErrors for TracingEmitter { fn drain_errors(&mut self) -> Box)> + '_> { Box::new(self.drain_errors()) } }