diff options
Diffstat (limited to 'integration_tests/tests/tokenizer.rs')
-rw-r--r-- | integration_tests/tests/tokenizer.rs | 218 |
1 files changed, 218 insertions, 0 deletions
diff --git a/integration_tests/tests/tokenizer.rs b/integration_tests/tests/tokenizer.rs new file mode 100644 index 0000000..2d3e4cb --- /dev/null +++ b/integration_tests/tests/tokenizer.rs @@ -0,0 +1,218 @@ +use std::{fs::File, io::BufReader, ops::Range, path::Path}; + +use html5lib_tests::{ + parse_tests, Error as TestError, InitialState, Output, Test, Token as TestToken, +}; +use html5tokenizer::{ + offset::{Offset, PosTrackingReader, Position}, + reader::Reader, + BasicEmitter, Emitter, Error, Event, InternalState, Token, Tokenizer, TracingEmitter, +}; +use similar_asserts::assert_eq; + +/// Path to a local checkout of [html5lib-tests], relative to the +/// directory containing the `Cargo.toml` file of the current crate. +/// +/// [html5lib-tests]: https://github.com/html5lib/html5lib-tests +const HTML5LIB_TESTS_PATH: &str = "html5lib-tests"; + +// FUTURE: it would be nice to assert that HTML5LIB_TESTS_PATH matches the path defined in .gitmodules +// but this is currently blocked by: +// * Cargo not setting CARGO_WORKSPACE_DIR (see https://github.com/rust-lang/cargo/issues/3946) +// * gix-config having more dependencies than I'd want to add for this + +#[test] +fn tokenizer() { + // TODO: use a custom test harness with e.g. libtest-mimic + let test_dir = format!("{HTML5LIB_TESTS_PATH}/tokenizer"); + + let mut test_paths = glob::glob(&format!("{test_dir}/*.test")) + .unwrap() + .peekable(); + + if test_paths.peek().is_none() { + panic!( + "could not find any .test files in {}, maybe try `git submodule update --init`", + test_dir + ); + } + + for test_path in test_paths { + let test_path = test_path.unwrap(); + + test_tokenizer_file(&test_path); + } +} + +fn test_tokenizer_file(path: &Path) { + let fname = path.file_name().unwrap().to_str().unwrap(); + + if matches!( + fname, + // We don't implement "Coercing an HTML DOM into an infoset" section + "xmlViolation.test" | + // Our parser does not operate on bytes, the input isn't valid Rust &str + "unicodeCharsProblematic.test" + ) { + return; + } + + let f = File::open(path).unwrap(); + let bf = BufReader::new(f); + let tests = parse_tests(bf).expect(&format!("failed to parse {path:?}")); + + for (i, test) in tests.into_iter().enumerate() { + run_test(fname, i, test); + } +} + +fn run_test(fname: &str, test_i: usize, test: Test) { + for state in &test.initial_states { + run_test_inner( + fname, + test_i, + &test, + state, + Tokenizer::new(&test.input, BasicEmitter::default()), + "BasicEmitter string", + ); + + run_test_inner( + fname, + test_i, + &test, + state, + Tokenizer::new( + BufReader::new(test.input.as_bytes()), + BasicEmitter::default(), + ), + "BasicEmitter bufread", + ); + + run_test_inner( + fname, + test_i, + &test, + state, + Tokenizer::new( + PosTrackingReader::new(&test.input), + TracingEmitter::default(), + ), + "TracingEmitter string", + ); + + run_test_inner( + fname, + test_i, + &test, + state, + Tokenizer::new( + PosTrackingReader::new(BufReader::new(test.input.as_bytes())), + TracingEmitter::default(), + ), + "TracingEmitter bufread", + ); + } +} + +fn run_test_inner<R, O, E, T>( + fname: &str, + test_i: usize, + test: &Test, + state: &InitialState, + mut tokenizer: Tokenizer<R, O, E>, + tokenizer_info: &str, +) where + R: Reader + Position<O>, + O: Offset, + E: Emitter<O> + Iterator<Item = T> + DrainErrors<O>, + T: Into<Token>, +{ + println!( + "==== FILE {}, TEST {}, STATE {:?}, TOKENIZER {} ====", + fname, test_i, state, tokenizer_info, + ); + println!("description: {}", test.description); + tokenizer.set_internal_state(match state { + InitialState::Data => InternalState::Data, + InitialState::Plaintext => InternalState::Plaintext, + InitialState::Rcdata => InternalState::Rcdata, + InitialState::Rawtext => InternalState::Rawtext, + InitialState::ScriptData => InternalState::ScriptData, + InitialState::CdataSection => InternalState::CdataSection, + }); + if let Some(last_start_tag) = &test.last_start_tag { + tokenizer.set_last_start_tag(last_start_tag); + } + + let mut actual_tokens = Vec::new(); + + while let Some(event) = tokenizer.next() { + let token = match event.unwrap() { + Event::CdataOpen => { + tokenizer.handle_cdata_open(false); + continue; + } + Event::Token(token) => token.into(), + }; + + match token { + Token::StartTag(tag) => actual_tokens.push(TestToken::StartTag { + name: tag.name, + attributes: tag + .attributes + .into_iter() + .map(|attr| (attr.name, attr.value)) + .collect(), + self_closing: tag.self_closing, + }), + Token::EndTag(tag) => actual_tokens.push(TestToken::EndTag { name: tag.name }), + Token::Char(c) => { + // Coalesce all adjacent character tokens into a single string. + if let Some(TestToken::Character(s)) = actual_tokens.last_mut() { + s.push(c); + } else { + actual_tokens.push(TestToken::Character(c.into())); + } + } + Token::Comment(comment) => actual_tokens.push(TestToken::Comment(comment)), + Token::Doctype(doctype) => actual_tokens.push(TestToken::Doctype { + name: doctype.name, + public_id: doctype.public_id, + system_id: doctype.system_id, + force_quirks: doctype.force_quirks, + }), + Token::EndOfFile => {} + }; + } + + assert_eq!( + Output { + errors: tokenizer + .emitter_mut() + .drain_errors() + .map(|(e, _)| TestError { + code: e.code().to_string() + }) + .collect(), + tokens: actual_tokens, + }, + test.output, + ); +} + +trait DrainErrors<O> { + fn drain_errors(&mut self) -> Box<dyn Iterator<Item = (Error, Range<O>)> + '_>; +} + +impl<O> DrainErrors<O> for BasicEmitter<O> { + fn drain_errors(&mut self) -> Box<dyn Iterator<Item = (Error, Range<O>)> + '_> { + Box::new(self.drain_errors()) + } +} + +impl DrainErrors<usize> for TracingEmitter { + fn drain_errors(&mut self) -> Box<dyn Iterator<Item = (Error, Range<usize>)> + '_> { + Box::new(self.drain_errors()) + } +} |