From b027ecdb397c2e378491f847660f8eeb740e8cf6 Mon Sep 17 00:00:00 2001 From: Martin Fischer Date: Wed, 27 Sep 2023 09:25:12 +0200 Subject: chore: rename integration tests --- integration_tests/tests/test_html5lib.rs | 218 ----------- integration_tests/tests/tokenizer.rs | 218 +++++++++++ tests/spans.rs | 626 +++++++++++++++++++++++++++++++ tests/test_spans.rs | 626 ------------------------------- 4 files changed, 844 insertions(+), 844 deletions(-) delete mode 100644 integration_tests/tests/test_html5lib.rs create mode 100644 integration_tests/tests/tokenizer.rs create mode 100644 tests/spans.rs delete mode 100644 tests/test_spans.rs diff --git a/integration_tests/tests/test_html5lib.rs b/integration_tests/tests/test_html5lib.rs deleted file mode 100644 index 2d3e4cb..0000000 --- a/integration_tests/tests/test_html5lib.rs +++ /dev/null @@ -1,218 +0,0 @@ -use std::{fs::File, io::BufReader, ops::Range, path::Path}; - -use html5lib_tests::{ - parse_tests, Error as TestError, InitialState, Output, Test, Token as TestToken, -}; -use html5tokenizer::{ - offset::{Offset, PosTrackingReader, Position}, - reader::Reader, - BasicEmitter, Emitter, Error, Event, InternalState, Token, Tokenizer, TracingEmitter, -}; -use similar_asserts::assert_eq; - -/// Path to a local checkout of [html5lib-tests], relative to the -/// directory containing the `Cargo.toml` file of the current crate. -/// -/// [html5lib-tests]: https://github.com/html5lib/html5lib-tests -const HTML5LIB_TESTS_PATH: &str = "html5lib-tests"; - -// FUTURE: it would be nice to assert that HTML5LIB_TESTS_PATH matches the path defined in .gitmodules -// but this is currently blocked by: -// * Cargo not setting CARGO_WORKSPACE_DIR (see https://github.com/rust-lang/cargo/issues/3946) -// * gix-config having more dependencies than I'd want to add for this - -#[test] -fn tokenizer() { - // TODO: use a custom test harness with e.g. libtest-mimic - let test_dir = format!("{HTML5LIB_TESTS_PATH}/tokenizer"); - - let mut test_paths = glob::glob(&format!("{test_dir}/*.test")) - .unwrap() - .peekable(); - - if test_paths.peek().is_none() { - panic!( - "could not find any .test files in {}, maybe try `git submodule update --init`", - test_dir - ); - } - - for test_path in test_paths { - let test_path = test_path.unwrap(); - - test_tokenizer_file(&test_path); - } -} - -fn test_tokenizer_file(path: &Path) { - let fname = path.file_name().unwrap().to_str().unwrap(); - - if matches!( - fname, - // We don't implement "Coercing an HTML DOM into an infoset" section - "xmlViolation.test" | - // Our parser does not operate on bytes, the input isn't valid Rust &str - "unicodeCharsProblematic.test" - ) { - return; - } - - let f = File::open(path).unwrap(); - let bf = BufReader::new(f); - let tests = parse_tests(bf).expect(&format!("failed to parse {path:?}")); - - for (i, test) in tests.into_iter().enumerate() { - run_test(fname, i, test); - } -} - -fn run_test(fname: &str, test_i: usize, test: Test) { - for state in &test.initial_states { - run_test_inner( - fname, - test_i, - &test, - state, - Tokenizer::new(&test.input, BasicEmitter::default()), - "BasicEmitter string", - ); - - run_test_inner( - fname, - test_i, - &test, - state, - Tokenizer::new( - BufReader::new(test.input.as_bytes()), - BasicEmitter::default(), - ), - "BasicEmitter bufread", - ); - - run_test_inner( - fname, - test_i, - &test, - state, - Tokenizer::new( - PosTrackingReader::new(&test.input), - TracingEmitter::default(), - ), - "TracingEmitter string", - ); - - run_test_inner( - fname, - test_i, - &test, - state, - Tokenizer::new( - PosTrackingReader::new(BufReader::new(test.input.as_bytes())), - TracingEmitter::default(), - ), - "TracingEmitter bufread", - ); - } -} - -fn run_test_inner( - fname: &str, - test_i: usize, - test: &Test, - state: &InitialState, - mut tokenizer: Tokenizer, - tokenizer_info: &str, -) where - R: Reader + Position, - O: Offset, - E: Emitter + Iterator + DrainErrors, - T: Into, -{ - println!( - "==== FILE {}, TEST {}, STATE {:?}, TOKENIZER {} ====", - fname, test_i, state, tokenizer_info, - ); - println!("description: {}", test.description); - tokenizer.set_internal_state(match state { - InitialState::Data => InternalState::Data, - InitialState::Plaintext => InternalState::Plaintext, - InitialState::Rcdata => InternalState::Rcdata, - InitialState::Rawtext => InternalState::Rawtext, - InitialState::ScriptData => InternalState::ScriptData, - InitialState::CdataSection => InternalState::CdataSection, - }); - if let Some(last_start_tag) = &test.last_start_tag { - tokenizer.set_last_start_tag(last_start_tag); - } - - let mut actual_tokens = Vec::new(); - - while let Some(event) = tokenizer.next() { - let token = match event.unwrap() { - Event::CdataOpen => { - tokenizer.handle_cdata_open(false); - continue; - } - Event::Token(token) => token.into(), - }; - - match token { - Token::StartTag(tag) => actual_tokens.push(TestToken::StartTag { - name: tag.name, - attributes: tag - .attributes - .into_iter() - .map(|attr| (attr.name, attr.value)) - .collect(), - self_closing: tag.self_closing, - }), - Token::EndTag(tag) => actual_tokens.push(TestToken::EndTag { name: tag.name }), - Token::Char(c) => { - // Coalesce all adjacent character tokens into a single string. - if let Some(TestToken::Character(s)) = actual_tokens.last_mut() { - s.push(c); - } else { - actual_tokens.push(TestToken::Character(c.into())); - } - } - Token::Comment(comment) => actual_tokens.push(TestToken::Comment(comment)), - Token::Doctype(doctype) => actual_tokens.push(TestToken::Doctype { - name: doctype.name, - public_id: doctype.public_id, - system_id: doctype.system_id, - force_quirks: doctype.force_quirks, - }), - Token::EndOfFile => {} - }; - } - - assert_eq!( - Output { - errors: tokenizer - .emitter_mut() - .drain_errors() - .map(|(e, _)| TestError { - code: e.code().to_string() - }) - .collect(), - tokens: actual_tokens, - }, - test.output, - ); -} - -trait DrainErrors { - fn drain_errors(&mut self) -> Box)> + '_>; -} - -impl DrainErrors for BasicEmitter { - fn drain_errors(&mut self) -> Box)> + '_> { - Box::new(self.drain_errors()) - } -} - -impl DrainErrors for TracingEmitter { - fn drain_errors(&mut self) -> Box)> + '_> { - Box::new(self.drain_errors()) - } -} diff --git a/integration_tests/tests/tokenizer.rs b/integration_tests/tests/tokenizer.rs new file mode 100644 index 0000000..2d3e4cb --- /dev/null +++ b/integration_tests/tests/tokenizer.rs @@ -0,0 +1,218 @@ +use std::{fs::File, io::BufReader, ops::Range, path::Path}; + +use html5lib_tests::{ + parse_tests, Error as TestError, InitialState, Output, Test, Token as TestToken, +}; +use html5tokenizer::{ + offset::{Offset, PosTrackingReader, Position}, + reader::Reader, + BasicEmitter, Emitter, Error, Event, InternalState, Token, Tokenizer, TracingEmitter, +}; +use similar_asserts::assert_eq; + +/// Path to a local checkout of [html5lib-tests], relative to the +/// directory containing the `Cargo.toml` file of the current crate. +/// +/// [html5lib-tests]: https://github.com/html5lib/html5lib-tests +const HTML5LIB_TESTS_PATH: &str = "html5lib-tests"; + +// FUTURE: it would be nice to assert that HTML5LIB_TESTS_PATH matches the path defined in .gitmodules +// but this is currently blocked by: +// * Cargo not setting CARGO_WORKSPACE_DIR (see https://github.com/rust-lang/cargo/issues/3946) +// * gix-config having more dependencies than I'd want to add for this + +#[test] +fn tokenizer() { + // TODO: use a custom test harness with e.g. libtest-mimic + let test_dir = format!("{HTML5LIB_TESTS_PATH}/tokenizer"); + + let mut test_paths = glob::glob(&format!("{test_dir}/*.test")) + .unwrap() + .peekable(); + + if test_paths.peek().is_none() { + panic!( + "could not find any .test files in {}, maybe try `git submodule update --init`", + test_dir + ); + } + + for test_path in test_paths { + let test_path = test_path.unwrap(); + + test_tokenizer_file(&test_path); + } +} + +fn test_tokenizer_file(path: &Path) { + let fname = path.file_name().unwrap().to_str().unwrap(); + + if matches!( + fname, + // We don't implement "Coercing an HTML DOM into an infoset" section + "xmlViolation.test" | + // Our parser does not operate on bytes, the input isn't valid Rust &str + "unicodeCharsProblematic.test" + ) { + return; + } + + let f = File::open(path).unwrap(); + let bf = BufReader::new(f); + let tests = parse_tests(bf).expect(&format!("failed to parse {path:?}")); + + for (i, test) in tests.into_iter().enumerate() { + run_test(fname, i, test); + } +} + +fn run_test(fname: &str, test_i: usize, test: Test) { + for state in &test.initial_states { + run_test_inner( + fname, + test_i, + &test, + state, + Tokenizer::new(&test.input, BasicEmitter::default()), + "BasicEmitter string", + ); + + run_test_inner( + fname, + test_i, + &test, + state, + Tokenizer::new( + BufReader::new(test.input.as_bytes()), + BasicEmitter::default(), + ), + "BasicEmitter bufread", + ); + + run_test_inner( + fname, + test_i, + &test, + state, + Tokenizer::new( + PosTrackingReader::new(&test.input), + TracingEmitter::default(), + ), + "TracingEmitter string", + ); + + run_test_inner( + fname, + test_i, + &test, + state, + Tokenizer::new( + PosTrackingReader::new(BufReader::new(test.input.as_bytes())), + TracingEmitter::default(), + ), + "TracingEmitter bufread", + ); + } +} + +fn run_test_inner( + fname: &str, + test_i: usize, + test: &Test, + state: &InitialState, + mut tokenizer: Tokenizer, + tokenizer_info: &str, +) where + R: Reader + Position, + O: Offset, + E: Emitter + Iterator + DrainErrors, + T: Into, +{ + println!( + "==== FILE {}, TEST {}, STATE {:?}, TOKENIZER {} ====", + fname, test_i, state, tokenizer_info, + ); + println!("description: {}", test.description); + tokenizer.set_internal_state(match state { + InitialState::Data => InternalState::Data, + InitialState::Plaintext => InternalState::Plaintext, + InitialState::Rcdata => InternalState::Rcdata, + InitialState::Rawtext => InternalState::Rawtext, + InitialState::ScriptData => InternalState::ScriptData, + InitialState::CdataSection => InternalState::CdataSection, + }); + if let Some(last_start_tag) = &test.last_start_tag { + tokenizer.set_last_start_tag(last_start_tag); + } + + let mut actual_tokens = Vec::new(); + + while let Some(event) = tokenizer.next() { + let token = match event.unwrap() { + Event::CdataOpen => { + tokenizer.handle_cdata_open(false); + continue; + } + Event::Token(token) => token.into(), + }; + + match token { + Token::StartTag(tag) => actual_tokens.push(TestToken::StartTag { + name: tag.name, + attributes: tag + .attributes + .into_iter() + .map(|attr| (attr.name, attr.value)) + .collect(), + self_closing: tag.self_closing, + }), + Token::EndTag(tag) => actual_tokens.push(TestToken::EndTag { name: tag.name }), + Token::Char(c) => { + // Coalesce all adjacent character tokens into a single string. + if let Some(TestToken::Character(s)) = actual_tokens.last_mut() { + s.push(c); + } else { + actual_tokens.push(TestToken::Character(c.into())); + } + } + Token::Comment(comment) => actual_tokens.push(TestToken::Comment(comment)), + Token::Doctype(doctype) => actual_tokens.push(TestToken::Doctype { + name: doctype.name, + public_id: doctype.public_id, + system_id: doctype.system_id, + force_quirks: doctype.force_quirks, + }), + Token::EndOfFile => {} + }; + } + + assert_eq!( + Output { + errors: tokenizer + .emitter_mut() + .drain_errors() + .map(|(e, _)| TestError { + code: e.code().to_string() + }) + .collect(), + tokens: actual_tokens, + }, + test.output, + ); +} + +trait DrainErrors { + fn drain_errors(&mut self) -> Box)> + '_>; +} + +impl DrainErrors for BasicEmitter { + fn drain_errors(&mut self) -> Box)> + '_> { + Box::new(self.drain_errors()) + } +} + +impl DrainErrors for TracingEmitter { + fn drain_errors(&mut self) -> Box)> + '_> { + Box::new(self.drain_errors()) + } +} diff --git a/tests/spans.rs b/tests/spans.rs new file mode 100644 index 0000000..b10808c --- /dev/null +++ b/tests/spans.rs @@ -0,0 +1,626 @@ +use std::convert::Infallible; +use std::ops::Range; + +use codespan_reporting::{ + self, + diagnostic::{Diagnostic, Label}, + files::SimpleFiles, + term::{self, termcolor::Buffer}, +}; +use html5tokenizer::{ + offset::PosTrackingReader, + reader::{IntoReader, Reader}, + trace::Trace, + NaiveParser, Token, +}; +use insta::assert_snapshot; +use similar_asserts::assert_eq; + +/// Just a convenient type alias for labeler closures since Rust +/// apparently cannot infer the type (requiring an annotation). +type Parser = NaiveParser< + PosTrackingReader>>, + usize, + html5tokenizer::TracingEmitter, +>; + +fn parser(reader: impl IntoReader<'static, Reader = R>) -> Parser +where + R: Reader + 'static, +{ + NaiveParser::new_with_emitter( + PosTrackingReader::new( + Box::new(reader.into_reader()) as Box> + ), + html5tokenizer::TracingEmitter::default(), + ) +} + +fn test_and_annotate + Clone>( + html: &'static str, + labeler: impl Fn(Parser) -> Vec<(Range, S)>, +) -> String { + let labels = labeler(parser(html)); + + assert_char_encoding_independence(html, labeler); + + annotate(html, labels) +} + +fn annotate(html: &str, labels: Vec<(Range, impl AsRef)>) -> String { + let mut files = SimpleFiles::new(); + let file_id = files.add("test.html", html); + + let diagnostic = Diagnostic::note().with_labels( + labels + .into_iter() + .map(|(span, text)| Label::primary(file_id, span).with_message(text.as_ref())) + .collect(), + ); + + let mut writer = Buffer::no_color(); + let config = codespan_reporting::term::Config::default(); + term::emit(&mut writer, &config, &files, &diagnostic).unwrap(); + let msg = std::str::from_utf8(writer.as_slice()).unwrap(); + + // strip the filename and the line numbers since we don't need them + // (apparently they cannot be disabled in codespan_reporting) + msg.lines() + .skip(3) + .flat_map(|l| l.split_once("│ ").map(|s| format!("{}\n", s.1.trim_end()))) + .collect::>() + .join("") +} + +#[test] +fn char_span() { + let html = "X & &doesntexist; ѣ "; + let labeler = |parser: Parser| { + let mut labels = Vec::new(); + for (_, trace) in parser.flatten() { + if let Trace::StartTag(trace) = trace { + labels.push((trace.span, "")); + } + } + labels + }; + assert_snapshot!(test_and_annotate(html, labeler), @r###" + + ^^^ ^^^^^ ^^^^^^^ ^^^^^^ + "###); +} + +#[test] +fn end_tag_span() { + let html = " "; + let labeler = |parser: Parser| { + let mut labels = Vec::new(); + for (_, trace) in parser.flatten() { + if let Trace::EndTag(trace) = trace { + labels.push((trace.span, "")); + } + } + labels + }; + assert_snapshot!(test_and_annotate(html, labeler), @r###" + + ^^^^ ^^^^^^ ^^^^^^^^ ^^^^^^^ + "###); +} + +#[test] +fn start_tag_name_span() { + let html = " "; + let labeler = |parser: Parser| { + let mut labels = Vec::new(); + for (_, trace) in parser.flatten() { + if let Trace::StartTag(trace) = trace { + labels.push((trace.name_span, "")); + } + } + labels + }; + assert_snapshot!(test_and_annotate(html, labeler), @r###" + + ^ ^^^ ^^^ ^^^ + "###); +} + +#[test] +fn end_tag_name_span() { + let html = " "; + let labeler = |parser: Parser| { + let mut labels = Vec::new(); + for (_, trace) in parser.flatten() { + if let Trace::EndTag(trace) = trace { + labels.push((trace.name_span, "")); + } + } + labels + }; + assert_snapshot!(test_and_annotate(html, labeler), @r###" + + ^ ^^^ ^^^ ^^^ + "###); +} + +#[test] +fn attribute_name_span() { + let html = ""; + let labeler = |parser: Parser| { + let mut labels = Vec::new(); + let (Token::StartTag(tag), Trace::StartTag(trace)) = parser.flatten().next().unwrap() + else { + panic!("expected start tag") + }; + for attr in &tag.attributes { + labels.push(( + trace.attribute_traces[attr.trace_idx().unwrap()].name_span(), + "", + )); + } + labels + }; + assert_snapshot!(test_and_annotate(html, labeler), @r###" + + ^ ^^^ ^ ^^ ^ ^^^ + "###); +} + +#[test] +fn attribute_value_span() { + let html = ""; + let labeler = |parser: Parser| { + let mut labels = Vec::new(); + let (Token::StartTag(tag), Trace::StartTag(trace)) = parser.flatten().next().unwrap() + else { + panic!("expected start tag") + }; + for attr in &tag.attributes { + labels.push(( + trace.attribute_traces[attr.trace_idx().unwrap()] + .value_span() + .unwrap(), + "", + )); + } + labels + }; + assert_snapshot!(test_and_annotate(html, labeler), @r###" + + ^^^^^^^^ ^^^^^^^^ ^^^^^^^^^^^^^ ^^^^^^^^^^^^^ ^ + "###); +} + +#[test] +fn attribute_value_with_char_ref() { + let html = ""; + let labeler = |parser: Parser| { + let mut labels = Vec::new(); + let (Token::StartTag(tag), Trace::StartTag(trace)) = parser.flatten().next().unwrap() + else { + panic!("expected start tag") + }; + for attr in &tag.attributes { + labels.push(( + trace.attribute_traces[attr.trace_idx().unwrap()] + .value_span() + .unwrap(), + "", + )); + } + labels + }; + assert_snapshot!(test_and_annotate(html, labeler), @r###" + + ^^^^^ ^^^^^ ^^^^^ + "###); +} + +#[test] +fn comment_data_span() { + #[rustfmt::skip] + let cases = [ + "", + "", + "", + "", + "", + " + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + + ^ + + ^ + + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + "#, + ]; + + let mut annotated = String::new(); + for case in cases { + let labeler = |parser: Parser| { + let (_, Trace::Doctype(trace)) = parser.flatten().next().unwrap() else { + panic!("expected doctype"); + }; + vec![(trace.span(), "")] + }; + annotated.push_str(&test_and_annotate(case, labeler)); + } + + assert_snapshot!(annotated, @r###" + + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + "###); +} + +#[test] +fn doctype_id_spans() { + #[rustfmt::skip] + let cases = [ + r#""#, + ]; + + let mut annotated = String::new(); + for case in cases { + let labeler = |parser: Parser| { + let (_, Trace::Doctype(trace)) = parser.flatten().next().unwrap() else { + panic!("expected doctype"); + }; + + let mut labels = Vec::new(); + if let Some(name_span) = trace.name_span() { + labels.push((name_span, "name")); + } + if let Some(public_id_span) = trace.public_id_span() { + labels.push((public_id_span, "public id")); + } + if let Some(system_id_span) = trace.system_id_span() { + labels.push((system_id_span, "system id")); + } + labels + }; + + annotated.push_str(&test_and_annotate(case, labeler)); + } + + assert_snapshot!(annotated, @r###" + + ^^^^ ^^^^^^^^^^^^^^^^^^^^^^^^^ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ system id + │ │ + │ public id + name + "###); +} + +#[test] +fn eof_offset() { + let html = "Where does it end?"; + let labeler = |parser: Parser| { + let mut labels = Vec::new(); + for (_, trace) in parser.flatten() { + if let Trace::EndOfFile(offset) = trace { + labels.push((offset..offset, "here")); + } + } + labels + }; + assert_snapshot!(test_and_annotate(html, labeler), @r###" + Where does it end? + ^ here + "###); +} + +fn annotate_errors(html: &'static str) -> String { + let mut parser = parser(html); + for _ in parser.by_ref() {} + let errors: Vec<_> = parser.emitter_mut().drain_errors().collect(); + + for (_, span) in errors { + if span.start == span.end { + if span.start != html.len() { + panic!("empty error spans are only allowed at the very end of the source (for eof errors)"); + } + } else { + assert!(span.start < span.end); + assert!(span.end <= html.len()); + } + } + + let labeler = |mut parser: Parser| { + let mut labels = Vec::new(); + for _ in parser.by_ref() {} + for (error, span) in parser.emitter_mut().drain_errors() { + labels.push((span, error.code())); + } + labels + }; + + test_and_annotate(html, labeler) +} + +#[test] +fn tests_for_errors_are_sorted() { + let source_of_this_file = std::fs::read_to_string(file!()).unwrap(); + let mut error_tests: Vec<_> = source_of_this_file + .lines() + .filter(|l| l.starts_with("fn error_")) + .collect(); + let error_tests_found_order = error_tests.join("\n"); + error_tests.sort(); + let error_tests_sorted = error_tests.join("\n"); + assert_eq!(error_tests_found_order, error_tests_sorted); +} + +#[test] +fn error_char_ref_absence_of_digits() { + let html = "&#qux;"; + assert_snapshot!(annotate_errors(html), @r###" + &#qux; + ^^^ absence-of-digits-in-numeric-character-reference + "###); +} + +#[test] +fn error_char_ref_control_char() { + let html = ""; + assert_snapshot!(annotate_errors(html), @r###" +  + ^^^^^^ control-character-reference + "###); +} + +#[test] +fn error_char_ref_missing_semicolon() { + let html = "¬"; + assert_snapshot!(annotate_errors(html), @r###" + ¬ + ^ missing-semicolon-after-character-reference + "###); +} + +#[test] +fn error_char_ref_noncharacter() { + let html = "﷐"; + assert_snapshot!(annotate_errors(html), @r###" + ﷐ + ^^^^^^^^ noncharacter-character-reference + "###); +} + +#[test] +fn error_char_ref_null_char() { + let html = "�"; + assert_snapshot!(annotate_errors(html), @r###" + � + ^^^^ null-character-reference + "###); +} + +#[test] +fn error_char_ref_outside_unicode_range() { + let html = "�"; + assert_snapshot!(annotate_errors(html), @r###" + � + ^^^^^^^^^^ character-reference-outside-unicode-range + "###); +} + +#[test] +fn error_char_ref_surrogate() { + let html = "�"; + assert_snapshot!(annotate_errors(html), @r###" + � + ^^^^^^^^ surrogate-character-reference + "###); +} + +#[test] +fn error_char_ref_unknown_named() { + let html = "The pirate says &arrrrr;"; + assert_snapshot!(annotate_errors(html), @r###" + The pirate says &arrrrr; + ^^^^^^^^ unknown-named-character-reference + "###); +} + +#[test] +fn error_duplicate_attribute() { + let html = "Does this open two pages? "; + assert_snapshot!(annotate_errors(html), @r###" + Does this open two pages? + ^^^^ duplicate-attribute + "###); +} + +#[test] +fn error_end_tag_with_attributes() { + let html = ""; + assert_snapshot!(annotate_errors(html), @r###" + + ^^^^^^ end-tag-with-attributes + "###); +} + +#[test] +fn error_end_tag_with_trailing_solidus() { + let html = "Do you start or do you end? "; + assert_snapshot!(annotate_errors(html), @r###" + Do you start or do you end? + ^ end-tag-with-trailing-solidus + "###); +} + +#[test] +fn error_eof_before_tag_name() { + let html = "<"; + assert_snapshot!(annotate_errors(html), @r###" + < + ^ eof-before-tag-name + "###); +} + +// TODO: add error_eof_in_cdata test +// blocked by lack of proper tree constructor (NaiveParser doesn't parse CDATA sections) + +#[test] +fn error_eof_in_comment() { + let html = "", - "", - "", - "", - "", - " - ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - - ^ - - ^ - - ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - - ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - "#, - ]; - - let mut annotated = String::new(); - for case in cases { - let labeler = |parser: Parser| { - let (_, Trace::Doctype(trace)) = parser.flatten().next().unwrap() else { - panic!("expected doctype"); - }; - vec![(trace.span(), "")] - }; - annotated.push_str(&test_and_annotate(case, labeler)); - } - - assert_snapshot!(annotated, @r###" - - ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - "###); -} - -#[test] -fn doctype_id_spans() { - #[rustfmt::skip] - let cases = [ - r#""#, - ]; - - let mut annotated = String::new(); - for case in cases { - let labeler = |parser: Parser| { - let (_, Trace::Doctype(trace)) = parser.flatten().next().unwrap() else { - panic!("expected doctype"); - }; - - let mut labels = Vec::new(); - if let Some(name_span) = trace.name_span() { - labels.push((name_span, "name")); - } - if let Some(public_id_span) = trace.public_id_span() { - labels.push((public_id_span, "public id")); - } - if let Some(system_id_span) = trace.system_id_span() { - labels.push((system_id_span, "system id")); - } - labels - }; - - annotated.push_str(&test_and_annotate(case, labeler)); - } - - assert_snapshot!(annotated, @r###" - - ^^^^ ^^^^^^^^^^^^^^^^^^^^^^^^^ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ system id - │ │ - │ public id - name - "###); -} - -#[test] -fn eof_offset() { - let html = "Where does it end?"; - let labeler = |parser: Parser| { - let mut labels = Vec::new(); - for (_, trace) in parser.flatten() { - if let Trace::EndOfFile(offset) = trace { - labels.push((offset..offset, "here")); - } - } - labels - }; - assert_snapshot!(test_and_annotate(html, labeler), @r###" - Where does it end? - ^ here - "###); -} - -fn annotate_errors(html: &'static str) -> String { - let mut parser = parser(html); - for _ in parser.by_ref() {} - let errors: Vec<_> = parser.emitter_mut().drain_errors().collect(); - - for (_, span) in errors { - if span.start == span.end { - if span.start != html.len() { - panic!("empty error spans are only allowed at the very end of the source (for eof errors)"); - } - } else { - assert!(span.start < span.end); - assert!(span.end <= html.len()); - } - } - - let labeler = |mut parser: Parser| { - let mut labels = Vec::new(); - for _ in parser.by_ref() {} - for (error, span) in parser.emitter_mut().drain_errors() { - labels.push((span, error.code())); - } - labels - }; - - test_and_annotate(html, labeler) -} - -#[test] -fn tests_for_errors_are_sorted() { - let source_of_this_file = std::fs::read_to_string(file!()).unwrap(); - let mut error_tests: Vec<_> = source_of_this_file - .lines() - .filter(|l| l.starts_with("fn error_")) - .collect(); - let error_tests_found_order = error_tests.join("\n"); - error_tests.sort(); - let error_tests_sorted = error_tests.join("\n"); - assert_eq!(error_tests_found_order, error_tests_sorted); -} - -#[test] -fn error_char_ref_absence_of_digits() { - let html = "&#qux;"; - assert_snapshot!(annotate_errors(html), @r###" - &#qux; - ^^^ absence-of-digits-in-numeric-character-reference - "###); -} - -#[test] -fn error_char_ref_control_char() { - let html = ""; - assert_snapshot!(annotate_errors(html), @r###" -  - ^^^^^^ control-character-reference - "###); -} - -#[test] -fn error_char_ref_missing_semicolon() { - let html = "¬"; - assert_snapshot!(annotate_errors(html), @r###" - ¬ - ^ missing-semicolon-after-character-reference - "###); -} - -#[test] -fn error_char_ref_noncharacter() { - let html = "﷐"; - assert_snapshot!(annotate_errors(html), @r###" - ﷐ - ^^^^^^^^ noncharacter-character-reference - "###); -} - -#[test] -fn error_char_ref_null_char() { - let html = "�"; - assert_snapshot!(annotate_errors(html), @r###" - � - ^^^^ null-character-reference - "###); -} - -#[test] -fn error_char_ref_outside_unicode_range() { - let html = "�"; - assert_snapshot!(annotate_errors(html), @r###" - � - ^^^^^^^^^^ character-reference-outside-unicode-range - "###); -} - -#[test] -fn error_char_ref_surrogate() { - let html = "�"; - assert_snapshot!(annotate_errors(html), @r###" - � - ^^^^^^^^ surrogate-character-reference - "###); -} - -#[test] -fn error_char_ref_unknown_named() { - let html = "The pirate says &arrrrr;"; - assert_snapshot!(annotate_errors(html), @r###" - The pirate says &arrrrr; - ^^^^^^^^ unknown-named-character-reference - "###); -} - -#[test] -fn error_duplicate_attribute() { - let html = "Does this open two pages? "; - assert_snapshot!(annotate_errors(html), @r###" - Does this open two pages? - ^^^^ duplicate-attribute - "###); -} - -#[test] -fn error_end_tag_with_attributes() { - let html = ""; - assert_snapshot!(annotate_errors(html), @r###" - - ^^^^^^ end-tag-with-attributes - "###); -} - -#[test] -fn error_end_tag_with_trailing_solidus() { - let html = "Do you start or do you end? "; - assert_snapshot!(annotate_errors(html), @r###" - Do you start or do you end? - ^ end-tag-with-trailing-solidus - "###); -} - -#[test] -fn error_eof_before_tag_name() { - let html = "<"; - assert_snapshot!(annotate_errors(html), @r###" - < - ^ eof-before-tag-name - "###); -} - -// TODO: add error_eof_in_cdata test -// blocked by lack of proper tree constructor (NaiveParser doesn't parse CDATA sections) - -#[test] -fn error_eof_in_comment() { - let html = "