integration_tests/tests/test_html5lib.rs


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149

use std::{fs::File, io::BufReader, path::Path};

use html5lib_tests::{
    parse_tests, Error as TestError, InitialState, Output, Test, Token as TestToken,
};
use html5tokenizer::{DefaultEmitter, InternalState, Reader, Token, Tokenizer};
use pretty_assertions::assert_eq;

/// Path to a local checkout of [html5lib-tests], relative to the
/// directory containing the `Cargo.toml` file of the current crate.
///
/// [html5lib-tests]: https://github.com/html5lib/html5lib-tests
const HTML5LIB_TESTS_PATH: &str = "html5lib-tests";

// FUTURE: it would be nice to assert that HTML5LIB_TESTS_PATH matches the path defined in .gitmodules
// but this is currently blocked by:
// * Cargo not setting CARGO_WORKSPACE_DIR (see https://github.com/rust-lang/cargo/issues/3946)
// * gix-config having more dependencies than I'd want to add for this

#[test]
fn tokenizer() {
    // TODO: use a custom test harness with e.g. libtest-mimic
    let test_dir = format!("{HTML5LIB_TESTS_PATH}/tokenizer");

    let mut test_paths = glob::glob(&format!("{test_dir}/*.test"))
        .unwrap()
        .peekable();

    if test_paths.peek().is_none() {
        panic!(
            "could not find any .test files in {}, maybe try `git submodule update --init`",
            test_dir
        );
    }

    for test_path in test_paths {
        let test_path = test_path.unwrap();

        test_tokenizer_file(&test_path);
    }
}

fn test_tokenizer_file(path: &Path) {
    let fname = path.file_name().unwrap().to_str().unwrap();

    if matches!(
        fname,
        // We don't implement "Coercing an HTML DOM into an infoset" section
        "xmlViolation.test" |
        // Our parser does not operate on bytes, the input isn't valid Rust &str
        "unicodeCharsProblematic.test"
    ) {
        return;
    }

    let f = File::open(path).unwrap();
    let bf = BufReader::new(f);
    let tests = parse_tests(bf).expect(&format!("failed to parse {path:?}"));

    for (i, test) in tests.into_iter().enumerate() {
        run_test(fname, i, test);
    }
}

fn run_test(fname: &str, test_i: usize, test: Test) {
    for state in &test.initial_states {
        run_test_inner(
            fname,
            test_i,
            &test,
            state,
            Tokenizer::new(&test.input, DefaultEmitter::default()),
            "string",
        );

        run_test_inner(
            fname,
            test_i,
            &test,
            state,
            Tokenizer::new(
                BufReader::new(test.input.as_bytes()),
                DefaultEmitter::default(),
            ),
            "bufread",
        );
    }
}

fn run_test_inner<R: Reader>(
    fname: &str,
    test_i: usize,
    test: &Test,
    state: &InitialState,
    mut tokenizer: Tokenizer<R>,
    tokenizer_info: &str,
) {
    println!(
        "==== FILE {}, TEST {}, STATE {:?}, TOKENIZER {} ====",
        fname, test_i, state, tokenizer_info,
    );
    println!("description: {}", test.description);
    tokenizer.set_internal_state(match state {
        InitialState::Data => InternalState::Data,
        InitialState::PlainText => InternalState::PlainText,
        InitialState::RcData => InternalState::RcData,
        InitialState::RawText => InternalState::RawText,
        InitialState::ScriptData => InternalState::ScriptData,
        InitialState::CdataSection => InternalState::CdataSection,
    });
    if let Some(last_start_tag) = &test.last_start_tag {
        tokenizer.set_last_start_tag(last_start_tag);
    }

    let mut actual = Output {
        errors: Vec::new(),
        tokens: Vec::new(),
    };

    for token in tokenizer {
        let token = token.unwrap();

        match token {
            Token::Error { error, .. } => actual.errors.push(TestError {
                code: error.to_string(),
            }),
            Token::StartTag(tag) => actual.tokens.push(TestToken::StartTag {
                name: tag.name,
                attributes: tag
                    .attributes
                    .into_iter()
                    .map(|(name, map_val)| (name, map_val.value))
                    .collect(),
                self_closing: tag.self_closing,
            }),
            Token::EndTag(tag) => actual.tokens.push(TestToken::EndTag { name: tag.name }),
            Token::String(data) => actual.tokens.push(TestToken::Character(data)),
            Token::Comment(data) => actual.tokens.push(TestToken::Comment(data)),
            Token::Doctype(doctype) => actual.tokens.push(TestToken::Doctype {
                name: Some(doctype.name).filter(|name| !name.is_empty()),
                public_id: doctype.public_identifier,
                system_id: doctype.system_identifier,
                force_quirks: doctype.force_quirks,
            }),
        };
    }

    assert_eq!(test.output, actual);
}