diff options
| author | Martin Fischer <martin@push-f.com> | 2021-04-08 08:42:01 +0200 | 
|---|---|---|
| committer | Martin Fischer <martin@push-f.com> | 2021-04-08 15:40:37 +0200 | 
| commit | 57e7eefcbe6fb8c3dc4b01c707be9de4c34963a7 (patch) | |
| tree | 6a9d296389bf3023396592c8514ed6712e011c7f /benches | |
import https://github.com/servo/html5ever
commit d1206daa740305f55a5fa159e43eb33afc359cb4
Diffstat (limited to 'benches')
| -rw-r--r-- | benches/html5ever.rs | 81 | 
1 files changed, 81 insertions, 0 deletions
| diff --git a/benches/html5ever.rs b/benches/html5ever.rs new file mode 100644 index 0000000..ff20c4f --- /dev/null +++ b/benches/html5ever.rs @@ -0,0 +1,81 @@ +#[macro_use] +extern crate criterion; +extern crate html5ever; + +use std::fs; +use std::path::PathBuf; + +use criterion::{black_box, Criterion}; + +use html5ever::tendril::*; +use html5ever::tokenizer::{BufferQueue, Token, TokenSink, TokenSinkResult, Tokenizer}; + +struct Sink; + +impl TokenSink for Sink { +    type Handle = (); + +    fn process_token(&mut self, token: Token, _line_number: u64) -> TokenSinkResult<()> { +        // Don't use the token, but make sure we don't get +        // optimized out entirely. +        black_box(token); +        TokenSinkResult::Continue +    } +} + +fn run_bench(c: &mut Criterion, name: &str) { +    let mut path = PathBuf::from(env!("CARGO_MANIFEST_DIR")); +    path.push("data/bench/"); +    path.push(name); +    let mut file = fs::File::open(&path).ok().expect("can't open file"); + +    // Read the file and treat it as an infinitely repeating sequence of characters. +    let mut file_input = ByteTendril::new(); +    file.read_to_tendril(&mut file_input) +        .ok() +        .expect("can't read file"); +    let file_input: StrTendril = file_input.try_reinterpret().unwrap(); +    let size = file_input.len(); +    let mut stream = file_input.chars().cycle(); + +    // Break the input into chunks of 1024 chars (= a few kB). +    // This simulates reading from the network. +    let mut input = vec![]; +    let mut total = 0usize; +    while total < size { +        // The by_ref() call is important, otherwise we get wrong results! +        // See rust-lang/rust#18045. +        let sz = std::cmp::min(1024, size - total); +        input.push(stream.by_ref().take(sz).collect::<String>().to_tendril()); +        total += sz; +    } + +    let test_name = format!("html tokenizing {}", name); + +    c.bench_function(&test_name, move |b| { +        b.iter(|| { +            let mut tok = Tokenizer::new(Sink, Default::default()); +            let mut buffer = BufferQueue::new(); +            // We are doing clone inside the bench function, this is not ideal, but possibly +            // necessary since our iterator consumes the underlying buffer. +            for buf in input.clone().into_iter() { +                buffer.push_back(buf); +                let _ = tok.feed(&mut buffer); +            } +            let _ = tok.feed(&mut buffer); +            tok.end(); +        }) +    }); +} + +fn html5ever_benchmark(c: &mut Criterion) { +    run_bench(c, "lipsum.html"); +    run_bench(c, "lipsum-zh.html"); +    run_bench(c, "medium-fragment.html"); +    run_bench(c, "small-fragment.html"); +    run_bench(c, "tiny-fragment.html"); +    run_bench(c, "strong.html"); +} + +criterion_group!(benches, html5ever_benchmark); +criterion_main!(benches); | 
