// Copyright 2014-2017 The html5ever Project Developers. See the
// COPYRIGHT file at the top-level directory of this distribution.
//
// Licensed under the Apache License, Version 2.0 or the MIT license
// , at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
//! The `BufferQueue` struct and helper types.
//!
//! This type is designed for the efficient parsing of string data, especially where many
//! significant characters are from the ascii range 0-63. This includes, for example, important
//! characters in xml/html parsing.
//!
//! Good and predictable performance is achieved by avoiding allocation where possible (a.k.a. zero
//! copy).
//!
//! [`BufferQueue`]: struct.BufferQueue.html
use std::collections::VecDeque;
pub use self::SetResult::{FromSet, NotFromSet};
use crate::util::smallcharset::SmallCharSet;
/// Result from [`pop_except_from`] containing either a character from a [`SmallCharSet`], or a
/// string buffer of characters not from the set.
///
/// [`pop_except_from`]: struct.BufferQueue.html#method.pop_except_from
/// [`SmallCharSet`]: ../struct.SmallCharSet.html
#[derive(PartialEq, Eq, Debug)]
pub enum SetResult {
/// A character from the `SmallCharSet`.
FromSet(char),
/// A string buffer containing no characters from the `SmallCharSet`.
NotFromSet(String),
}
/// A queue of owned string buffers, which supports incrementally consuming characters.
///
/// Internally it uses [`VecDeque`] and has the same complexity properties.
///
/// [`VecDeque`]: https://doc.rust-lang.org/std/collections/struct.VecDeque.html
#[derive(Debug)]
pub struct BufferQueue {
/// Buffers to process.
buffers: VecDeque<(usize, String)>,
}
impl BufferQueue {
/// Create an empty BufferQueue.
#[inline]
pub fn new() -> BufferQueue {
BufferQueue {
buffers: VecDeque::with_capacity(16),
}
}
/// Returns whether the queue is empty.
#[inline]
pub fn is_empty(&self) -> bool {
self.buffers.is_empty()
}
/// Get the buffer at the beginning of the queue.
#[inline]
pub fn pop_front(&mut self) -> Option {
if let Some((i, s)) = self.buffers.pop_front() {
return Some(s[i..].into());
}
None
// self.buffers.pop_front().map(|(i, s)| &s[i..])
}
/// Add a buffer to the beginning of the queue.
///
/// If the buffer is empty, it will be skipped.
pub fn push_front(&mut self, buf: String) {
if buf.len() == 0 {
return;
}
self.buffers.push_front((0, buf));
}
/// Add a buffer to the end of the queue.
///
/// If the buffer is empty, it will be skipped.
pub fn push_back(&mut self, buf: String) {
if buf.len() == 0 {
return;
}
self.buffers.push_back((0, buf));
}
/// Look at the next available character without removing it, if the queue is not empty.
pub fn peek(&self) -> Option {
debug_assert!(
self.buffers
.iter()
.find(|(i, s)| s[*i..].is_empty())
.is_none(),
"invariant \"all buffers in the queue are non-empty\" failed"
);
self.buffers
.front()
.map(|(i, s)| s[*i..].chars().next().unwrap())
}
/// Get the next character if one is available, removing it from the queue.
///
/// This function manages the buffers, removing them as they become empty.
pub fn next(&mut self) -> Option {
let (result, now_empty) = match self.buffers.front_mut() {
None => (None, false),
Some((i, buf)) => {
let c = &buf[*i..].chars().next().expect("empty buffer in queue");
*i += c.len_utf8();
(Some(*c), buf[*i..].is_empty())
}
};
if now_empty {
self.buffers.pop_front();
}
result
}
/// Pops and returns either a single character from the given set, or
/// a buffer of characters none of which are in the set.
pub(crate) fn pop_except_from(&mut self, set: SmallCharSet) -> Option {
let (result, now_empty) = match self.buffers.front_mut() {
None => (None, false),
Some((i, buf)) => {
let n = set.nonmember_prefix_len(&buf[*i..]);
if n > 0 {
let out = buf.drain(*i..*i + n).collect();
(Some(NotFromSet(out)), buf[*i..].is_empty())
} else {
let c = &buf[*i..].chars().next().expect("empty buffer in queue");
*i += c.len_utf8();
(Some(FromSet(*c)), buf[*i..].is_empty())
}
}
};
// Unborrow self for this part.
if now_empty {
self.buffers.pop_front();
}
result
}
/// Consume bytes matching the pattern, using a custom comparison function `eq`.
///
/// Returns `Some(true)` if there is a match, `Some(false)` if there is no match, or `None` if
/// it wasn't possible to know (more data is needed).
///
/// The custom comparison function is used elsewhere to compare ascii-case-insensitively.
pub fn eat bool>(&mut self, pat: &str, eq: F) -> Option {
let mut buffers_exhausted = 0;
let mut consumed_from_last = 0;
self.buffers.front()?;
for pattern_byte in pat.bytes() {
if buffers_exhausted >= self.buffers.len() {
return None;
}
let (i, buf) = &self.buffers[buffers_exhausted];
if !eq(&buf[*i..].as_bytes()[consumed_from_last], &pattern_byte) {
return Some(false);
}
consumed_from_last += 1;
if consumed_from_last >= buf.len() {
buffers_exhausted += 1;
consumed_from_last = 0;
}
}
// We have a match. Commit changes to the BufferQueue.
for _ in 0..buffers_exhausted {
self.buffers.pop_front();
}
match self.buffers.front_mut() {
None => assert_eq!(consumed_from_last, 0),
Some((i, _buf)) => {
*i += consumed_from_last;
}
}
Some(true)
}
}
#[cfg(test)]
#[allow(non_snake_case)]
mod test {
use super::BufferQueue;
use super::SetResult::{FromSet, NotFromSet};
#[test]
fn smoke_test() {
let mut bq = BufferQueue::new();
assert_eq!(bq.peek(), None);
assert_eq!(bq.next(), None);
bq.push_back("abc".into());
assert_eq!(bq.peek(), Some('a'));
assert_eq!(bq.next(), Some('a'));
assert_eq!(bq.peek(), Some('b'));
assert_eq!(bq.peek(), Some('b'));
assert_eq!(bq.next(), Some('b'));
assert_eq!(bq.peek(), Some('c'));
assert_eq!(bq.next(), Some('c'));
assert_eq!(bq.peek(), None);
assert_eq!(bq.next(), None);
}
#[test]
fn can_unconsume() {
let mut bq = BufferQueue::new();
bq.push_back("abc".into());
assert_eq!(bq.next(), Some('a'));
bq.push_front("xy".into());
assert_eq!(bq.next(), Some('x'));
assert_eq!(bq.next(), Some('y'));
assert_eq!(bq.next(), Some('b'));
assert_eq!(bq.next(), Some('c'));
assert_eq!(bq.next(), None);
}
#[test]
fn can_pop_except_set() {
let mut bq = BufferQueue::new();
bq.push_back("abc&def".into());
let mut pop = || bq.pop_except_from(small_char_set!('&'));
assert_eq!(pop(), Some(NotFromSet("abc".into())));
assert_eq!(pop(), Some(FromSet('&')));
assert_eq!(pop(), Some(NotFromSet("def".into())));
assert_eq!(pop(), None);
}
#[test]
fn can_eat() {
// This is not very comprehensive. We rely on the tokenizer
// integration tests for more thorough testing with many
// different input buffer splits.
let mut bq = BufferQueue::new();
bq.push_back("a".into());
bq.push_back("bc".into());
assert_eq!(bq.eat("abcd", u8::eq_ignore_ascii_case), None);
assert_eq!(bq.eat("ax", u8::eq_ignore_ascii_case), Some(false));
assert_eq!(bq.eat("ab", u8::eq_ignore_ascii_case), Some(true));
assert_eq!(bq.next(), Some('c'));
assert_eq!(bq.next(), None);
}
}