diff options
Diffstat (limited to 'src/util/buffer_queue.rs')
-rw-r--r-- | src/util/buffer_queue.rs | 303 |
1 files changed, 303 insertions, 0 deletions
diff --git a/src/util/buffer_queue.rs b/src/util/buffer_queue.rs new file mode 100644 index 0000000..d572489 --- /dev/null +++ b/src/util/buffer_queue.rs @@ -0,0 +1,303 @@ +// Copyright 2014-2017 The html5ever Project Developers. See the +// COPYRIGHT file at the top-level directory of this distribution. +// +// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or +// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license +// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +//! The `BufferQueue` struct and helper types. +//! +//! This type is designed for the efficient parsing of string data, especially where many +//! significant characters are from the ascii range 0-63. This includes, for example, important +//! characters in xml/html parsing. +//! +//! Good and predictable performance is achieved by avoiding allocation where possible (a.k.a. zero +//! copy). +//! +//! [`BufferQueue`]: struct.BufferQueue.html + +use std::collections::VecDeque; + +use tendril::StrTendril; + +pub use self::SetResult::{FromSet, NotFromSet}; +use crate::util::smallcharset::SmallCharSet; + +/// Result from [`pop_except_from`] containing either a character from a [`SmallCharSet`], or a +/// string buffer of characters not from the set. +/// +/// [`pop_except_from`]: struct.BufferQueue.html#method.pop_except_from +/// [`SmallCharSet`]: ../struct.SmallCharSet.html +#[derive(PartialEq, Eq, Debug)] +pub enum SetResult { + /// A character from the `SmallCharSet`. + FromSet(char), + /// A string buffer containing no characters from the `SmallCharSet`. + NotFromSet(StrTendril), +} + +/// A queue of owned string buffers, which supports incrementally consuming characters. +/// +/// Internally it uses [`VecDeque`] and has the same complexity properties. +/// +/// [`VecDeque`]: https://doc.rust-lang.org/std/collections/struct.VecDeque.html +#[derive(Debug)] +pub struct BufferQueue { + /// Buffers to process. + buffers: VecDeque<StrTendril>, +} + +impl BufferQueue { + /// Create an empty BufferQueue. + #[inline] + pub fn new() -> BufferQueue { + BufferQueue { + buffers: VecDeque::with_capacity(16), + } + } + + /// Returns whether the queue is empty. + #[inline] + pub fn is_empty(&self) -> bool { + self.buffers.is_empty() + } + + /// Get the buffer at the beginning of the queue. + #[inline] + pub fn pop_front(&mut self) -> Option<StrTendril> { + self.buffers.pop_front() + } + + /// Add a buffer to the beginning of the queue. + /// + /// If the buffer is empty, it will be skipped. + pub fn push_front(&mut self, buf: StrTendril) { + if buf.len32() == 0 { + return; + } + self.buffers.push_front(buf); + } + + /// Add a buffer to the end of the queue. + /// + /// If the buffer is empty, it will be skipped. + pub fn push_back(&mut self, buf: StrTendril) { + if buf.len32() == 0 { + return; + } + self.buffers.push_back(buf); + } + + /// Look at the next available character without removing it, if the queue is not empty. + pub fn peek(&self) -> Option<char> { + debug_assert!( + self.buffers + .iter() + .find(|el| el.len32() == 0) + .is_none(), + "invariant \"all buffers in the queue are non-empty\" failed" + ); + self.buffers.front().map(|b| b.chars().next().unwrap()) + } + + /// Get the next character if one is available, removing it from the queue. + /// + /// This function manages the buffers, removing them as they become empty. + pub fn next(&mut self) -> Option<char> { + let (result, now_empty) = match self.buffers.front_mut() { + None => (None, false), + Some(buf) => { + let c = buf.pop_front_char().expect("empty buffer in queue"); + (Some(c), buf.is_empty()) + }, + }; + + if now_empty { + self.buffers.pop_front(); + } + + result + } + + /// Pops and returns either a single character from the given set, or + /// a buffer of characters none of which are in the set. + /// + /// # Examples + /// + /// ``` + /// # #[macro_use] extern crate markup5ever; + /// # #[macro_use] extern crate tendril; + /// # fn main() { + /// use markup5ever::buffer_queue::{BufferQueue, SetResult}; + /// + /// let mut queue = BufferQueue::new(); + /// queue.push_back(format_tendril!(r#"<some_tag attr="text">SomeText</some_tag>"#)); + /// let set = small_char_set!(b'<' b'>' b' ' b'=' b'"' b'/'); + /// let tag = format_tendril!("some_tag"); + /// let attr = format_tendril!("attr"); + /// let attr_val = format_tendril!("text"); + /// assert_eq!(queue.pop_except_from(set), Some(SetResult::FromSet('<'))); + /// assert_eq!(queue.pop_except_from(set), Some(SetResult::NotFromSet(tag))); + /// assert_eq!(queue.pop_except_from(set), Some(SetResult::FromSet(' '))); + /// assert_eq!(queue.pop_except_from(set), Some(SetResult::NotFromSet(attr))); + /// assert_eq!(queue.pop_except_from(set), Some(SetResult::FromSet('='))); + /// assert_eq!(queue.pop_except_from(set), Some(SetResult::FromSet('"'))); + /// assert_eq!(queue.pop_except_from(set), Some(SetResult::NotFromSet(attr_val))); + /// // ... + /// # } + /// ``` + pub fn pop_except_from(&mut self, set: SmallCharSet) -> Option<SetResult> { + let (result, now_empty) = match self.buffers.front_mut() { + None => (None, false), + Some(buf) => { + let n = set.nonmember_prefix_len(&buf); + if n > 0 { + let out; + unsafe { + out = buf.unsafe_subtendril(0, n); + buf.unsafe_pop_front(n); + } + (Some(NotFromSet(out)), buf.is_empty()) + } else { + let c = buf.pop_front_char().expect("empty buffer in queue"); + (Some(FromSet(c)), buf.is_empty()) + } + }, + }; + + // Unborrow self for this part. + if now_empty { + self.buffers.pop_front(); + } + + result + } + + /// Consume bytes matching the pattern, using a custom comparison function `eq`. + /// + /// Returns `Some(true)` if there is a match, `Some(false)` if there is no match, or `None` if + /// it wasn't possible to know (more data is needed). + /// + /// The custom comparison function is used elsewhere to compare ascii-case-insensitively. + /// + /// # Examples + /// + /// ``` + /// # extern crate markup5ever; + /// # #[macro_use] extern crate tendril; + /// # fn main() { + /// use markup5ever::buffer_queue::{BufferQueue}; + /// + /// let mut queue = BufferQueue::new(); + /// queue.push_back(format_tendril!("testtext")); + /// let test_str = "test"; + /// assert_eq!(queue.eat("test", |&a, &b| a == b), Some(true)); + /// assert_eq!(queue.eat("text", |&a, &b| a == b), Some(true)); + /// assert!(queue.is_empty()); + /// # } + /// ``` + pub fn eat<F: Fn(&u8, &u8) -> bool>(&mut self, pat: &str, eq: F) -> Option<bool> { + let mut buffers_exhausted = 0; + let mut consumed_from_last = 0; + + self.buffers.front()?; + + for pattern_byte in pat.bytes() { + if buffers_exhausted >= self.buffers.len() { + return None; + } + let buf = &self.buffers[buffers_exhausted]; + + if !eq(&buf.as_bytes()[consumed_from_last], &pattern_byte) { + return Some(false); + } + + consumed_from_last += 1; + if consumed_from_last >= buf.len() { + buffers_exhausted += 1; + consumed_from_last = 0; + } + } + + // We have a match. Commit changes to the BufferQueue. + for _ in 0..buffers_exhausted { + self.buffers.pop_front(); + } + + match self.buffers.front_mut() { + None => assert_eq!(consumed_from_last, 0), + Some(ref mut buf) => buf.pop_front(consumed_from_last as u32), + } + + Some(true) + } +} + +#[cfg(test)] +#[allow(non_snake_case)] +mod test { + use tendril::SliceExt; + + use super::BufferQueue; + use super::SetResult::{FromSet, NotFromSet}; + + #[test] + fn smoke_test() { + let mut bq = BufferQueue::new(); + assert_eq!(bq.peek(), None); + assert_eq!(bq.next(), None); + + bq.push_back("abc".to_tendril()); + assert_eq!(bq.peek(), Some('a')); + assert_eq!(bq.next(), Some('a')); + assert_eq!(bq.peek(), Some('b')); + assert_eq!(bq.peek(), Some('b')); + assert_eq!(bq.next(), Some('b')); + assert_eq!(bq.peek(), Some('c')); + assert_eq!(bq.next(), Some('c')); + assert_eq!(bq.peek(), None); + assert_eq!(bq.next(), None); + } + + #[test] + fn can_unconsume() { + let mut bq = BufferQueue::new(); + bq.push_back("abc".to_tendril()); + assert_eq!(bq.next(), Some('a')); + + bq.push_front("xy".to_tendril()); + assert_eq!(bq.next(), Some('x')); + assert_eq!(bq.next(), Some('y')); + assert_eq!(bq.next(), Some('b')); + assert_eq!(bq.next(), Some('c')); + assert_eq!(bq.next(), None); + } + + #[test] + fn can_pop_except_set() { + let mut bq = BufferQueue::new(); + bq.push_back("abc&def".to_tendril()); + let mut pop = || bq.pop_except_from(small_char_set!('&')); + assert_eq!(pop(), Some(NotFromSet("abc".to_tendril()))); + assert_eq!(pop(), Some(FromSet('&'))); + assert_eq!(pop(), Some(NotFromSet("def".to_tendril()))); + assert_eq!(pop(), None); + } + + #[test] + fn can_eat() { + // This is not very comprehensive. We rely on the tokenizer + // integration tests for more thorough testing with many + // different input buffer splits. + let mut bq = BufferQueue::new(); + bq.push_back("a".to_tendril()); + bq.push_back("bc".to_tendril()); + assert_eq!(bq.eat("abcd", u8::eq_ignore_ascii_case), None); + assert_eq!(bq.eat("ax", u8::eq_ignore_ascii_case), Some(false)); + assert_eq!(bq.eat("ab", u8::eq_ignore_ascii_case), Some(true)); + assert_eq!(bq.next(), Some('c')); + assert_eq!(bq.next(), None); + } +} |