// Copyright 2014-2017 The html5ever Project Developers. See the // COPYRIGHT file at the top-level directory of this distribution. // // Licensed under the Apache License, Version 2.0 or the MIT license // , at your // option. This file may not be copied, modified, or distributed // except according to those terms. //! The `BufferQueue` struct and helper types. //! //! This type is designed for the efficient parsing of string data, especially where many //! significant characters are from the ascii range 0-63. This includes, for example, important //! characters in xml/html parsing. //! //! Good and predictable performance is achieved by avoiding allocation where possible (a.k.a. zero //! copy). //! //! [`BufferQueue`]: struct.BufferQueue.html use std::collections::VecDeque; pub use self::SetResult::{FromSet, NotFromSet}; use crate::util::smallcharset::SmallCharSet; /// Result from [`pop_except_from`] containing either a character from a [`SmallCharSet`], or a /// string buffer of characters not from the set. /// /// [`pop_except_from`]: struct.BufferQueue.html#method.pop_except_from /// [`SmallCharSet`]: ../struct.SmallCharSet.html #[derive(PartialEq, Eq, Debug)] pub enum SetResult { /// A character from the `SmallCharSet`. FromSet(char), /// A string buffer containing no characters from the `SmallCharSet`. NotFromSet(String), } /// A queue of owned string buffers, which supports incrementally consuming characters. /// /// Internally it uses [`VecDeque`] and has the same complexity properties. /// /// [`VecDeque`]: https://doc.rust-lang.org/std/collections/struct.VecDeque.html #[derive(Debug)] pub struct BufferQueue { /// Buffers to process. buffers: VecDeque<(usize, String)>, } impl BufferQueue { /// Create an empty BufferQueue. #[inline] pub fn new() -> BufferQueue { BufferQueue { buffers: VecDeque::with_capacity(16), } } /// Returns whether the queue is empty. #[inline] pub fn is_empty(&self) -> bool { self.buffers.is_empty() } /// Get the buffer at the beginning of the queue. #[inline] pub fn pop_front(&mut self) -> Option { if let Some((i, s)) = self.buffers.pop_front() { return Some(s[i..].into()); } None // self.buffers.pop_front().map(|(i, s)| &s[i..]) } /// Add a buffer to the beginning of the queue. /// /// If the buffer is empty, it will be skipped. pub fn push_front(&mut self, buf: String) { if buf.len() == 0 { return; } self.buffers.push_front((0, buf)); } /// Add a buffer to the end of the queue. /// /// If the buffer is empty, it will be skipped. pub fn push_back(&mut self, buf: String) { if buf.len() == 0 { return; } self.buffers.push_back((0, buf)); } /// Look at the next available character without removing it, if the queue is not empty. pub fn peek(&self) -> Option { debug_assert!( self.buffers .iter() .find(|(i, s)| s[*i..].is_empty()) .is_none(), "invariant \"all buffers in the queue are non-empty\" failed" ); self.buffers .front() .map(|(i, s)| s[*i..].chars().next().unwrap()) } /// Get the next character if one is available, removing it from the queue. /// /// This function manages the buffers, removing them as they become empty. pub fn next(&mut self) -> Option { let (result, now_empty) = match self.buffers.front_mut() { None => (None, false), Some((i, buf)) => { let c = &buf[*i..].chars().next().expect("empty buffer in queue"); *i += c.len_utf8(); (Some(*c), buf[*i..].is_empty()) } }; if now_empty { self.buffers.pop_front(); } result } /// Pops and returns either a single character from the given set, or /// a buffer of characters none of which are in the set. pub(crate) fn pop_except_from(&mut self, set: SmallCharSet) -> Option { let (result, now_empty) = match self.buffers.front_mut() { None => (None, false), Some((i, buf)) => { let n = set.nonmember_prefix_len(&buf[*i..]); if n > 0 { let out = buf.drain(*i..*i + n).collect(); (Some(NotFromSet(out)), buf[*i..].is_empty()) } else { let c = &buf[*i..].chars().next().expect("empty buffer in queue"); *i += c.len_utf8(); (Some(FromSet(*c)), buf[*i..].is_empty()) } } }; // Unborrow self for this part. if now_empty { self.buffers.pop_front(); } result } /// Consume bytes matching the pattern, using a custom comparison function `eq`. /// /// Returns `Some(true)` if there is a match, `Some(false)` if there is no match, or `None` if /// it wasn't possible to know (more data is needed). /// /// The custom comparison function is used elsewhere to compare ascii-case-insensitively. pub fn eat bool>(&mut self, pat: &str, eq: F) -> Option { let mut buffers_exhausted = 0; let mut consumed_from_last = 0; self.buffers.front()?; for pattern_byte in pat.bytes() { if buffers_exhausted >= self.buffers.len() { return None; } let (i, buf) = &self.buffers[buffers_exhausted]; if !eq(&buf[*i..].as_bytes()[consumed_from_last], &pattern_byte) { return Some(false); } consumed_from_last += 1; if consumed_from_last >= buf.len() { buffers_exhausted += 1; consumed_from_last = 0; } } // We have a match. Commit changes to the BufferQueue. for _ in 0..buffers_exhausted { self.buffers.pop_front(); } match self.buffers.front_mut() { None => assert_eq!(consumed_from_last, 0), Some((i, _buf)) => { *i += consumed_from_last; } } Some(true) } } #[cfg(test)] #[allow(non_snake_case)] mod test { use super::BufferQueue; use super::SetResult::{FromSet, NotFromSet}; #[test] fn smoke_test() { let mut bq = BufferQueue::new(); assert_eq!(bq.peek(), None); assert_eq!(bq.next(), None); bq.push_back("abc".into()); assert_eq!(bq.peek(), Some('a')); assert_eq!(bq.next(), Some('a')); assert_eq!(bq.peek(), Some('b')); assert_eq!(bq.peek(), Some('b')); assert_eq!(bq.next(), Some('b')); assert_eq!(bq.peek(), Some('c')); assert_eq!(bq.next(), Some('c')); assert_eq!(bq.peek(), None); assert_eq!(bq.next(), None); } #[test] fn can_unconsume() { let mut bq = BufferQueue::new(); bq.push_back("abc".into()); assert_eq!(bq.next(), Some('a')); bq.push_front("xy".into()); assert_eq!(bq.next(), Some('x')); assert_eq!(bq.next(), Some('y')); assert_eq!(bq.next(), Some('b')); assert_eq!(bq.next(), Some('c')); assert_eq!(bq.next(), None); } #[test] fn can_pop_except_set() { let mut bq = BufferQueue::new(); bq.push_back("abc&def".into()); let mut pop = || bq.pop_except_from(small_char_set!('&')); assert_eq!(pop(), Some(NotFromSet("abc".into()))); assert_eq!(pop(), Some(FromSet('&'))); assert_eq!(pop(), Some(NotFromSet("def".into()))); assert_eq!(pop(), None); } #[test] fn can_eat() { // This is not very comprehensive. We rely on the tokenizer // integration tests for more thorough testing with many // different input buffer splits. let mut bq = BufferQueue::new(); bq.push_back("a".into()); bq.push_back("bc".into()); assert_eq!(bq.eat("abcd", u8::eq_ignore_ascii_case), None); assert_eq!(bq.eat("ax", u8::eq_ignore_ascii_case), Some(false)); assert_eq!(bq.eat("ab", u8::eq_ignore_ascii_case), Some(true)); assert_eq!(bq.next(), Some('c')); assert_eq!(bq.next(), None); } }