diff options
-rw-r--r-- | src/lib.rs | 23 | ||||
-rw-r--r-- | src/tokenizer/char_ref/mod.rs | 2 | ||||
-rw-r--r-- | src/tokenizer/mod.rs | 10 | ||||
-rw-r--r-- | src/util/buffer_queue.rs | 303 | ||||
-rw-r--r-- | src/util/smallcharset.rs | 90 |
5 files changed, 422 insertions, 6 deletions
@@ -18,8 +18,31 @@ pub use tendril; #[macro_use] mod macros; +/// Create a [`SmallCharSet`], with each space-separated number stored in the set. +/// +/// # Examples +/// +/// ``` +/// # #[macro_use] extern crate markup5ever; +/// # fn main() { +/// let set = small_char_set!(12 54 42); +/// assert_eq!(set.bits, +/// 0b00000000_01000000_00000100_00000000_00000000_00000000_00010000_00000000); +/// # } +/// ``` +/// +/// [`SmallCharSet`]: struct.SmallCharSet.html +#[macro_export] +macro_rules! small_char_set ( ($($e:expr)+) => ( + $ crate ::util::smallcharset::SmallCharSet { + bits: $( (1 << ($e as usize)) )|+ + } +)); + mod util { pub mod str; + pub mod buffer_queue; + pub mod smallcharset; } pub mod tokenizer; diff --git a/src/tokenizer/char_ref/mod.rs b/src/tokenizer/char_ref/mod.rs index e0369c7..484a9e1 100644 --- a/src/tokenizer/char_ref/mod.rs +++ b/src/tokenizer/char_ref/mod.rs @@ -8,8 +8,8 @@ // except according to those terms. use super::{TokenSink, Tokenizer}; -use markup5ever::buffer_queue::BufferQueue; use tendril::StrTendril; +use crate::util::buffer_queue::BufferQueue; use crate::util::str::is_ascii_alnum; use log::debug; diff --git a/src/tokenizer/mod.rs b/src/tokenizer/mod.rs index 33c0a88..6c5823f 100644 --- a/src/tokenizer/mod.rs +++ b/src/tokenizer/mod.rs @@ -21,19 +21,19 @@ use self::states::{Rawtext, Rcdata, ScriptData, ScriptDataEscaped}; use self::char_ref::{CharRef, CharRefTokenizer}; -use crate::util::str::lower_ascii_letter; +use crate::util::{smallcharset::SmallCharSet, str::lower_ascii_letter}; use log::debug; use mac::{_tt_as_expr_hack, format_if, matches}; -use markup5ever::{namespace_url, ns, small_char_set}; +use markup5ever::{namespace_url, ns}; use std::borrow::Cow::{self, Borrowed}; use std::collections::BTreeMap; use std::default::Default; use std::mem::replace; -pub use markup5ever::buffer_queue::{BufferQueue, FromSet, NotFromSet, SetResult}; +pub use crate::util::buffer_queue::{BufferQueue, FromSet, NotFromSet, SetResult}; use tendril::StrTendril; -use markup5ever::{Attribute, LocalName, QualName, SmallCharSet}; +use markup5ever::{Attribute, LocalName, QualName}; mod char_ref; mod interface; @@ -1535,7 +1535,7 @@ mod test { use super::interface::{EndTag, StartTag, Tag, TagKind}; use super::interface::{TagToken, Token}; - use markup5ever::buffer_queue::BufferQueue; + use crate::util::buffer_queue::BufferQueue; use std::mem::replace; use markup5ever::LocalName; diff --git a/src/util/buffer_queue.rs b/src/util/buffer_queue.rs new file mode 100644 index 0000000..d572489 --- /dev/null +++ b/src/util/buffer_queue.rs @@ -0,0 +1,303 @@ +// Copyright 2014-2017 The html5ever Project Developers. See the +// COPYRIGHT file at the top-level directory of this distribution. +// +// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or +// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license +// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +//! The `BufferQueue` struct and helper types. +//! +//! This type is designed for the efficient parsing of string data, especially where many +//! significant characters are from the ascii range 0-63. This includes, for example, important +//! characters in xml/html parsing. +//! +//! Good and predictable performance is achieved by avoiding allocation where possible (a.k.a. zero +//! copy). +//! +//! [`BufferQueue`]: struct.BufferQueue.html + +use std::collections::VecDeque; + +use tendril::StrTendril; + +pub use self::SetResult::{FromSet, NotFromSet}; +use crate::util::smallcharset::SmallCharSet; + +/// Result from [`pop_except_from`] containing either a character from a [`SmallCharSet`], or a +/// string buffer of characters not from the set. +/// +/// [`pop_except_from`]: struct.BufferQueue.html#method.pop_except_from +/// [`SmallCharSet`]: ../struct.SmallCharSet.html +#[derive(PartialEq, Eq, Debug)] +pub enum SetResult { + /// A character from the `SmallCharSet`. + FromSet(char), + /// A string buffer containing no characters from the `SmallCharSet`. + NotFromSet(StrTendril), +} + +/// A queue of owned string buffers, which supports incrementally consuming characters. +/// +/// Internally it uses [`VecDeque`] and has the same complexity properties. +/// +/// [`VecDeque`]: https://doc.rust-lang.org/std/collections/struct.VecDeque.html +#[derive(Debug)] +pub struct BufferQueue { + /// Buffers to process. + buffers: VecDeque<StrTendril>, +} + +impl BufferQueue { + /// Create an empty BufferQueue. + #[inline] + pub fn new() -> BufferQueue { + BufferQueue { + buffers: VecDeque::with_capacity(16), + } + } + + /// Returns whether the queue is empty. + #[inline] + pub fn is_empty(&self) -> bool { + self.buffers.is_empty() + } + + /// Get the buffer at the beginning of the queue. + #[inline] + pub fn pop_front(&mut self) -> Option<StrTendril> { + self.buffers.pop_front() + } + + /// Add a buffer to the beginning of the queue. + /// + /// If the buffer is empty, it will be skipped. + pub fn push_front(&mut self, buf: StrTendril) { + if buf.len32() == 0 { + return; + } + self.buffers.push_front(buf); + } + + /// Add a buffer to the end of the queue. + /// + /// If the buffer is empty, it will be skipped. + pub fn push_back(&mut self, buf: StrTendril) { + if buf.len32() == 0 { + return; + } + self.buffers.push_back(buf); + } + + /// Look at the next available character without removing it, if the queue is not empty. + pub fn peek(&self) -> Option<char> { + debug_assert!( + self.buffers + .iter() + .find(|el| el.len32() == 0) + .is_none(), + "invariant \"all buffers in the queue are non-empty\" failed" + ); + self.buffers.front().map(|b| b.chars().next().unwrap()) + } + + /// Get the next character if one is available, removing it from the queue. + /// + /// This function manages the buffers, removing them as they become empty. + pub fn next(&mut self) -> Option<char> { + let (result, now_empty) = match self.buffers.front_mut() { + None => (None, false), + Some(buf) => { + let c = buf.pop_front_char().expect("empty buffer in queue"); + (Some(c), buf.is_empty()) + }, + }; + + if now_empty { + self.buffers.pop_front(); + } + + result + } + + /// Pops and returns either a single character from the given set, or + /// a buffer of characters none of which are in the set. + /// + /// # Examples + /// + /// ``` + /// # #[macro_use] extern crate markup5ever; + /// # #[macro_use] extern crate tendril; + /// # fn main() { + /// use markup5ever::buffer_queue::{BufferQueue, SetResult}; + /// + /// let mut queue = BufferQueue::new(); + /// queue.push_back(format_tendril!(r#"<some_tag attr="text">SomeText</some_tag>"#)); + /// let set = small_char_set!(b'<' b'>' b' ' b'=' b'"' b'/'); + /// let tag = format_tendril!("some_tag"); + /// let attr = format_tendril!("attr"); + /// let attr_val = format_tendril!("text"); + /// assert_eq!(queue.pop_except_from(set), Some(SetResult::FromSet('<'))); + /// assert_eq!(queue.pop_except_from(set), Some(SetResult::NotFromSet(tag))); + /// assert_eq!(queue.pop_except_from(set), Some(SetResult::FromSet(' '))); + /// assert_eq!(queue.pop_except_from(set), Some(SetResult::NotFromSet(attr))); + /// assert_eq!(queue.pop_except_from(set), Some(SetResult::FromSet('='))); + /// assert_eq!(queue.pop_except_from(set), Some(SetResult::FromSet('"'))); + /// assert_eq!(queue.pop_except_from(set), Some(SetResult::NotFromSet(attr_val))); + /// // ... + /// # } + /// ``` + pub fn pop_except_from(&mut self, set: SmallCharSet) -> Option<SetResult> { + let (result, now_empty) = match self.buffers.front_mut() { + None => (None, false), + Some(buf) => { + let n = set.nonmember_prefix_len(&buf); + if n > 0 { + let out; + unsafe { + out = buf.unsafe_subtendril(0, n); + buf.unsafe_pop_front(n); + } + (Some(NotFromSet(out)), buf.is_empty()) + } else { + let c = buf.pop_front_char().expect("empty buffer in queue"); + (Some(FromSet(c)), buf.is_empty()) + } + }, + }; + + // Unborrow self for this part. + if now_empty { + self.buffers.pop_front(); + } + + result + } + + /// Consume bytes matching the pattern, using a custom comparison function `eq`. + /// + /// Returns `Some(true)` if there is a match, `Some(false)` if there is no match, or `None` if + /// it wasn't possible to know (more data is needed). + /// + /// The custom comparison function is used elsewhere to compare ascii-case-insensitively. + /// + /// # Examples + /// + /// ``` + /// # extern crate markup5ever; + /// # #[macro_use] extern crate tendril; + /// # fn main() { + /// use markup5ever::buffer_queue::{BufferQueue}; + /// + /// let mut queue = BufferQueue::new(); + /// queue.push_back(format_tendril!("testtext")); + /// let test_str = "test"; + /// assert_eq!(queue.eat("test", |&a, &b| a == b), Some(true)); + /// assert_eq!(queue.eat("text", |&a, &b| a == b), Some(true)); + /// assert!(queue.is_empty()); + /// # } + /// ``` + pub fn eat<F: Fn(&u8, &u8) -> bool>(&mut self, pat: &str, eq: F) -> Option<bool> { + let mut buffers_exhausted = 0; + let mut consumed_from_last = 0; + + self.buffers.front()?; + + for pattern_byte in pat.bytes() { + if buffers_exhausted >= self.buffers.len() { + return None; + } + let buf = &self.buffers[buffers_exhausted]; + + if !eq(&buf.as_bytes()[consumed_from_last], &pattern_byte) { + return Some(false); + } + + consumed_from_last += 1; + if consumed_from_last >= buf.len() { + buffers_exhausted += 1; + consumed_from_last = 0; + } + } + + // We have a match. Commit changes to the BufferQueue. + for _ in 0..buffers_exhausted { + self.buffers.pop_front(); + } + + match self.buffers.front_mut() { + None => assert_eq!(consumed_from_last, 0), + Some(ref mut buf) => buf.pop_front(consumed_from_last as u32), + } + + Some(true) + } +} + +#[cfg(test)] +#[allow(non_snake_case)] +mod test { + use tendril::SliceExt; + + use super::BufferQueue; + use super::SetResult::{FromSet, NotFromSet}; + + #[test] + fn smoke_test() { + let mut bq = BufferQueue::new(); + assert_eq!(bq.peek(), None); + assert_eq!(bq.next(), None); + + bq.push_back("abc".to_tendril()); + assert_eq!(bq.peek(), Some('a')); + assert_eq!(bq.next(), Some('a')); + assert_eq!(bq.peek(), Some('b')); + assert_eq!(bq.peek(), Some('b')); + assert_eq!(bq.next(), Some('b')); + assert_eq!(bq.peek(), Some('c')); + assert_eq!(bq.next(), Some('c')); + assert_eq!(bq.peek(), None); + assert_eq!(bq.next(), None); + } + + #[test] + fn can_unconsume() { + let mut bq = BufferQueue::new(); + bq.push_back("abc".to_tendril()); + assert_eq!(bq.next(), Some('a')); + + bq.push_front("xy".to_tendril()); + assert_eq!(bq.next(), Some('x')); + assert_eq!(bq.next(), Some('y')); + assert_eq!(bq.next(), Some('b')); + assert_eq!(bq.next(), Some('c')); + assert_eq!(bq.next(), None); + } + + #[test] + fn can_pop_except_set() { + let mut bq = BufferQueue::new(); + bq.push_back("abc&def".to_tendril()); + let mut pop = || bq.pop_except_from(small_char_set!('&')); + assert_eq!(pop(), Some(NotFromSet("abc".to_tendril()))); + assert_eq!(pop(), Some(FromSet('&'))); + assert_eq!(pop(), Some(NotFromSet("def".to_tendril()))); + assert_eq!(pop(), None); + } + + #[test] + fn can_eat() { + // This is not very comprehensive. We rely on the tokenizer + // integration tests for more thorough testing with many + // different input buffer splits. + let mut bq = BufferQueue::new(); + bq.push_back("a".to_tendril()); + bq.push_back("bc".to_tendril()); + assert_eq!(bq.eat("abcd", u8::eq_ignore_ascii_case), None); + assert_eq!(bq.eat("ax", u8::eq_ignore_ascii_case), Some(false)); + assert_eq!(bq.eat("ab", u8::eq_ignore_ascii_case), Some(true)); + assert_eq!(bq.next(), Some('c')); + assert_eq!(bq.next(), None); + } +} diff --git a/src/util/smallcharset.rs b/src/util/smallcharset.rs new file mode 100644 index 0000000..957dad7 --- /dev/null +++ b/src/util/smallcharset.rs @@ -0,0 +1,90 @@ +// Copyright 2014-2017 The html5ever Project Developers. See the +// COPYRIGHT file at the top-level directory of this distribution. +// +// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or +// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license +// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +//! This module contains a single struct [`SmallCharSet`]. See its documentation for details. +//! +//! [`SmallCharSet`]: struct.SmallCharSet.html + +/// Represents a set of "small characters", those with Unicode scalar +/// values less than 64. +/// +/// This is stored as a bitmap, with 1 bit for each value. +#[derive(Debug, Eq, PartialEq, Clone, Copy, Hash)] +pub struct SmallCharSet { + pub bits: u64, +} + +impl SmallCharSet { + /// Checks whether a character (u8 value below 64) is stored in the SmallCharSet. + /// + /// # Examples + /// + /// ```ignore + /// # use markup5ever::SmallCharSet; + /// let set = SmallCharSet { + /// bits: 0b00000000_01000000_00000100_00000000_00000000_00000000_00010000_00000000 + /// }; + /// assert!(set.contains(64)); + /// assert!(set.contains(b'6')); // `b'6'` is the same as 64u8 + /// ``` + #[inline] + fn contains(&self, n: u8) -> bool { + 0 != (self.bits & (1 << (n as usize))) + } + + /// Count the number of bytes of characters at the beginning of `buf` which are not in the set. + /// + /// This functionality is used in [`BufferQueue::pop_except_from`]. + /// + /// # Examples + /// + /// ``` + /// # #[macro_use] extern crate markup5ever; + /// # fn main() { + /// let set = small_char_set!(48 49 50); // '0' '1' '2' + /// // `test` is 4 chars, ๐ is 4 chars, then we meet a character in the set + /// let test_str = "test๐01232afd"; + /// assert_eq!(set.nonmember_prefix_len(test_str), 8); + /// # } + /// ``` + /// + /// [`BufferQueue::pop_except_from`]: buffer_queue/struct.BufferQueue.html#method.pop_except_from + pub fn nonmember_prefix_len(&self, buf: &str) -> u32 { + let mut n = 0; + for b in buf.bytes() { + if b >= 64 || !self.contains(b) { + n += 1; + } else { + break; + } + } + n + } +} + +#[cfg(test)] +mod test { + use std::iter::repeat; + + #[test] + fn nonmember_prefix() { + for &c in ['&', '\0'].iter() { + for x in 0..48u32 { + for y in 0..48u32 { + let mut s = repeat("x").take(x as usize).collect::<String>(); + s.push(c); + s.push_str(&repeat("x").take(y as usize).collect::<String>()); + let set = small_char_set!('&' '\0'); + + assert_eq!(x, set.nonmember_prefix_len(&s)); + } + } + } + } +} |