diff options
| -rw-r--r-- | src/lib.rs | 23 | ||||
| -rw-r--r-- | src/tokenizer/char_ref/mod.rs | 2 | ||||
| -rw-r--r-- | src/tokenizer/mod.rs | 10 | ||||
| -rw-r--r-- | src/util/buffer_queue.rs | 303 | ||||
| -rw-r--r-- | src/util/smallcharset.rs | 90 | 
5 files changed, 422 insertions, 6 deletions
| @@ -18,8 +18,31 @@ pub use tendril;  #[macro_use]  mod macros; +/// Create a [`SmallCharSet`], with each space-separated number stored in the set. +/// +/// # Examples +/// +/// ``` +/// # #[macro_use] extern crate markup5ever; +/// # fn main() { +/// let set = small_char_set!(12 54 42); +/// assert_eq!(set.bits, +///            0b00000000_01000000_00000100_00000000_00000000_00000000_00010000_00000000); +/// # } +/// ``` +/// +/// [`SmallCharSet`]: struct.SmallCharSet.html +#[macro_export] +macro_rules! small_char_set ( ($($e:expr)+) => ( +    $ crate ::util::smallcharset::SmallCharSet { +        bits: $( (1 << ($e as usize)) )|+ +    } +)); +  mod util {      pub mod str; +    pub mod buffer_queue; +    pub mod smallcharset;  }  pub mod tokenizer; diff --git a/src/tokenizer/char_ref/mod.rs b/src/tokenizer/char_ref/mod.rs index e0369c7..484a9e1 100644 --- a/src/tokenizer/char_ref/mod.rs +++ b/src/tokenizer/char_ref/mod.rs @@ -8,8 +8,8 @@  // except according to those terms.  use super::{TokenSink, Tokenizer}; -use markup5ever::buffer_queue::BufferQueue;  use tendril::StrTendril; +use crate::util::buffer_queue::BufferQueue;  use crate::util::str::is_ascii_alnum;  use log::debug; diff --git a/src/tokenizer/mod.rs b/src/tokenizer/mod.rs index 33c0a88..6c5823f 100644 --- a/src/tokenizer/mod.rs +++ b/src/tokenizer/mod.rs @@ -21,19 +21,19 @@ use self::states::{Rawtext, Rcdata, ScriptData, ScriptDataEscaped};  use self::char_ref::{CharRef, CharRefTokenizer}; -use crate::util::str::lower_ascii_letter; +use crate::util::{smallcharset::SmallCharSet, str::lower_ascii_letter};  use log::debug;  use mac::{_tt_as_expr_hack, format_if, matches}; -use markup5ever::{namespace_url, ns, small_char_set}; +use markup5ever::{namespace_url, ns};  use std::borrow::Cow::{self, Borrowed};  use std::collections::BTreeMap;  use std::default::Default;  use std::mem::replace; -pub use markup5ever::buffer_queue::{BufferQueue, FromSet, NotFromSet, SetResult}; +pub use crate::util::buffer_queue::{BufferQueue, FromSet, NotFromSet, SetResult};  use tendril::StrTendril; -use markup5ever::{Attribute, LocalName, QualName, SmallCharSet}; +use markup5ever::{Attribute, LocalName, QualName};  mod char_ref;  mod interface; @@ -1535,7 +1535,7 @@ mod test {      use super::interface::{EndTag, StartTag, Tag, TagKind};      use super::interface::{TagToken, Token}; -    use markup5ever::buffer_queue::BufferQueue; +    use crate::util::buffer_queue::BufferQueue;      use std::mem::replace;      use markup5ever::LocalName; diff --git a/src/util/buffer_queue.rs b/src/util/buffer_queue.rs new file mode 100644 index 0000000..d572489 --- /dev/null +++ b/src/util/buffer_queue.rs @@ -0,0 +1,303 @@ +// Copyright 2014-2017 The html5ever Project Developers. See the +// COPYRIGHT file at the top-level directory of this distribution. +// +// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or +// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license +// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +//! The `BufferQueue` struct and helper types. +//! +//! This type is designed for the efficient parsing of string data, especially where many +//! significant characters are from the ascii range 0-63. This includes, for example, important +//! characters in xml/html parsing. +//! +//! Good and predictable performance is achieved by avoiding allocation where possible (a.k.a. zero +//! copy). +//! +//! [`BufferQueue`]: struct.BufferQueue.html + +use std::collections::VecDeque; + +use tendril::StrTendril; + +pub use self::SetResult::{FromSet, NotFromSet}; +use crate::util::smallcharset::SmallCharSet; + +/// Result from [`pop_except_from`] containing either a character from a [`SmallCharSet`], or a +/// string buffer of characters not from the set. +/// +/// [`pop_except_from`]: struct.BufferQueue.html#method.pop_except_from +/// [`SmallCharSet`]: ../struct.SmallCharSet.html +#[derive(PartialEq, Eq, Debug)] +pub enum SetResult { +    /// A character from the `SmallCharSet`. +    FromSet(char), +    /// A string buffer containing no characters from the `SmallCharSet`. +    NotFromSet(StrTendril), +} + +/// A queue of owned string buffers, which supports incrementally consuming characters. +/// +/// Internally it uses [`VecDeque`] and has the same complexity properties. +/// +/// [`VecDeque`]: https://doc.rust-lang.org/std/collections/struct.VecDeque.html +#[derive(Debug)] +pub struct BufferQueue { +    /// Buffers to process. +    buffers: VecDeque<StrTendril>, +} + +impl BufferQueue { +    /// Create an empty BufferQueue. +    #[inline] +    pub fn new() -> BufferQueue { +        BufferQueue { +            buffers: VecDeque::with_capacity(16), +        } +    } + +    /// Returns whether the queue is empty. +    #[inline] +    pub fn is_empty(&self) -> bool { +        self.buffers.is_empty() +    } + +    /// Get the buffer at the beginning of the queue. +    #[inline] +    pub fn pop_front(&mut self) -> Option<StrTendril> { +        self.buffers.pop_front() +    } + +    /// Add a buffer to the beginning of the queue. +    /// +    /// If the buffer is empty, it will be skipped. +    pub fn push_front(&mut self, buf: StrTendril) { +        if buf.len32() == 0 { +            return; +        } +        self.buffers.push_front(buf); +    } + +    /// Add a buffer to the end of the queue. +    /// +    /// If the buffer is empty, it will be skipped. +    pub fn push_back(&mut self, buf: StrTendril) { +        if buf.len32() == 0 { +            return; +        } +        self.buffers.push_back(buf); +    } + +    /// Look at the next available character without removing it, if the queue is not empty. +    pub fn peek(&self) -> Option<char> { +        debug_assert!( +            self.buffers +                .iter() +                .find(|el| el.len32() == 0) +                .is_none(), +            "invariant \"all buffers in the queue are non-empty\" failed" +        ); +        self.buffers.front().map(|b| b.chars().next().unwrap()) +    } + +    /// Get the next character if one is available, removing it from the queue. +    /// +    /// This function manages the buffers, removing them as they become empty. +    pub fn next(&mut self) -> Option<char> { +        let (result, now_empty) = match self.buffers.front_mut() { +            None => (None, false), +            Some(buf) => { +                let c = buf.pop_front_char().expect("empty buffer in queue"); +                (Some(c), buf.is_empty()) +            }, +        }; + +        if now_empty { +            self.buffers.pop_front(); +        } + +        result +    } + +    /// Pops and returns either a single character from the given set, or +    /// a buffer of characters none of which are in the set. +    /// +    /// # Examples +    /// +    /// ``` +    /// # #[macro_use] extern crate markup5ever; +    /// # #[macro_use] extern crate tendril; +    /// # fn main() { +    /// use markup5ever::buffer_queue::{BufferQueue, SetResult}; +    /// +    /// let mut queue = BufferQueue::new(); +    /// queue.push_back(format_tendril!(r#"<some_tag attr="text">SomeText</some_tag>"#)); +    /// let set = small_char_set!(b'<' b'>' b' ' b'=' b'"' b'/'); +    /// let tag = format_tendril!("some_tag"); +    /// let attr = format_tendril!("attr"); +    /// let attr_val = format_tendril!("text"); +    /// assert_eq!(queue.pop_except_from(set), Some(SetResult::FromSet('<'))); +    /// assert_eq!(queue.pop_except_from(set), Some(SetResult::NotFromSet(tag))); +    /// assert_eq!(queue.pop_except_from(set), Some(SetResult::FromSet(' '))); +    /// assert_eq!(queue.pop_except_from(set), Some(SetResult::NotFromSet(attr))); +    /// assert_eq!(queue.pop_except_from(set), Some(SetResult::FromSet('='))); +    /// assert_eq!(queue.pop_except_from(set), Some(SetResult::FromSet('"'))); +    /// assert_eq!(queue.pop_except_from(set), Some(SetResult::NotFromSet(attr_val))); +    /// // ... +    /// # } +    /// ``` +    pub fn pop_except_from(&mut self, set: SmallCharSet) -> Option<SetResult> { +        let (result, now_empty) = match self.buffers.front_mut() { +            None => (None, false), +            Some(buf) => { +                let n = set.nonmember_prefix_len(&buf); +                if n > 0 { +                    let out; +                    unsafe { +                        out = buf.unsafe_subtendril(0, n); +                        buf.unsafe_pop_front(n); +                    } +                    (Some(NotFromSet(out)), buf.is_empty()) +                } else { +                    let c = buf.pop_front_char().expect("empty buffer in queue"); +                    (Some(FromSet(c)), buf.is_empty()) +                } +            }, +        }; + +        // Unborrow self for this part. +        if now_empty { +            self.buffers.pop_front(); +        } + +        result +    } + +    /// Consume bytes matching the pattern, using a custom comparison function `eq`. +    /// +    /// Returns `Some(true)` if there is a match, `Some(false)` if there is no match, or `None` if +    /// it wasn't possible to know (more data is needed). +    /// +    /// The custom comparison function is used elsewhere to compare ascii-case-insensitively. +    /// +    /// # Examples +    /// +    /// ``` +    /// # extern crate markup5ever; +    /// # #[macro_use] extern crate tendril; +    /// # fn main() { +    /// use markup5ever::buffer_queue::{BufferQueue}; +    /// +    /// let mut queue = BufferQueue::new(); +    /// queue.push_back(format_tendril!("testtext")); +    /// let test_str = "test"; +    /// assert_eq!(queue.eat("test", |&a, &b| a == b), Some(true)); +    /// assert_eq!(queue.eat("text", |&a, &b| a == b), Some(true)); +    /// assert!(queue.is_empty()); +    /// # } +    /// ``` +    pub fn eat<F: Fn(&u8, &u8) -> bool>(&mut self, pat: &str, eq: F) -> Option<bool> { +        let mut buffers_exhausted = 0; +        let mut consumed_from_last = 0; + +        self.buffers.front()?; + +        for pattern_byte in pat.bytes() { +            if buffers_exhausted >= self.buffers.len() { +                return None; +            } +            let buf = &self.buffers[buffers_exhausted]; + +            if !eq(&buf.as_bytes()[consumed_from_last], &pattern_byte) { +                return Some(false); +            } + +            consumed_from_last += 1; +            if consumed_from_last >= buf.len() { +                buffers_exhausted += 1; +                consumed_from_last = 0; +            } +        } + +        // We have a match. Commit changes to the BufferQueue. +        for _ in 0..buffers_exhausted { +            self.buffers.pop_front(); +        } + +        match self.buffers.front_mut() { +            None => assert_eq!(consumed_from_last, 0), +            Some(ref mut buf) => buf.pop_front(consumed_from_last as u32), +        } + +        Some(true) +    } +} + +#[cfg(test)] +#[allow(non_snake_case)] +mod test { +    use tendril::SliceExt; + +    use super::BufferQueue; +    use super::SetResult::{FromSet, NotFromSet}; + +    #[test] +    fn smoke_test() { +        let mut bq = BufferQueue::new(); +        assert_eq!(bq.peek(), None); +        assert_eq!(bq.next(), None); + +        bq.push_back("abc".to_tendril()); +        assert_eq!(bq.peek(), Some('a')); +        assert_eq!(bq.next(), Some('a')); +        assert_eq!(bq.peek(), Some('b')); +        assert_eq!(bq.peek(), Some('b')); +        assert_eq!(bq.next(), Some('b')); +        assert_eq!(bq.peek(), Some('c')); +        assert_eq!(bq.next(), Some('c')); +        assert_eq!(bq.peek(), None); +        assert_eq!(bq.next(), None); +    } + +    #[test] +    fn can_unconsume() { +        let mut bq = BufferQueue::new(); +        bq.push_back("abc".to_tendril()); +        assert_eq!(bq.next(), Some('a')); + +        bq.push_front("xy".to_tendril()); +        assert_eq!(bq.next(), Some('x')); +        assert_eq!(bq.next(), Some('y')); +        assert_eq!(bq.next(), Some('b')); +        assert_eq!(bq.next(), Some('c')); +        assert_eq!(bq.next(), None); +    } + +    #[test] +    fn can_pop_except_set() { +        let mut bq = BufferQueue::new(); +        bq.push_back("abc&def".to_tendril()); +        let mut pop = || bq.pop_except_from(small_char_set!('&')); +        assert_eq!(pop(), Some(NotFromSet("abc".to_tendril()))); +        assert_eq!(pop(), Some(FromSet('&'))); +        assert_eq!(pop(), Some(NotFromSet("def".to_tendril()))); +        assert_eq!(pop(), None); +    } + +    #[test] +    fn can_eat() { +        // This is not very comprehensive.  We rely on the tokenizer +        // integration tests for more thorough testing with many +        // different input buffer splits. +        let mut bq = BufferQueue::new(); +        bq.push_back("a".to_tendril()); +        bq.push_back("bc".to_tendril()); +        assert_eq!(bq.eat("abcd", u8::eq_ignore_ascii_case), None); +        assert_eq!(bq.eat("ax", u8::eq_ignore_ascii_case), Some(false)); +        assert_eq!(bq.eat("ab", u8::eq_ignore_ascii_case), Some(true)); +        assert_eq!(bq.next(), Some('c')); +        assert_eq!(bq.next(), None); +    } +} diff --git a/src/util/smallcharset.rs b/src/util/smallcharset.rs new file mode 100644 index 0000000..957dad7 --- /dev/null +++ b/src/util/smallcharset.rs @@ -0,0 +1,90 @@ +// Copyright 2014-2017 The html5ever Project Developers. See the +// COPYRIGHT file at the top-level directory of this distribution. +// +// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or +// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license +// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +//! This module contains a single struct [`SmallCharSet`]. See its documentation for details. +//! +//! [`SmallCharSet`]: struct.SmallCharSet.html + +/// Represents a set of "small characters", those with Unicode scalar +/// values less than 64. +/// +/// This is stored as a bitmap, with 1 bit for each value. +#[derive(Debug, Eq, PartialEq, Clone, Copy, Hash)] +pub struct SmallCharSet { +    pub bits: u64, +} + +impl SmallCharSet { +    /// Checks whether a character (u8 value below 64) is stored in the SmallCharSet. +    /// +    /// # Examples +    /// +    /// ```ignore +    /// # use markup5ever::SmallCharSet; +    /// let set = SmallCharSet { +    ///     bits: 0b00000000_01000000_00000100_00000000_00000000_00000000_00010000_00000000 +    /// }; +    /// assert!(set.contains(64)); +    /// assert!(set.contains(b'6')); // `b'6'` is the same as 64u8 +    /// ``` +    #[inline] +    fn contains(&self, n: u8) -> bool { +        0 != (self.bits & (1 << (n as usize))) +    } + +    /// Count the number of bytes of characters at the beginning of `buf` which are not in the set. +    /// +    /// This functionality is used in [`BufferQueue::pop_except_from`]. +    /// +    /// # Examples +    /// +    /// ``` +    /// # #[macro_use] extern crate markup5ever; +    /// # fn main() { +    /// let set = small_char_set!(48 49 50); // '0' '1' '2' +    /// // `test` is 4 chars, ๐ is 4 chars, then we meet a character in the set +    /// let test_str = "test๐01232afd"; +    /// assert_eq!(set.nonmember_prefix_len(test_str), 8); +    /// # } +    /// ``` +    /// +    /// [`BufferQueue::pop_except_from`]: buffer_queue/struct.BufferQueue.html#method.pop_except_from +    pub fn nonmember_prefix_len(&self, buf: &str) -> u32 { +        let mut n = 0; +        for b in buf.bytes() { +            if b >= 64 || !self.contains(b) { +                n += 1; +            } else { +                break; +            } +        } +        n +    } +} + +#[cfg(test)] +mod test { +    use std::iter::repeat; + +    #[test] +    fn nonmember_prefix() { +        for &c in ['&', '\0'].iter() { +            for x in 0..48u32 { +                for y in 0..48u32 { +                    let mut s = repeat("x").take(x as usize).collect::<String>(); +                    s.push(c); +                    s.push_str(&repeat("x").take(y as usize).collect::<String>()); +                    let set = small_char_set!('&' '\0'); + +                    assert_eq!(x, set.nonmember_prefix_len(&s)); +                } +            } +        } +    } +} | 
