summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMartin Fischer <martin@push-f.com>2021-04-08 09:51:44 +0200
committerMartin Fischer <martin@push-f.com>2021-04-08 15:40:48 +0200
commit69c070d9436028a3eed97596243bb52aee198210 (patch)
tree9708a10a3225d46571fc3d15bf2f9130d520d6b1
parent071a1bf860900482079c0f602430ebdc425e5fee (diff)
merge buffer_queue and smallcharset from markup5ever
-rw-r--r--src/lib.rs23
-rw-r--r--src/tokenizer/char_ref/mod.rs2
-rw-r--r--src/tokenizer/mod.rs10
-rw-r--r--src/util/buffer_queue.rs303
-rw-r--r--src/util/smallcharset.rs90
5 files changed, 422 insertions, 6 deletions
diff --git a/src/lib.rs b/src/lib.rs
index 1d6e2d1..95647ff 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -18,8 +18,31 @@ pub use tendril;
#[macro_use]
mod macros;
+/// Create a [`SmallCharSet`], with each space-separated number stored in the set.
+///
+/// # Examples
+///
+/// ```
+/// # #[macro_use] extern crate markup5ever;
+/// # fn main() {
+/// let set = small_char_set!(12 54 42);
+/// assert_eq!(set.bits,
+/// 0b00000000_01000000_00000100_00000000_00000000_00000000_00010000_00000000);
+/// # }
+/// ```
+///
+/// [`SmallCharSet`]: struct.SmallCharSet.html
+#[macro_export]
+macro_rules! small_char_set ( ($($e:expr)+) => (
+ $ crate ::util::smallcharset::SmallCharSet {
+ bits: $( (1 << ($e as usize)) )|+
+ }
+));
+
mod util {
pub mod str;
+ pub mod buffer_queue;
+ pub mod smallcharset;
}
pub mod tokenizer;
diff --git a/src/tokenizer/char_ref/mod.rs b/src/tokenizer/char_ref/mod.rs
index e0369c7..484a9e1 100644
--- a/src/tokenizer/char_ref/mod.rs
+++ b/src/tokenizer/char_ref/mod.rs
@@ -8,8 +8,8 @@
// except according to those terms.
use super::{TokenSink, Tokenizer};
-use markup5ever::buffer_queue::BufferQueue;
use tendril::StrTendril;
+use crate::util::buffer_queue::BufferQueue;
use crate::util::str::is_ascii_alnum;
use log::debug;
diff --git a/src/tokenizer/mod.rs b/src/tokenizer/mod.rs
index 33c0a88..6c5823f 100644
--- a/src/tokenizer/mod.rs
+++ b/src/tokenizer/mod.rs
@@ -21,19 +21,19 @@ use self::states::{Rawtext, Rcdata, ScriptData, ScriptDataEscaped};
use self::char_ref::{CharRef, CharRefTokenizer};
-use crate::util::str::lower_ascii_letter;
+use crate::util::{smallcharset::SmallCharSet, str::lower_ascii_letter};
use log::debug;
use mac::{_tt_as_expr_hack, format_if, matches};
-use markup5ever::{namespace_url, ns, small_char_set};
+use markup5ever::{namespace_url, ns};
use std::borrow::Cow::{self, Borrowed};
use std::collections::BTreeMap;
use std::default::Default;
use std::mem::replace;
-pub use markup5ever::buffer_queue::{BufferQueue, FromSet, NotFromSet, SetResult};
+pub use crate::util::buffer_queue::{BufferQueue, FromSet, NotFromSet, SetResult};
use tendril::StrTendril;
-use markup5ever::{Attribute, LocalName, QualName, SmallCharSet};
+use markup5ever::{Attribute, LocalName, QualName};
mod char_ref;
mod interface;
@@ -1535,7 +1535,7 @@ mod test {
use super::interface::{EndTag, StartTag, Tag, TagKind};
use super::interface::{TagToken, Token};
- use markup5ever::buffer_queue::BufferQueue;
+ use crate::util::buffer_queue::BufferQueue;
use std::mem::replace;
use markup5ever::LocalName;
diff --git a/src/util/buffer_queue.rs b/src/util/buffer_queue.rs
new file mode 100644
index 0000000..d572489
--- /dev/null
+++ b/src/util/buffer_queue.rs
@@ -0,0 +1,303 @@
+// Copyright 2014-2017 The html5ever Project Developers. See the
+// COPYRIGHT file at the top-level directory of this distribution.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+//! The `BufferQueue` struct and helper types.
+//!
+//! This type is designed for the efficient parsing of string data, especially where many
+//! significant characters are from the ascii range 0-63. This includes, for example, important
+//! characters in xml/html parsing.
+//!
+//! Good and predictable performance is achieved by avoiding allocation where possible (a.k.a. zero
+//! copy).
+//!
+//! [`BufferQueue`]: struct.BufferQueue.html
+
+use std::collections::VecDeque;
+
+use tendril::StrTendril;
+
+pub use self::SetResult::{FromSet, NotFromSet};
+use crate::util::smallcharset::SmallCharSet;
+
+/// Result from [`pop_except_from`] containing either a character from a [`SmallCharSet`], or a
+/// string buffer of characters not from the set.
+///
+/// [`pop_except_from`]: struct.BufferQueue.html#method.pop_except_from
+/// [`SmallCharSet`]: ../struct.SmallCharSet.html
+#[derive(PartialEq, Eq, Debug)]
+pub enum SetResult {
+ /// A character from the `SmallCharSet`.
+ FromSet(char),
+ /// A string buffer containing no characters from the `SmallCharSet`.
+ NotFromSet(StrTendril),
+}
+
+/// A queue of owned string buffers, which supports incrementally consuming characters.
+///
+/// Internally it uses [`VecDeque`] and has the same complexity properties.
+///
+/// [`VecDeque`]: https://doc.rust-lang.org/std/collections/struct.VecDeque.html
+#[derive(Debug)]
+pub struct BufferQueue {
+ /// Buffers to process.
+ buffers: VecDeque<StrTendril>,
+}
+
+impl BufferQueue {
+ /// Create an empty BufferQueue.
+ #[inline]
+ pub fn new() -> BufferQueue {
+ BufferQueue {
+ buffers: VecDeque::with_capacity(16),
+ }
+ }
+
+ /// Returns whether the queue is empty.
+ #[inline]
+ pub fn is_empty(&self) -> bool {
+ self.buffers.is_empty()
+ }
+
+ /// Get the buffer at the beginning of the queue.
+ #[inline]
+ pub fn pop_front(&mut self) -> Option<StrTendril> {
+ self.buffers.pop_front()
+ }
+
+ /// Add a buffer to the beginning of the queue.
+ ///
+ /// If the buffer is empty, it will be skipped.
+ pub fn push_front(&mut self, buf: StrTendril) {
+ if buf.len32() == 0 {
+ return;
+ }
+ self.buffers.push_front(buf);
+ }
+
+ /// Add a buffer to the end of the queue.
+ ///
+ /// If the buffer is empty, it will be skipped.
+ pub fn push_back(&mut self, buf: StrTendril) {
+ if buf.len32() == 0 {
+ return;
+ }
+ self.buffers.push_back(buf);
+ }
+
+ /// Look at the next available character without removing it, if the queue is not empty.
+ pub fn peek(&self) -> Option<char> {
+ debug_assert!(
+ self.buffers
+ .iter()
+ .find(|el| el.len32() == 0)
+ .is_none(),
+ "invariant \"all buffers in the queue are non-empty\" failed"
+ );
+ self.buffers.front().map(|b| b.chars().next().unwrap())
+ }
+
+ /// Get the next character if one is available, removing it from the queue.
+ ///
+ /// This function manages the buffers, removing them as they become empty.
+ pub fn next(&mut self) -> Option<char> {
+ let (result, now_empty) = match self.buffers.front_mut() {
+ None => (None, false),
+ Some(buf) => {
+ let c = buf.pop_front_char().expect("empty buffer in queue");
+ (Some(c), buf.is_empty())
+ },
+ };
+
+ if now_empty {
+ self.buffers.pop_front();
+ }
+
+ result
+ }
+
+ /// Pops and returns either a single character from the given set, or
+ /// a buffer of characters none of which are in the set.
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// # #[macro_use] extern crate markup5ever;
+ /// # #[macro_use] extern crate tendril;
+ /// # fn main() {
+ /// use markup5ever::buffer_queue::{BufferQueue, SetResult};
+ ///
+ /// let mut queue = BufferQueue::new();
+ /// queue.push_back(format_tendril!(r#"<some_tag attr="text">SomeText</some_tag>"#));
+ /// let set = small_char_set!(b'<' b'>' b' ' b'=' b'"' b'/');
+ /// let tag = format_tendril!("some_tag");
+ /// let attr = format_tendril!("attr");
+ /// let attr_val = format_tendril!("text");
+ /// assert_eq!(queue.pop_except_from(set), Some(SetResult::FromSet('<')));
+ /// assert_eq!(queue.pop_except_from(set), Some(SetResult::NotFromSet(tag)));
+ /// assert_eq!(queue.pop_except_from(set), Some(SetResult::FromSet(' ')));
+ /// assert_eq!(queue.pop_except_from(set), Some(SetResult::NotFromSet(attr)));
+ /// assert_eq!(queue.pop_except_from(set), Some(SetResult::FromSet('=')));
+ /// assert_eq!(queue.pop_except_from(set), Some(SetResult::FromSet('"')));
+ /// assert_eq!(queue.pop_except_from(set), Some(SetResult::NotFromSet(attr_val)));
+ /// // ...
+ /// # }
+ /// ```
+ pub fn pop_except_from(&mut self, set: SmallCharSet) -> Option<SetResult> {
+ let (result, now_empty) = match self.buffers.front_mut() {
+ None => (None, false),
+ Some(buf) => {
+ let n = set.nonmember_prefix_len(&buf);
+ if n > 0 {
+ let out;
+ unsafe {
+ out = buf.unsafe_subtendril(0, n);
+ buf.unsafe_pop_front(n);
+ }
+ (Some(NotFromSet(out)), buf.is_empty())
+ } else {
+ let c = buf.pop_front_char().expect("empty buffer in queue");
+ (Some(FromSet(c)), buf.is_empty())
+ }
+ },
+ };
+
+ // Unborrow self for this part.
+ if now_empty {
+ self.buffers.pop_front();
+ }
+
+ result
+ }
+
+ /// Consume bytes matching the pattern, using a custom comparison function `eq`.
+ ///
+ /// Returns `Some(true)` if there is a match, `Some(false)` if there is no match, or `None` if
+ /// it wasn't possible to know (more data is needed).
+ ///
+ /// The custom comparison function is used elsewhere to compare ascii-case-insensitively.
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// # extern crate markup5ever;
+ /// # #[macro_use] extern crate tendril;
+ /// # fn main() {
+ /// use markup5ever::buffer_queue::{BufferQueue};
+ ///
+ /// let mut queue = BufferQueue::new();
+ /// queue.push_back(format_tendril!("testtext"));
+ /// let test_str = "test";
+ /// assert_eq!(queue.eat("test", |&a, &b| a == b), Some(true));
+ /// assert_eq!(queue.eat("text", |&a, &b| a == b), Some(true));
+ /// assert!(queue.is_empty());
+ /// # }
+ /// ```
+ pub fn eat<F: Fn(&u8, &u8) -> bool>(&mut self, pat: &str, eq: F) -> Option<bool> {
+ let mut buffers_exhausted = 0;
+ let mut consumed_from_last = 0;
+
+ self.buffers.front()?;
+
+ for pattern_byte in pat.bytes() {
+ if buffers_exhausted >= self.buffers.len() {
+ return None;
+ }
+ let buf = &self.buffers[buffers_exhausted];
+
+ if !eq(&buf.as_bytes()[consumed_from_last], &pattern_byte) {
+ return Some(false);
+ }
+
+ consumed_from_last += 1;
+ if consumed_from_last >= buf.len() {
+ buffers_exhausted += 1;
+ consumed_from_last = 0;
+ }
+ }
+
+ // We have a match. Commit changes to the BufferQueue.
+ for _ in 0..buffers_exhausted {
+ self.buffers.pop_front();
+ }
+
+ match self.buffers.front_mut() {
+ None => assert_eq!(consumed_from_last, 0),
+ Some(ref mut buf) => buf.pop_front(consumed_from_last as u32),
+ }
+
+ Some(true)
+ }
+}
+
+#[cfg(test)]
+#[allow(non_snake_case)]
+mod test {
+ use tendril::SliceExt;
+
+ use super::BufferQueue;
+ use super::SetResult::{FromSet, NotFromSet};
+
+ #[test]
+ fn smoke_test() {
+ let mut bq = BufferQueue::new();
+ assert_eq!(bq.peek(), None);
+ assert_eq!(bq.next(), None);
+
+ bq.push_back("abc".to_tendril());
+ assert_eq!(bq.peek(), Some('a'));
+ assert_eq!(bq.next(), Some('a'));
+ assert_eq!(bq.peek(), Some('b'));
+ assert_eq!(bq.peek(), Some('b'));
+ assert_eq!(bq.next(), Some('b'));
+ assert_eq!(bq.peek(), Some('c'));
+ assert_eq!(bq.next(), Some('c'));
+ assert_eq!(bq.peek(), None);
+ assert_eq!(bq.next(), None);
+ }
+
+ #[test]
+ fn can_unconsume() {
+ let mut bq = BufferQueue::new();
+ bq.push_back("abc".to_tendril());
+ assert_eq!(bq.next(), Some('a'));
+
+ bq.push_front("xy".to_tendril());
+ assert_eq!(bq.next(), Some('x'));
+ assert_eq!(bq.next(), Some('y'));
+ assert_eq!(bq.next(), Some('b'));
+ assert_eq!(bq.next(), Some('c'));
+ assert_eq!(bq.next(), None);
+ }
+
+ #[test]
+ fn can_pop_except_set() {
+ let mut bq = BufferQueue::new();
+ bq.push_back("abc&def".to_tendril());
+ let mut pop = || bq.pop_except_from(small_char_set!('&'));
+ assert_eq!(pop(), Some(NotFromSet("abc".to_tendril())));
+ assert_eq!(pop(), Some(FromSet('&')));
+ assert_eq!(pop(), Some(NotFromSet("def".to_tendril())));
+ assert_eq!(pop(), None);
+ }
+
+ #[test]
+ fn can_eat() {
+ // This is not very comprehensive. We rely on the tokenizer
+ // integration tests for more thorough testing with many
+ // different input buffer splits.
+ let mut bq = BufferQueue::new();
+ bq.push_back("a".to_tendril());
+ bq.push_back("bc".to_tendril());
+ assert_eq!(bq.eat("abcd", u8::eq_ignore_ascii_case), None);
+ assert_eq!(bq.eat("ax", u8::eq_ignore_ascii_case), Some(false));
+ assert_eq!(bq.eat("ab", u8::eq_ignore_ascii_case), Some(true));
+ assert_eq!(bq.next(), Some('c'));
+ assert_eq!(bq.next(), None);
+ }
+}
diff --git a/src/util/smallcharset.rs b/src/util/smallcharset.rs
new file mode 100644
index 0000000..957dad7
--- /dev/null
+++ b/src/util/smallcharset.rs
@@ -0,0 +1,90 @@
+// Copyright 2014-2017 The html5ever Project Developers. See the
+// COPYRIGHT file at the top-level directory of this distribution.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+//! This module contains a single struct [`SmallCharSet`]. See its documentation for details.
+//!
+//! [`SmallCharSet`]: struct.SmallCharSet.html
+
+/// Represents a set of "small characters", those with Unicode scalar
+/// values less than 64.
+///
+/// This is stored as a bitmap, with 1 bit for each value.
+#[derive(Debug, Eq, PartialEq, Clone, Copy, Hash)]
+pub struct SmallCharSet {
+ pub bits: u64,
+}
+
+impl SmallCharSet {
+ /// Checks whether a character (u8 value below 64) is stored in the SmallCharSet.
+ ///
+ /// # Examples
+ ///
+ /// ```ignore
+ /// # use markup5ever::SmallCharSet;
+ /// let set = SmallCharSet {
+ /// bits: 0b00000000_01000000_00000100_00000000_00000000_00000000_00010000_00000000
+ /// };
+ /// assert!(set.contains(64));
+ /// assert!(set.contains(b'6')); // `b'6'` is the same as 64u8
+ /// ```
+ #[inline]
+ fn contains(&self, n: u8) -> bool {
+ 0 != (self.bits & (1 << (n as usize)))
+ }
+
+ /// Count the number of bytes of characters at the beginning of `buf` which are not in the set.
+ ///
+ /// This functionality is used in [`BufferQueue::pop_except_from`].
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// # #[macro_use] extern crate markup5ever;
+ /// # fn main() {
+ /// let set = small_char_set!(48 49 50); // '0' '1' '2'
+ /// // `test` is 4 chars, ๐Ÿ˜ is 4 chars, then we meet a character in the set
+ /// let test_str = "test๐Ÿ˜01232afd";
+ /// assert_eq!(set.nonmember_prefix_len(test_str), 8);
+ /// # }
+ /// ```
+ ///
+ /// [`BufferQueue::pop_except_from`]: buffer_queue/struct.BufferQueue.html#method.pop_except_from
+ pub fn nonmember_prefix_len(&self, buf: &str) -> u32 {
+ let mut n = 0;
+ for b in buf.bytes() {
+ if b >= 64 || !self.contains(b) {
+ n += 1;
+ } else {
+ break;
+ }
+ }
+ n
+ }
+}
+
+#[cfg(test)]
+mod test {
+ use std::iter::repeat;
+
+ #[test]
+ fn nonmember_prefix() {
+ for &c in ['&', '\0'].iter() {
+ for x in 0..48u32 {
+ for y in 0..48u32 {
+ let mut s = repeat("x").take(x as usize).collect::<String>();
+ s.push(c);
+ s.push_str(&repeat("x").take(y as usize).collect::<String>());
+ let set = small_char_set!('&' '\0');
+
+ assert_eq!(x, set.nonmember_prefix_len(&s));
+ }
+ }
+ }
+ }
+}