aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMartin Fischer <martin@push-f.com>2021-04-08 12:42:04 +0200
committerMartin Fischer <martin@push-f.com>2021-04-08 15:40:48 +0200
commite0bef0105e0cc64bb610889b6921fd94897431d9 (patch)
tree4601b8a86778c10b65d232f99c1f5fd4b289c22a
parent8bb20dcdeec57b2109b05351663ec1dba9c65f84 (diff)
drop tendril dependency
-rw-r--r--Cargo.toml1
-rw-r--r--benches/html5ever.rs13
-rw-r--r--examples/noop-tokenize.rs8
-rw-r--r--examples/tokenize.rs8
-rw-r--r--src/lib.rs2
-rw-r--r--src/tokenizer/char_ref/mod.rs21
-rw-r--r--src/tokenizer/interface.rs17
-rw-r--r--src/tokenizer/mod.rs127
-rw-r--r--src/util/buffer_queue.rs80
-rw-r--r--src/util/smallcharset.rs4
10 files changed, 132 insertions, 149 deletions
diff --git a/Cargo.toml b/Cargo.toml
index e32901f..ebb1ce2 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -13,7 +13,6 @@ edition = "2018"
[dependencies]
log = "0.4"
mac = "0.1"
-tendril = "0.4"
[dev-dependencies]
typed-arena = "1.3.0"
diff --git a/benches/html5ever.rs b/benches/html5ever.rs
index ff20c4f..9f4b815 100644
--- a/benches/html5ever.rs
+++ b/benches/html5ever.rs
@@ -2,12 +2,10 @@
extern crate criterion;
extern crate html5ever;
-use std::fs;
use std::path::PathBuf;
use criterion::{black_box, Criterion};
-use html5ever::tendril::*;
use html5ever::tokenizer::{BufferQueue, Token, TokenSink, TokenSinkResult, Tokenizer};
struct Sink;
@@ -27,14 +25,7 @@ fn run_bench(c: &mut Criterion, name: &str) {
let mut path = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
path.push("data/bench/");
path.push(name);
- let mut file = fs::File::open(&path).ok().expect("can't open file");
-
- // Read the file and treat it as an infinitely repeating sequence of characters.
- let mut file_input = ByteTendril::new();
- file.read_to_tendril(&mut file_input)
- .ok()
- .expect("can't read file");
- let file_input: StrTendril = file_input.try_reinterpret().unwrap();
+ let file_input: String = std::fs::read_to_string(&path).expect("can't open file");
let size = file_input.len();
let mut stream = file_input.chars().cycle();
@@ -46,7 +37,7 @@ fn run_bench(c: &mut Criterion, name: &str) {
// The by_ref() call is important, otherwise we get wrong results!
// See rust-lang/rust#18045.
let sz = std::cmp::min(1024, size - total);
- input.push(stream.by_ref().take(sz).collect::<String>().to_tendril());
+ input.push(stream.by_ref().take(sz).collect::<String>());
total += sz;
}
diff --git a/examples/noop-tokenize.rs b/examples/noop-tokenize.rs
index d6c62f1..323c429 100644
--- a/examples/noop-tokenize.rs
+++ b/examples/noop-tokenize.rs
@@ -14,8 +14,8 @@ extern crate html5ever;
use std::default::Default;
use std::io;
-use html5ever::tendril::*;
use html5ever::tokenizer::{BufferQueue, Token, TokenSink, TokenSinkResult, Tokenizer};
+use io::Read;
struct Sink(Vec<Token>);
@@ -31,10 +31,10 @@ impl TokenSink for Sink {
}
fn main() {
- let mut chunk = ByteTendril::new();
- io::stdin().read_to_tendril(&mut chunk).unwrap();
+ let mut chunk = Vec::new();
+ io::stdin().read_to_end(&mut chunk).unwrap();
let mut input = BufferQueue::new();
- input.push_back(chunk.try_reinterpret().unwrap());
+ input.push_back(std::str::from_utf8(&chunk).unwrap().to_string());
let mut tok = Tokenizer::new(Sink(Vec::new()), Default::default());
let _ = tok.feed(&mut input);
diff --git a/examples/tokenize.rs b/examples/tokenize.rs
index c422f0c..943513a 100644
--- a/examples/tokenize.rs
+++ b/examples/tokenize.rs
@@ -12,12 +12,12 @@ extern crate html5ever;
use std::default::Default;
use std::io;
-use html5ever::tendril::*;
use html5ever::tokenizer::BufferQueue;
use html5ever::tokenizer::{CharacterTokens, EndTag, NullCharacterToken, StartTag, TagToken};
use html5ever::tokenizer::{
ParseError, Token, TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts,
};
+use io::Read;
#[derive(Copy, Clone)]
struct TokenPrinter {
@@ -84,10 +84,10 @@ impl TokenSink for TokenPrinter {
fn main() {
let mut sink = TokenPrinter { in_char_run: false };
- let mut chunk = ByteTendril::new();
- io::stdin().read_to_tendril(&mut chunk).unwrap();
+ let mut chunk = Vec::new();
+ io::stdin().read_to_end(&mut chunk).unwrap();
let mut input = BufferQueue::new();
- input.push_back(chunk.try_reinterpret().unwrap());
+ input.push_back(std::str::from_utf8(&chunk).unwrap().to_string());
let mut tok = Tokenizer::new(
sink,
diff --git a/src/lib.rs b/src/lib.rs
index 88934f0..bc0950f 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -12,8 +12,6 @@
#![cfg_attr(test, deny(warnings))]
#![allow(unused_parens)]
-pub use tendril;
-
#[macro_use]
mod macros;
diff --git a/src/tokenizer/char_ref/mod.rs b/src/tokenizer/char_ref/mod.rs
index 484a9e1..6daeb13 100644
--- a/src/tokenizer/char_ref/mod.rs
+++ b/src/tokenizer/char_ref/mod.rs
@@ -8,7 +8,6 @@
// except according to those terms.
use super::{TokenSink, Tokenizer};
-use tendril::StrTendril;
use crate::util::buffer_queue::BufferQueue;
use crate::util::str::is_ascii_alnum;
@@ -55,7 +54,7 @@ pub struct CharRefTokenizer {
seen_digit: bool,
hex_marker: Option<char>,
- name_buf_opt: Option<StrTendril>,
+ name_buf_opt: Option<String>,
name_match: Option<(u32, u32)>,
name_len: usize,
}
@@ -84,13 +83,13 @@ impl CharRefTokenizer {
self.result.expect("get_result called before done")
}
- fn name_buf(&self) -> &StrTendril {
+ fn name_buf(&self) -> &str {
self.name_buf_opt
.as_ref()
.expect("name_buf missing in named character reference")
}
- fn name_buf_mut(&mut self) -> &mut StrTendril {
+ fn name_buf_mut(&mut self) -> &mut String {
self.name_buf_opt
.as_mut()
.expect("name_buf missing in named character reference")
@@ -189,7 +188,7 @@ impl CharRefTokenizer {
_ => {
self.state = Named;
- self.name_buf_opt = Some(StrTendril::new());
+ self.name_buf_opt = Some(String::new());
Progress
},
}
@@ -265,9 +264,9 @@ impl CharRefTokenizer {
tokenizer: &mut Tokenizer<Sink>,
input: &mut BufferQueue,
) -> Status {
- let mut unconsume = StrTendril::from_char('#');
+ let mut unconsume = String::from('#');
match self.hex_marker {
- Some(c) => unconsume.push_char(c),
+ Some(c) => unconsume.push(c),
None => (),
}
@@ -316,7 +315,7 @@ impl CharRefTokenizer {
input: &mut BufferQueue,
) -> Status {
let c = unwrap_or_return!(tokenizer.get_char(input), Stuck);
- self.name_buf_mut().push_char(c);
+ self.name_buf_mut().push(c);
self.finish_named(tokenizer, input, Some(c))
}
@@ -411,7 +410,7 @@ impl CharRefTokenizer {
self.unconsume_name(input);
self.finish_none()
} else {
- input.push_front(StrTendril::from_slice(&self.name_buf()[name_len..]));
+ input.push_front(String::from(&self.name_buf()[name_len..]));
self.result = Some(CharRef {
chars: [from_u32(c1).unwrap(), from_u32(c2).unwrap()],
num_chars: if c2 == 0 { 1 } else { 2 },
@@ -428,7 +427,7 @@ impl CharRefTokenizer {
input: &mut BufferQueue,
) -> Status {
let c = unwrap_or_return!(tokenizer.get_char(input), Stuck);
- self.name_buf_mut().push_char(c);
+ self.name_buf_mut().push(c);
match c {
_ if is_ascii_alnum(c) => return Progress,
';' => self.emit_name_error(tokenizer),
@@ -462,7 +461,7 @@ impl CharRefTokenizer {
},
Octothorpe => {
- input.push_front(StrTendril::from_slice("#"));
+ input.push_front(String::from("#"));
tokenizer.emit_error(Borrowed("EOF after '#' in character reference"));
self.finish_none();
},
diff --git a/src/tokenizer/interface.rs b/src/tokenizer/interface.rs
index c331a0e..dfd9a9f 100644
--- a/src/tokenizer/interface.rs
+++ b/src/tokenizer/interface.rs
@@ -7,7 +7,6 @@
// option. This file may not be copied, modified, or distributed
// except according to those terms.
-use tendril::StrTendril;
use crate::tokenizer::states;
use std::borrow::Cow;
@@ -19,9 +18,9 @@ pub use self::Token::{EOFToken, NullCharacterToken, ParseError};
// FIXME: already exists in Servo DOM
#[derive(PartialEq, Eq, Clone, Debug)]
pub struct Doctype {
- pub name: Option<StrTendril>,
- pub public_id: Option<StrTendril>,
- pub system_id: Option<StrTendril>,
+ pub name: Option<String>,
+ pub public_id: Option<String>,
+ pub system_id: Option<String>,
pub force_quirks: bool,
}
@@ -51,16 +50,16 @@ pub enum TagKind {
#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug)]
pub struct Attribute {
/// The name of the attribute (e.g. the `class` in `<div class="test">`)
- pub name: StrTendril,
+ pub name: String,
/// The value of the attribute (e.g. the `"test"` in `<div class="test">`)
- pub value: StrTendril,
+ pub value: String,
}
/// A tag token.
#[derive(PartialEq, Eq, Clone, Debug)]
pub struct Tag {
pub kind: TagKind,
- pub name: StrTendril,
+ pub name: String,
pub self_closing: bool,
pub attrs: Vec<Attribute>,
}
@@ -86,8 +85,8 @@ impl Tag {
pub enum Token {
DoctypeToken(Doctype),
TagToken(Tag),
- CommentToken(StrTendril),
- CharacterTokens(StrTendril),
+ CommentToken(String),
+ CharacterTokens(String),
NullCharacterToken,
EOFToken,
ParseError(Cow<'static, str>),
diff --git a/src/tokenizer/mod.rs b/src/tokenizer/mod.rs
index f45c917..eb22b11 100644
--- a/src/tokenizer/mod.rs
+++ b/src/tokenizer/mod.rs
@@ -31,7 +31,6 @@ use std::default::Default;
use std::mem::replace;
pub use crate::util::buffer_queue::{BufferQueue, FromSet, NotFromSet, SetResult};
-use tendril::StrTendril;
mod char_ref;
mod interface;
@@ -49,10 +48,10 @@ pub enum TokenizerResult<Handle> {
Script(Handle),
}
-fn option_push(opt_str: &mut Option<StrTendril>, c: char) {
+fn option_push(opt_str: &mut Option<String>, c: char) {
match *opt_str {
- Some(ref mut s) => s.push_char(c),
- None => *opt_str = Some(StrTendril::from_char(c)),
+ Some(ref mut s) => s.push(c),
+ None => *opt_str = Some(String::from(c)),
}
}
@@ -132,7 +131,7 @@ pub struct Tokenizer<Sink> {
current_tag_kind: TagKind,
/// Current tag name.
- current_tag_name: StrTendril,
+ current_tag_name: String,
/// Current tag is self-closing?
current_tag_self_closing: bool,
@@ -141,22 +140,22 @@ pub struct Tokenizer<Sink> {
current_tag_attrs: Vec<Attribute>,
/// Current attribute name.
- current_attr_name: StrTendril,
+ current_attr_name: String,
/// Current attribute value.
- current_attr_value: StrTendril,
+ current_attr_value: String,
/// Current comment.
- current_comment: StrTendril,
+ current_comment: String,
/// Current doctype token.
current_doctype: Doctype,
/// Last start tag name, for use in checking "appropriate end tag".
- last_start_tag_name: Option<StrTendril>,
+ last_start_tag_name: Option<String>,
/// The "temporary buffer" mentioned in the spec.
- temp_buf: StrTendril,
+ temp_buf: String,
/// Record of how many ns we spent in each state, if profiling is enabled.
state_profile: BTreeMap<states::State, u64>,
@@ -173,8 +172,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
pub fn new(sink: Sink, mut opts: TokenizerOpts) -> Tokenizer<Sink> {
let start_tag_name = opts
.last_start_tag_name
- .take()
- .map(|s| StrTendril::from(s));
+ .take();
let state = opts.initial_state.unwrap_or(states::Data);
let discard_bom = opts.discard_bom;
Tokenizer {
@@ -188,15 +186,15 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
ignore_lf: false,
discard_bom,
current_tag_kind: StartTag,
- current_tag_name: StrTendril::new(),
+ current_tag_name: String::new(),
current_tag_self_closing: false,
current_tag_attrs: vec![],
- current_attr_name: StrTendril::new(),
- current_attr_value: StrTendril::new(),
- current_comment: StrTendril::new(),
+ current_attr_name: String::new(),
+ current_attr_value: String::new(),
+ current_comment: String::new(),
current_doctype: Doctype::new(),
last_start_tag_name: start_tag_name,
- temp_buf: StrTendril::new(),
+ temp_buf: String::new(),
state_profile: BTreeMap::new(),
time_in_sink: 0,
current_line: 1,
@@ -324,12 +322,12 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
pat: &str,
eq: fn(&u8, &u8) -> bool,
) -> Option<bool> {
- input.push_front(replace(&mut self.temp_buf, StrTendril::new()));
+ input.push_front(replace(&mut self.temp_buf, String::new()));
match input.eat(pat, eq) {
None if self.at_eof => Some(false),
None => {
while let Some(c) = input.next() {
- self.temp_buf.push_char(c);
+ self.temp_buf.push(c);
}
None
},
@@ -398,12 +396,12 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
fn emit_char(&mut self, c: char) {
self.process_token_and_continue(match c {
'\0' => NullCharacterToken,
- _ => CharacterTokens(StrTendril::from_char(c)),
+ _ => CharacterTokens(String::from(c)),
});
}
// The string must not contain '\0'!
- fn emit_chars(&mut self, b: StrTendril) {
+ fn emit_chars(&mut self, b: String) {
self.process_token_and_continue(CharacterTokens(b));
}
@@ -453,7 +451,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
fn emit_temp_buf(&mut self) {
// FIXME: Make sure that clearing on emit is spec-compatible.
- let buf = replace(&mut self.temp_buf, StrTendril::new());
+ let buf = replace(&mut self.temp_buf, String::new());
self.emit_chars(buf);
}
@@ -463,7 +461,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
}
fn emit_current_comment(&mut self) {
- let comment = replace(&mut self.current_comment, StrTendril::new());
+ let comment = replace(&mut self.current_comment, String::new());
self.process_token_and_continue(CommentToken(comment));
}
@@ -475,7 +473,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
fn create_tag(&mut self, kind: TagKind, c: char) {
self.discard_tag();
- self.current_tag_name.push_char(c);
+ self.current_tag_name.push(c);
self.current_tag_kind = kind;
}
@@ -489,7 +487,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
fn create_attribute(&mut self, c: char) {
self.finish_attribute();
- self.current_attr_name.push_char(c);
+ self.current_attr_name.push(c);
}
fn finish_attribute(&mut self) {
@@ -516,7 +514,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
self.current_attr_name.clear();
self.current_tag_attrs.push(Attribute {
name: name,
- value: replace(&mut self.current_attr_value, StrTendril::new()),
+ value: replace(&mut self.current_attr_value, String::new()),
});
}
}
@@ -526,7 +524,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
self.process_token_and_continue(DoctypeToken(doctype));
}
- fn doctype_id(&mut self, kind: DoctypeIdKind) -> &mut Option<StrTendril> {
+ fn doctype_id(&mut self, kind: DoctypeIdKind) -> &mut Option<String> {
match kind {
Public => &mut self.current_doctype.public_id,
System => &mut self.current_doctype.system_id,
@@ -537,7 +535,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
let id = self.doctype_id(kind);
match *id {
Some(ref mut s) => s.clear(),
- None => *id = Some(StrTendril::new()),
+ None => *id = Some(String::new()),
}
}
@@ -573,18 +571,18 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
macro_rules! shorthand (
( $me:ident : emit $c:expr ) => ( $me.emit_char($c); );
( $me:ident : create_tag $kind:ident $c:expr ) => ( $me.create_tag($kind, $c); );
- ( $me:ident : push_tag $c:expr ) => ( $me.current_tag_name.push_char($c); );
+ ( $me:ident : push_tag $c:expr ) => ( $me.current_tag_name.push($c); );
( $me:ident : discard_tag ) => ( $me.discard_tag(); );
( $me:ident : discard_char $input:expr ) => ( $me.discard_char($input); );
- ( $me:ident : push_temp $c:expr ) => ( $me.temp_buf.push_char($c); );
+ ( $me:ident : push_temp $c:expr ) => ( $me.temp_buf.push($c); );
( $me:ident : emit_temp ) => ( $me.emit_temp_buf(); );
( $me:ident : clear_temp ) => ( $me.clear_temp_buf(); );
( $me:ident : create_attr $c:expr ) => ( $me.create_attribute($c); );
- ( $me:ident : push_name $c:expr ) => ( $me.current_attr_name.push_char($c); );
- ( $me:ident : push_value $c:expr ) => ( $me.current_attr_value.push_char($c); );
- ( $me:ident : append_value $c:expr ) => ( $me.current_attr_value.push_tendril($c); );
- ( $me:ident : push_comment $c:expr ) => ( $me.current_comment.push_char($c); );
- ( $me:ident : append_comment $c:expr ) => ( $me.current_comment.push_slice($c); );
+ ( $me:ident : push_name $c:expr ) => ( $me.current_attr_name.push($c); );
+ ( $me:ident : push_value $c:expr ) => ( $me.current_attr_value.push($c); );
+ ( $me:ident : append_value $c:expr ) => ( $me.current_attr_value.push_str($c); );
+ ( $me:ident : push_comment $c:expr ) => ( $me.current_comment.push($c); );
+ ( $me:ident : append_comment $c:expr ) => ( $me.current_comment.push_str($c); );
( $me:ident : emit_comment ) => ( $me.emit_current_comment(); );
( $me:ident : clear_comment ) => ( $me.current_comment.clear(); );
( $me:ident : create_doctype ) => ( $me.current_doctype = Doctype::new(); );
@@ -1523,7 +1521,6 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
#[allow(non_snake_case)]
mod test {
use super::option_push; // private items
- use tendril::{SliceExt, StrTendril};
use super::{TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts};
@@ -1539,7 +1536,7 @@ mod test {
// vector is a collection of the line numbers that each token is on.
struct LinesMatch {
tokens: Vec<Token>,
- current_str: StrTendril,
+ current_str: String,
lines: Vec<(Token, u64)>,
}
@@ -1547,7 +1544,7 @@ mod test {
fn new() -> LinesMatch {
LinesMatch {
tokens: vec![],
- current_str: StrTendril::new(),
+ current_str: String::new(),
lines: vec![],
}
}
@@ -1559,7 +1556,7 @@ mod test {
fn finish_str(&mut self) {
if self.current_str.len() > 0 {
- let s = replace(&mut self.current_str, StrTendril::new());
+ let s = replace(&mut self.current_str, String::new());
self.tokens.push(CharacterTokens(s));
}
}
@@ -1575,11 +1572,11 @@ mod test {
) -> TokenSinkResult<Self::Handle> {
match token {
CharacterTokens(b) => {
- self.current_str.push_slice(&b);
+ self.current_str.push_str(&b);
},
NullCharacterToken => {
- self.current_str.push_char('\0');
+ self.current_str.push('\0');
},
ParseError(_) => {
@@ -1610,7 +1607,7 @@ mod test {
// Take in tokens, process them, and return vector with line
// numbers that each token is on
- fn tokenize(input: Vec<StrTendril>, opts: TokenizerOpts) -> Vec<(Token, u64)> {
+ fn tokenize(input: Vec<String>, opts: TokenizerOpts) -> Vec<(Token, u64)> {
let sink = LinesMatch::new();
let mut tok = Tokenizer::new(sink, opts);
let mut buffer = BufferQueue::new();
@@ -1623,7 +1620,7 @@ mod test {
}
// Create a tag token
- fn create_tag(token: StrTendril, tagkind: TagKind) -> Token {
+ fn create_tag(token: String, tagkind: TagKind) -> Token {
let name = token;
let token = TagToken(Tag {
kind: tagkind,
@@ -1636,23 +1633,23 @@ mod test {
#[test]
fn push_to_None_gives_singleton() {
- let mut s: Option<StrTendril> = None;
+ let mut s: Option<String> = None;
option_push(&mut s, 'x');
- assert_eq!(s, Some("x".to_tendril()));
+ assert_eq!(s, Some("x".into()));
}
#[test]
fn push_to_empty_appends() {
- let mut s: Option<StrTendril> = Some(StrTendril::new());
+ let mut s: Option<String> = Some(String::new());
option_push(&mut s, 'x');
- assert_eq!(s, Some("x".to_tendril()));
+ assert_eq!(s, Some("x".into()));
}
#[test]
fn push_to_nonempty_appends() {
- let mut s: Option<StrTendril> = Some(StrTendril::from_slice("y"));
+ let mut s: Option<String> = Some(String::from("y"));
option_push(&mut s, 'x');
- assert_eq!(s, Some("yx".to_tendril()));
+ assert_eq!(s, Some("yx".into()));
}
#[test]
@@ -1665,16 +1662,16 @@ mod test {
last_start_tag_name: None,
};
let vector = vec![
- StrTendril::from("<a>\n"),
- StrTendril::from("<b>\n"),
- StrTendril::from("</b>\n"),
- StrTendril::from("</a>\n"),
+ String::from("<a>\n"),
+ String::from("<b>\n"),
+ String::from("</b>\n"),
+ String::from("</a>\n"),
];
let expected = vec![
- (create_tag(StrTendril::from("a"), StartTag), 1),
- (create_tag(StrTendril::from("b"), StartTag), 2),
- (create_tag(StrTendril::from("b"), EndTag), 3),
- (create_tag(StrTendril::from("a"), EndTag), 4),
+ (create_tag(String::from("a"), StartTag), 1),
+ (create_tag(String::from("b"), StartTag), 2),
+ (create_tag(String::from("b"), EndTag), 3),
+ (create_tag(String::from("a"), EndTag), 4),
];
let results = tokenize(vector, opts);
assert_eq!(results, expected);
@@ -1690,16 +1687,16 @@ mod test {
last_start_tag_name: None,
};
let vector = vec![
- StrTendril::from("<a>\r\n"),
- StrTendril::from("<b>\r\n"),
- StrTendril::from("</b>\r\n"),
- StrTendril::from("</a>\r\n"),
+ String::from("<a>\r\n"),
+ String::from("<b>\r\n"),
+ String::from("</b>\r\n"),
+ String::from("</a>\r\n"),
];
let expected = vec![
- (create_tag(StrTendril::from("a"), StartTag), 1),
- (create_tag(StrTendril::from("b"), StartTag), 2),
- (create_tag(StrTendril::from("b"), EndTag), 3),
- (create_tag(StrTendril::from("a"), EndTag), 4),
+ (create_tag(String::from("a"), StartTag), 1),
+ (create_tag(String::from("b"), StartTag), 2),
+ (create_tag(String::from("b"), EndTag), 3),
+ (create_tag(String::from("a"), EndTag), 4),
];
let results = tokenize(vector, opts);
assert_eq!(results, expected);
diff --git a/src/util/buffer_queue.rs b/src/util/buffer_queue.rs
index 7f8c3cc..5201a57 100644
--- a/src/util/buffer_queue.rs
+++ b/src/util/buffer_queue.rs
@@ -20,8 +20,6 @@
use std::collections::VecDeque;
-use tendril::StrTendril;
-
pub use self::SetResult::{FromSet, NotFromSet};
use crate::util::smallcharset::SmallCharSet;
@@ -35,7 +33,7 @@ pub enum SetResult {
/// A character from the `SmallCharSet`.
FromSet(char),
/// A string buffer containing no characters from the `SmallCharSet`.
- NotFromSet(StrTendril),
+ NotFromSet(String),
}
/// A queue of owned string buffers, which supports incrementally consuming characters.
@@ -46,7 +44,7 @@ pub enum SetResult {
#[derive(Debug)]
pub struct BufferQueue {
/// Buffers to process.
- buffers: VecDeque<StrTendril>,
+ buffers: VecDeque<(usize, String)>,
}
impl BufferQueue {
@@ -66,28 +64,32 @@ impl BufferQueue {
/// Get the buffer at the beginning of the queue.
#[inline]
- pub fn pop_front(&mut self) -> Option<StrTendril> {
- self.buffers.pop_front()
+ pub fn pop_front(&mut self) -> Option<String> {
+ if let Some((i, s)) = self.buffers.pop_front() {
+ return Some(s[i..].into())
+ }
+ None
+ // self.buffers.pop_front().map(|(i, s)| &s[i..])
}
/// Add a buffer to the beginning of the queue.
///
/// If the buffer is empty, it will be skipped.
- pub fn push_front(&mut self, buf: StrTendril) {
- if buf.len32() == 0 {
+ pub fn push_front(&mut self, buf: String) {
+ if buf.len() == 0 {
return;
}
- self.buffers.push_front(buf);
+ self.buffers.push_front((0, buf));
}
/// Add a buffer to the end of the queue.
///
/// If the buffer is empty, it will be skipped.
- pub fn push_back(&mut self, buf: StrTendril) {
- if buf.len32() == 0 {
+ pub fn push_back(&mut self, buf: String) {
+ if buf.len() == 0 {
return;
}
- self.buffers.push_back(buf);
+ self.buffers.push_back((0, buf));
}
/// Look at the next available character without removing it, if the queue is not empty.
@@ -95,11 +97,11 @@ impl BufferQueue {
debug_assert!(
self.buffers
.iter()
- .find(|el| el.len32() == 0)
+ .find(|(i, s)| s[*i..].is_empty())
.is_none(),
"invariant \"all buffers in the queue are non-empty\" failed"
);
- self.buffers.front().map(|b| b.chars().next().unwrap())
+ self.buffers.front().map(|(i, s)| s[*i..].chars().next().unwrap())
}
/// Get the next character if one is available, removing it from the queue.
@@ -108,9 +110,10 @@ impl BufferQueue {
pub fn next(&mut self) -> Option<char> {
let (result, now_empty) = match self.buffers.front_mut() {
None => (None, false),
- Some(buf) => {
- let c = buf.pop_front_char().expect("empty buffer in queue");
- (Some(c), buf.is_empty())
+ Some((i, buf)) => {
+ let c = &buf[*i..].chars().next().expect("empty buffer in queue");
+ *i += c.len_utf8();
+ (Some(*c), buf[*i..].is_empty())
},
};
@@ -126,18 +129,15 @@ impl BufferQueue {
pub fn pop_except_from(&mut self, set: SmallCharSet) -> Option<SetResult> {
let (result, now_empty) = match self.buffers.front_mut() {
None => (None, false),
- Some(buf) => {
- let n = set.nonmember_prefix_len(&buf);
+ Some((i, buf)) => {
+ let n = set.nonmember_prefix_len(&buf[*i..]);
if n > 0 {
- let out;
- unsafe {
- out = buf.unsafe_subtendril(0, n);
- buf.unsafe_pop_front(n);
- }
- (Some(NotFromSet(out)), buf.is_empty())
+ let out = buf.drain(*i..*i + n).collect();
+ (Some(NotFromSet(out)), buf[*i..].is_empty())
} else {
- let c = buf.pop_front_char().expect("empty buffer in queue");
- (Some(FromSet(c)), buf.is_empty())
+ let c = &buf[*i..].chars().next().expect("empty buffer in queue");
+ *i += c.len_utf8();
+ (Some(FromSet(*c)), buf[*i..].is_empty())
}
},
};
@@ -166,9 +166,9 @@ impl BufferQueue {
if buffers_exhausted >= self.buffers.len() {
return None;
}
- let buf = &self.buffers[buffers_exhausted];
+ let (i, buf) = &self.buffers[buffers_exhausted];
- if !eq(&buf.as_bytes()[consumed_from_last], &pattern_byte) {
+ if !eq(&buf[*i..].as_bytes()[consumed_from_last], &pattern_byte) {
return Some(false);
}
@@ -186,7 +186,9 @@ impl BufferQueue {
match self.buffers.front_mut() {
None => assert_eq!(consumed_from_last, 0),
- Some(ref mut buf) => buf.pop_front(consumed_from_last as u32),
+ Some((i, _buf)) => {
+ *i += consumed_from_last;
+ },
}
Some(true)
@@ -196,8 +198,6 @@ impl BufferQueue {
#[cfg(test)]
#[allow(non_snake_case)]
mod test {
- use tendril::SliceExt;
-
use super::BufferQueue;
use super::SetResult::{FromSet, NotFromSet};
@@ -207,7 +207,7 @@ mod test {
assert_eq!(bq.peek(), None);
assert_eq!(bq.next(), None);
- bq.push_back("abc".to_tendril());
+ bq.push_back("abc".into());
assert_eq!(bq.peek(), Some('a'));
assert_eq!(bq.next(), Some('a'));
assert_eq!(bq.peek(), Some('b'));
@@ -222,10 +222,10 @@ mod test {
#[test]
fn can_unconsume() {
let mut bq = BufferQueue::new();
- bq.push_back("abc".to_tendril());
+ bq.push_back("abc".into());
assert_eq!(bq.next(), Some('a'));
- bq.push_front("xy".to_tendril());
+ bq.push_front("xy".into());
assert_eq!(bq.next(), Some('x'));
assert_eq!(bq.next(), Some('y'));
assert_eq!(bq.next(), Some('b'));
@@ -236,11 +236,11 @@ mod test {
#[test]
fn can_pop_except_set() {
let mut bq = BufferQueue::new();
- bq.push_back("abc&def".to_tendril());
+ bq.push_back("abc&def".into());
let mut pop = || bq.pop_except_from(small_char_set!('&'));
- assert_eq!(pop(), Some(NotFromSet("abc".to_tendril())));
+ assert_eq!(pop(), Some(NotFromSet("abc".into())));
assert_eq!(pop(), Some(FromSet('&')));
- assert_eq!(pop(), Some(NotFromSet("def".to_tendril())));
+ assert_eq!(pop(), Some(NotFromSet("def".into())));
assert_eq!(pop(), None);
}
@@ -250,8 +250,8 @@ mod test {
// integration tests for more thorough testing with many
// different input buffer splits.
let mut bq = BufferQueue::new();
- bq.push_back("a".to_tendril());
- bq.push_back("bc".to_tendril());
+ bq.push_back("a".into());
+ bq.push_back("bc".into());
assert_eq!(bq.eat("abcd", u8::eq_ignore_ascii_case), None);
assert_eq!(bq.eat("ax", u8::eq_ignore_ascii_case), Some(false));
assert_eq!(bq.eat("ab", u8::eq_ignore_ascii_case), Some(true));
diff --git a/src/util/smallcharset.rs b/src/util/smallcharset.rs
index aeeb189..2bf8585 100644
--- a/src/util/smallcharset.rs
+++ b/src/util/smallcharset.rs
@@ -41,7 +41,7 @@ impl SmallCharSet {
/// Count the number of bytes of characters at the beginning of `buf` which are not in the set.
///
/// This functionality is used in [`BufferQueue::pop_except_from`].
- pub fn nonmember_prefix_len(&self, buf: &str) -> u32 {
+ pub fn nonmember_prefix_len(&self, buf: &str) -> usize {
let mut n = 0;
for b in buf.bytes() {
if b >= 64 || !self.contains(b) {
@@ -61,7 +61,7 @@ mod test {
#[test]
fn nonmember_prefix() {
for &c in ['&', '\0'].iter() {
- for x in 0..48u32 {
+ for x in 0..48 {
for y in 0..48u32 {
let mut s = repeat("x").take(x as usize).collect::<String>();
s.push(c);