summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--src/lib.rs58
-rw-r--r--src/reader.rs41
2 files changed, 63 insertions, 36 deletions
diff --git a/src/lib.rs b/src/lib.rs
index ef6c9a2..361ccd0 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -29,6 +29,37 @@ macro_rules! ctostr {
};
}
+// this is a stack that can hold 0 to 2 Ts
+#[derive(Debug, Default)]
+struct Stack2<T: Copy>(Option<(T, Option<T>)>);
+
+impl<T: Copy> Stack2<T> {
+ #[inline]
+ fn push(&mut self, c: T) {
+ self.0 = match self.0 {
+ None => Some((c, None)),
+ Some((c1, None)) => Some((c1, Some(c))),
+ Some((_c1, Some(_c2))) => panic!("stack full!"),
+ }
+ }
+
+ #[inline]
+ fn pop(&mut self) -> Option<T> {
+ let (new_self, rv) = match self.0 {
+ Some((c1, Some(c2))) => (Some((c1, None)), Some(c2)),
+ Some((c1, None)) => (None, Some(c1)),
+ None => (None, None),
+ };
+ self.0 = new_self;
+ rv
+ }
+
+ #[inline]
+ fn is_empty(&self) -> bool {
+ matches!(self.0, None)
+ }
+}
+
/// A HTML tokenizer. See crate-level docs for basic usage.
pub struct Tokenizer<R: Reader, E: Emitter = DefaultEmitter> {
eof: bool,
@@ -36,7 +67,7 @@ pub struct Tokenizer<R: Reader, E: Emitter = DefaultEmitter> {
emitter: E,
temporary_buffer: String,
reader: R,
- to_reconsume: Option<Option<char>>,
+ to_reconsume: Stack2<Option<char>>,
character_reference_code: u32,
return_state: Option<State>,
}
@@ -65,7 +96,7 @@ impl<R: Reader, E: Emitter> Tokenizer<R, E> {
state: State::Data,
emitter,
temporary_buffer: String::new(),
- to_reconsume: None,
+ to_reconsume: Stack2::default(),
reader: input.to_reader(),
character_reference_code: 0,
return_state: None,
@@ -98,7 +129,7 @@ impl<R: Reader, E: Emitter> Tokenizer<R, E> {
#[inline]
fn unread_char(&mut self, c: Option<char>) {
- self.to_reconsume = Some(c);
+ self.to_reconsume.push(c);
}
#[inline]
@@ -122,19 +153,30 @@ impl<R: Reader, E: Emitter> Tokenizer<R, E> {
}
fn read_char(&mut self) -> Option<char> {
- if let Some(c) = self.to_reconsume.take() {
- return c;
+ let (mut c, reconsumed) = match self.to_reconsume.pop() {
+ Some(c) => (c?, true),
+ None => (self.reader.read_char()?, false),
+ };
+
+ if c == '\r' {
+ c = '\n';
+ let c2 = self.reader.read_char();
+ if c2 != Some('\n') {
+ self.unread_char(c2);
+ }
+ }
+
+ if !reconsumed {
+ self.validate_char(c);
}
- let c = self.reader.read_char()?;
- self.validate_char(c);
Some(c)
}
#[inline]
fn try_read_string(&mut self, s: &str, case_sensitive: bool) -> bool {
debug_assert!(!s.is_empty());
- debug_assert!(self.to_reconsume.is_none());
+ debug_assert!(self.to_reconsume.is_empty());
self.reader.try_read_string(s, case_sensitive)
}
diff --git a/src/reader.rs b/src/reader.rs
index d2cae92..7246129 100644
--- a/src/reader.rs
+++ b/src/reader.rs
@@ -4,9 +4,8 @@
pub trait Reader {
/// Return a new character from the input stream.
///
- /// Newlines have to be normalized as described in [Preprocessing the input
- /// stream](https://html.spec.whatwg.org/#preprocessing-the-input-stream), however error
- /// emission is done within the tokenizer.
+ /// The input stream does **not** have to be preprocessed in any way, it can contain standalone
+ /// surrogates and have inconsistent newlines.
fn read_char(&mut self) -> Option<char>;
/// Attempt to read an entire string at once, either case-insensitively or not.
@@ -46,49 +45,35 @@ impl<'a, R: 'a + Reader> Readable<'a> for R {
/// from strings.
pub struct StringReader<'a> {
input: &'a str,
+ cursor: std::str::Chars<'a>,
pos: usize,
}
impl<'a> StringReader<'a> {
fn new(input: &'a str) -> Self {
- StringReader { input, pos: 0 }
- }
-
- fn peek_char(&self) -> Option<char> {
- self.input.get(self.pos..)?.chars().next()
+ let cursor = input.chars();
+ StringReader {
+ input,
+ cursor,
+ pos: 0,
+ }
}
}
impl<'a> Reader for StringReader<'a> {
fn read_char(&mut self) -> Option<char> {
- let mut r1 = match self.peek_char() {
- Some(x) => x,
- None => {
- self.pos += 1;
- return None;
- }
- };
-
- self.pos += r1.len_utf8();
-
- if r1 == '\r' {
- r1 = '\n';
- let r2 = self.peek_char();
- if r2 == Some('\n') {
- self.pos += r2.map(char::len_utf8).unwrap_or(0);
- }
- }
-
- Some(r1)
+ let c = self.cursor.next()?;
+ self.pos += c.len_utf8();
+ Some(c)
}
fn try_read_string(&mut self, s1: &str, case_sensitive: bool) -> bool {
// we do not need to call validate_char here because `s` hopefully does not contain invalid
// characters
-
if let Some(s2) = self.input.get(self.pos..self.pos + s1.len()) {
if s1 == s2 || (!case_sensitive && s1.eq_ignore_ascii_case(s2)) {
self.pos += s1.len();
+ self.cursor = self.input[self.pos..].chars();
return true;
}
}