diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/lib.rs | 58 | ||||
-rw-r--r-- | src/reader.rs | 41 |
2 files changed, 63 insertions, 36 deletions
@@ -29,6 +29,37 @@ macro_rules! ctostr { }; } +// this is a stack that can hold 0 to 2 Ts +#[derive(Debug, Default)] +struct Stack2<T: Copy>(Option<(T, Option<T>)>); + +impl<T: Copy> Stack2<T> { + #[inline] + fn push(&mut self, c: T) { + self.0 = match self.0 { + None => Some((c, None)), + Some((c1, None)) => Some((c1, Some(c))), + Some((_c1, Some(_c2))) => panic!("stack full!"), + } + } + + #[inline] + fn pop(&mut self) -> Option<T> { + let (new_self, rv) = match self.0 { + Some((c1, Some(c2))) => (Some((c1, None)), Some(c2)), + Some((c1, None)) => (None, Some(c1)), + None => (None, None), + }; + self.0 = new_self; + rv + } + + #[inline] + fn is_empty(&self) -> bool { + matches!(self.0, None) + } +} + /// A HTML tokenizer. See crate-level docs for basic usage. pub struct Tokenizer<R: Reader, E: Emitter = DefaultEmitter> { eof: bool, @@ -36,7 +67,7 @@ pub struct Tokenizer<R: Reader, E: Emitter = DefaultEmitter> { emitter: E, temporary_buffer: String, reader: R, - to_reconsume: Option<Option<char>>, + to_reconsume: Stack2<Option<char>>, character_reference_code: u32, return_state: Option<State>, } @@ -65,7 +96,7 @@ impl<R: Reader, E: Emitter> Tokenizer<R, E> { state: State::Data, emitter, temporary_buffer: String::new(), - to_reconsume: None, + to_reconsume: Stack2::default(), reader: input.to_reader(), character_reference_code: 0, return_state: None, @@ -98,7 +129,7 @@ impl<R: Reader, E: Emitter> Tokenizer<R, E> { #[inline] fn unread_char(&mut self, c: Option<char>) { - self.to_reconsume = Some(c); + self.to_reconsume.push(c); } #[inline] @@ -122,19 +153,30 @@ impl<R: Reader, E: Emitter> Tokenizer<R, E> { } fn read_char(&mut self) -> Option<char> { - if let Some(c) = self.to_reconsume.take() { - return c; + let (mut c, reconsumed) = match self.to_reconsume.pop() { + Some(c) => (c?, true), + None => (self.reader.read_char()?, false), + }; + + if c == '\r' { + c = '\n'; + let c2 = self.reader.read_char(); + if c2 != Some('\n') { + self.unread_char(c2); + } + } + + if !reconsumed { + self.validate_char(c); } - let c = self.reader.read_char()?; - self.validate_char(c); Some(c) } #[inline] fn try_read_string(&mut self, s: &str, case_sensitive: bool) -> bool { debug_assert!(!s.is_empty()); - debug_assert!(self.to_reconsume.is_none()); + debug_assert!(self.to_reconsume.is_empty()); self.reader.try_read_string(s, case_sensitive) } diff --git a/src/reader.rs b/src/reader.rs index d2cae92..7246129 100644 --- a/src/reader.rs +++ b/src/reader.rs @@ -4,9 +4,8 @@ pub trait Reader { /// Return a new character from the input stream. /// - /// Newlines have to be normalized as described in [Preprocessing the input - /// stream](https://html.spec.whatwg.org/#preprocessing-the-input-stream), however error - /// emission is done within the tokenizer. + /// The input stream does **not** have to be preprocessed in any way, it can contain standalone + /// surrogates and have inconsistent newlines. fn read_char(&mut self) -> Option<char>; /// Attempt to read an entire string at once, either case-insensitively or not. @@ -46,49 +45,35 @@ impl<'a, R: 'a + Reader> Readable<'a> for R { /// from strings. pub struct StringReader<'a> { input: &'a str, + cursor: std::str::Chars<'a>, pos: usize, } impl<'a> StringReader<'a> { fn new(input: &'a str) -> Self { - StringReader { input, pos: 0 } - } - - fn peek_char(&self) -> Option<char> { - self.input.get(self.pos..)?.chars().next() + let cursor = input.chars(); + StringReader { + input, + cursor, + pos: 0, + } } } impl<'a> Reader for StringReader<'a> { fn read_char(&mut self) -> Option<char> { - let mut r1 = match self.peek_char() { - Some(x) => x, - None => { - self.pos += 1; - return None; - } - }; - - self.pos += r1.len_utf8(); - - if r1 == '\r' { - r1 = '\n'; - let r2 = self.peek_char(); - if r2 == Some('\n') { - self.pos += r2.map(char::len_utf8).unwrap_or(0); - } - } - - Some(r1) + let c = self.cursor.next()?; + self.pos += c.len_utf8(); + Some(c) } fn try_read_string(&mut self, s1: &str, case_sensitive: bool) -> bool { // we do not need to call validate_char here because `s` hopefully does not contain invalid // characters - if let Some(s2) = self.input.get(self.pos..self.pos + s1.len()) { if s1 == s2 || (!case_sensitive && s1.eq_ignore_ascii_case(s2)) { self.pos += s1.len(); + self.cursor = self.input[self.pos..].chars(); return true; } } |