diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/default_emitter.rs | 18 | ||||
-rw-r--r-- | src/emitter.rs | 4 | ||||
-rw-r--r-- | src/token.rs | 6 | ||||
-rw-r--r-- | src/tokenizer/machine/utils.rs | 13 |
4 files changed, 15 insertions, 26 deletions
diff --git a/src/default_emitter.rs b/src/default_emitter.rs index e89fa5e..5edf848 100644 --- a/src/default_emitter.rs +++ b/src/default_emitter.rs @@ -1,7 +1,6 @@ use std::collections::btree_map::Entry; use std::collections::BTreeSet; use std::collections::VecDeque; -use std::mem; use std::ops::Range; use crate::offset::NoopOffset; @@ -12,7 +11,6 @@ use crate::Error; /// The default implementation of [`Emitter`], used to produce tokens. pub struct DefaultEmitter<O = NoopOffset> { - current_characters: String, current_token: Option<Token<O>>, current_attribute: Option<(String, crate::token::AttrInternal<O>)>, seen_attributes: BTreeSet<String>, @@ -24,7 +22,6 @@ pub struct DefaultEmitter<O = NoopOffset> { impl<O> Default for DefaultEmitter<O> { fn default() -> Self { DefaultEmitter { - current_characters: String::new(), current_token: None, current_attribute: None, seen_attributes: BTreeSet::new(), @@ -56,11 +53,10 @@ impl<O: Offset> Emitter<O> for DefaultEmitter<O> { } fn emit_eof(&mut self) { - self.flush_current_characters(); } - fn emit_string(&mut self, s: &str) { - self.current_characters.push_str(s); + fn emit_char(&mut self, c: char) { + self.emit_token(Token::Char(c)); } fn init_start_tag(&mut self, tag_offset: O, name_offset: O) { @@ -328,7 +324,6 @@ impl<O: Offset> Emitter<O> for DefaultEmitter<O> { impl<O> DefaultEmitter<O> { fn emit_token(&mut self, token: Token<O>) { - self.flush_current_characters(); self.emitted_tokens.push_front(token); } @@ -358,15 +353,6 @@ impl<O> DefaultEmitter<O> { } } } - - fn flush_current_characters(&mut self) { - if self.current_characters.is_empty() { - return; - } - - let s = mem::take(&mut self.current_characters); - self.emit_token(Token::String(s)); - } } /// The majority of our testing of the [`DefaultEmitter`] is done against the diff --git a/src/emitter.rs b/src/emitter.rs index 2a47f40..7a567b4 100644 --- a/src/emitter.rs +++ b/src/emitter.rs @@ -30,8 +30,8 @@ pub trait Emitter<O> { /// The state machine has reached the end of the file. fn emit_eof(&mut self); - /// Emit a bunch of plain characters as character tokens. - fn emit_string(&mut self, c: &str); + /// Emits the given character as a character token. + fn emit_char(&mut self, c: char); /// Set the _current token_ to a start tag. fn init_start_tag(&mut self, tag_offset: O, name_offset: O); diff --git a/src/token.rs b/src/token.rs index c599cd5..cb584ff 100644 --- a/src/token.rs +++ b/src/token.rs @@ -9,12 +9,14 @@ use crate::offset::Offset; /// A type for the tokens emitted by a WHATWG-compliant HTML tokenizer. #[derive(Debug, Eq, PartialEq)] pub enum Token<O> { + /// A literal character, a resolved character reference, + /// or part of a resolved character reference (since some + /// character references resolve to two `char`s). + Char(char), /// An HTML start tag. StartTag(StartTag<O>), /// An HTML end tag. EndTag(EndTag<O>), - /// A literal string. Character references have been resolved. - String(String), /// An HTML comment. Comment(Comment<O>), /// An HTML doctype declaration. diff --git a/src/tokenizer/machine/utils.rs b/src/tokenizer/machine/utils.rs index d96e50b..ea4d697 100644 --- a/src/tokenizer/machine/utils.rs +++ b/src/tokenizer/machine/utils.rs @@ -16,7 +16,7 @@ where /// Emits the given character as a character token. #[inline] pub(super) fn emit_char(&mut self, c: char) { - self.emitter.emit_string(ctostr!(c)); + self.emitter.emit_char(c); } /// Emits every byte of the given byte slice as a character token. @@ -25,10 +25,9 @@ where /// since [`str::chars`] isn't `const`.) #[inline] pub(super) fn emit_chars(&mut self, s: &[u8]) { - self.emitter.emit_string( - // this unsafe block is only temporary and will be removed in the next commit - unsafe { std::str::from_utf8_unchecked(s) }, - ); + for c in s { + self.emit_char(*c as char); + } } #[inline] @@ -204,7 +203,9 @@ where } pub(super) fn flush_buffer_characters(&mut self) { - self.emitter.emit_string(&self.temporary_buffer); + for c in self.temporary_buffer.chars() { + self.emitter.emit_char(c); + } self.temporary_buffer.clear(); } } |