aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/default_emitter.rs18
-rw-r--r--src/emitter.rs4
-rw-r--r--src/token.rs6
-rw-r--r--src/tokenizer/machine/utils.rs13
4 files changed, 15 insertions, 26 deletions
diff --git a/src/default_emitter.rs b/src/default_emitter.rs
index e89fa5e..5edf848 100644
--- a/src/default_emitter.rs
+++ b/src/default_emitter.rs
@@ -1,7 +1,6 @@
use std::collections::btree_map::Entry;
use std::collections::BTreeSet;
use std::collections::VecDeque;
-use std::mem;
use std::ops::Range;
use crate::offset::NoopOffset;
@@ -12,7 +11,6 @@ use crate::Error;
/// The default implementation of [`Emitter`], used to produce tokens.
pub struct DefaultEmitter<O = NoopOffset> {
- current_characters: String,
current_token: Option<Token<O>>,
current_attribute: Option<(String, crate::token::AttrInternal<O>)>,
seen_attributes: BTreeSet<String>,
@@ -24,7 +22,6 @@ pub struct DefaultEmitter<O = NoopOffset> {
impl<O> Default for DefaultEmitter<O> {
fn default() -> Self {
DefaultEmitter {
- current_characters: String::new(),
current_token: None,
current_attribute: None,
seen_attributes: BTreeSet::new(),
@@ -56,11 +53,10 @@ impl<O: Offset> Emitter<O> for DefaultEmitter<O> {
}
fn emit_eof(&mut self) {
- self.flush_current_characters();
}
- fn emit_string(&mut self, s: &str) {
- self.current_characters.push_str(s);
+ fn emit_char(&mut self, c: char) {
+ self.emit_token(Token::Char(c));
}
fn init_start_tag(&mut self, tag_offset: O, name_offset: O) {
@@ -328,7 +324,6 @@ impl<O: Offset> Emitter<O> for DefaultEmitter<O> {
impl<O> DefaultEmitter<O> {
fn emit_token(&mut self, token: Token<O>) {
- self.flush_current_characters();
self.emitted_tokens.push_front(token);
}
@@ -358,15 +353,6 @@ impl<O> DefaultEmitter<O> {
}
}
}
-
- fn flush_current_characters(&mut self) {
- if self.current_characters.is_empty() {
- return;
- }
-
- let s = mem::take(&mut self.current_characters);
- self.emit_token(Token::String(s));
- }
}
/// The majority of our testing of the [`DefaultEmitter`] is done against the
diff --git a/src/emitter.rs b/src/emitter.rs
index 2a47f40..7a567b4 100644
--- a/src/emitter.rs
+++ b/src/emitter.rs
@@ -30,8 +30,8 @@ pub trait Emitter<O> {
/// The state machine has reached the end of the file.
fn emit_eof(&mut self);
- /// Emit a bunch of plain characters as character tokens.
- fn emit_string(&mut self, c: &str);
+ /// Emits the given character as a character token.
+ fn emit_char(&mut self, c: char);
/// Set the _current token_ to a start tag.
fn init_start_tag(&mut self, tag_offset: O, name_offset: O);
diff --git a/src/token.rs b/src/token.rs
index c599cd5..cb584ff 100644
--- a/src/token.rs
+++ b/src/token.rs
@@ -9,12 +9,14 @@ use crate::offset::Offset;
/// A type for the tokens emitted by a WHATWG-compliant HTML tokenizer.
#[derive(Debug, Eq, PartialEq)]
pub enum Token<O> {
+ /// A literal character, a resolved character reference,
+ /// or part of a resolved character reference (since some
+ /// character references resolve to two `char`s).
+ Char(char),
/// An HTML start tag.
StartTag(StartTag<O>),
/// An HTML end tag.
EndTag(EndTag<O>),
- /// A literal string. Character references have been resolved.
- String(String),
/// An HTML comment.
Comment(Comment<O>),
/// An HTML doctype declaration.
diff --git a/src/tokenizer/machine/utils.rs b/src/tokenizer/machine/utils.rs
index d96e50b..ea4d697 100644
--- a/src/tokenizer/machine/utils.rs
+++ b/src/tokenizer/machine/utils.rs
@@ -16,7 +16,7 @@ where
/// Emits the given character as a character token.
#[inline]
pub(super) fn emit_char(&mut self, c: char) {
- self.emitter.emit_string(ctostr!(c));
+ self.emitter.emit_char(c);
}
/// Emits every byte of the given byte slice as a character token.
@@ -25,10 +25,9 @@ where
/// since [`str::chars`] isn't `const`.)
#[inline]
pub(super) fn emit_chars(&mut self, s: &[u8]) {
- self.emitter.emit_string(
- // this unsafe block is only temporary and will be removed in the next commit
- unsafe { std::str::from_utf8_unchecked(s) },
- );
+ for c in s {
+ self.emit_char(*c as char);
+ }
}
#[inline]
@@ -204,7 +203,9 @@ where
}
pub(super) fn flush_buffer_characters(&mut self) {
- self.emitter.emit_string(&self.temporary_buffer);
+ for c in self.temporary_buffer.chars() {
+ self.emitter.emit_char(c);
+ }
self.temporary_buffer.clear();
}
}