7 files changed, 30 insertions, 29 deletions
diff --git a/CHANGELOG.md b/CHANGELOG.md
index c4acbb2..075373c 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -14,6 +14,9 @@
     (Errors now have to be queried separately with
     `DefaultEmitter::drain_errors`.)
 
+  * Replaced the `String` variant with a new `Char` variant.  
+    (The tokenizer now emits chars instead of strings.)
+
 * `Emitter` trait
 
   * Removed `pop_token` method and `Token` associated type.
@@ -21,6 +24,8 @@
 
   * Renamed `emit_error` to `report_error`.
 
+  * Replaced `emit_string` with `emit_char`.
+
 ### 0.5.1 - 2023-09-03
 
 #### Features
diff --git a/README.md b/README.md
index 740d857..07d9e06 100644
--- a/README.md
+++ b/README.md
@@ -19,8 +19,8 @@ for token in NaiveParser::new(html).flatten() {
         Token::StartTag(tag) => {
             write!(new_html, "<{}>", tag.name).unwrap();
         }
-        Token::String(hello_world) => {
-            write!(new_html, "{}", hello_world).unwrap();
+        Token::Char(c) => {
+            write!(new_html, "{c}").unwrap();
         }
         Token::EndTag(tag) => {
             write!(new_html, "</{}>", tag.name).unwrap();
diff --git a/integration_tests/tests/test_html5lib.rs b/integration_tests/tests/test_html5lib.rs
index 2e404c5..bede29a 100644
--- a/integration_tests/tests/test_html5lib.rs
+++ b/integration_tests/tests/test_html5lib.rs
@@ -137,7 +137,14 @@ fn run_test_inner<R: Reader>(
                 self_closing: tag.self_closing,
             }),
             Token::EndTag(tag) => actual_tokens.push(TestToken::EndTag { name: tag.name }),
-            Token::String(data) => actual_tokens.push(TestToken::Character(data)),
+            Token::Char(c) => {
+                // Coalesce all adjacent character tokens into a single string.
+                if let Some(TestToken::Character(s)) = actual_tokens.last_mut() {
+                    s.push(c);
+                } else {
+                    actual_tokens.push(TestToken::Character(c.into()));
+                }
+            }
             Token::Comment(comment) => actual_tokens.push(TestToken::Comment(comment.data)),
             Token::Doctype(doctype) => actual_tokens.push(TestToken::Doctype {
                 name: doctype.name,
diff --git a/src/default_emitter.rs b/src/default_emitter.rs
index e89fa5e..5edf848 100644
--- a/src/default_emitter.rs
+++ b/src/default_emitter.rs
@@ -1,7 +1,6 @@
 use std::collections::btree_map::Entry;
 use std::collections::BTreeSet;
 use std::collections::VecDeque;
-use std::mem;
 use std::ops::Range;
 
 use crate::offset::NoopOffset;
@@ -12,7 +11,6 @@ use crate::Error;
 
 /// The default implementation of [`Emitter`], used to produce tokens.
 pub struct DefaultEmitter<O = NoopOffset> {
-    current_characters: String,
     current_token: Option<Token<O>>,
     current_attribute: Option<(String, crate::token::AttrInternal<O>)>,
     seen_attributes: BTreeSet<String>,
@@ -24,7 +22,6 @@ pub struct DefaultEmitter<O = NoopOffset> {
 impl<O> Default for DefaultEmitter<O> {
     fn default() -> Self {
         DefaultEmitter {
-            current_characters: String::new(),
             current_token: None,
             current_attribute: None,
             seen_attributes: BTreeSet::new(),
@@ -56,11 +53,10 @@ impl<O: Offset> Emitter<O> for DefaultEmitter<O> {
     }
 
     fn emit_eof(&mut self) {
-        self.flush_current_characters();
     }
 
-    fn emit_string(&mut self, s: &str) {
-        self.current_characters.push_str(s);
+    fn emit_char(&mut self, c: char) {
+        self.emit_token(Token::Char(c));
     }
 
     fn init_start_tag(&mut self, tag_offset: O, name_offset: O) {
@@ -328,7 +324,6 @@ impl<O: Offset> Emitter<O> for DefaultEmitter<O> {
 
 impl<O> DefaultEmitter<O> {
     fn emit_token(&mut self, token: Token<O>) {
-        self.flush_current_characters();
         self.emitted_tokens.push_front(token);
     }
 
@@ -358,15 +353,6 @@ impl<O> DefaultEmitter<O> {
             }
         }
     }
-
-    fn flush_current_characters(&mut self) {
-        if self.current_characters.is_empty() {
-            return;
-        }
-
-        let s = mem::take(&mut self.current_characters);
-        self.emit_token(Token::String(s));
-    }
 }
 
 /// The majority of our testing of the [`DefaultEmitter`] is done against the
diff --git a/src/emitter.rs b/src/emitter.rs
index 2a47f40..7a567b4 100644
--- a/src/emitter.rs
+++ b/src/emitter.rs
@@ -30,8 +30,8 @@ pub trait Emitter<O> {
     /// The state machine has reached the end of the file.
     fn emit_eof(&mut self);
 
-    /// Emit a bunch of plain characters as character tokens.
-    fn emit_string(&mut self, c: &str);
+    /// Emits the given character as a character token.
+    fn emit_char(&mut self, c: char);
 
     /// Set the _current token_ to a start tag.
     fn init_start_tag(&mut self, tag_offset: O, name_offset: O);
diff --git a/src/token.rs b/src/token.rs
index c599cd5..cb584ff 100644
--- a/src/token.rs
+++ b/src/token.rs
@@ -9,12 +9,14 @@ use crate::offset::Offset;
 /// A type for the tokens emitted by a WHATWG-compliant HTML tokenizer.
 #[derive(Debug, Eq, PartialEq)]
 pub enum Token<O> {
+    /// A literal character, a resolved character reference,
+    /// or part of a resolved character reference (since some
+    /// character references resolve to two `char`s).
+    Char(char),
     /// An HTML start tag.
     StartTag(StartTag<O>),
     /// An HTML end tag.
     EndTag(EndTag<O>),
-    /// A literal string. Character references have been resolved.
-    String(String),
     /// An HTML comment.
     Comment(Comment<O>),
     /// An HTML doctype declaration.
diff --git a/src/tokenizer/machine/utils.rs b/src/tokenizer/machine/utils.rs
index d96e50b..ea4d697 100644
--- a/src/tokenizer/machine/utils.rs
+++ b/src/tokenizer/machine/utils.rs
@@ -16,7 +16,7 @@ where
     /// Emits the given character as a character token.
     #[inline]
     pub(super) fn emit_char(&mut self, c: char) {
-        self.emitter.emit_string(ctostr!(c));
+        self.emitter.emit_char(c);
     }
 
     /// Emits every byte of the given byte slice as a character token.
@@ -25,10 +25,9 @@ where
     /// since [`str::chars`] isn't `const`.)
     #[inline]
     pub(super) fn emit_chars(&mut self, s: &[u8]) {
-        self.emitter.emit_string(
-            // this unsafe block is only temporary and will be removed in the next commit
-            unsafe { std::str::from_utf8_unchecked(s) },
-        );
+        for c in s {
+            self.emit_char(*c as char);
+        }
     }
 
     #[inline]
@@ -204,7 +203,9 @@ where
     }
 
     pub(super) fn flush_buffer_characters(&mut self) {
-        self.emitter.emit_string(&self.temporary_buffer);
+        for c in self.temporary_buffer.chars() {
+            self.emitter.emit_char(c);
+        }
         self.temporary_buffer.clear();
     }
 }