summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--CHANGELOG.md5
-rw-r--r--README.md4
-rw-r--r--integration_tests/tests/test_html5lib.rs9
-rw-r--r--src/default_emitter.rs18
-rw-r--r--src/emitter.rs4
-rw-r--r--src/token.rs6
-rw-r--r--src/tokenizer/machine/utils.rs13
7 files changed, 30 insertions, 29 deletions
diff --git a/CHANGELOG.md b/CHANGELOG.md
index c4acbb2..075373c 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -14,6 +14,9 @@
(Errors now have to be queried separately with
`DefaultEmitter::drain_errors`.)
+ * Replaced the `String` variant with a new `Char` variant.
+ (The tokenizer now emits chars instead of strings.)
+
* `Emitter` trait
* Removed `pop_token` method and `Token` associated type.
@@ -21,6 +24,8 @@
* Renamed `emit_error` to `report_error`.
+ * Replaced `emit_string` with `emit_char`.
+
### 0.5.1 - 2023-09-03
#### Features
diff --git a/README.md b/README.md
index 740d857..07d9e06 100644
--- a/README.md
+++ b/README.md
@@ -19,8 +19,8 @@ for token in NaiveParser::new(html).flatten() {
Token::StartTag(tag) => {
write!(new_html, "<{}>", tag.name).unwrap();
}
- Token::String(hello_world) => {
- write!(new_html, "{}", hello_world).unwrap();
+ Token::Char(c) => {
+ write!(new_html, "{c}").unwrap();
}
Token::EndTag(tag) => {
write!(new_html, "</{}>", tag.name).unwrap();
diff --git a/integration_tests/tests/test_html5lib.rs b/integration_tests/tests/test_html5lib.rs
index 2e404c5..bede29a 100644
--- a/integration_tests/tests/test_html5lib.rs
+++ b/integration_tests/tests/test_html5lib.rs
@@ -137,7 +137,14 @@ fn run_test_inner<R: Reader>(
self_closing: tag.self_closing,
}),
Token::EndTag(tag) => actual_tokens.push(TestToken::EndTag { name: tag.name }),
- Token::String(data) => actual_tokens.push(TestToken::Character(data)),
+ Token::Char(c) => {
+ // Coalesce all adjacent character tokens into a single string.
+ if let Some(TestToken::Character(s)) = actual_tokens.last_mut() {
+ s.push(c);
+ } else {
+ actual_tokens.push(TestToken::Character(c.into()));
+ }
+ }
Token::Comment(comment) => actual_tokens.push(TestToken::Comment(comment.data)),
Token::Doctype(doctype) => actual_tokens.push(TestToken::Doctype {
name: doctype.name,
diff --git a/src/default_emitter.rs b/src/default_emitter.rs
index e89fa5e..5edf848 100644
--- a/src/default_emitter.rs
+++ b/src/default_emitter.rs
@@ -1,7 +1,6 @@
use std::collections::btree_map::Entry;
use std::collections::BTreeSet;
use std::collections::VecDeque;
-use std::mem;
use std::ops::Range;
use crate::offset::NoopOffset;
@@ -12,7 +11,6 @@ use crate::Error;
/// The default implementation of [`Emitter`], used to produce tokens.
pub struct DefaultEmitter<O = NoopOffset> {
- current_characters: String,
current_token: Option<Token<O>>,
current_attribute: Option<(String, crate::token::AttrInternal<O>)>,
seen_attributes: BTreeSet<String>,
@@ -24,7 +22,6 @@ pub struct DefaultEmitter<O = NoopOffset> {
impl<O> Default for DefaultEmitter<O> {
fn default() -> Self {
DefaultEmitter {
- current_characters: String::new(),
current_token: None,
current_attribute: None,
seen_attributes: BTreeSet::new(),
@@ -56,11 +53,10 @@ impl<O: Offset> Emitter<O> for DefaultEmitter<O> {
}
fn emit_eof(&mut self) {
- self.flush_current_characters();
}
- fn emit_string(&mut self, s: &str) {
- self.current_characters.push_str(s);
+ fn emit_char(&mut self, c: char) {
+ self.emit_token(Token::Char(c));
}
fn init_start_tag(&mut self, tag_offset: O, name_offset: O) {
@@ -328,7 +324,6 @@ impl<O: Offset> Emitter<O> for DefaultEmitter<O> {
impl<O> DefaultEmitter<O> {
fn emit_token(&mut self, token: Token<O>) {
- self.flush_current_characters();
self.emitted_tokens.push_front(token);
}
@@ -358,15 +353,6 @@ impl<O> DefaultEmitter<O> {
}
}
}
-
- fn flush_current_characters(&mut self) {
- if self.current_characters.is_empty() {
- return;
- }
-
- let s = mem::take(&mut self.current_characters);
- self.emit_token(Token::String(s));
- }
}
/// The majority of our testing of the [`DefaultEmitter`] is done against the
diff --git a/src/emitter.rs b/src/emitter.rs
index 2a47f40..7a567b4 100644
--- a/src/emitter.rs
+++ b/src/emitter.rs
@@ -30,8 +30,8 @@ pub trait Emitter<O> {
/// The state machine has reached the end of the file.
fn emit_eof(&mut self);
- /// Emit a bunch of plain characters as character tokens.
- fn emit_string(&mut self, c: &str);
+ /// Emits the given character as a character token.
+ fn emit_char(&mut self, c: char);
/// Set the _current token_ to a start tag.
fn init_start_tag(&mut self, tag_offset: O, name_offset: O);
diff --git a/src/token.rs b/src/token.rs
index c599cd5..cb584ff 100644
--- a/src/token.rs
+++ b/src/token.rs
@@ -9,12 +9,14 @@ use crate::offset::Offset;
/// A type for the tokens emitted by a WHATWG-compliant HTML tokenizer.
#[derive(Debug, Eq, PartialEq)]
pub enum Token<O> {
+ /// A literal character, a resolved character reference,
+ /// or part of a resolved character reference (since some
+ /// character references resolve to two `char`s).
+ Char(char),
/// An HTML start tag.
StartTag(StartTag<O>),
/// An HTML end tag.
EndTag(EndTag<O>),
- /// A literal string. Character references have been resolved.
- String(String),
/// An HTML comment.
Comment(Comment<O>),
/// An HTML doctype declaration.
diff --git a/src/tokenizer/machine/utils.rs b/src/tokenizer/machine/utils.rs
index d96e50b..ea4d697 100644
--- a/src/tokenizer/machine/utils.rs
+++ b/src/tokenizer/machine/utils.rs
@@ -16,7 +16,7 @@ where
/// Emits the given character as a character token.
#[inline]
pub(super) fn emit_char(&mut self, c: char) {
- self.emitter.emit_string(ctostr!(c));
+ self.emitter.emit_char(c);
}
/// Emits every byte of the given byte slice as a character token.
@@ -25,10 +25,9 @@ where
/// since [`str::chars`] isn't `const`.)
#[inline]
pub(super) fn emit_chars(&mut self, s: &[u8]) {
- self.emitter.emit_string(
- // this unsafe block is only temporary and will be removed in the next commit
- unsafe { std::str::from_utf8_unchecked(s) },
- );
+ for c in s {
+ self.emit_char(*c as char);
+ }
}
#[inline]
@@ -204,7 +203,9 @@ where
}
pub(super) fn flush_buffer_characters(&mut self) {
- self.emitter.emit_string(&self.temporary_buffer);
+ for c in self.temporary_buffer.chars() {
+ self.emitter.emit_char(c);
+ }
self.temporary_buffer.clear();
}
}