diff options
-rw-r--r-- | CHANGELOG.md | 5 | ||||
-rw-r--r-- | README.md | 4 | ||||
-rw-r--r-- | integration_tests/tests/test_html5lib.rs | 9 | ||||
-rw-r--r-- | src/default_emitter.rs | 18 | ||||
-rw-r--r-- | src/emitter.rs | 4 | ||||
-rw-r--r-- | src/token.rs | 6 | ||||
-rw-r--r-- | src/tokenizer/machine/utils.rs | 13 |
7 files changed, 30 insertions, 29 deletions
diff --git a/CHANGELOG.md b/CHANGELOG.md index c4acbb2..075373c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,9 @@ (Errors now have to be queried separately with `DefaultEmitter::drain_errors`.) + * Replaced the `String` variant with a new `Char` variant. + (The tokenizer now emits chars instead of strings.) + * `Emitter` trait * Removed `pop_token` method and `Token` associated type. @@ -21,6 +24,8 @@ * Renamed `emit_error` to `report_error`. + * Replaced `emit_string` with `emit_char`. + ### 0.5.1 - 2023-09-03 #### Features @@ -19,8 +19,8 @@ for token in NaiveParser::new(html).flatten() { Token::StartTag(tag) => { write!(new_html, "<{}>", tag.name).unwrap(); } - Token::String(hello_world) => { - write!(new_html, "{}", hello_world).unwrap(); + Token::Char(c) => { + write!(new_html, "{c}").unwrap(); } Token::EndTag(tag) => { write!(new_html, "</{}>", tag.name).unwrap(); diff --git a/integration_tests/tests/test_html5lib.rs b/integration_tests/tests/test_html5lib.rs index 2e404c5..bede29a 100644 --- a/integration_tests/tests/test_html5lib.rs +++ b/integration_tests/tests/test_html5lib.rs @@ -137,7 +137,14 @@ fn run_test_inner<R: Reader>( self_closing: tag.self_closing, }), Token::EndTag(tag) => actual_tokens.push(TestToken::EndTag { name: tag.name }), - Token::String(data) => actual_tokens.push(TestToken::Character(data)), + Token::Char(c) => { + // Coalesce all adjacent character tokens into a single string. + if let Some(TestToken::Character(s)) = actual_tokens.last_mut() { + s.push(c); + } else { + actual_tokens.push(TestToken::Character(c.into())); + } + } Token::Comment(comment) => actual_tokens.push(TestToken::Comment(comment.data)), Token::Doctype(doctype) => actual_tokens.push(TestToken::Doctype { name: doctype.name, diff --git a/src/default_emitter.rs b/src/default_emitter.rs index e89fa5e..5edf848 100644 --- a/src/default_emitter.rs +++ b/src/default_emitter.rs @@ -1,7 +1,6 @@ use std::collections::btree_map::Entry; use std::collections::BTreeSet; use std::collections::VecDeque; -use std::mem; use std::ops::Range; use crate::offset::NoopOffset; @@ -12,7 +11,6 @@ use crate::Error; /// The default implementation of [`Emitter`], used to produce tokens. pub struct DefaultEmitter<O = NoopOffset> { - current_characters: String, current_token: Option<Token<O>>, current_attribute: Option<(String, crate::token::AttrInternal<O>)>, seen_attributes: BTreeSet<String>, @@ -24,7 +22,6 @@ pub struct DefaultEmitter<O = NoopOffset> { impl<O> Default for DefaultEmitter<O> { fn default() -> Self { DefaultEmitter { - current_characters: String::new(), current_token: None, current_attribute: None, seen_attributes: BTreeSet::new(), @@ -56,11 +53,10 @@ impl<O: Offset> Emitter<O> for DefaultEmitter<O> { } fn emit_eof(&mut self) { - self.flush_current_characters(); } - fn emit_string(&mut self, s: &str) { - self.current_characters.push_str(s); + fn emit_char(&mut self, c: char) { + self.emit_token(Token::Char(c)); } fn init_start_tag(&mut self, tag_offset: O, name_offset: O) { @@ -328,7 +324,6 @@ impl<O: Offset> Emitter<O> for DefaultEmitter<O> { impl<O> DefaultEmitter<O> { fn emit_token(&mut self, token: Token<O>) { - self.flush_current_characters(); self.emitted_tokens.push_front(token); } @@ -358,15 +353,6 @@ impl<O> DefaultEmitter<O> { } } } - - fn flush_current_characters(&mut self) { - if self.current_characters.is_empty() { - return; - } - - let s = mem::take(&mut self.current_characters); - self.emit_token(Token::String(s)); - } } /// The majority of our testing of the [`DefaultEmitter`] is done against the diff --git a/src/emitter.rs b/src/emitter.rs index 2a47f40..7a567b4 100644 --- a/src/emitter.rs +++ b/src/emitter.rs @@ -30,8 +30,8 @@ pub trait Emitter<O> { /// The state machine has reached the end of the file. fn emit_eof(&mut self); - /// Emit a bunch of plain characters as character tokens. - fn emit_string(&mut self, c: &str); + /// Emits the given character as a character token. + fn emit_char(&mut self, c: char); /// Set the _current token_ to a start tag. fn init_start_tag(&mut self, tag_offset: O, name_offset: O); diff --git a/src/token.rs b/src/token.rs index c599cd5..cb584ff 100644 --- a/src/token.rs +++ b/src/token.rs @@ -9,12 +9,14 @@ use crate::offset::Offset; /// A type for the tokens emitted by a WHATWG-compliant HTML tokenizer. #[derive(Debug, Eq, PartialEq)] pub enum Token<O> { + /// A literal character, a resolved character reference, + /// or part of a resolved character reference (since some + /// character references resolve to two `char`s). + Char(char), /// An HTML start tag. StartTag(StartTag<O>), /// An HTML end tag. EndTag(EndTag<O>), - /// A literal string. Character references have been resolved. - String(String), /// An HTML comment. Comment(Comment<O>), /// An HTML doctype declaration. diff --git a/src/tokenizer/machine/utils.rs b/src/tokenizer/machine/utils.rs index d96e50b..ea4d697 100644 --- a/src/tokenizer/machine/utils.rs +++ b/src/tokenizer/machine/utils.rs @@ -16,7 +16,7 @@ where /// Emits the given character as a character token. #[inline] pub(super) fn emit_char(&mut self, c: char) { - self.emitter.emit_string(ctostr!(c)); + self.emitter.emit_char(c); } /// Emits every byte of the given byte slice as a character token. @@ -25,10 +25,9 @@ where /// since [`str::chars`] isn't `const`.) #[inline] pub(super) fn emit_chars(&mut self, s: &[u8]) { - self.emitter.emit_string( - // this unsafe block is only temporary and will be removed in the next commit - unsafe { std::str::from_utf8_unchecked(s) }, - ); + for c in s { + self.emit_char(*c as char); + } } #[inline] @@ -204,7 +203,9 @@ where } pub(super) fn flush_buffer_characters(&mut self) { - self.emitter.emit_string(&self.temporary_buffer); + for c in self.temporary_buffer.chars() { + self.emitter.emit_char(c); + } self.temporary_buffer.clear(); } } |