diff options
| -rw-r--r-- | CHANGELOG.md | 5 | ||||
| -rw-r--r-- | README.md | 4 | ||||
| -rw-r--r-- | integration_tests/tests/test_html5lib.rs | 9 | ||||
| -rw-r--r-- | src/default_emitter.rs | 18 | ||||
| -rw-r--r-- | src/emitter.rs | 4 | ||||
| -rw-r--r-- | src/token.rs | 6 | ||||
| -rw-r--r-- | src/tokenizer/machine/utils.rs | 13 | 
7 files changed, 30 insertions, 29 deletions
| diff --git a/CHANGELOG.md b/CHANGELOG.md index c4acbb2..075373c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,9 @@      (Errors now have to be queried separately with      `DefaultEmitter::drain_errors`.) +  * Replaced the `String` variant with a new `Char` variant.   +    (The tokenizer now emits chars instead of strings.) +  * `Emitter` trait    * Removed `pop_token` method and `Token` associated type. @@ -21,6 +24,8 @@    * Renamed `emit_error` to `report_error`. +  * Replaced `emit_string` with `emit_char`. +  ### 0.5.1 - 2023-09-03  #### Features @@ -19,8 +19,8 @@ for token in NaiveParser::new(html).flatten() {          Token::StartTag(tag) => {              write!(new_html, "<{}>", tag.name).unwrap();          } -        Token::String(hello_world) => { -            write!(new_html, "{}", hello_world).unwrap(); +        Token::Char(c) => { +            write!(new_html, "{c}").unwrap();          }          Token::EndTag(tag) => {              write!(new_html, "</{}>", tag.name).unwrap(); diff --git a/integration_tests/tests/test_html5lib.rs b/integration_tests/tests/test_html5lib.rs index 2e404c5..bede29a 100644 --- a/integration_tests/tests/test_html5lib.rs +++ b/integration_tests/tests/test_html5lib.rs @@ -137,7 +137,14 @@ fn run_test_inner<R: Reader>(                  self_closing: tag.self_closing,              }),              Token::EndTag(tag) => actual_tokens.push(TestToken::EndTag { name: tag.name }), -            Token::String(data) => actual_tokens.push(TestToken::Character(data)), +            Token::Char(c) => { +                // Coalesce all adjacent character tokens into a single string. +                if let Some(TestToken::Character(s)) = actual_tokens.last_mut() { +                    s.push(c); +                } else { +                    actual_tokens.push(TestToken::Character(c.into())); +                } +            }              Token::Comment(comment) => actual_tokens.push(TestToken::Comment(comment.data)),              Token::Doctype(doctype) => actual_tokens.push(TestToken::Doctype {                  name: doctype.name, diff --git a/src/default_emitter.rs b/src/default_emitter.rs index e89fa5e..5edf848 100644 --- a/src/default_emitter.rs +++ b/src/default_emitter.rs @@ -1,7 +1,6 @@  use std::collections::btree_map::Entry;  use std::collections::BTreeSet;  use std::collections::VecDeque; -use std::mem;  use std::ops::Range;  use crate::offset::NoopOffset; @@ -12,7 +11,6 @@ use crate::Error;  /// The default implementation of [`Emitter`], used to produce tokens.  pub struct DefaultEmitter<O = NoopOffset> { -    current_characters: String,      current_token: Option<Token<O>>,      current_attribute: Option<(String, crate::token::AttrInternal<O>)>,      seen_attributes: BTreeSet<String>, @@ -24,7 +22,6 @@ pub struct DefaultEmitter<O = NoopOffset> {  impl<O> Default for DefaultEmitter<O> {      fn default() -> Self {          DefaultEmitter { -            current_characters: String::new(),              current_token: None,              current_attribute: None,              seen_attributes: BTreeSet::new(), @@ -56,11 +53,10 @@ impl<O: Offset> Emitter<O> for DefaultEmitter<O> {      }      fn emit_eof(&mut self) { -        self.flush_current_characters();      } -    fn emit_string(&mut self, s: &str) { -        self.current_characters.push_str(s); +    fn emit_char(&mut self, c: char) { +        self.emit_token(Token::Char(c));      }      fn init_start_tag(&mut self, tag_offset: O, name_offset: O) { @@ -328,7 +324,6 @@ impl<O: Offset> Emitter<O> for DefaultEmitter<O> {  impl<O> DefaultEmitter<O> {      fn emit_token(&mut self, token: Token<O>) { -        self.flush_current_characters();          self.emitted_tokens.push_front(token);      } @@ -358,15 +353,6 @@ impl<O> DefaultEmitter<O> {              }          }      } - -    fn flush_current_characters(&mut self) { -        if self.current_characters.is_empty() { -            return; -        } - -        let s = mem::take(&mut self.current_characters); -        self.emit_token(Token::String(s)); -    }  }  /// The majority of our testing of the [`DefaultEmitter`] is done against the diff --git a/src/emitter.rs b/src/emitter.rs index 2a47f40..7a567b4 100644 --- a/src/emitter.rs +++ b/src/emitter.rs @@ -30,8 +30,8 @@ pub trait Emitter<O> {      /// The state machine has reached the end of the file.      fn emit_eof(&mut self); -    /// Emit a bunch of plain characters as character tokens. -    fn emit_string(&mut self, c: &str); +    /// Emits the given character as a character token. +    fn emit_char(&mut self, c: char);      /// Set the _current token_ to a start tag.      fn init_start_tag(&mut self, tag_offset: O, name_offset: O); diff --git a/src/token.rs b/src/token.rs index c599cd5..cb584ff 100644 --- a/src/token.rs +++ b/src/token.rs @@ -9,12 +9,14 @@ use crate::offset::Offset;  /// A type for the tokens emitted by a WHATWG-compliant HTML tokenizer.  #[derive(Debug, Eq, PartialEq)]  pub enum Token<O> { +    /// A literal character, a resolved character reference, +    /// or part of a resolved character reference (since some +    /// character references resolve to two `char`s). +    Char(char),      /// An HTML start tag.      StartTag(StartTag<O>),      /// An HTML end tag.      EndTag(EndTag<O>), -    /// A literal string. Character references have been resolved. -    String(String),      /// An HTML comment.      Comment(Comment<O>),      /// An HTML doctype declaration. diff --git a/src/tokenizer/machine/utils.rs b/src/tokenizer/machine/utils.rs index d96e50b..ea4d697 100644 --- a/src/tokenizer/machine/utils.rs +++ b/src/tokenizer/machine/utils.rs @@ -16,7 +16,7 @@ where      /// Emits the given character as a character token.      #[inline]      pub(super) fn emit_char(&mut self, c: char) { -        self.emitter.emit_string(ctostr!(c)); +        self.emitter.emit_char(c);      }      /// Emits every byte of the given byte slice as a character token. @@ -25,10 +25,9 @@ where      /// since [`str::chars`] isn't `const`.)      #[inline]      pub(super) fn emit_chars(&mut self, s: &[u8]) { -        self.emitter.emit_string( -            // this unsafe block is only temporary and will be removed in the next commit -            unsafe { std::str::from_utf8_unchecked(s) }, -        ); +        for c in s { +            self.emit_char(*c as char); +        }      }      #[inline] @@ -204,7 +203,9 @@ where      }      pub(super) fn flush_buffer_characters(&mut self) { -        self.emitter.emit_string(&self.temporary_buffer); +        for c in self.temporary_buffer.chars() { +            self.emitter.emit_char(c); +        }          self.temporary_buffer.clear();      }  } | 
