refactor!: remove current_is_appropriate_end_tag_token from Emitter

author: Martin Fischer <martin@push-f.com> 2023-08-13 22:55:20 +0200
committer: Martin Fischer <martin@push-f.com> 2023-08-19 11:40:49 +0200
commit: 821f773bd263829e361c5d8063fc3e59695bbd6a (patch)
tree: 8972175986a5b8bd85eec296ac3cf492139aae89
parent: 0a02268b163db620c9e3d2b4f22da6d4bb67adf2 (diff)
2 files changed, 27 insertions, 40 deletions
diff --git a/src/emitter.rs b/src/emitter.rs
index ac0f9d2..769d233 100644
--- a/src/emitter.rs
+++ b/src/emitter.rs
@@ -15,7 +15,6 @@ use crate::Error;
 ///
 /// An emitter is assumed to have these internal states:
 ///
-/// * _last start tag_: The most recently emitted start tag's name
 /// * _current token_: Can be a tag, doctype or comment token. There's only one current token.
 /// * _current attribute_: The currently processed HTML attribute, consisting of two strings for name and value.
 ///
@@ -27,9 +26,6 @@ use crate::Error;
 ///   checks that would emit errors.
 ///
 /// * If you don't care about attributes at all, you can make all related methods a noop.
-///
-/// The state machine needs to have a functional implementation of
-/// `current_is_appropriate_end_tag_token` to do correct transitions, however.
 pub trait Emitter<R> {
     /// The token type emitted by this emitter. This controls what type of values the [`Tokenizer`](crate::Tokenizer)
     /// yields when used as an iterator.
@@ -63,8 +59,6 @@ pub trait Emitter<R> {
     /// Also get the current attribute and append it to the to-be-emitted tag. See docstring for
     /// [`Emitter::init_attribute_name`] for how duplicates should be handled.
     ///
-    /// If a start tag is emitted, update the _last start tag_.
-    ///
     /// If an end tag is emitted with attributes, an [`Error::EndTagWithAttributes`]
     /// error should be emitted.
     ///
@@ -162,23 +156,12 @@ pub trait Emitter<R> {
     ///
     /// If the current token is not a doctype, this method may panic.
     fn push_doctype_system_identifier(&mut self, s: &str);
-
-    /// Return true if all of these hold. Return false otherwise.
-    ///
-    /// * the _current token_ is an end tag
-    /// * the _last start tag_ exists
-    /// * the current end tag token's name equals to the last start tag's name.
-    ///
-    /// See also [WHATWG's definition of "appropriate end tag
-    /// token"](https://html.spec.whatwg.org/#appropriate-end-tag-token).
-    fn current_is_appropriate_end_tag_token(&mut self) -> bool;
 }
 
 /// The default implementation of [`Emitter`], used to produce tokens.
 pub struct DefaultEmitter<R, S> {
     current_characters: String,
     current_token: Option<Token<S>>,
-    last_start_tag: String,
     current_attribute: Option<(String, Attribute<S>)>,
     seen_attributes: BTreeSet<String>,
     emitted_tokens: VecDeque<Token<S>>,
@@ -191,7 +174,6 @@ impl<R, S> Default for DefaultEmitter<R, S> {
         DefaultEmitter {
             current_characters: String::new(),
             current_token: None,
-            last_start_tag: String::new(),
             current_attribute: None,
             seen_attributes: BTreeSet::new(),
             emitted_tokens: VecDeque::new(),
@@ -245,12 +227,6 @@ impl<R, S: Span<R>> DefaultEmitter<R, S> {
         // that exact
         self.emitted_tokens.push_front(Token::Error { error, span });
     }
-
-    pub(crate) fn set_last_start_tag(&mut self, last_start_tag: Option<&str>) {
-        self.last_start_tag.clear();
-        self.last_start_tag
-            .push_str(last_start_tag.unwrap_or_default());
-    }
 }
 
 impl<R, S: Span<R>> Emitter<R> for DefaultEmitter<R, S> {
@@ -293,7 +269,7 @@ impl<R, S: Span<R>> Emitter<R> for DefaultEmitter<R, S> {
     }
     fn emit_current_tag(&mut self) {
         self.flush_current_attribute();
-        let mut token = self.current_token.take().unwrap();
+        let token = self.current_token.take().unwrap();
         match token {
             Token::EndTag(_) => {
                 if !self.seen_attributes.is_empty() {
@@ -302,9 +278,7 @@ impl<R, S: Span<R>> Emitter<R> for DefaultEmitter<R, S> {
                 }
                 self.seen_attributes.clear();
             }
-            Token::StartTag(ref mut _tag) => {
-                self.set_last_start_tag(Some(&_tag.name));
-            }
+            Token::StartTag(_) => {}
             _ => debug_assert!(false),
         }
         self.emit_token(token);
@@ -458,15 +432,6 @@ impl<R, S: Span<R>> Emitter<R> for DefaultEmitter<R, S> {
             debug_assert!(false);
         }
     }
-
-    fn current_is_appropriate_end_tag_token(&mut self) -> bool {
-        match self.current_token {
-            Some(Token::EndTag(ref tag)) => {
-                !self.last_start_tag.is_empty() && self.last_start_tag == tag.name
-            }
-            _ => false,
-        }
-    }
 }
 
 /// An HTML start tag, such as `<p>` or `<a>`.
diff --git a/src/tokenizer.rs b/src/tokenizer.rs
index 78d4fc4..7768ee4 100644
--- a/src/tokenizer.rs
+++ b/src/tokenizer.rs
@@ -41,6 +41,9 @@ pub struct Tokenizer<R: Reader, E: Emitter<R> = DefaultEmitter<R, ()>> {
     to_reconsume: Stack2<Option<char>>,
     pub(crate) character_reference_code: u32,
     pub(crate) return_state: Option<InternalState>,
+    current_tag_name: String,
+    last_start_tag_name: String,
+    is_start_tag: bool,
 }
 
 impl<R: Reader, E: Emitter<R>> Tokenizer<R, E> {
@@ -57,6 +60,9 @@ impl<R: Reader, E: Emitter<R>> Tokenizer<R, E> {
             temporary_buffer: String::new(),
             character_reference_code: 0,
             eof: false,
+            current_tag_name: String::new(),
+            last_start_tag_name: String::new(),
+            is_start_tag: false,
         }
     }
 }
@@ -115,29 +121,44 @@ impl<R: Reader, E: Emitter<R>> Tokenizer<R, E> {
         self.emitter.emit_error(error, &self.reader);
     }
 
+    /// Assuming the _current token_ is an end tag, return true if all of these hold. Return false otherwise.
+    ///
+    /// * the _last start tag_ exists
+    /// * the current end tag token's name equals to the last start tag's name.
+    ///
+    /// See also [WHATWG's definition of "appropriate end tag
+    /// token"](https://html.spec.whatwg.org/#appropriate-end-tag-token).
     #[inline]
     pub(crate) fn current_end_tag_is_appropriate(&mut self) -> bool {
-        self.emitter.current_is_appropriate_end_tag_token()
+        self.current_tag_name == self.last_start_tag_name
     }
 
     #[inline]
     pub(crate) fn init_start_tag(&mut self) {
         self.emitter.init_start_tag(&self.reader);
+        self.current_tag_name.clear();
+        self.is_start_tag = true;
     }
 
     #[inline]
     pub(crate) fn init_end_tag(&mut self) {
         self.emitter.init_end_tag(&self.reader);
+        self.current_tag_name.clear();
+        self.is_start_tag = false;
     }
 
     #[inline]
     pub(crate) fn push_tag_name(&mut self, s: &str) {
         self.emitter.push_tag_name(s);
+        self.current_tag_name.push_str(s);
     }
 
     #[inline]
     pub(crate) fn emit_current_tag(&mut self) {
         self.emitter.emit_current_tag();
+        if self.is_start_tag {
+            std::mem::swap(&mut self.last_start_tag_name, &mut self.current_tag_name);
+        }
     }
 
     #[inline]
@@ -271,12 +292,13 @@ impl<R: Reader, E: Emitter<R>> Iterator for Tokenizer<R, E> {
     }
 }
 
-impl<S: crate::spans::Span<R>, R: Reader> Tokenizer<R, DefaultEmitter<R, S>> {
+impl<R: Reader, E: Emitter<R>> Tokenizer<R, E> {
     /// Test-internal function to override internal state.
     ///
     /// Only available with the `integration-tests` feature which is not public API.
     #[cfg(feature = "integration-tests")]
     pub fn set_last_start_tag(&mut self, last_start_tag: &str) {
-        self.emitter.set_last_start_tag(Some(last_start_tag));
+        self.last_start_tag_name.clear();
+        self.last_start_tag_name.push_str(last_start_tag);
     }
 }
author	Martin Fischer <martin@push-f.com>	2023-08-13 22:55:20 +0200
committer	Martin Fischer <martin@push-f.com>	2023-08-19 11:40:49 +0200
commit	821f773bd263829e361c5d8063fc3e59695bbd6a (patch)
tree	8972175986a5b8bd85eec296ac3cf492139aae89
parent	0a02268b163db620c9e3d2b4f22da6d4bb67adf2 (diff)