diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/emitter.rs | 39 | ||||
-rw-r--r-- | src/tokenizer.rs | 28 |
2 files changed, 27 insertions, 40 deletions
diff --git a/src/emitter.rs b/src/emitter.rs index ac0f9d2..769d233 100644 --- a/src/emitter.rs +++ b/src/emitter.rs @@ -15,7 +15,6 @@ use crate::Error; /// /// An emitter is assumed to have these internal states: /// -/// * _last start tag_: The most recently emitted start tag's name /// * _current token_: Can be a tag, doctype or comment token. There's only one current token. /// * _current attribute_: The currently processed HTML attribute, consisting of two strings for name and value. /// @@ -27,9 +26,6 @@ use crate::Error; /// checks that would emit errors. /// /// * If you don't care about attributes at all, you can make all related methods a noop. -/// -/// The state machine needs to have a functional implementation of -/// `current_is_appropriate_end_tag_token` to do correct transitions, however. pub trait Emitter<R> { /// The token type emitted by this emitter. This controls what type of values the [`Tokenizer`](crate::Tokenizer) /// yields when used as an iterator. @@ -63,8 +59,6 @@ pub trait Emitter<R> { /// Also get the current attribute and append it to the to-be-emitted tag. See docstring for /// [`Emitter::init_attribute_name`] for how duplicates should be handled. /// - /// If a start tag is emitted, update the _last start tag_. - /// /// If an end tag is emitted with attributes, an [`Error::EndTagWithAttributes`] /// error should be emitted. /// @@ -162,23 +156,12 @@ pub trait Emitter<R> { /// /// If the current token is not a doctype, this method may panic. fn push_doctype_system_identifier(&mut self, s: &str); - - /// Return true if all of these hold. Return false otherwise. - /// - /// * the _current token_ is an end tag - /// * the _last start tag_ exists - /// * the current end tag token's name equals to the last start tag's name. - /// - /// See also [WHATWG's definition of "appropriate end tag - /// token"](https://html.spec.whatwg.org/#appropriate-end-tag-token). - fn current_is_appropriate_end_tag_token(&mut self) -> bool; } /// The default implementation of [`Emitter`], used to produce tokens. pub struct DefaultEmitter<R, S> { current_characters: String, current_token: Option<Token<S>>, - last_start_tag: String, current_attribute: Option<(String, Attribute<S>)>, seen_attributes: BTreeSet<String>, emitted_tokens: VecDeque<Token<S>>, @@ -191,7 +174,6 @@ impl<R, S> Default for DefaultEmitter<R, S> { DefaultEmitter { current_characters: String::new(), current_token: None, - last_start_tag: String::new(), current_attribute: None, seen_attributes: BTreeSet::new(), emitted_tokens: VecDeque::new(), @@ -245,12 +227,6 @@ impl<R, S: Span<R>> DefaultEmitter<R, S> { // that exact self.emitted_tokens.push_front(Token::Error { error, span }); } - - pub(crate) fn set_last_start_tag(&mut self, last_start_tag: Option<&str>) { - self.last_start_tag.clear(); - self.last_start_tag - .push_str(last_start_tag.unwrap_or_default()); - } } impl<R, S: Span<R>> Emitter<R> for DefaultEmitter<R, S> { @@ -293,7 +269,7 @@ impl<R, S: Span<R>> Emitter<R> for DefaultEmitter<R, S> { } fn emit_current_tag(&mut self) { self.flush_current_attribute(); - let mut token = self.current_token.take().unwrap(); + let token = self.current_token.take().unwrap(); match token { Token::EndTag(_) => { if !self.seen_attributes.is_empty() { @@ -302,9 +278,7 @@ impl<R, S: Span<R>> Emitter<R> for DefaultEmitter<R, S> { } self.seen_attributes.clear(); } - Token::StartTag(ref mut _tag) => { - self.set_last_start_tag(Some(&_tag.name)); - } + Token::StartTag(_) => {} _ => debug_assert!(false), } self.emit_token(token); @@ -458,15 +432,6 @@ impl<R, S: Span<R>> Emitter<R> for DefaultEmitter<R, S> { debug_assert!(false); } } - - fn current_is_appropriate_end_tag_token(&mut self) -> bool { - match self.current_token { - Some(Token::EndTag(ref tag)) => { - !self.last_start_tag.is_empty() && self.last_start_tag == tag.name - } - _ => false, - } - } } /// An HTML start tag, such as `<p>` or `<a>`. diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 78d4fc4..7768ee4 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -41,6 +41,9 @@ pub struct Tokenizer<R: Reader, E: Emitter<R> = DefaultEmitter<R, ()>> { to_reconsume: Stack2<Option<char>>, pub(crate) character_reference_code: u32, pub(crate) return_state: Option<InternalState>, + current_tag_name: String, + last_start_tag_name: String, + is_start_tag: bool, } impl<R: Reader, E: Emitter<R>> Tokenizer<R, E> { @@ -57,6 +60,9 @@ impl<R: Reader, E: Emitter<R>> Tokenizer<R, E> { temporary_buffer: String::new(), character_reference_code: 0, eof: false, + current_tag_name: String::new(), + last_start_tag_name: String::new(), + is_start_tag: false, } } } @@ -115,29 +121,44 @@ impl<R: Reader, E: Emitter<R>> Tokenizer<R, E> { self.emitter.emit_error(error, &self.reader); } + /// Assuming the _current token_ is an end tag, return true if all of these hold. Return false otherwise. + /// + /// * the _last start tag_ exists + /// * the current end tag token's name equals to the last start tag's name. + /// + /// See also [WHATWG's definition of "appropriate end tag + /// token"](https://html.spec.whatwg.org/#appropriate-end-tag-token). #[inline] pub(crate) fn current_end_tag_is_appropriate(&mut self) -> bool { - self.emitter.current_is_appropriate_end_tag_token() + self.current_tag_name == self.last_start_tag_name } #[inline] pub(crate) fn init_start_tag(&mut self) { self.emitter.init_start_tag(&self.reader); + self.current_tag_name.clear(); + self.is_start_tag = true; } #[inline] pub(crate) fn init_end_tag(&mut self) { self.emitter.init_end_tag(&self.reader); + self.current_tag_name.clear(); + self.is_start_tag = false; } #[inline] pub(crate) fn push_tag_name(&mut self, s: &str) { self.emitter.push_tag_name(s); + self.current_tag_name.push_str(s); } #[inline] pub(crate) fn emit_current_tag(&mut self) { self.emitter.emit_current_tag(); + if self.is_start_tag { + std::mem::swap(&mut self.last_start_tag_name, &mut self.current_tag_name); + } } #[inline] @@ -271,12 +292,13 @@ impl<R: Reader, E: Emitter<R>> Iterator for Tokenizer<R, E> { } } -impl<S: crate::spans::Span<R>, R: Reader> Tokenizer<R, DefaultEmitter<R, S>> { +impl<R: Reader, E: Emitter<R>> Tokenizer<R, E> { /// Test-internal function to override internal state. /// /// Only available with the `integration-tests` feature which is not public API. #[cfg(feature = "integration-tests")] pub fn set_last_start_tag(&mut self, last_start_tag: &str) { - self.emitter.set_last_start_tag(Some(last_start_tag)); + self.last_start_tag_name.clear(); + self.last_start_tag_name.push_str(last_start_tag); } } |