summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--src/emitter.rs39
-rw-r--r--src/tokenizer.rs28
2 files changed, 27 insertions, 40 deletions
diff --git a/src/emitter.rs b/src/emitter.rs
index ac0f9d2..769d233 100644
--- a/src/emitter.rs
+++ b/src/emitter.rs
@@ -15,7 +15,6 @@ use crate::Error;
///
/// An emitter is assumed to have these internal states:
///
-/// * _last start tag_: The most recently emitted start tag's name
/// * _current token_: Can be a tag, doctype or comment token. There's only one current token.
/// * _current attribute_: The currently processed HTML attribute, consisting of two strings for name and value.
///
@@ -27,9 +26,6 @@ use crate::Error;
/// checks that would emit errors.
///
/// * If you don't care about attributes at all, you can make all related methods a noop.
-///
-/// The state machine needs to have a functional implementation of
-/// `current_is_appropriate_end_tag_token` to do correct transitions, however.
pub trait Emitter<R> {
/// The token type emitted by this emitter. This controls what type of values the [`Tokenizer`](crate::Tokenizer)
/// yields when used as an iterator.
@@ -63,8 +59,6 @@ pub trait Emitter<R> {
/// Also get the current attribute and append it to the to-be-emitted tag. See docstring for
/// [`Emitter::init_attribute_name`] for how duplicates should be handled.
///
- /// If a start tag is emitted, update the _last start tag_.
- ///
/// If an end tag is emitted with attributes, an [`Error::EndTagWithAttributes`]
/// error should be emitted.
///
@@ -162,23 +156,12 @@ pub trait Emitter<R> {
///
/// If the current token is not a doctype, this method may panic.
fn push_doctype_system_identifier(&mut self, s: &str);
-
- /// Return true if all of these hold. Return false otherwise.
- ///
- /// * the _current token_ is an end tag
- /// * the _last start tag_ exists
- /// * the current end tag token's name equals to the last start tag's name.
- ///
- /// See also [WHATWG's definition of "appropriate end tag
- /// token"](https://html.spec.whatwg.org/#appropriate-end-tag-token).
- fn current_is_appropriate_end_tag_token(&mut self) -> bool;
}
/// The default implementation of [`Emitter`], used to produce tokens.
pub struct DefaultEmitter<R, S> {
current_characters: String,
current_token: Option<Token<S>>,
- last_start_tag: String,
current_attribute: Option<(String, Attribute<S>)>,
seen_attributes: BTreeSet<String>,
emitted_tokens: VecDeque<Token<S>>,
@@ -191,7 +174,6 @@ impl<R, S> Default for DefaultEmitter<R, S> {
DefaultEmitter {
current_characters: String::new(),
current_token: None,
- last_start_tag: String::new(),
current_attribute: None,
seen_attributes: BTreeSet::new(),
emitted_tokens: VecDeque::new(),
@@ -245,12 +227,6 @@ impl<R, S: Span<R>> DefaultEmitter<R, S> {
// that exact
self.emitted_tokens.push_front(Token::Error { error, span });
}
-
- pub(crate) fn set_last_start_tag(&mut self, last_start_tag: Option<&str>) {
- self.last_start_tag.clear();
- self.last_start_tag
- .push_str(last_start_tag.unwrap_or_default());
- }
}
impl<R, S: Span<R>> Emitter<R> for DefaultEmitter<R, S> {
@@ -293,7 +269,7 @@ impl<R, S: Span<R>> Emitter<R> for DefaultEmitter<R, S> {
}
fn emit_current_tag(&mut self) {
self.flush_current_attribute();
- let mut token = self.current_token.take().unwrap();
+ let token = self.current_token.take().unwrap();
match token {
Token::EndTag(_) => {
if !self.seen_attributes.is_empty() {
@@ -302,9 +278,7 @@ impl<R, S: Span<R>> Emitter<R> for DefaultEmitter<R, S> {
}
self.seen_attributes.clear();
}
- Token::StartTag(ref mut _tag) => {
- self.set_last_start_tag(Some(&_tag.name));
- }
+ Token::StartTag(_) => {}
_ => debug_assert!(false),
}
self.emit_token(token);
@@ -458,15 +432,6 @@ impl<R, S: Span<R>> Emitter<R> for DefaultEmitter<R, S> {
debug_assert!(false);
}
}
-
- fn current_is_appropriate_end_tag_token(&mut self) -> bool {
- match self.current_token {
- Some(Token::EndTag(ref tag)) => {
- !self.last_start_tag.is_empty() && self.last_start_tag == tag.name
- }
- _ => false,
- }
- }
}
/// An HTML start tag, such as `<p>` or `<a>`.
diff --git a/src/tokenizer.rs b/src/tokenizer.rs
index 78d4fc4..7768ee4 100644
--- a/src/tokenizer.rs
+++ b/src/tokenizer.rs
@@ -41,6 +41,9 @@ pub struct Tokenizer<R: Reader, E: Emitter<R> = DefaultEmitter<R, ()>> {
to_reconsume: Stack2<Option<char>>,
pub(crate) character_reference_code: u32,
pub(crate) return_state: Option<InternalState>,
+ current_tag_name: String,
+ last_start_tag_name: String,
+ is_start_tag: bool,
}
impl<R: Reader, E: Emitter<R>> Tokenizer<R, E> {
@@ -57,6 +60,9 @@ impl<R: Reader, E: Emitter<R>> Tokenizer<R, E> {
temporary_buffer: String::new(),
character_reference_code: 0,
eof: false,
+ current_tag_name: String::new(),
+ last_start_tag_name: String::new(),
+ is_start_tag: false,
}
}
}
@@ -115,29 +121,44 @@ impl<R: Reader, E: Emitter<R>> Tokenizer<R, E> {
self.emitter.emit_error(error, &self.reader);
}
+ /// Assuming the _current token_ is an end tag, return true if all of these hold. Return false otherwise.
+ ///
+ /// * the _last start tag_ exists
+ /// * the current end tag token's name equals to the last start tag's name.
+ ///
+ /// See also [WHATWG's definition of "appropriate end tag
+ /// token"](https://html.spec.whatwg.org/#appropriate-end-tag-token).
#[inline]
pub(crate) fn current_end_tag_is_appropriate(&mut self) -> bool {
- self.emitter.current_is_appropriate_end_tag_token()
+ self.current_tag_name == self.last_start_tag_name
}
#[inline]
pub(crate) fn init_start_tag(&mut self) {
self.emitter.init_start_tag(&self.reader);
+ self.current_tag_name.clear();
+ self.is_start_tag = true;
}
#[inline]
pub(crate) fn init_end_tag(&mut self) {
self.emitter.init_end_tag(&self.reader);
+ self.current_tag_name.clear();
+ self.is_start_tag = false;
}
#[inline]
pub(crate) fn push_tag_name(&mut self, s: &str) {
self.emitter.push_tag_name(s);
+ self.current_tag_name.push_str(s);
}
#[inline]
pub(crate) fn emit_current_tag(&mut self) {
self.emitter.emit_current_tag();
+ if self.is_start_tag {
+ std::mem::swap(&mut self.last_start_tag_name, &mut self.current_tag_name);
+ }
}
#[inline]
@@ -271,12 +292,13 @@ impl<R: Reader, E: Emitter<R>> Iterator for Tokenizer<R, E> {
}
}
-impl<S: crate::spans::Span<R>, R: Reader> Tokenizer<R, DefaultEmitter<R, S>> {
+impl<R: Reader, E: Emitter<R>> Tokenizer<R, E> {
/// Test-internal function to override internal state.
///
/// Only available with the `integration-tests` feature which is not public API.
#[cfg(feature = "integration-tests")]
pub fn set_last_start_tag(&mut self, last_start_tag: &str) {
- self.emitter.set_last_start_tag(Some(last_start_tag));
+ self.last_start_tag_name.clear();
+ self.last_start_tag_name.push_str(last_start_tag);
}
}