summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMartin Fischer <martin@push-f.com>2023-09-01 12:53:29 +0200
committerMartin Fischer <martin@push-f.com>2023-09-03 23:00:05 +0200
commitf239037c1b960ba16c6c8b2184ac017c53c631bf (patch)
tree1b40c7151f5f9270b26ba2a15088f90dca175a43
parentf588704c90f33fe27945d742762d016dea3e113c (diff)
fix!: make start/end tag name spans encoding-independent
-rw-r--r--CHANGELOG.md5
-rw-r--r--src/emitter.rs53
-rw-r--r--src/machine.rs3
-rw-r--r--src/tokenizer.rs6
-rw-r--r--tests/test_spans.rs8
5 files changed, 49 insertions, 26 deletions
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 5025516..90e954b 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -22,11 +22,16 @@
* `emit_error` now takes a span instead of an offset.
+ * Added a `name_offset` parameter to `init_start_tag` and `init_end_tag`.
+
* Several provided offsets have been changed to be more sensible.
Affected are: `set_self_closing`, `init_start_tag`, `init_end_tag`, `emit_current_tag`
* token types
+ * `StartTag`/`EndTag`: Added `name_span` fields
+ (and removed the same-named methods).
+
* `AttributeOwned`: The `value_offset` field has been replaced with `value_span`.
* Added required `len_of_char_in_current_encoding` method to `Reader` trait.
diff --git a/src/emitter.rs b/src/emitter.rs
index ff6e863..aa84215 100644
--- a/src/emitter.rs
+++ b/src/emitter.rs
@@ -49,10 +49,10 @@ pub trait Emitter<O> {
fn emit_string(&mut self, c: &str);
/// Set the _current token_ to a start tag.
- fn init_start_tag(&mut self, offset: O);
+ fn init_start_tag(&mut self, tag_offset: O, name_offset: O);
/// Set the _current token_ to an end tag.
- fn init_end_tag(&mut self, offset: O);
+ fn init_end_tag(&mut self, tag_offset: O, name_offset: O);
/// Set the _current token_ to a comment.
fn init_comment(&mut self, data_offset: O);
@@ -78,6 +78,11 @@ pub trait Emitter<O> {
/// If the current token is not a doctype, this method may panic.
fn emit_current_doctype(&mut self, offset: O);
+ /// Called after the last [`push_tag_name`] call for a tag name.
+ ///
+ /// [`push_tag_name`]: Self::push_tag_name
+ fn terminate_tag_name(&mut self, offset: O) {}
+
/// Called after the last [`push_attribute_value`] call for an attribute value.
///
/// [`push_attribute_value`]: Self::push_attribute_value
@@ -273,18 +278,20 @@ impl<O: Offset> Emitter<O> for DefaultEmitter<O> {
self.current_characters.push_str(s);
}
- fn init_start_tag(&mut self, offset: O) {
+ fn init_start_tag(&mut self, tag_offset: O, name_offset: O) {
self.current_token = Some(Token::StartTag(StartTag {
- span: offset..O::default(),
+ span: tag_offset..O::default(),
self_closing: false,
name: String::new(),
attributes: Default::default(),
+ name_span: name_offset..O::default(),
}));
}
- fn init_end_tag(&mut self, offset: O) {
+ fn init_end_tag(&mut self, tag_offset: O, name_offset: O) {
self.current_token = Some(Token::EndTag(EndTag {
- span: offset..O::default(),
+ span: tag_offset..O::default(),
name: String::new(),
+ name_span: name_offset..O::default(),
}));
self.seen_attributes.clear();
}
@@ -367,6 +374,22 @@ impl<O: Offset> Emitter<O> for DefaultEmitter<O> {
}
}
+ fn terminate_tag_name(&mut self, offset: O) {
+ match self.current_token {
+ Some(Token::StartTag(StartTag {
+ ref mut name_span, ..
+ })) => {
+ name_span.end = offset;
+ }
+ Some(Token::EndTag(EndTag {
+ ref mut name_span, ..
+ })) => {
+ name_span.end = offset;
+ }
+ _ => debug_assert!(false),
+ }
+ }
+
fn push_comment(&mut self, s: &str) {
match self.current_token {
Some(Token::Comment(Comment { ref mut data, .. })) => data.push_str(s),
@@ -483,14 +506,9 @@ pub struct StartTag<O> {
/// The source code span of the tag.
pub span: Range<O>,
-}
-impl<O: Offset> StartTag<O> {
- /// Calculates the span for the tag name and returns it.
- pub fn name_span(&self) -> Range<O> {
- let start = self.span.start + b"<".len();
- start..start + self.name.len()
- }
+ /// The span of the tag name.
+ pub name_span: Range<O>,
}
/// An HTML end/close tag, such as `</p>` or `</a>`.
@@ -502,14 +520,9 @@ pub struct EndTag<O> {
/// The source code span of the tag.
pub span: Range<O>,
-}
-impl<O: Offset> EndTag<O> {
- /// Calculates the span for the tag name and returns it.
- pub fn name_span(&self) -> Range<O> {
- let start = self.span.start + b"</".len();
- start..start + self.name.len()
- }
+ /// The span of the tag name.
+ pub name_span: Range<O>,
}
/// An HTML comment.
diff --git a/src/machine.rs b/src/machine.rs
index 5b36eee..c27708d 100644
--- a/src/machine.rs
+++ b/src/machine.rs
@@ -190,14 +190,17 @@ where
},
State::TagName => match slf.read_char()? {
Some(whitespace_pat!()) => {
+ slf.emitter.terminate_tag_name(slf.position_before_match);
slf.state = State::BeforeAttributeName;
Ok(ControlToken::Continue)
}
Some('/') => {
+ slf.emitter.terminate_tag_name(slf.position_before_match);
slf.state = State::SelfClosingStartTag;
Ok(ControlToken::Continue)
}
Some('>') => {
+ slf.emitter.terminate_tag_name(slf.position_before_match);
slf.state = State::Data;
slf.emit_current_tag();
Ok(ControlToken::Continue)
diff --git a/src/tokenizer.rs b/src/tokenizer.rs
index 58f7b80..e0402b9 100644
--- a/src/tokenizer.rs
+++ b/src/tokenizer.rs
@@ -221,14 +221,16 @@ impl<R: Reader + Position<O>, O: Offset, E: Emitter<O>> Tokenizer<R, O, E> {
#[inline]
pub(crate) fn init_start_tag(&mut self) {
- self.emitter.init_start_tag(self.some_offset);
+ self.emitter
+ .init_start_tag(self.some_offset, self.position_before_match);
self.current_tag_name.clear();
self.is_start_tag = true;
}
#[inline]
pub(crate) fn init_end_tag(&mut self) {
- self.emitter.init_end_tag(self.some_offset);
+ self.emitter
+ .init_end_tag(self.some_offset, self.position_before_match);
self.current_tag_name.clear();
self.is_start_tag = false;
}
diff --git a/tests/test_spans.rs b/tests/test_spans.rs
index 8190f01..14d92b2 100644
--- a/tests/test_spans.rs
+++ b/tests/test_spans.rs
@@ -106,12 +106,12 @@ fn start_tag_name_span() {
let mut labels = Vec::new();
for token in tokens {
if let Token::StartTag(tag) = token {
- labels.push((tag.name_span(), ""));
+ labels.push((tag.name_span, ""));
}
}
labels
};
- assert_panics_but_should_not(|| assert_char_encoding_independence(html, labeler)); // FIXME
+ assert_char_encoding_independence(html, labeler);
assert_snapshot!(test_and_annotate(html, labeler), @r###"
<x> <xyz> <xyz > <xyz/>
^ ^^^ ^^^ ^^^
@@ -125,12 +125,12 @@ fn end_tag_name_span() {
let mut labels = Vec::new();
for token in tokens {
if let Token::EndTag(tag) = token {
- labels.push((tag.name_span(), ""));
+ labels.push((tag.name_span, ""));
}
}
labels
};
- assert_panics_but_should_not(|| assert_char_encoding_independence(html, labeler)); // FIXME
+ assert_char_encoding_independence(html, labeler);
assert_snapshot!(test_and_annotate(html, labeler), @r###"
</x> </xyz> </xyz > </xyz/>
^ ^^^ ^^^ ^^^