diff options
author | Martin Fischer <martin@push-f.com> | 2023-09-01 12:53:29 +0200 |
---|---|---|
committer | Martin Fischer <martin@push-f.com> | 2023-09-03 23:00:05 +0200 |
commit | f239037c1b960ba16c6c8b2184ac017c53c631bf (patch) | |
tree | 1b40c7151f5f9270b26ba2a15088f90dca175a43 | |
parent | f588704c90f33fe27945d742762d016dea3e113c (diff) |
fix!: make start/end tag name spans encoding-independent
-rw-r--r-- | CHANGELOG.md | 5 | ||||
-rw-r--r-- | src/emitter.rs | 53 | ||||
-rw-r--r-- | src/machine.rs | 3 | ||||
-rw-r--r-- | src/tokenizer.rs | 6 | ||||
-rw-r--r-- | tests/test_spans.rs | 8 |
5 files changed, 49 insertions, 26 deletions
diff --git a/CHANGELOG.md b/CHANGELOG.md index 5025516..90e954b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,11 +22,16 @@ * `emit_error` now takes a span instead of an offset. + * Added a `name_offset` parameter to `init_start_tag` and `init_end_tag`. + * Several provided offsets have been changed to be more sensible. Affected are: `set_self_closing`, `init_start_tag`, `init_end_tag`, `emit_current_tag` * token types + * `StartTag`/`EndTag`: Added `name_span` fields + (and removed the same-named methods). + * `AttributeOwned`: The `value_offset` field has been replaced with `value_span`. * Added required `len_of_char_in_current_encoding` method to `Reader` trait. diff --git a/src/emitter.rs b/src/emitter.rs index ff6e863..aa84215 100644 --- a/src/emitter.rs +++ b/src/emitter.rs @@ -49,10 +49,10 @@ pub trait Emitter<O> { fn emit_string(&mut self, c: &str); /// Set the _current token_ to a start tag. - fn init_start_tag(&mut self, offset: O); + fn init_start_tag(&mut self, tag_offset: O, name_offset: O); /// Set the _current token_ to an end tag. - fn init_end_tag(&mut self, offset: O); + fn init_end_tag(&mut self, tag_offset: O, name_offset: O); /// Set the _current token_ to a comment. fn init_comment(&mut self, data_offset: O); @@ -78,6 +78,11 @@ pub trait Emitter<O> { /// If the current token is not a doctype, this method may panic. fn emit_current_doctype(&mut self, offset: O); + /// Called after the last [`push_tag_name`] call for a tag name. + /// + /// [`push_tag_name`]: Self::push_tag_name + fn terminate_tag_name(&mut self, offset: O) {} + /// Called after the last [`push_attribute_value`] call for an attribute value. /// /// [`push_attribute_value`]: Self::push_attribute_value @@ -273,18 +278,20 @@ impl<O: Offset> Emitter<O> for DefaultEmitter<O> { self.current_characters.push_str(s); } - fn init_start_tag(&mut self, offset: O) { + fn init_start_tag(&mut self, tag_offset: O, name_offset: O) { self.current_token = Some(Token::StartTag(StartTag { - span: offset..O::default(), + span: tag_offset..O::default(), self_closing: false, name: String::new(), attributes: Default::default(), + name_span: name_offset..O::default(), })); } - fn init_end_tag(&mut self, offset: O) { + fn init_end_tag(&mut self, tag_offset: O, name_offset: O) { self.current_token = Some(Token::EndTag(EndTag { - span: offset..O::default(), + span: tag_offset..O::default(), name: String::new(), + name_span: name_offset..O::default(), })); self.seen_attributes.clear(); } @@ -367,6 +374,22 @@ impl<O: Offset> Emitter<O> for DefaultEmitter<O> { } } + fn terminate_tag_name(&mut self, offset: O) { + match self.current_token { + Some(Token::StartTag(StartTag { + ref mut name_span, .. + })) => { + name_span.end = offset; + } + Some(Token::EndTag(EndTag { + ref mut name_span, .. + })) => { + name_span.end = offset; + } + _ => debug_assert!(false), + } + } + fn push_comment(&mut self, s: &str) { match self.current_token { Some(Token::Comment(Comment { ref mut data, .. })) => data.push_str(s), @@ -483,14 +506,9 @@ pub struct StartTag<O> { /// The source code span of the tag. pub span: Range<O>, -} -impl<O: Offset> StartTag<O> { - /// Calculates the span for the tag name and returns it. - pub fn name_span(&self) -> Range<O> { - let start = self.span.start + b"<".len(); - start..start + self.name.len() - } + /// The span of the tag name. + pub name_span: Range<O>, } /// An HTML end/close tag, such as `</p>` or `</a>`. @@ -502,14 +520,9 @@ pub struct EndTag<O> { /// The source code span of the tag. pub span: Range<O>, -} -impl<O: Offset> EndTag<O> { - /// Calculates the span for the tag name and returns it. - pub fn name_span(&self) -> Range<O> { - let start = self.span.start + b"</".len(); - start..start + self.name.len() - } + /// The span of the tag name. + pub name_span: Range<O>, } /// An HTML comment. diff --git a/src/machine.rs b/src/machine.rs index 5b36eee..c27708d 100644 --- a/src/machine.rs +++ b/src/machine.rs @@ -190,14 +190,17 @@ where }, State::TagName => match slf.read_char()? { Some(whitespace_pat!()) => { + slf.emitter.terminate_tag_name(slf.position_before_match); slf.state = State::BeforeAttributeName; Ok(ControlToken::Continue) } Some('/') => { + slf.emitter.terminate_tag_name(slf.position_before_match); slf.state = State::SelfClosingStartTag; Ok(ControlToken::Continue) } Some('>') => { + slf.emitter.terminate_tag_name(slf.position_before_match); slf.state = State::Data; slf.emit_current_tag(); Ok(ControlToken::Continue) diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 58f7b80..e0402b9 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -221,14 +221,16 @@ impl<R: Reader + Position<O>, O: Offset, E: Emitter<O>> Tokenizer<R, O, E> { #[inline] pub(crate) fn init_start_tag(&mut self) { - self.emitter.init_start_tag(self.some_offset); + self.emitter + .init_start_tag(self.some_offset, self.position_before_match); self.current_tag_name.clear(); self.is_start_tag = true; } #[inline] pub(crate) fn init_end_tag(&mut self) { - self.emitter.init_end_tag(self.some_offset); + self.emitter + .init_end_tag(self.some_offset, self.position_before_match); self.current_tag_name.clear(); self.is_start_tag = false; } diff --git a/tests/test_spans.rs b/tests/test_spans.rs index 8190f01..14d92b2 100644 --- a/tests/test_spans.rs +++ b/tests/test_spans.rs @@ -106,12 +106,12 @@ fn start_tag_name_span() { let mut labels = Vec::new(); for token in tokens { if let Token::StartTag(tag) = token { - labels.push((tag.name_span(), "")); + labels.push((tag.name_span, "")); } } labels }; - assert_panics_but_should_not(|| assert_char_encoding_independence(html, labeler)); // FIXME + assert_char_encoding_independence(html, labeler); assert_snapshot!(test_and_annotate(html, labeler), @r###" <x> <xyz> <xyz > <xyz/> ^ ^^^ ^^^ ^^^ @@ -125,12 +125,12 @@ fn end_tag_name_span() { let mut labels = Vec::new(); for token in tokens { if let Token::EndTag(tag) = token { - labels.push((tag.name_span(), "")); + labels.push((tag.name_span, "")); } } labels }; - assert_panics_but_should_not(|| assert_char_encoding_independence(html, labeler)); // FIXME + assert_char_encoding_independence(html, labeler); assert_snapshot!(test_and_annotate(html, labeler), @r###" </x> </xyz> </xyz > </xyz/> ^ ^^^ ^^^ ^^^ |