diff options
author | Martin Fischer <martin@push-f.com> | 2023-08-17 16:12:51 +0200 |
---|---|---|
committer | Martin Fischer <martin@push-f.com> | 2023-08-19 13:41:55 +0200 |
commit | cb4e9cf7cfc1612c67f16bfabadbdf442cd380fe (patch) | |
tree | 5aa2bf7dccafe66d735f3609d446bfad083ab8a3 | |
parent | c15895d44d17984386d3684e2aa85aca386ba3bf (diff) |
fix: fix lots of position off-by-ones
Previously the PosTrackingReader always mysteriously subtracted 1
from the current position ... this wasn't sound at all ... the machine
just happens to often call `Tokenizer::unread_char` ... but not always.
E.g. for proper comments it didn't which resulted in their offset and
spans being off-by-one, which is fixed by this commit (see test_spans.rs).
-rw-r--r-- | src/emitter.rs | 7 | ||||
-rw-r--r-- | src/machine.rs | 8 | ||||
-rw-r--r-- | src/offset.rs | 2 | ||||
-rw-r--r-- | src/tokenizer.rs | 8 | ||||
-rw-r--r-- | tests/test_spans.rs | 5 |
5 files changed, 14 insertions, 16 deletions
diff --git a/src/emitter.rs b/src/emitter.rs index 1f60f70..90aa4db 100644 --- a/src/emitter.rs +++ b/src/emitter.rs @@ -310,7 +310,7 @@ impl<O: Offset> Emitter<O> for DefaultEmitter<O> { *self_closing = true; } Token::EndTag(_) => { - self.emit_error(Error::EndTagWithTrailingSolidus, offset); + self.emit_error(Error::EndTagWithTrailingSolidus, offset - 1); } _ => { debug_assert!(false); @@ -378,9 +378,8 @@ impl<O: Offset> Emitter<O> for DefaultEmitter<O> { }, )); } - fn init_attribute_value(&mut self, offset: O, quoted: bool) { - self.current_attribute.as_mut().unwrap().1.value_span = - offset + quoted as usize..offset + quoted as usize; + fn init_attribute_value(&mut self, offset: O, _quoted: bool) { + self.current_attribute.as_mut().unwrap().1.value_span = offset..offset; } fn push_attribute_name(&mut self, s: &str) { diff --git a/src/machine.rs b/src/machine.rs index deb3983..a58a754 100644 --- a/src/machine.rs +++ b/src/machine.rs @@ -694,7 +694,7 @@ where Ok(ControlToken::Continue) } Some(x) => { - slf.emitter.init_attribute_name(slf.reader.position()); + slf.emitter.init_attribute_name(slf.reader.position() - 1); slf.state = State::AttributeName; slf.unread_char(Some(x)); Ok(ControlToken::Continue) @@ -747,7 +747,7 @@ where Ok(ControlToken::Eof) } Some(x) => { - slf.emitter.init_attribute_name(slf.reader.position()); + slf.emitter.init_attribute_name(slf.reader.position() - 1); slf.state = State::AttributeName; slf.unread_char(Some(x)); Ok(ControlToken::Continue) @@ -775,7 +775,7 @@ where } c => { slf.emitter - .init_attribute_value(slf.reader.position(), false); + .init_attribute_value(slf.reader.position() - 1, false); slf.state = State::AttributeValueUnquoted; slf.unread_char(c); Ok(ControlToken::Continue) @@ -952,7 +952,7 @@ where } c => { slf.emit_error(Error::IncorrectlyOpenedComment); - slf.emitter.init_comment(slf.reader.position()); + slf.emitter.init_comment(slf.reader.position() - 1); slf.state = State::BogusComment; slf.unread_char(c); Ok(ControlToken::Continue) diff --git a/src/offset.rs b/src/offset.rs index 8809366..3152c78 100644 --- a/src/offset.rs +++ b/src/offset.rs @@ -68,7 +68,7 @@ impl<R> PosTrackingReader<R> { impl<R> Position<usize> for PosTrackingReader<R> { fn position(&self) -> usize { - self.position - 1 + self.position } } diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 02a4d62..e8a8908 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -106,7 +106,7 @@ impl From<State> for InternalState { } } -impl<R: Reader + Position<O>, O, E: Emitter<O>> Tokenizer<R, O, E> { +impl<R: Reader + Position<O>, O: Offset, E: Emitter<O>> Tokenizer<R, O, E> { /// Test-internal function to override internal state. /// /// Only available with the `integration-tests` feature which is not public API. @@ -123,7 +123,7 @@ impl<R: Reader + Position<O>, O, E: Emitter<O>> Tokenizer<R, O, E> { /// Just a helper method for the machine. #[inline] pub(crate) fn emit_error(&mut self, error: Error) { - self.emitter.emit_error(error, self.reader.position()); + self.emitter.emit_error(error, self.reader.position() - 1); } /// Assuming the _current token_ is an end tag, return true if all of these hold. Return false otherwise. @@ -140,14 +140,14 @@ impl<R: Reader + Position<O>, O, E: Emitter<O>> Tokenizer<R, O, E> { #[inline] pub(crate) fn init_start_tag(&mut self) { - self.emitter.init_start_tag(self.reader.position()); + self.emitter.init_start_tag(self.reader.position() - 1); self.current_tag_name.clear(); self.is_start_tag = true; } #[inline] pub(crate) fn init_end_tag(&mut self) { - self.emitter.init_end_tag(self.reader.position()); + self.emitter.init_end_tag(self.reader.position() - 1); self.current_tag_name.clear(); self.is_start_tag = false; } diff --git a/tests/test_spans.rs b/tests/test_spans.rs index 970099a..a33c2b3 100644 --- a/tests/test_spans.rs +++ b/tests/test_spans.rs @@ -111,12 +111,11 @@ fn comment_proper_data_span() { let Token::Comment(comment) = tokenizer(html).next().unwrap() else { panic!("expected comment"); }; - // FIXME: this span is wrong (starts one byte too soon) - assert_eq!(comment.data, html[1..][comment.data_span()]); + assert_eq!(comment.data, html[comment.data_span()]); let labels = vec![(comment.data_span(), "")]; assert_snapshot!(annotate(html, labels), @r###" <!-- Why are you looking at the source code? --> - ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ "###); } |