From cb4e9cf7cfc1612c67f16bfabadbdf442cd380fe Mon Sep 17 00:00:00 2001 From: Martin Fischer Date: Thu, 17 Aug 2023 16:12:51 +0200 Subject: fix: fix lots of position off-by-ones Previously the PosTrackingReader always mysteriously subtracted 1 from the current position ... this wasn't sound at all ... the machine just happens to often call `Tokenizer::unread_char` ... but not always. E.g. for proper comments it didn't which resulted in their offset and spans being off-by-one, which is fixed by this commit (see test_spans.rs). --- src/emitter.rs | 7 +++---- src/machine.rs | 8 ++++---- src/offset.rs | 2 +- src/tokenizer.rs | 8 ++++---- tests/test_spans.rs | 5 ++--- 5 files changed, 14 insertions(+), 16 deletions(-) diff --git a/src/emitter.rs b/src/emitter.rs index 1f60f70..90aa4db 100644 --- a/src/emitter.rs +++ b/src/emitter.rs @@ -310,7 +310,7 @@ impl Emitter for DefaultEmitter { *self_closing = true; } Token::EndTag(_) => { - self.emit_error(Error::EndTagWithTrailingSolidus, offset); + self.emit_error(Error::EndTagWithTrailingSolidus, offset - 1); } _ => { debug_assert!(false); @@ -378,9 +378,8 @@ impl Emitter for DefaultEmitter { }, )); } - fn init_attribute_value(&mut self, offset: O, quoted: bool) { - self.current_attribute.as_mut().unwrap().1.value_span = - offset + quoted as usize..offset + quoted as usize; + fn init_attribute_value(&mut self, offset: O, _quoted: bool) { + self.current_attribute.as_mut().unwrap().1.value_span = offset..offset; } fn push_attribute_name(&mut self, s: &str) { diff --git a/src/machine.rs b/src/machine.rs index deb3983..a58a754 100644 --- a/src/machine.rs +++ b/src/machine.rs @@ -694,7 +694,7 @@ where Ok(ControlToken::Continue) } Some(x) => { - slf.emitter.init_attribute_name(slf.reader.position()); + slf.emitter.init_attribute_name(slf.reader.position() - 1); slf.state = State::AttributeName; slf.unread_char(Some(x)); Ok(ControlToken::Continue) @@ -747,7 +747,7 @@ where Ok(ControlToken::Eof) } Some(x) => { - slf.emitter.init_attribute_name(slf.reader.position()); + slf.emitter.init_attribute_name(slf.reader.position() - 1); slf.state = State::AttributeName; slf.unread_char(Some(x)); Ok(ControlToken::Continue) @@ -775,7 +775,7 @@ where } c => { slf.emitter - .init_attribute_value(slf.reader.position(), false); + .init_attribute_value(slf.reader.position() - 1, false); slf.state = State::AttributeValueUnquoted; slf.unread_char(c); Ok(ControlToken::Continue) @@ -952,7 +952,7 @@ where } c => { slf.emit_error(Error::IncorrectlyOpenedComment); - slf.emitter.init_comment(slf.reader.position()); + slf.emitter.init_comment(slf.reader.position() - 1); slf.state = State::BogusComment; slf.unread_char(c); Ok(ControlToken::Continue) diff --git a/src/offset.rs b/src/offset.rs index 8809366..3152c78 100644 --- a/src/offset.rs +++ b/src/offset.rs @@ -68,7 +68,7 @@ impl PosTrackingReader { impl Position for PosTrackingReader { fn position(&self) -> usize { - self.position - 1 + self.position } } diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 02a4d62..e8a8908 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -106,7 +106,7 @@ impl From for InternalState { } } -impl, O, E: Emitter> Tokenizer { +impl, O: Offset, E: Emitter> Tokenizer { /// Test-internal function to override internal state. /// /// Only available with the `integration-tests` feature which is not public API. @@ -123,7 +123,7 @@ impl, O, E: Emitter> Tokenizer { /// Just a helper method for the machine. #[inline] pub(crate) fn emit_error(&mut self, error: Error) { - self.emitter.emit_error(error, self.reader.position()); + self.emitter.emit_error(error, self.reader.position() - 1); } /// Assuming the _current token_ is an end tag, return true if all of these hold. Return false otherwise. @@ -140,14 +140,14 @@ impl, O, E: Emitter> Tokenizer { #[inline] pub(crate) fn init_start_tag(&mut self) { - self.emitter.init_start_tag(self.reader.position()); + self.emitter.init_start_tag(self.reader.position() - 1); self.current_tag_name.clear(); self.is_start_tag = true; } #[inline] pub(crate) fn init_end_tag(&mut self) { - self.emitter.init_end_tag(self.reader.position()); + self.emitter.init_end_tag(self.reader.position() - 1); self.current_tag_name.clear(); self.is_start_tag = false; } diff --git a/tests/test_spans.rs b/tests/test_spans.rs index 970099a..a33c2b3 100644 --- a/tests/test_spans.rs +++ b/tests/test_spans.rs @@ -111,12 +111,11 @@ fn comment_proper_data_span() { let Token::Comment(comment) = tokenizer(html).next().unwrap() else { panic!("expected comment"); }; - // FIXME: this span is wrong (starts one byte too soon) - assert_eq!(comment.data, html[1..][comment.data_span()]); + assert_eq!(comment.data, html[comment.data_span()]); let labels = vec![(comment.data_span(), "")]; assert_snapshot!(annotate(html, labels), @r###" - ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ "###); } -- cgit v1.2.3