aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMartin Fischer <martin@push-f.com>2023-08-17 16:12:51 +0200
committerMartin Fischer <martin@push-f.com>2023-08-19 13:41:55 +0200
commitcb4e9cf7cfc1612c67f16bfabadbdf442cd380fe (patch)
tree5aa2bf7dccafe66d735f3609d446bfad083ab8a3
parentc15895d44d17984386d3684e2aa85aca386ba3bf (diff)
fix: fix lots of position off-by-ones
Previously the PosTrackingReader always mysteriously subtracted 1 from the current position ... this wasn't sound at all ... the machine just happens to often call `Tokenizer::unread_char` ... but not always. E.g. for proper comments it didn't which resulted in their offset and spans being off-by-one, which is fixed by this commit (see test_spans.rs).
-rw-r--r--src/emitter.rs7
-rw-r--r--src/machine.rs8
-rw-r--r--src/offset.rs2
-rw-r--r--src/tokenizer.rs8
-rw-r--r--tests/test_spans.rs5
5 files changed, 14 insertions, 16 deletions
diff --git a/src/emitter.rs b/src/emitter.rs
index 1f60f70..90aa4db 100644
--- a/src/emitter.rs
+++ b/src/emitter.rs
@@ -310,7 +310,7 @@ impl<O: Offset> Emitter<O> for DefaultEmitter<O> {
*self_closing = true;
}
Token::EndTag(_) => {
- self.emit_error(Error::EndTagWithTrailingSolidus, offset);
+ self.emit_error(Error::EndTagWithTrailingSolidus, offset - 1);
}
_ => {
debug_assert!(false);
@@ -378,9 +378,8 @@ impl<O: Offset> Emitter<O> for DefaultEmitter<O> {
},
));
}
- fn init_attribute_value(&mut self, offset: O, quoted: bool) {
- self.current_attribute.as_mut().unwrap().1.value_span =
- offset + quoted as usize..offset + quoted as usize;
+ fn init_attribute_value(&mut self, offset: O, _quoted: bool) {
+ self.current_attribute.as_mut().unwrap().1.value_span = offset..offset;
}
fn push_attribute_name(&mut self, s: &str) {
diff --git a/src/machine.rs b/src/machine.rs
index deb3983..a58a754 100644
--- a/src/machine.rs
+++ b/src/machine.rs
@@ -694,7 +694,7 @@ where
Ok(ControlToken::Continue)
}
Some(x) => {
- slf.emitter.init_attribute_name(slf.reader.position());
+ slf.emitter.init_attribute_name(slf.reader.position() - 1);
slf.state = State::AttributeName;
slf.unread_char(Some(x));
Ok(ControlToken::Continue)
@@ -747,7 +747,7 @@ where
Ok(ControlToken::Eof)
}
Some(x) => {
- slf.emitter.init_attribute_name(slf.reader.position());
+ slf.emitter.init_attribute_name(slf.reader.position() - 1);
slf.state = State::AttributeName;
slf.unread_char(Some(x));
Ok(ControlToken::Continue)
@@ -775,7 +775,7 @@ where
}
c => {
slf.emitter
- .init_attribute_value(slf.reader.position(), false);
+ .init_attribute_value(slf.reader.position() - 1, false);
slf.state = State::AttributeValueUnquoted;
slf.unread_char(c);
Ok(ControlToken::Continue)
@@ -952,7 +952,7 @@ where
}
c => {
slf.emit_error(Error::IncorrectlyOpenedComment);
- slf.emitter.init_comment(slf.reader.position());
+ slf.emitter.init_comment(slf.reader.position() - 1);
slf.state = State::BogusComment;
slf.unread_char(c);
Ok(ControlToken::Continue)
diff --git a/src/offset.rs b/src/offset.rs
index 8809366..3152c78 100644
--- a/src/offset.rs
+++ b/src/offset.rs
@@ -68,7 +68,7 @@ impl<R> PosTrackingReader<R> {
impl<R> Position<usize> for PosTrackingReader<R> {
fn position(&self) -> usize {
- self.position - 1
+ self.position
}
}
diff --git a/src/tokenizer.rs b/src/tokenizer.rs
index 02a4d62..e8a8908 100644
--- a/src/tokenizer.rs
+++ b/src/tokenizer.rs
@@ -106,7 +106,7 @@ impl From<State> for InternalState {
}
}
-impl<R: Reader + Position<O>, O, E: Emitter<O>> Tokenizer<R, O, E> {
+impl<R: Reader + Position<O>, O: Offset, E: Emitter<O>> Tokenizer<R, O, E> {
/// Test-internal function to override internal state.
///
/// Only available with the `integration-tests` feature which is not public API.
@@ -123,7 +123,7 @@ impl<R: Reader + Position<O>, O, E: Emitter<O>> Tokenizer<R, O, E> {
/// Just a helper method for the machine.
#[inline]
pub(crate) fn emit_error(&mut self, error: Error) {
- self.emitter.emit_error(error, self.reader.position());
+ self.emitter.emit_error(error, self.reader.position() - 1);
}
/// Assuming the _current token_ is an end tag, return true if all of these hold. Return false otherwise.
@@ -140,14 +140,14 @@ impl<R: Reader + Position<O>, O, E: Emitter<O>> Tokenizer<R, O, E> {
#[inline]
pub(crate) fn init_start_tag(&mut self) {
- self.emitter.init_start_tag(self.reader.position());
+ self.emitter.init_start_tag(self.reader.position() - 1);
self.current_tag_name.clear();
self.is_start_tag = true;
}
#[inline]
pub(crate) fn init_end_tag(&mut self) {
- self.emitter.init_end_tag(self.reader.position());
+ self.emitter.init_end_tag(self.reader.position() - 1);
self.current_tag_name.clear();
self.is_start_tag = false;
}
diff --git a/tests/test_spans.rs b/tests/test_spans.rs
index 970099a..a33c2b3 100644
--- a/tests/test_spans.rs
+++ b/tests/test_spans.rs
@@ -111,12 +111,11 @@ fn comment_proper_data_span() {
let Token::Comment(comment) = tokenizer(html).next().unwrap() else {
panic!("expected comment");
};
- // FIXME: this span is wrong (starts one byte too soon)
- assert_eq!(comment.data, html[1..][comment.data_span()]);
+ assert_eq!(comment.data, html[comment.data_span()]);
let labels = vec![(comment.data_span(), "")];
assert_snapshot!(annotate(html, labels), @r###"
<!-- Why are you looking at the source code? -->
- ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
"###);
}