summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--CHANGELOG.md4
-rw-r--r--src/basic_emitter.rs2
-rw-r--r--src/emitter.rs2
-rw-r--r--src/tokenizer.rs4
-rw-r--r--src/tokenizer/machine/utils.rs4
-rw-r--r--src/trace.rs2
-rw-r--r--src/tracing_emitter.rs4
-rw-r--r--tests/test_spans.rs18
8 files changed, 34 insertions, 6 deletions
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 52de087..5c6cc58 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -4,6 +4,8 @@
#### Features
+* Added offsets for end-of-file tokens.
+
* Added a blanket implementation to implement `Reader` for boxed readers.
#### Breaking changes
@@ -35,6 +37,8 @@
* Replaced `emit_string` with `emit_char`.
+ * Added an offset parameter to `emit_eof`.
+
* `NaiveParser`: Removed `new_with_spans`.
### 0.5.1 - 2023-09-03
diff --git a/src/basic_emitter.rs b/src/basic_emitter.rs
index e67447b..0d37810 100644
--- a/src/basic_emitter.rs
+++ b/src/basic_emitter.rs
@@ -60,7 +60,7 @@ impl<O: Offset> Emitter<O> for BasicEmitter<O> {
self.emit_token(Token::Char(c));
}
- fn emit_eof(&mut self) {
+ fn emit_eof(&mut self, offset: O) {
self.emit_token(Token::EndOfFile);
}
diff --git a/src/emitter.rs b/src/emitter.rs
index d1e1dfe..264d2f1 100644
--- a/src/emitter.rs
+++ b/src/emitter.rs
@@ -31,7 +31,7 @@ pub trait Emitter<O> {
fn emit_char(&mut self, c: char);
/// The state machine has reached the end of the file.
- fn emit_eof(&mut self);
+ fn emit_eof(&mut self, offset: O);
/// Set the _current token_ to a start tag.
fn init_start_tag(&mut self, tag_offset: O, name_offset: O);
diff --git a/src/tokenizer.rs b/src/tokenizer.rs
index 3359637..b41c208 100644
--- a/src/tokenizer.rs
+++ b/src/tokenizer.rs
@@ -171,7 +171,9 @@ where
Ok(ControlToken::Continue) => (),
Ok(ControlToken::Eof) => {
self.eof = true;
- self.machine.emitter.emit_eof();
+ self.machine
+ .emitter
+ .emit_eof(self.machine.reader_position());
}
Ok(ControlToken::CdataOpen) => return Some(Ok(Event::CdataOpen)),
}
diff --git a/src/tokenizer/machine/utils.rs b/src/tokenizer/machine/utils.rs
index ea4d697..9752746 100644
--- a/src/tokenizer/machine/utils.rs
+++ b/src/tokenizer/machine/utils.rs
@@ -13,6 +13,10 @@ where
O: Offset,
E: Emitter<O>,
{
+ pub(crate) fn reader_position(&self) -> O {
+ self.reader.position()
+ }
+
/// Emits the given character as a character token.
#[inline]
pub(super) fn emit_char(&mut self, c: char) {
diff --git a/src/trace.rs b/src/trace.rs
index a816429..620d4f3 100644
--- a/src/trace.rs
+++ b/src/trace.rs
@@ -19,7 +19,7 @@ pub enum Trace {
EndTag(EndTagTrace),
Comment(CommentTrace),
Doctype(DoctypeTrace),
- EndOfFile,
+ EndOfFile(usize),
}
/// Provides byte offsets and syntax information for a [`StartTag`] token.
diff --git a/src/tracing_emitter.rs b/src/tracing_emitter.rs
index 408e832..819f909 100644
--- a/src/tracing_emitter.rs
+++ b/src/tracing_emitter.rs
@@ -66,8 +66,8 @@ impl Emitter<usize> for TracingEmitter {
self.emit_token(Token::Char(c), Trace::Char);
}
- fn emit_eof(&mut self) {
- self.emit_token(Token::EndOfFile, Trace::EndOfFile);
+ fn emit_eof(&mut self, offset: usize) {
+ self.emit_token(Token::EndOfFile, Trace::EndOfFile(offset));
}
fn init_start_tag(&mut self, tag_offset: usize, name_offset: usize) {
diff --git a/tests/test_spans.rs b/tests/test_spans.rs
index 0e95be0..d19d6aa 100644
--- a/tests/test_spans.rs
+++ b/tests/test_spans.rs
@@ -351,6 +351,24 @@ fn doctype_id_spans() {
"###);
}
+#[test]
+fn eof_offset() {
+ let html = "Where does it end?";
+ let labeler = |parser: Parser| {
+ let mut labels = Vec::new();
+ for (_, trace) in parser.flatten() {
+ if let Trace::EndOfFile(offset) = trace {
+ labels.push((offset..offset, "here"));
+ }
+ }
+ labels
+ };
+ assert_snapshot!(test_and_annotate(html, labeler), @r###"
+ Where does it end?
+ ^ here
+ "###);
+}
+
fn annotate_errors(html: &'static str) -> String {
let mut parser = parser(html);
for _ in parser.by_ref() {}