diff options
-rw-r--r-- | CHANGELOG.md | 4 | ||||
-rw-r--r-- | src/basic_emitter.rs | 2 | ||||
-rw-r--r-- | src/emitter.rs | 2 | ||||
-rw-r--r-- | src/tokenizer.rs | 4 | ||||
-rw-r--r-- | src/tokenizer/machine/utils.rs | 4 | ||||
-rw-r--r-- | src/trace.rs | 2 | ||||
-rw-r--r-- | src/tracing_emitter.rs | 4 | ||||
-rw-r--r-- | tests/test_spans.rs | 18 |
8 files changed, 34 insertions, 6 deletions
diff --git a/CHANGELOG.md b/CHANGELOG.md index 52de087..5c6cc58 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,8 @@ #### Features +* Added offsets for end-of-file tokens. + * Added a blanket implementation to implement `Reader` for boxed readers. #### Breaking changes @@ -35,6 +37,8 @@ * Replaced `emit_string` with `emit_char`. + * Added an offset parameter to `emit_eof`. + * `NaiveParser`: Removed `new_with_spans`. ### 0.5.1 - 2023-09-03 diff --git a/src/basic_emitter.rs b/src/basic_emitter.rs index e67447b..0d37810 100644 --- a/src/basic_emitter.rs +++ b/src/basic_emitter.rs @@ -60,7 +60,7 @@ impl<O: Offset> Emitter<O> for BasicEmitter<O> { self.emit_token(Token::Char(c)); } - fn emit_eof(&mut self) { + fn emit_eof(&mut self, offset: O) { self.emit_token(Token::EndOfFile); } diff --git a/src/emitter.rs b/src/emitter.rs index d1e1dfe..264d2f1 100644 --- a/src/emitter.rs +++ b/src/emitter.rs @@ -31,7 +31,7 @@ pub trait Emitter<O> { fn emit_char(&mut self, c: char); /// The state machine has reached the end of the file. - fn emit_eof(&mut self); + fn emit_eof(&mut self, offset: O); /// Set the _current token_ to a start tag. fn init_start_tag(&mut self, tag_offset: O, name_offset: O); diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 3359637..b41c208 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -171,7 +171,9 @@ where Ok(ControlToken::Continue) => (), Ok(ControlToken::Eof) => { self.eof = true; - self.machine.emitter.emit_eof(); + self.machine + .emitter + .emit_eof(self.machine.reader_position()); } Ok(ControlToken::CdataOpen) => return Some(Ok(Event::CdataOpen)), } diff --git a/src/tokenizer/machine/utils.rs b/src/tokenizer/machine/utils.rs index ea4d697..9752746 100644 --- a/src/tokenizer/machine/utils.rs +++ b/src/tokenizer/machine/utils.rs @@ -13,6 +13,10 @@ where O: Offset, E: Emitter<O>, { + pub(crate) fn reader_position(&self) -> O { + self.reader.position() + } + /// Emits the given character as a character token. #[inline] pub(super) fn emit_char(&mut self, c: char) { diff --git a/src/trace.rs b/src/trace.rs index a816429..620d4f3 100644 --- a/src/trace.rs +++ b/src/trace.rs @@ -19,7 +19,7 @@ pub enum Trace { EndTag(EndTagTrace), Comment(CommentTrace), Doctype(DoctypeTrace), - EndOfFile, + EndOfFile(usize), } /// Provides byte offsets and syntax information for a [`StartTag`] token. diff --git a/src/tracing_emitter.rs b/src/tracing_emitter.rs index 408e832..819f909 100644 --- a/src/tracing_emitter.rs +++ b/src/tracing_emitter.rs @@ -66,8 +66,8 @@ impl Emitter<usize> for TracingEmitter { self.emit_token(Token::Char(c), Trace::Char); } - fn emit_eof(&mut self) { - self.emit_token(Token::EndOfFile, Trace::EndOfFile); + fn emit_eof(&mut self, offset: usize) { + self.emit_token(Token::EndOfFile, Trace::EndOfFile(offset)); } fn init_start_tag(&mut self, tag_offset: usize, name_offset: usize) { diff --git a/tests/test_spans.rs b/tests/test_spans.rs index 0e95be0..d19d6aa 100644 --- a/tests/test_spans.rs +++ b/tests/test_spans.rs @@ -351,6 +351,24 @@ fn doctype_id_spans() { "###); } +#[test] +fn eof_offset() { + let html = "Where does it end?"; + let labeler = |parser: Parser| { + let mut labels = Vec::new(); + for (_, trace) in parser.flatten() { + if let Trace::EndOfFile(offset) = trace { + labels.push((offset..offset, "here")); + } + } + labels + }; + assert_snapshot!(test_and_annotate(html, labeler), @r###" + Where does it end? + ^ here + "###); +} + fn annotate_errors(html: &'static str) -> String { let mut parser = parser(html); for _ in parser.by_ref() {} |