diff options
author | Martin Fischer <martin@push-f.com> | 2023-09-01 23:00:45 +0200 |
---|---|---|
committer | Martin Fischer <martin@push-f.com> | 2023-09-03 23:00:05 +0200 |
commit | f588704c90f33fe27945d742762d016dea3e113c (patch) | |
tree | 66716b8c01499f6b3848790c2185c9e06e48fb1e /src/tokenizer.rs | |
parent | fb3d757b2f756950c9a86681291e2817e4bd2975 (diff) |
fix: don't assume UTF-8 in machine/tokenizer
Diffstat (limited to 'src/tokenizer.rs')
-rw-r--r-- | src/tokenizer.rs | 5 |
1 files changed, 4 insertions, 1 deletions
diff --git a/src/tokenizer.rs b/src/tokenizer.rs index d59710d..58f7b80 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -49,6 +49,8 @@ pub struct Tokenizer<R: Reader, O, E: Emitter<O>> { current_tag_name: String, last_start_tag_name: String, is_start_tag: bool, + /// The reader position before the match block in [`machine::consume`]. + pub(crate) position_before_match: O, /// * Set to the offset of `<` in [`InternalState::Data`]. /// * Set to the offset of `&` in [`InternalState::CharacterReference`]. pub(crate) some_offset: O, @@ -77,6 +79,7 @@ impl<R: Reader + Position<O>, O: Offset, E: Emitter<O>> Tokenizer<R, O, E> { current_tag_name: String::new(), last_start_tag_name: String::new(), is_start_tag: false, + position_before_match: O::default(), some_offset: O::default(), naively_switch_state: false, } @@ -198,7 +201,7 @@ impl<R: Reader + Position<O>, O: Offset, E: Emitter<O>> Tokenizer<R, O, E> { | Error::ControlCharacterReference | Error::UnknownNamedCharacterReference => self.some_offset..self.reader.position(), - _ => self.reader.position() - 1..self.reader.position(), + _ => self.position_before_match..self.reader.position(), }; self.emitter.emit_error(error, span); } |