aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorMartin Fischer <martin@push-f.com>2023-09-02 12:27:14 +0200
committerMartin Fischer <martin@push-f.com>2023-09-03 23:00:05 +0200
commite993f19c2b8ef00b32f17f9ed32306f3ceb21bc3 (patch)
tree4992456f36e6d012b4cc54a69811ec321cc4550c /src
parentc8a8bcb95b725d91a7c4b7bc7623171f2a04fc67 (diff)
fix!: make comment data spans encoding-independent
Diffstat (limited to 'src')
-rw-r--r--src/emitter.rs26
-rw-r--r--src/machine.rs27
-rw-r--r--src/tokenizer.rs1
3 files changed, 32 insertions, 22 deletions
diff --git a/src/emitter.rs b/src/emitter.rs
index 9fdf967..db3da78 100644
--- a/src/emitter.rs
+++ b/src/emitter.rs
@@ -55,7 +55,7 @@ pub trait Emitter<O> {
fn init_end_tag(&mut self, tag_offset: O, name_offset: O);
/// Set the _current token_ to a comment.
- fn init_comment(&mut self, data_offset: O);
+ fn init_comment(&mut self, data_start_offset: O);
/// Emit the _current token_, assuming it is a tag.
///
@@ -71,7 +71,7 @@ pub trait Emitter<O> {
/// Emit the _current token_, assuming it is a comment.
///
/// If the current token is not a comment, this method may panic.
- fn emit_current_comment(&mut self, offset: O);
+ fn emit_current_comment(&mut self, data_end_offset: O);
/// Emit the _current token_, assuming it is a doctype.
///
@@ -309,10 +309,10 @@ impl<O: Offset> Emitter<O> for DefaultEmitter<O> {
self.seen_attributes.clear();
}
- fn init_comment(&mut self, data_offset: O) {
+ fn init_comment(&mut self, data_start_offset: O) {
self.current_token = Some(Token::Comment(Comment {
data: String::new(),
- data_offset,
+ data_span: data_start_offset..O::default(),
}));
}
fn emit_current_tag(&mut self, offset: O) {
@@ -334,10 +334,14 @@ impl<O: Offset> Emitter<O> for DefaultEmitter<O> {
}
self.emit_token(token);
}
- fn emit_current_comment(&mut self, _offset: O) {
- let comment = self.current_token.take().unwrap();
- debug_assert!(matches!(comment, Token::Comment(_)));
- self.emit_token(comment);
+ fn emit_current_comment(&mut self, data_end_offset: O) {
+ let mut token = self.current_token.take().unwrap();
+ if let Token::Comment(comment) = &mut token {
+ comment.data_span.end = data_end_offset;
+ } else {
+ debug_assert!(false);
+ }
+ self.emit_token(token);
}
fn emit_current_doctype(&mut self, offset: O) {
@@ -572,13 +576,13 @@ pub struct Comment<O> {
/// The text within the comment.
pub data: String,
/// The source offset of the comment data.
- pub data_offset: O,
+ pub data_span: Range<O>,
}
impl<O: Offset> Comment<O> {
- /// Calculates the span for the comment data and returns it.
+ /// Returns the span for the comment data.
pub fn data_span(&self) -> Range<O> {
- self.data_offset..self.data_offset + self.data.len()
+ self.data_span.clone()
}
}
diff --git a/src/machine.rs b/src/machine.rs
index f00af0a..26e1652 100644
--- a/src/machine.rs
+++ b/src/machine.rs
@@ -950,11 +950,11 @@ where
State::BogusComment => match slf.read_char()? {
Some('>') => {
slf.state = State::Data;
- slf.emitter.emit_current_comment(slf.reader.position());
+ slf.emitter.emit_current_comment(slf.position_before_match);
Ok(ControlToken::Continue)
}
None => {
- slf.emitter.emit_current_comment(slf.reader.position());
+ slf.emitter.emit_current_comment(slf.position_before_match);
Ok(ControlToken::Eof)
}
Some('\0') => {
@@ -994,7 +994,7 @@ where
Some('>') => {
slf.emit_error(Error::AbruptClosingOfEmptyComment);
slf.state = State::Data;
- slf.emitter.emit_current_comment(slf.reader.position());
+ slf.emitter.emit_current_comment(slf.position_before_match);
Ok(ControlToken::Continue)
}
c => {
@@ -1008,15 +1008,19 @@ where
slf.state = State::CommentEnd;
Ok(ControlToken::Continue)
}
- Some('>') => {
+ Some(c @ '>') => {
slf.emit_error(Error::AbruptClosingOfEmptyComment);
slf.state = State::Data;
- slf.emitter.emit_current_comment(slf.reader.position());
+ slf.emitter.emit_current_comment(
+ slf.position_before_match - slf.reader.len_of_char_in_current_encoding(c),
+ );
Ok(ControlToken::Continue)
}
None => {
slf.emit_error(Error::EofInComment);
- slf.emitter.emit_current_comment(slf.reader.position());
+ slf.emitter.emit_current_comment(
+ slf.position_before_match - slf.reader.len_of_char_in_current_encoding('-'),
+ );
Ok(ControlToken::Eof)
}
c @ Some(_) => {
@@ -1033,6 +1037,7 @@ where
Ok(ControlToken::Continue)
}
Some('-') => {
+ slf.some_offset = slf.position_before_match;
slf.state = State::CommentEndDash;
Ok(ControlToken::Continue)
}
@@ -1109,7 +1114,7 @@ where
}
None => {
slf.emit_error(Error::EofInComment);
- slf.emitter.emit_current_comment(slf.reader.position());
+ slf.emitter.emit_current_comment(slf.some_offset);
Ok(ControlToken::Eof)
}
c => {
@@ -1122,7 +1127,7 @@ where
State::CommentEnd => match slf.read_char()? {
Some('>') => {
slf.state = State::Data;
- slf.emitter.emit_current_comment(slf.reader.position());
+ slf.emitter.emit_current_comment(slf.some_offset);
Ok(ControlToken::Continue)
}
Some('!') => {
@@ -1135,7 +1140,7 @@ where
}
None => {
slf.emit_error(Error::EofInComment);
- slf.emitter.emit_current_comment(slf.reader.position());
+ slf.emitter.emit_current_comment(slf.some_offset);
Ok(ControlToken::Eof)
}
c @ Some(_) => {
@@ -1157,12 +1162,12 @@ where
Some('>') => {
slf.emit_error(Error::IncorrectlyClosedComment);
slf.state = State::Data;
- slf.emitter.emit_current_comment(slf.reader.position());
+ slf.emitter.emit_current_comment(slf.some_offset);
Ok(ControlToken::Continue)
}
None => {
slf.emit_error(Error::EofInComment);
- slf.emitter.emit_current_comment(slf.reader.position());
+ slf.emitter.emit_current_comment(slf.some_offset);
Ok(ControlToken::Eof)
}
c @ Some(_) => {
diff --git a/src/tokenizer.rs b/src/tokenizer.rs
index e0402b9..cfd8eea 100644
--- a/src/tokenizer.rs
+++ b/src/tokenizer.rs
@@ -52,6 +52,7 @@ pub struct Tokenizer<R: Reader, O, E: Emitter<O>> {
/// The reader position before the match block in [`machine::consume`].
pub(crate) position_before_match: O,
/// * Set to the offset of `<` in [`InternalState::Data`].
+ /// * Set to the offset of `-` in [`InternalState::Comment`].
/// * Set to the offset of `&` in [`InternalState::CharacterReference`].
pub(crate) some_offset: O,
/// This boolean flag exists so that the [`NaiveParser`](crate::NaiveParser) can work with any [`Emitter`]