diff options
author | Martin Fischer <martin@push-f.com> | 2023-09-02 12:27:14 +0200 |
---|---|---|
committer | Martin Fischer <martin@push-f.com> | 2023-09-03 23:00:05 +0200 |
commit | e993f19c2b8ef00b32f17f9ed32306f3ceb21bc3 (patch) | |
tree | 4992456f36e6d012b4cc54a69811ec321cc4550c | |
parent | c8a8bcb95b725d91a7c4b7bc7623171f2a04fc67 (diff) |
fix!: make comment data spans encoding-independent
-rw-r--r-- | CHANGELOG.md | 3 | ||||
-rw-r--r-- | src/emitter.rs | 26 | ||||
-rw-r--r-- | src/machine.rs | 27 | ||||
-rw-r--r-- | src/tokenizer.rs | 1 | ||||
-rw-r--r-- | tests/test_spans.rs | 20 |
5 files changed, 36 insertions, 41 deletions
diff --git a/CHANGELOG.md b/CHANGELOG.md index b7ae1d7..6e67319 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -25,7 +25,8 @@ * Added a `name_offset` parameter to `init_start_tag` and `init_end_tag`. * Several provided offsets have been changed to be more sensible. - Affected are: `init_start_tag`, `init_end_tag`, `emit_current_tag` + Affected are: `init_start_tag`, `init_end_tag`, `emit_current_tag`, + `emit_current_comment` * token types diff --git a/src/emitter.rs b/src/emitter.rs index 9fdf967..db3da78 100644 --- a/src/emitter.rs +++ b/src/emitter.rs @@ -55,7 +55,7 @@ pub trait Emitter<O> { fn init_end_tag(&mut self, tag_offset: O, name_offset: O); /// Set the _current token_ to a comment. - fn init_comment(&mut self, data_offset: O); + fn init_comment(&mut self, data_start_offset: O); /// Emit the _current token_, assuming it is a tag. /// @@ -71,7 +71,7 @@ pub trait Emitter<O> { /// Emit the _current token_, assuming it is a comment. /// /// If the current token is not a comment, this method may panic. - fn emit_current_comment(&mut self, offset: O); + fn emit_current_comment(&mut self, data_end_offset: O); /// Emit the _current token_, assuming it is a doctype. /// @@ -309,10 +309,10 @@ impl<O: Offset> Emitter<O> for DefaultEmitter<O> { self.seen_attributes.clear(); } - fn init_comment(&mut self, data_offset: O) { + fn init_comment(&mut self, data_start_offset: O) { self.current_token = Some(Token::Comment(Comment { data: String::new(), - data_offset, + data_span: data_start_offset..O::default(), })); } fn emit_current_tag(&mut self, offset: O) { @@ -334,10 +334,14 @@ impl<O: Offset> Emitter<O> for DefaultEmitter<O> { } self.emit_token(token); } - fn emit_current_comment(&mut self, _offset: O) { - let comment = self.current_token.take().unwrap(); - debug_assert!(matches!(comment, Token::Comment(_))); - self.emit_token(comment); + fn emit_current_comment(&mut self, data_end_offset: O) { + let mut token = self.current_token.take().unwrap(); + if let Token::Comment(comment) = &mut token { + comment.data_span.end = data_end_offset; + } else { + debug_assert!(false); + } + self.emit_token(token); } fn emit_current_doctype(&mut self, offset: O) { @@ -572,13 +576,13 @@ pub struct Comment<O> { /// The text within the comment. pub data: String, /// The source offset of the comment data. - pub data_offset: O, + pub data_span: Range<O>, } impl<O: Offset> Comment<O> { - /// Calculates the span for the comment data and returns it. + /// Returns the span for the comment data. pub fn data_span(&self) -> Range<O> { - self.data_offset..self.data_offset + self.data.len() + self.data_span.clone() } } diff --git a/src/machine.rs b/src/machine.rs index f00af0a..26e1652 100644 --- a/src/machine.rs +++ b/src/machine.rs @@ -950,11 +950,11 @@ where State::BogusComment => match slf.read_char()? { Some('>') => { slf.state = State::Data; - slf.emitter.emit_current_comment(slf.reader.position()); + slf.emitter.emit_current_comment(slf.position_before_match); Ok(ControlToken::Continue) } None => { - slf.emitter.emit_current_comment(slf.reader.position()); + slf.emitter.emit_current_comment(slf.position_before_match); Ok(ControlToken::Eof) } Some('\0') => { @@ -994,7 +994,7 @@ where Some('>') => { slf.emit_error(Error::AbruptClosingOfEmptyComment); slf.state = State::Data; - slf.emitter.emit_current_comment(slf.reader.position()); + slf.emitter.emit_current_comment(slf.position_before_match); Ok(ControlToken::Continue) } c => { @@ -1008,15 +1008,19 @@ where slf.state = State::CommentEnd; Ok(ControlToken::Continue) } - Some('>') => { + Some(c @ '>') => { slf.emit_error(Error::AbruptClosingOfEmptyComment); slf.state = State::Data; - slf.emitter.emit_current_comment(slf.reader.position()); + slf.emitter.emit_current_comment( + slf.position_before_match - slf.reader.len_of_char_in_current_encoding(c), + ); Ok(ControlToken::Continue) } None => { slf.emit_error(Error::EofInComment); - slf.emitter.emit_current_comment(slf.reader.position()); + slf.emitter.emit_current_comment( + slf.position_before_match - slf.reader.len_of_char_in_current_encoding('-'), + ); Ok(ControlToken::Eof) } c @ Some(_) => { @@ -1033,6 +1037,7 @@ where Ok(ControlToken::Continue) } Some('-') => { + slf.some_offset = slf.position_before_match; slf.state = State::CommentEndDash; Ok(ControlToken::Continue) } @@ -1109,7 +1114,7 @@ where } None => { slf.emit_error(Error::EofInComment); - slf.emitter.emit_current_comment(slf.reader.position()); + slf.emitter.emit_current_comment(slf.some_offset); Ok(ControlToken::Eof) } c => { @@ -1122,7 +1127,7 @@ where State::CommentEnd => match slf.read_char()? { Some('>') => { slf.state = State::Data; - slf.emitter.emit_current_comment(slf.reader.position()); + slf.emitter.emit_current_comment(slf.some_offset); Ok(ControlToken::Continue) } Some('!') => { @@ -1135,7 +1140,7 @@ where } None => { slf.emit_error(Error::EofInComment); - slf.emitter.emit_current_comment(slf.reader.position()); + slf.emitter.emit_current_comment(slf.some_offset); Ok(ControlToken::Eof) } c @ Some(_) => { @@ -1157,12 +1162,12 @@ where Some('>') => { slf.emit_error(Error::IncorrectlyClosedComment); slf.state = State::Data; - slf.emitter.emit_current_comment(slf.reader.position()); + slf.emitter.emit_current_comment(slf.some_offset); Ok(ControlToken::Continue) } None => { slf.emit_error(Error::EofInComment); - slf.emitter.emit_current_comment(slf.reader.position()); + slf.emitter.emit_current_comment(slf.some_offset); Ok(ControlToken::Eof) } c @ Some(_) => { diff --git a/src/tokenizer.rs b/src/tokenizer.rs index e0402b9..cfd8eea 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -52,6 +52,7 @@ pub struct Tokenizer<R: Reader, O, E: Emitter<O>> { /// The reader position before the match block in [`machine::consume`]. pub(crate) position_before_match: O, /// * Set to the offset of `<` in [`InternalState::Data`]. + /// * Set to the offset of `-` in [`InternalState::Comment`]. /// * Set to the offset of `&` in [`InternalState::CharacterReference`]. pub(crate) some_offset: O, /// This boolean flag exists so that the [`NaiveParser`](crate::NaiveParser) can work with any [`Emitter`] diff --git a/tests/test_spans.rs b/tests/test_spans.rs index e856a19..ca23141 100644 --- a/tests/test_spans.rs +++ b/tests/test_spans.rs @@ -54,13 +54,6 @@ fn annotate(html: &str, labels: Vec<(Range<usize>, impl AsRef<str>)>) -> String .join("") } -fn assert_panics_but_should_not(f: impl FnOnce() + std::panic::UnwindSafe) { - assert!( - std::panic::catch_unwind(f).is_err(), - "congrats! you made some span test support UTF-16, please stop calling assert_panics_but_should_not for this test" - ); -} - #[test] fn start_tag_span() { let html = "<x> <xyz> <xyz > <xyz/>"; @@ -219,7 +212,7 @@ fn comment_data_span() { ]; let mut annotated = String::new(); - for (idx, case) in cases.iter().enumerate() { + for case in cases { let labeler = |tokens: TokenIter| { let Token::Comment(comment) = tokens .filter(|t| !matches!(t, Token::Error { .. })) @@ -231,16 +224,7 @@ fn comment_data_span() { vec![(comment.data_span(), "")] }; - println!("{idx}"); - if [ - 0, 1, 2, 3, 8, 9, 10, 11, // FIXME - ] - .contains(&idx) - { - assert_panics_but_should_not(|| assert_char_encoding_independence(case, labeler)); - } else { - assert_char_encoding_independence(case, labeler); - } + assert_char_encoding_independence(case, labeler); annotated.push_str(&test_and_annotate(case, labeler)); } |