diff options
author | Martin Fischer <martin@push-f.com> | 2023-09-01 23:00:45 +0200 |
---|---|---|
committer | Martin Fischer <martin@push-f.com> | 2023-09-03 23:00:05 +0200 |
commit | f588704c90f33fe27945d742762d016dea3e113c (patch) | |
tree | 66716b8c01499f6b3848790c2185c9e06e48fb1e | |
parent | fb3d757b2f756950c9a86681291e2817e4bd2975 (diff) |
fix: don't assume UTF-8 in machine/tokenizer
-rw-r--r-- | src/machine.rs | 47 | ||||
-rw-r--r-- | src/tokenizer.rs | 5 | ||||
-rw-r--r-- | tests/test_spans.rs | 18 |
3 files changed, 43 insertions, 27 deletions
diff --git a/src/machine.rs b/src/machine.rs index f7f5ac6..5b36eee 100644 --- a/src/machine.rs +++ b/src/machine.rs @@ -38,6 +38,8 @@ where }; } + slf.position_before_match = slf.reader.position(); + match slf.state { State::Data => match slf.read_char()? { Some('&') => { @@ -46,7 +48,7 @@ where Ok(ControlToken::Continue) } Some('<') => { - slf.some_offset = slf.reader.position() - 1; + slf.some_offset = slf.position_before_match; slf.state = State::TagOpen; Ok(ControlToken::Continue) } @@ -702,7 +704,7 @@ where Ok(ControlToken::Continue) } Some(x) => { - slf.emitter.init_attribute_name(slf.reader.position() - 1); + slf.emitter.init_attribute_name(slf.position_before_match); slf.state = State::AttributeName; slf.unread_char(Some(x)); Ok(ControlToken::Continue) @@ -755,7 +757,7 @@ where Ok(ControlToken::Eof) } Some(x) => { - slf.emitter.init_attribute_name(slf.reader.position() - 1); + slf.emitter.init_attribute_name(slf.position_before_match); slf.state = State::AttributeName; slf.unread_char(Some(x)); Ok(ControlToken::Continue) @@ -783,7 +785,7 @@ where } c => { slf.emitter - .init_attribute_value(AttrValueSyntax::Unquoted, slf.reader.position() - 1); + .init_attribute_value(AttrValueSyntax::Unquoted, slf.position_before_match); slf.state = State::AttributeValueUnquoted; slf.unread_char(c); Ok(ControlToken::Continue) @@ -791,8 +793,13 @@ where }, State::AttributeValueDoubleQuoted => match slf.read_char()? { Some('"') => { - slf.emitter - .terminate_attribute_value(slf.reader.position() - 1); + slf.emitter.terminate_attribute_value( + // We cannot simply pass slf.position_before_match because + // State::NamedCharacterReference calls Tokenizer::unread_char + // which Reader::position doesn't account for. + // TODO: pass slf.position_before_match once CharacterReference has been converted to a function call + slf.reader.position() - slf.reader.len_of_char_in_current_encoding('"'), + ); slf.state = State::AfterAttributeValueQuoted; Ok(ControlToken::Continue) } @@ -817,8 +824,13 @@ where }, State::AttributeValueSingleQuoted => match slf.read_char()? { Some('\'') => { - slf.emitter - .terminate_attribute_value(slf.reader.position() - 1); + slf.emitter.terminate_attribute_value( + // We cannot simply pass slf.position_before_match because + // State::NamedCharacterReference calls Tokenizer::unread_char + // which Reader::position doesn't account for. + // TODO: pass slf.position_before_match once CharacterReference has been converted to a function call + slf.reader.position() - slf.reader.len_of_char_in_current_encoding('\''), + ); slf.state = State::AfterAttributeValueQuoted; Ok(ControlToken::Continue) } @@ -843,8 +855,13 @@ where }, State::AttributeValueUnquoted => match slf.read_char()? { Some(whitespace_pat!()) => { - slf.emitter - .terminate_attribute_value(slf.reader.position() - 1); + slf.emitter.terminate_attribute_value( + // We cannot simply pass slf.position_before_match because + // State::NamedCharacterReference calls Tokenizer::unread_char + // which Reader::position doesn't account for. + // TODO: pass slf.position_before_match once CharacterReference has been converted to a function call + slf.reader.position() - slf.reader.len_of_char_in_current_encoding(' '), + ); slf.state = State::BeforeAttributeName; Ok(ControlToken::Continue) } @@ -904,7 +921,9 @@ where }, State::SelfClosingStartTag => match slf.read_char()? { Some('>') => { - slf.emitter.set_self_closing(slf.reader.position() - 2); + slf.emitter.set_self_closing( + slf.position_before_match - slf.reader.len_of_char_in_current_encoding('/'), + ); slf.state = State::Data; slf.emit_current_tag(); Ok(ControlToken::Continue) @@ -953,7 +972,7 @@ where Some('[') if slf.try_read_string("CDATA[", true)? => Ok(ControlToken::CdataOpen), c => { slf.emit_error(Error::IncorrectlyOpenedComment); - slf.emitter.init_comment(slf.reader.position() - 1); + slf.emitter.init_comment(slf.position_before_match); slf.state = State::BogusComment; slf.unread_char(c); Ok(ControlToken::Continue) @@ -1659,7 +1678,9 @@ where } }, State::CharacterReference => { - slf.some_offset = slf.reader.position() - "&".len(); + // TODO: we can avoid these Reader method calls by changing CharacterReference to be a function instead of a state + slf.some_offset = + slf.reader.position() - slf.reader.len_of_char_in_current_encoding('&'); slf.temporary_buffer.clear(); slf.temporary_buffer.push('&'); match slf.read_char()? { diff --git a/src/tokenizer.rs b/src/tokenizer.rs index d59710d..58f7b80 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -49,6 +49,8 @@ pub struct Tokenizer<R: Reader, O, E: Emitter<O>> { current_tag_name: String, last_start_tag_name: String, is_start_tag: bool, + /// The reader position before the match block in [`machine::consume`]. + pub(crate) position_before_match: O, /// * Set to the offset of `<` in [`InternalState::Data`]. /// * Set to the offset of `&` in [`InternalState::CharacterReference`]. pub(crate) some_offset: O, @@ -77,6 +79,7 @@ impl<R: Reader + Position<O>, O: Offset, E: Emitter<O>> Tokenizer<R, O, E> { current_tag_name: String::new(), last_start_tag_name: String::new(), is_start_tag: false, + position_before_match: O::default(), some_offset: O::default(), naively_switch_state: false, } @@ -198,7 +201,7 @@ impl<R: Reader + Position<O>, O: Offset, E: Emitter<O>> Tokenizer<R, O, E> { | Error::ControlCharacterReference | Error::UnknownNamedCharacterReference => self.some_offset..self.reader.position(), - _ => self.reader.position() - 1..self.reader.position(), + _ => self.position_before_match..self.reader.position(), }; self.emitter.emit_error(error, span); } diff --git a/tests/test_spans.rs b/tests/test_spans.rs index c58616d..8190f01 100644 --- a/tests/test_spans.rs +++ b/tests/test_spans.rs @@ -73,7 +73,7 @@ fn start_tag_span() { } labels }; - assert_panics_but_should_not(|| assert_char_encoding_independence(html, labeler)); // FIXME + assert_char_encoding_independence(html, labeler); assert_snapshot!(test_and_annotate(html, labeler), @r###" <x> <xyz> <xyz > <xyz/> ^^^ ^^^^^ ^^^^^^^ ^^^^^^ @@ -92,7 +92,7 @@ fn end_tag_span() { } labels }; - assert_panics_but_should_not(|| assert_char_encoding_independence(html, labeler)); // FIXME + assert_char_encoding_independence(html, labeler); assert_snapshot!(test_and_annotate(html, labeler), @r###" </x> </xyz> </xyz > </xyz/> ^^^^ ^^^^^^ ^^^^^^^^ ^^^^^^^ @@ -170,7 +170,7 @@ fn attribute_value_span() { } labels }; - assert_panics_but_should_not(|| assert_char_encoding_independence(html, labeler)); // FIXME + assert_char_encoding_independence(html, labeler); assert_snapshot!(test_and_annotate(html, labeler), @r###" <test x=unquoted y = unquoted z='single-quoted' zz="double-quoted" empty=''> ^^^^^^^^ ^^^^^^^^ ^^^^^^^^^^^^^ ^^^^^^^^^^^^^ ^ @@ -190,7 +190,7 @@ fn attribute_value_with_char_ref() { } labels }; - assert_panics_but_should_not(|| assert_char_encoding_independence(html, labeler)); // FIXME + assert_char_encoding_independence(html, labeler); assert_snapshot!(test_and_annotate(html, labeler), @r###" <test x=& y='&' z="&"> ^^^^^ ^^^^^ ^^^^^ @@ -261,7 +261,7 @@ fn doctype_span() { }; vec![(doctype.span, "")] }; - assert_panics_but_should_not(|| assert_char_encoding_independence(case, labeler)); // FIXME + assert_char_encoding_independence(case, labeler); annotated.push_str(&test_and_annotate(case, labeler)); } @@ -341,17 +341,9 @@ fn annotate_errors(html: &'static str) -> String { *doesnt_support_utf16.lock().unwrap() = matches!( error, - | Error::AbsenceOfDigitsInNumericCharacterReference // FIXME - | Error::CharacterReferenceOutsideUnicodeRange // FIXME - | Error::ControlCharacterReference // FIXME | Error::DuplicateAttribute // FIXME | Error::EndTagWithAttributes // FIXME | Error::EndTagWithTrailingSolidus // FIXME - | Error::InvalidFirstCharacterOfTagName // FIXME - | Error::NoncharacterCharacterReference // FIXME - | Error::NullCharacterReference // FIXME - | Error::SurrogateCharacterReference // FIXME - | Error::UnknownNamedCharacterReference // FIXME ); } labels |