aboutsummaryrefslogtreecommitdiff
path: root/src/machine.rs
diff options
context:
space:
mode:
authorMartin Fischer <martin@push-f.com>2023-09-01 23:00:45 +0200
committerMartin Fischer <martin@push-f.com>2023-09-03 23:00:05 +0200
commitf588704c90f33fe27945d742762d016dea3e113c (patch)
tree66716b8c01499f6b3848790c2185c9e06e48fb1e /src/machine.rs
parentfb3d757b2f756950c9a86681291e2817e4bd2975 (diff)
fix: don't assume UTF-8 in machine/tokenizer
Diffstat (limited to 'src/machine.rs')
-rw-r--r--src/machine.rs47
1 files changed, 34 insertions, 13 deletions
diff --git a/src/machine.rs b/src/machine.rs
index f7f5ac6..5b36eee 100644
--- a/src/machine.rs
+++ b/src/machine.rs
@@ -38,6 +38,8 @@ where
};
}
+ slf.position_before_match = slf.reader.position();
+
match slf.state {
State::Data => match slf.read_char()? {
Some('&') => {
@@ -46,7 +48,7 @@ where
Ok(ControlToken::Continue)
}
Some('<') => {
- slf.some_offset = slf.reader.position() - 1;
+ slf.some_offset = slf.position_before_match;
slf.state = State::TagOpen;
Ok(ControlToken::Continue)
}
@@ -702,7 +704,7 @@ where
Ok(ControlToken::Continue)
}
Some(x) => {
- slf.emitter.init_attribute_name(slf.reader.position() - 1);
+ slf.emitter.init_attribute_name(slf.position_before_match);
slf.state = State::AttributeName;
slf.unread_char(Some(x));
Ok(ControlToken::Continue)
@@ -755,7 +757,7 @@ where
Ok(ControlToken::Eof)
}
Some(x) => {
- slf.emitter.init_attribute_name(slf.reader.position() - 1);
+ slf.emitter.init_attribute_name(slf.position_before_match);
slf.state = State::AttributeName;
slf.unread_char(Some(x));
Ok(ControlToken::Continue)
@@ -783,7 +785,7 @@ where
}
c => {
slf.emitter
- .init_attribute_value(AttrValueSyntax::Unquoted, slf.reader.position() - 1);
+ .init_attribute_value(AttrValueSyntax::Unquoted, slf.position_before_match);
slf.state = State::AttributeValueUnquoted;
slf.unread_char(c);
Ok(ControlToken::Continue)
@@ -791,8 +793,13 @@ where
},
State::AttributeValueDoubleQuoted => match slf.read_char()? {
Some('"') => {
- slf.emitter
- .terminate_attribute_value(slf.reader.position() - 1);
+ slf.emitter.terminate_attribute_value(
+ // We cannot simply pass slf.position_before_match because
+ // State::NamedCharacterReference calls Tokenizer::unread_char
+ // which Reader::position doesn't account for.
+ // TODO: pass slf.position_before_match once CharacterReference has been converted to a function call
+ slf.reader.position() - slf.reader.len_of_char_in_current_encoding('"'),
+ );
slf.state = State::AfterAttributeValueQuoted;
Ok(ControlToken::Continue)
}
@@ -817,8 +824,13 @@ where
},
State::AttributeValueSingleQuoted => match slf.read_char()? {
Some('\'') => {
- slf.emitter
- .terminate_attribute_value(slf.reader.position() - 1);
+ slf.emitter.terminate_attribute_value(
+ // We cannot simply pass slf.position_before_match because
+ // State::NamedCharacterReference calls Tokenizer::unread_char
+ // which Reader::position doesn't account for.
+ // TODO: pass slf.position_before_match once CharacterReference has been converted to a function call
+ slf.reader.position() - slf.reader.len_of_char_in_current_encoding('\''),
+ );
slf.state = State::AfterAttributeValueQuoted;
Ok(ControlToken::Continue)
}
@@ -843,8 +855,13 @@ where
},
State::AttributeValueUnquoted => match slf.read_char()? {
Some(whitespace_pat!()) => {
- slf.emitter
- .terminate_attribute_value(slf.reader.position() - 1);
+ slf.emitter.terminate_attribute_value(
+ // We cannot simply pass slf.position_before_match because
+ // State::NamedCharacterReference calls Tokenizer::unread_char
+ // which Reader::position doesn't account for.
+ // TODO: pass slf.position_before_match once CharacterReference has been converted to a function call
+ slf.reader.position() - slf.reader.len_of_char_in_current_encoding(' '),
+ );
slf.state = State::BeforeAttributeName;
Ok(ControlToken::Continue)
}
@@ -904,7 +921,9 @@ where
},
State::SelfClosingStartTag => match slf.read_char()? {
Some('>') => {
- slf.emitter.set_self_closing(slf.reader.position() - 2);
+ slf.emitter.set_self_closing(
+ slf.position_before_match - slf.reader.len_of_char_in_current_encoding('/'),
+ );
slf.state = State::Data;
slf.emit_current_tag();
Ok(ControlToken::Continue)
@@ -953,7 +972,7 @@ where
Some('[') if slf.try_read_string("CDATA[", true)? => Ok(ControlToken::CdataOpen),
c => {
slf.emit_error(Error::IncorrectlyOpenedComment);
- slf.emitter.init_comment(slf.reader.position() - 1);
+ slf.emitter.init_comment(slf.position_before_match);
slf.state = State::BogusComment;
slf.unread_char(c);
Ok(ControlToken::Continue)
@@ -1659,7 +1678,9 @@ where
}
},
State::CharacterReference => {
- slf.some_offset = slf.reader.position() - "&".len();
+ // TODO: we can avoid these Reader method calls by changing CharacterReference to be a function instead of a state
+ slf.some_offset =
+ slf.reader.position() - slf.reader.len_of_char_in_current_encoding('&');
slf.temporary_buffer.clear();
slf.temporary_buffer.push('&');
match slf.read_char()? {