aboutsummaryrefslogtreecommitdiff
path: root/src/tokenizer/machine.rs
diff options
context:
space:
mode:
Diffstat (limited to 'src/tokenizer/machine.rs')
-rw-r--r--src/tokenizer/machine.rs68
1 files changed, 50 insertions, 18 deletions
diff --git a/src/tokenizer/machine.rs b/src/tokenizer/machine.rs
index ff72a7c..d5a1f87 100644
--- a/src/tokenizer/machine.rs
+++ b/src/tokenizer/machine.rs
@@ -26,6 +26,7 @@ pub(super) struct Machine<R, O, E> {
position_before_match: O,
/// * Set to the offset of `<` in [`State::Data`].
/// * Set to the offset of `-` in [`State::Comment`].
+ /// * Set to the offset of `[` in [`State::CdataSectionBracket`].
/// * Set to the offset of `&` in [`State::CharacterReference`].
some_offset: O,
/// This boolean flag exists so that the [`NaiveParser`](crate::NaiveParser) can work with any [`Emitter`]
@@ -126,7 +127,7 @@ where
}
Some('\0') => {
slf.emit_error(Error::UnexpectedNullCharacter);
- slf.emit_char('\u{fffd}');
+ slf.emit_char_for_source_char('\u{fffd}', '\0');
Ok(ControlToken::Continue)
}
Some(x) => {
@@ -142,7 +143,7 @@ where
}
Some('\0') => {
slf.emit_error(Error::UnexpectedNullCharacter);
- slf.emit_char('\u{fffd}');
+ slf.emit_char_for_source_char('\u{fffd}', '\0');
Ok(ControlToken::Continue)
}
Some(x) => {
@@ -158,7 +159,7 @@ where
}
Some('\0') => {
slf.emit_error(Error::UnexpectedNullCharacter);
- slf.emit_char('\u{fffd}');
+ slf.emit_char_for_source_char('\u{fffd}', '\0');
Ok(ControlToken::Continue)
}
Some(x) => {
@@ -170,7 +171,7 @@ where
State::PlainText => match slf.read_char()? {
Some('\0') => {
slf.emit_error(Error::UnexpectedNullCharacter);
- slf.emit_char('\u{fffd}');
+ slf.emit_char_for_source_char('\u{fffd}', '\0');
Ok(ControlToken::Continue)
}
Some(x) => {
@@ -475,7 +476,7 @@ where
}
Some('\0') => {
slf.emit_error(Error::UnexpectedNullCharacter);
- slf.emit_char('\u{fffd}');
+ slf.emit_char_for_source_char('\u{fffd}', '\0');
Ok(ControlToken::Continue)
}
None => {
@@ -500,7 +501,7 @@ where
Some('\0') => {
slf.emit_error(Error::UnexpectedNullCharacter);
slf.state = State::ScriptDataEscaped;
- slf.emit_char('\u{fffd}');
+ slf.emit_char_for_source_char('\u{fffd}', '\0');
Ok(ControlToken::Continue)
}
None => {
@@ -530,7 +531,7 @@ where
Some('\0') => {
slf.emit_error(Error::UnexpectedNullCharacter);
slf.state = State::ScriptDataEscaped;
- slf.emit_char('\u{fffd}');
+ slf.emit_char_for_source_char('\u{fffd}', '\0');
Ok(ControlToken::Continue)
}
None => {
@@ -638,7 +639,7 @@ where
}
Some('\0') => {
slf.emit_error(Error::UnexpectedNullCharacter);
- slf.emit_char('\u{fffd}');
+ slf.emit_char_for_source_char('\u{fffd}', '\0');
Ok(ControlToken::Continue)
}
None => {
@@ -664,7 +665,7 @@ where
Some('\0') => {
slf.emit_error(Error::UnexpectedNullCharacter);
slf.state = State::ScriptDataDoubleEscaped;
- slf.emit_char('\u{fffd}');
+ slf.emit_char_for_source_char('\u{fffd}', '\0');
Ok(ControlToken::Continue)
}
None => {
@@ -695,7 +696,7 @@ where
Some('\0') => {
slf.emit_error(Error::UnexpectedNullCharacter);
slf.state = State::ScriptDataDoubleEscaped;
- slf.emit_char('\u{fffd}');
+ slf.emit_char_for_source_char('\u{fffd}', '\0');
Ok(ControlToken::Continue)
}
None => {
@@ -1748,6 +1749,7 @@ where
State::CdataSectionBracket => match slf.read_char()? {
Some(']') => {
slf.state = State::CdataSectionEnd;
+ slf.some_offset = slf.position_before_match;
Ok(ControlToken::Continue)
}
c => {
@@ -1805,7 +1807,20 @@ where
try_read_character_reference(first_char, |x| slf.try_read_string(x, true))?
else {
slf.unread_char(Some(first_char));
- slf.flush_code_points_consumed_as_character_reference();
+
+ debug_assert_eq!(slf.temporary_buffer, "&");
+ slf.temporary_buffer.clear();
+
+ if slf.is_consumed_as_part_of_an_attribute() {
+ slf.emitter.push_attribute_value("&");
+ } else {
+ slf.emitter.emit_char(
+ '&',
+ slf.some_offset
+ ..slf.some_offset + slf.reader.len_of_char_in_current_encoding('&'),
+ );
+ }
+
slf.state = State::AmbiguousAmpersand;
return Ok(ControlToken::Continue);
};
@@ -1829,9 +1844,20 @@ where
slf.emit_error(Error::MissingSemicolonAfterCharacterReference);
}
- slf.temporary_buffer.clear();
- slf.temporary_buffer.push_str(char_ref.characters);
- slf.flush_code_points_consumed_as_character_reference();
+ if slf.is_consumed_as_part_of_an_attribute() {
+ slf.temporary_buffer.clear();
+ slf.temporary_buffer.push_str(char_ref.characters);
+ slf.emitter.push_attribute_value(&slf.temporary_buffer);
+ } else {
+ for c in char_ref.characters.chars() {
+ slf.emitter.emit_char(
+ c,
+ slf.some_offset
+ ..slf.reader.position()
+ - slf.reader.len_of_char_in_current_encoding(c),
+ );
+ }
+ }
slf.state = slf.return_state.take().unwrap();
Ok(ControlToken::Continue)
}
@@ -1998,10 +2024,16 @@ where
_ => (),
}
- slf.temporary_buffer.clear();
- slf.temporary_buffer
- .push(std::char::from_u32(slf.character_reference_code).unwrap());
- slf.flush_code_points_consumed_as_character_reference();
+ let char = std::char::from_u32(slf.character_reference_code).unwrap();
+
+ if slf.is_consumed_as_part_of_an_attribute() {
+ slf.temporary_buffer.clear();
+ slf.temporary_buffer.push(char);
+ slf.emitter.push_attribute_value(&slf.temporary_buffer);
+ } else {
+ slf.emitter
+ .emit_char(char, slf.some_offset..slf.reader.position());
+ }
slf.state = slf.return_state.take().unwrap();
Ok(ControlToken::Continue)
}