diff options
Diffstat (limited to 'src/tokenizer/machine.rs')
-rw-r--r-- | src/tokenizer/machine.rs | 68 |
1 files changed, 50 insertions, 18 deletions
diff --git a/src/tokenizer/machine.rs b/src/tokenizer/machine.rs index ff72a7c..d5a1f87 100644 --- a/src/tokenizer/machine.rs +++ b/src/tokenizer/machine.rs @@ -26,6 +26,7 @@ pub(super) struct Machine<R, O, E> { position_before_match: O, /// * Set to the offset of `<` in [`State::Data`]. /// * Set to the offset of `-` in [`State::Comment`]. + /// * Set to the offset of `[` in [`State::CdataSectionBracket`]. /// * Set to the offset of `&` in [`State::CharacterReference`]. some_offset: O, /// This boolean flag exists so that the [`NaiveParser`](crate::NaiveParser) can work with any [`Emitter`] @@ -126,7 +127,7 @@ where } Some('\0') => { slf.emit_error(Error::UnexpectedNullCharacter); - slf.emit_char('\u{fffd}'); + slf.emit_char_for_source_char('\u{fffd}', '\0'); Ok(ControlToken::Continue) } Some(x) => { @@ -142,7 +143,7 @@ where } Some('\0') => { slf.emit_error(Error::UnexpectedNullCharacter); - slf.emit_char('\u{fffd}'); + slf.emit_char_for_source_char('\u{fffd}', '\0'); Ok(ControlToken::Continue) } Some(x) => { @@ -158,7 +159,7 @@ where } Some('\0') => { slf.emit_error(Error::UnexpectedNullCharacter); - slf.emit_char('\u{fffd}'); + slf.emit_char_for_source_char('\u{fffd}', '\0'); Ok(ControlToken::Continue) } Some(x) => { @@ -170,7 +171,7 @@ where State::PlainText => match slf.read_char()? { Some('\0') => { slf.emit_error(Error::UnexpectedNullCharacter); - slf.emit_char('\u{fffd}'); + slf.emit_char_for_source_char('\u{fffd}', '\0'); Ok(ControlToken::Continue) } Some(x) => { @@ -475,7 +476,7 @@ where } Some('\0') => { slf.emit_error(Error::UnexpectedNullCharacter); - slf.emit_char('\u{fffd}'); + slf.emit_char_for_source_char('\u{fffd}', '\0'); Ok(ControlToken::Continue) } None => { @@ -500,7 +501,7 @@ where Some('\0') => { slf.emit_error(Error::UnexpectedNullCharacter); slf.state = State::ScriptDataEscaped; - slf.emit_char('\u{fffd}'); + slf.emit_char_for_source_char('\u{fffd}', '\0'); Ok(ControlToken::Continue) } None => { @@ -530,7 +531,7 @@ where Some('\0') => { slf.emit_error(Error::UnexpectedNullCharacter); slf.state = State::ScriptDataEscaped; - slf.emit_char('\u{fffd}'); + slf.emit_char_for_source_char('\u{fffd}', '\0'); Ok(ControlToken::Continue) } None => { @@ -638,7 +639,7 @@ where } Some('\0') => { slf.emit_error(Error::UnexpectedNullCharacter); - slf.emit_char('\u{fffd}'); + slf.emit_char_for_source_char('\u{fffd}', '\0'); Ok(ControlToken::Continue) } None => { @@ -664,7 +665,7 @@ where Some('\0') => { slf.emit_error(Error::UnexpectedNullCharacter); slf.state = State::ScriptDataDoubleEscaped; - slf.emit_char('\u{fffd}'); + slf.emit_char_for_source_char('\u{fffd}', '\0'); Ok(ControlToken::Continue) } None => { @@ -695,7 +696,7 @@ where Some('\0') => { slf.emit_error(Error::UnexpectedNullCharacter); slf.state = State::ScriptDataDoubleEscaped; - slf.emit_char('\u{fffd}'); + slf.emit_char_for_source_char('\u{fffd}', '\0'); Ok(ControlToken::Continue) } None => { @@ -1748,6 +1749,7 @@ where State::CdataSectionBracket => match slf.read_char()? { Some(']') => { slf.state = State::CdataSectionEnd; + slf.some_offset = slf.position_before_match; Ok(ControlToken::Continue) } c => { @@ -1805,7 +1807,20 @@ where try_read_character_reference(first_char, |x| slf.try_read_string(x, true))? else { slf.unread_char(Some(first_char)); - slf.flush_code_points_consumed_as_character_reference(); + + debug_assert_eq!(slf.temporary_buffer, "&"); + slf.temporary_buffer.clear(); + + if slf.is_consumed_as_part_of_an_attribute() { + slf.emitter.push_attribute_value("&"); + } else { + slf.emitter.emit_char( + '&', + slf.some_offset + ..slf.some_offset + slf.reader.len_of_char_in_current_encoding('&'), + ); + } + slf.state = State::AmbiguousAmpersand; return Ok(ControlToken::Continue); }; @@ -1829,9 +1844,20 @@ where slf.emit_error(Error::MissingSemicolonAfterCharacterReference); } - slf.temporary_buffer.clear(); - slf.temporary_buffer.push_str(char_ref.characters); - slf.flush_code_points_consumed_as_character_reference(); + if slf.is_consumed_as_part_of_an_attribute() { + slf.temporary_buffer.clear(); + slf.temporary_buffer.push_str(char_ref.characters); + slf.emitter.push_attribute_value(&slf.temporary_buffer); + } else { + for c in char_ref.characters.chars() { + slf.emitter.emit_char( + c, + slf.some_offset + ..slf.reader.position() + - slf.reader.len_of_char_in_current_encoding(c), + ); + } + } slf.state = slf.return_state.take().unwrap(); Ok(ControlToken::Continue) } @@ -1998,10 +2024,16 @@ where _ => (), } - slf.temporary_buffer.clear(); - slf.temporary_buffer - .push(std::char::from_u32(slf.character_reference_code).unwrap()); - slf.flush_code_points_consumed_as_character_reference(); + let char = std::char::from_u32(slf.character_reference_code).unwrap(); + + if slf.is_consumed_as_part_of_an_attribute() { + slf.temporary_buffer.clear(); + slf.temporary_buffer.push(char); + slf.emitter.push_attribute_value(&slf.temporary_buffer); + } else { + slf.emitter + .emit_char(char, slf.some_offset..slf.reader.position()); + } slf.state = slf.return_state.take().unwrap(); Ok(ControlToken::Continue) } |