diff options
Diffstat (limited to 'src/tokenizer')
-rw-r--r-- | src/tokenizer/machine.rs | 68 | ||||
-rw-r--r-- | src/tokenizer/machine/utils.rs | 30 |
2 files changed, 74 insertions, 24 deletions
diff --git a/src/tokenizer/machine.rs b/src/tokenizer/machine.rs index ff72a7c..d5a1f87 100644 --- a/src/tokenizer/machine.rs +++ b/src/tokenizer/machine.rs @@ -26,6 +26,7 @@ pub(super) struct Machine<R, O, E> { position_before_match: O, /// * Set to the offset of `<` in [`State::Data`]. /// * Set to the offset of `-` in [`State::Comment`]. + /// * Set to the offset of `[` in [`State::CdataSectionBracket`]. /// * Set to the offset of `&` in [`State::CharacterReference`]. some_offset: O, /// This boolean flag exists so that the [`NaiveParser`](crate::NaiveParser) can work with any [`Emitter`] @@ -126,7 +127,7 @@ where } Some('\0') => { slf.emit_error(Error::UnexpectedNullCharacter); - slf.emit_char('\u{fffd}'); + slf.emit_char_for_source_char('\u{fffd}', '\0'); Ok(ControlToken::Continue) } Some(x) => { @@ -142,7 +143,7 @@ where } Some('\0') => { slf.emit_error(Error::UnexpectedNullCharacter); - slf.emit_char('\u{fffd}'); + slf.emit_char_for_source_char('\u{fffd}', '\0'); Ok(ControlToken::Continue) } Some(x) => { @@ -158,7 +159,7 @@ where } Some('\0') => { slf.emit_error(Error::UnexpectedNullCharacter); - slf.emit_char('\u{fffd}'); + slf.emit_char_for_source_char('\u{fffd}', '\0'); Ok(ControlToken::Continue) } Some(x) => { @@ -170,7 +171,7 @@ where State::PlainText => match slf.read_char()? { Some('\0') => { slf.emit_error(Error::UnexpectedNullCharacter); - slf.emit_char('\u{fffd}'); + slf.emit_char_for_source_char('\u{fffd}', '\0'); Ok(ControlToken::Continue) } Some(x) => { @@ -475,7 +476,7 @@ where } Some('\0') => { slf.emit_error(Error::UnexpectedNullCharacter); - slf.emit_char('\u{fffd}'); + slf.emit_char_for_source_char('\u{fffd}', '\0'); Ok(ControlToken::Continue) } None => { @@ -500,7 +501,7 @@ where Some('\0') => { slf.emit_error(Error::UnexpectedNullCharacter); slf.state = State::ScriptDataEscaped; - slf.emit_char('\u{fffd}'); + slf.emit_char_for_source_char('\u{fffd}', '\0'); Ok(ControlToken::Continue) } None => { @@ -530,7 +531,7 @@ where Some('\0') => { slf.emit_error(Error::UnexpectedNullCharacter); slf.state = State::ScriptDataEscaped; - slf.emit_char('\u{fffd}'); + slf.emit_char_for_source_char('\u{fffd}', '\0'); Ok(ControlToken::Continue) } None => { @@ -638,7 +639,7 @@ where } Some('\0') => { slf.emit_error(Error::UnexpectedNullCharacter); - slf.emit_char('\u{fffd}'); + slf.emit_char_for_source_char('\u{fffd}', '\0'); Ok(ControlToken::Continue) } None => { @@ -664,7 +665,7 @@ where Some('\0') => { slf.emit_error(Error::UnexpectedNullCharacter); slf.state = State::ScriptDataDoubleEscaped; - slf.emit_char('\u{fffd}'); + slf.emit_char_for_source_char('\u{fffd}', '\0'); Ok(ControlToken::Continue) } None => { @@ -695,7 +696,7 @@ where Some('\0') => { slf.emit_error(Error::UnexpectedNullCharacter); slf.state = State::ScriptDataDoubleEscaped; - slf.emit_char('\u{fffd}'); + slf.emit_char_for_source_char('\u{fffd}', '\0'); Ok(ControlToken::Continue) } None => { @@ -1748,6 +1749,7 @@ where State::CdataSectionBracket => match slf.read_char()? { Some(']') => { slf.state = State::CdataSectionEnd; + slf.some_offset = slf.position_before_match; Ok(ControlToken::Continue) } c => { @@ -1805,7 +1807,20 @@ where try_read_character_reference(first_char, |x| slf.try_read_string(x, true))? else { slf.unread_char(Some(first_char)); - slf.flush_code_points_consumed_as_character_reference(); + + debug_assert_eq!(slf.temporary_buffer, "&"); + slf.temporary_buffer.clear(); + + if slf.is_consumed_as_part_of_an_attribute() { + slf.emitter.push_attribute_value("&"); + } else { + slf.emitter.emit_char( + '&', + slf.some_offset + ..slf.some_offset + slf.reader.len_of_char_in_current_encoding('&'), + ); + } + slf.state = State::AmbiguousAmpersand; return Ok(ControlToken::Continue); }; @@ -1829,9 +1844,20 @@ where slf.emit_error(Error::MissingSemicolonAfterCharacterReference); } - slf.temporary_buffer.clear(); - slf.temporary_buffer.push_str(char_ref.characters); - slf.flush_code_points_consumed_as_character_reference(); + if slf.is_consumed_as_part_of_an_attribute() { + slf.temporary_buffer.clear(); + slf.temporary_buffer.push_str(char_ref.characters); + slf.emitter.push_attribute_value(&slf.temporary_buffer); + } else { + for c in char_ref.characters.chars() { + slf.emitter.emit_char( + c, + slf.some_offset + ..slf.reader.position() + - slf.reader.len_of_char_in_current_encoding(c), + ); + } + } slf.state = slf.return_state.take().unwrap(); Ok(ControlToken::Continue) } @@ -1998,10 +2024,16 @@ where _ => (), } - slf.temporary_buffer.clear(); - slf.temporary_buffer - .push(std::char::from_u32(slf.character_reference_code).unwrap()); - slf.flush_code_points_consumed_as_character_reference(); + let char = std::char::from_u32(slf.character_reference_code).unwrap(); + + if slf.is_consumed_as_part_of_an_attribute() { + slf.temporary_buffer.clear(); + slf.temporary_buffer.push(char); + slf.emitter.push_attribute_value(&slf.temporary_buffer); + } else { + slf.emitter + .emit_char(char, slf.some_offset..slf.reader.position()); + } slf.state = slf.return_state.take().unwrap(); Ok(ControlToken::Continue) } diff --git a/src/tokenizer/machine/utils.rs b/src/tokenizer/machine/utils.rs index 9752746..4d59282 100644 --- a/src/tokenizer/machine/utils.rs +++ b/src/tokenizer/machine/utils.rs @@ -17,20 +17,40 @@ where self.reader.position() } + /// Emits the given character as a character token, with its span set according to the given source character. + /// + /// This method should only be used if `c != source_char`, otherwise [`Machine::emit_char`] should be used instead. + #[inline] + pub(super) fn emit_char_for_source_char(&mut self, c: char, source_char: char) { + let pos = self.reader.position(); + self.emitter.emit_char( + c, + pos - self.reader.len_of_char_in_current_encoding(source_char)..pos, + ); + } + /// Emits the given character as a character token. + /// + /// The character MUST have been present literally in the read input. #[inline] pub(super) fn emit_char(&mut self, c: char) { - self.emitter.emit_char(c); + self.emit_char_for_source_char(c, c); } /// Emits every byte of the given byte slice as a character token. /// + /// Every byte MUST have been literally present as a character in the read input. + /// /// (We're operating on bytes to enable compiler optimization, /// since [`str::chars`] isn't `const`.) #[inline] pub(super) fn emit_chars(&mut self, s: &[u8]) { + let mut start = self.some_offset; + for c in s { - self.emit_char(*c as char); + let end = start + self.reader.len_of_char_in_current_encoding(*c as char); + self.emitter.emit_char(*c as char, start..end); + start = end; } } @@ -207,10 +227,8 @@ where } pub(super) fn flush_buffer_characters(&mut self) { - for c in self.temporary_buffer.chars() { - self.emitter.emit_char(c); - } - self.temporary_buffer.clear(); + let temporary_buffer = std::mem::take(&mut self.temporary_buffer); + self.emit_chars(temporary_buffer.as_bytes()); } } |