aboutsummaryrefslogtreecommitdiff
path: root/src/tokenizer
diff options
context:
space:
mode:
authorMartin Fischer <martin@push-f.com>2023-09-15 09:40:55 +0200
committerMartin Fischer <martin@push-f.com>2023-09-28 10:36:08 +0200
commitd46de6ab592e57a31fef13cfc015c4ce818e8f47 (patch)
tree814654b579f0a1754193a40786f09a711adc021a /src/tokenizer
parent2e986862920b438b253fd6e6f11d8f4e5d6f4e27 (diff)
feat: add span to Trace::Char
Diffstat (limited to 'src/tokenizer')
-rw-r--r--src/tokenizer/machine.rs68
-rw-r--r--src/tokenizer/machine/utils.rs30
2 files changed, 74 insertions, 24 deletions
diff --git a/src/tokenizer/machine.rs b/src/tokenizer/machine.rs
index ff72a7c..d5a1f87 100644
--- a/src/tokenizer/machine.rs
+++ b/src/tokenizer/machine.rs
@@ -26,6 +26,7 @@ pub(super) struct Machine<R, O, E> {
position_before_match: O,
/// * Set to the offset of `<` in [`State::Data`].
/// * Set to the offset of `-` in [`State::Comment`].
+ /// * Set to the offset of `[` in [`State::CdataSectionBracket`].
/// * Set to the offset of `&` in [`State::CharacterReference`].
some_offset: O,
/// This boolean flag exists so that the [`NaiveParser`](crate::NaiveParser) can work with any [`Emitter`]
@@ -126,7 +127,7 @@ where
}
Some('\0') => {
slf.emit_error(Error::UnexpectedNullCharacter);
- slf.emit_char('\u{fffd}');
+ slf.emit_char_for_source_char('\u{fffd}', '\0');
Ok(ControlToken::Continue)
}
Some(x) => {
@@ -142,7 +143,7 @@ where
}
Some('\0') => {
slf.emit_error(Error::UnexpectedNullCharacter);
- slf.emit_char('\u{fffd}');
+ slf.emit_char_for_source_char('\u{fffd}', '\0');
Ok(ControlToken::Continue)
}
Some(x) => {
@@ -158,7 +159,7 @@ where
}
Some('\0') => {
slf.emit_error(Error::UnexpectedNullCharacter);
- slf.emit_char('\u{fffd}');
+ slf.emit_char_for_source_char('\u{fffd}', '\0');
Ok(ControlToken::Continue)
}
Some(x) => {
@@ -170,7 +171,7 @@ where
State::PlainText => match slf.read_char()? {
Some('\0') => {
slf.emit_error(Error::UnexpectedNullCharacter);
- slf.emit_char('\u{fffd}');
+ slf.emit_char_for_source_char('\u{fffd}', '\0');
Ok(ControlToken::Continue)
}
Some(x) => {
@@ -475,7 +476,7 @@ where
}
Some('\0') => {
slf.emit_error(Error::UnexpectedNullCharacter);
- slf.emit_char('\u{fffd}');
+ slf.emit_char_for_source_char('\u{fffd}', '\0');
Ok(ControlToken::Continue)
}
None => {
@@ -500,7 +501,7 @@ where
Some('\0') => {
slf.emit_error(Error::UnexpectedNullCharacter);
slf.state = State::ScriptDataEscaped;
- slf.emit_char('\u{fffd}');
+ slf.emit_char_for_source_char('\u{fffd}', '\0');
Ok(ControlToken::Continue)
}
None => {
@@ -530,7 +531,7 @@ where
Some('\0') => {
slf.emit_error(Error::UnexpectedNullCharacter);
slf.state = State::ScriptDataEscaped;
- slf.emit_char('\u{fffd}');
+ slf.emit_char_for_source_char('\u{fffd}', '\0');
Ok(ControlToken::Continue)
}
None => {
@@ -638,7 +639,7 @@ where
}
Some('\0') => {
slf.emit_error(Error::UnexpectedNullCharacter);
- slf.emit_char('\u{fffd}');
+ slf.emit_char_for_source_char('\u{fffd}', '\0');
Ok(ControlToken::Continue)
}
None => {
@@ -664,7 +665,7 @@ where
Some('\0') => {
slf.emit_error(Error::UnexpectedNullCharacter);
slf.state = State::ScriptDataDoubleEscaped;
- slf.emit_char('\u{fffd}');
+ slf.emit_char_for_source_char('\u{fffd}', '\0');
Ok(ControlToken::Continue)
}
None => {
@@ -695,7 +696,7 @@ where
Some('\0') => {
slf.emit_error(Error::UnexpectedNullCharacter);
slf.state = State::ScriptDataDoubleEscaped;
- slf.emit_char('\u{fffd}');
+ slf.emit_char_for_source_char('\u{fffd}', '\0');
Ok(ControlToken::Continue)
}
None => {
@@ -1748,6 +1749,7 @@ where
State::CdataSectionBracket => match slf.read_char()? {
Some(']') => {
slf.state = State::CdataSectionEnd;
+ slf.some_offset = slf.position_before_match;
Ok(ControlToken::Continue)
}
c => {
@@ -1805,7 +1807,20 @@ where
try_read_character_reference(first_char, |x| slf.try_read_string(x, true))?
else {
slf.unread_char(Some(first_char));
- slf.flush_code_points_consumed_as_character_reference();
+
+ debug_assert_eq!(slf.temporary_buffer, "&");
+ slf.temporary_buffer.clear();
+
+ if slf.is_consumed_as_part_of_an_attribute() {
+ slf.emitter.push_attribute_value("&");
+ } else {
+ slf.emitter.emit_char(
+ '&',
+ slf.some_offset
+ ..slf.some_offset + slf.reader.len_of_char_in_current_encoding('&'),
+ );
+ }
+
slf.state = State::AmbiguousAmpersand;
return Ok(ControlToken::Continue);
};
@@ -1829,9 +1844,20 @@ where
slf.emit_error(Error::MissingSemicolonAfterCharacterReference);
}
- slf.temporary_buffer.clear();
- slf.temporary_buffer.push_str(char_ref.characters);
- slf.flush_code_points_consumed_as_character_reference();
+ if slf.is_consumed_as_part_of_an_attribute() {
+ slf.temporary_buffer.clear();
+ slf.temporary_buffer.push_str(char_ref.characters);
+ slf.emitter.push_attribute_value(&slf.temporary_buffer);
+ } else {
+ for c in char_ref.characters.chars() {
+ slf.emitter.emit_char(
+ c,
+ slf.some_offset
+ ..slf.reader.position()
+ - slf.reader.len_of_char_in_current_encoding(c),
+ );
+ }
+ }
slf.state = slf.return_state.take().unwrap();
Ok(ControlToken::Continue)
}
@@ -1998,10 +2024,16 @@ where
_ => (),
}
- slf.temporary_buffer.clear();
- slf.temporary_buffer
- .push(std::char::from_u32(slf.character_reference_code).unwrap());
- slf.flush_code_points_consumed_as_character_reference();
+ let char = std::char::from_u32(slf.character_reference_code).unwrap();
+
+ if slf.is_consumed_as_part_of_an_attribute() {
+ slf.temporary_buffer.clear();
+ slf.temporary_buffer.push(char);
+ slf.emitter.push_attribute_value(&slf.temporary_buffer);
+ } else {
+ slf.emitter
+ .emit_char(char, slf.some_offset..slf.reader.position());
+ }
slf.state = slf.return_state.take().unwrap();
Ok(ControlToken::Continue)
}
diff --git a/src/tokenizer/machine/utils.rs b/src/tokenizer/machine/utils.rs
index 9752746..4d59282 100644
--- a/src/tokenizer/machine/utils.rs
+++ b/src/tokenizer/machine/utils.rs
@@ -17,20 +17,40 @@ where
self.reader.position()
}
+ /// Emits the given character as a character token, with its span set according to the given source character.
+ ///
+ /// This method should only be used if `c != source_char`, otherwise [`Machine::emit_char`] should be used instead.
+ #[inline]
+ pub(super) fn emit_char_for_source_char(&mut self, c: char, source_char: char) {
+ let pos = self.reader.position();
+ self.emitter.emit_char(
+ c,
+ pos - self.reader.len_of_char_in_current_encoding(source_char)..pos,
+ );
+ }
+
/// Emits the given character as a character token.
+ ///
+ /// The character MUST have been present literally in the read input.
#[inline]
pub(super) fn emit_char(&mut self, c: char) {
- self.emitter.emit_char(c);
+ self.emit_char_for_source_char(c, c);
}
/// Emits every byte of the given byte slice as a character token.
///
+ /// Every byte MUST have been literally present as a character in the read input.
+ ///
/// (We're operating on bytes to enable compiler optimization,
/// since [`str::chars`] isn't `const`.)
#[inline]
pub(super) fn emit_chars(&mut self, s: &[u8]) {
+ let mut start = self.some_offset;
+
for c in s {
- self.emit_char(*c as char);
+ let end = start + self.reader.len_of_char_in_current_encoding(*c as char);
+ self.emitter.emit_char(*c as char, start..end);
+ start = end;
}
}
@@ -207,10 +227,8 @@ where
}
pub(super) fn flush_buffer_characters(&mut self) {
- for c in self.temporary_buffer.chars() {
- self.emitter.emit_char(c);
- }
- self.temporary_buffer.clear();
+ let temporary_buffer = std::mem::take(&mut self.temporary_buffer);
+ self.emit_chars(temporary_buffer.as_bytes());
}
}