diff options
Diffstat (limited to 'src/tokenizer')
| -rw-r--r-- | src/tokenizer/machine.rs | 68 | ||||
| -rw-r--r-- | src/tokenizer/machine/utils.rs | 30 | 
2 files changed, 74 insertions, 24 deletions
diff --git a/src/tokenizer/machine.rs b/src/tokenizer/machine.rs index ff72a7c..d5a1f87 100644 --- a/src/tokenizer/machine.rs +++ b/src/tokenizer/machine.rs @@ -26,6 +26,7 @@ pub(super) struct Machine<R, O, E> {      position_before_match: O,      /// * Set to the offset of `<` in [`State::Data`].      /// * Set to the offset of `-` in [`State::Comment`]. +    /// * Set to the offset of `[` in [`State::CdataSectionBracket`].      /// * Set to the offset of `&` in [`State::CharacterReference`].      some_offset: O,      /// This boolean flag exists so that the [`NaiveParser`](crate::NaiveParser) can work with any [`Emitter`] @@ -126,7 +127,7 @@ where              }              Some('\0') => {                  slf.emit_error(Error::UnexpectedNullCharacter); -                slf.emit_char('\u{fffd}'); +                slf.emit_char_for_source_char('\u{fffd}', '\0');                  Ok(ControlToken::Continue)              }              Some(x) => { @@ -142,7 +143,7 @@ where              }              Some('\0') => {                  slf.emit_error(Error::UnexpectedNullCharacter); -                slf.emit_char('\u{fffd}'); +                slf.emit_char_for_source_char('\u{fffd}', '\0');                  Ok(ControlToken::Continue)              }              Some(x) => { @@ -158,7 +159,7 @@ where              }              Some('\0') => {                  slf.emit_error(Error::UnexpectedNullCharacter); -                slf.emit_char('\u{fffd}'); +                slf.emit_char_for_source_char('\u{fffd}', '\0');                  Ok(ControlToken::Continue)              }              Some(x) => { @@ -170,7 +171,7 @@ where          State::PlainText => match slf.read_char()? {              Some('\0') => {                  slf.emit_error(Error::UnexpectedNullCharacter); -                slf.emit_char('\u{fffd}'); +                slf.emit_char_for_source_char('\u{fffd}', '\0');                  Ok(ControlToken::Continue)              }              Some(x) => { @@ -475,7 +476,7 @@ where              }              Some('\0') => {                  slf.emit_error(Error::UnexpectedNullCharacter); -                slf.emit_char('\u{fffd}'); +                slf.emit_char_for_source_char('\u{fffd}', '\0');                  Ok(ControlToken::Continue)              }              None => { @@ -500,7 +501,7 @@ where              Some('\0') => {                  slf.emit_error(Error::UnexpectedNullCharacter);                  slf.state = State::ScriptDataEscaped; -                slf.emit_char('\u{fffd}'); +                slf.emit_char_for_source_char('\u{fffd}', '\0');                  Ok(ControlToken::Continue)              }              None => { @@ -530,7 +531,7 @@ where              Some('\0') => {                  slf.emit_error(Error::UnexpectedNullCharacter);                  slf.state = State::ScriptDataEscaped; -                slf.emit_char('\u{fffd}'); +                slf.emit_char_for_source_char('\u{fffd}', '\0');                  Ok(ControlToken::Continue)              }              None => { @@ -638,7 +639,7 @@ where              }              Some('\0') => {                  slf.emit_error(Error::UnexpectedNullCharacter); -                slf.emit_char('\u{fffd}'); +                slf.emit_char_for_source_char('\u{fffd}', '\0');                  Ok(ControlToken::Continue)              }              None => { @@ -664,7 +665,7 @@ where              Some('\0') => {                  slf.emit_error(Error::UnexpectedNullCharacter);                  slf.state = State::ScriptDataDoubleEscaped; -                slf.emit_char('\u{fffd}'); +                slf.emit_char_for_source_char('\u{fffd}', '\0');                  Ok(ControlToken::Continue)              }              None => { @@ -695,7 +696,7 @@ where              Some('\0') => {                  slf.emit_error(Error::UnexpectedNullCharacter);                  slf.state = State::ScriptDataDoubleEscaped; -                slf.emit_char('\u{fffd}'); +                slf.emit_char_for_source_char('\u{fffd}', '\0');                  Ok(ControlToken::Continue)              }              None => { @@ -1748,6 +1749,7 @@ where          State::CdataSectionBracket => match slf.read_char()? {              Some(']') => {                  slf.state = State::CdataSectionEnd; +                slf.some_offset = slf.position_before_match;                  Ok(ControlToken::Continue)              }              c => { @@ -1805,7 +1807,20 @@ where                  try_read_character_reference(first_char, |x| slf.try_read_string(x, true))?              else {                  slf.unread_char(Some(first_char)); -                slf.flush_code_points_consumed_as_character_reference(); + +                debug_assert_eq!(slf.temporary_buffer, "&"); +                slf.temporary_buffer.clear(); + +                if slf.is_consumed_as_part_of_an_attribute() { +                    slf.emitter.push_attribute_value("&"); +                } else { +                    slf.emitter.emit_char( +                        '&', +                        slf.some_offset +                            ..slf.some_offset + slf.reader.len_of_char_in_current_encoding('&'), +                    ); +                } +                  slf.state = State::AmbiguousAmpersand;                  return Ok(ControlToken::Continue);              }; @@ -1829,9 +1844,20 @@ where                      slf.emit_error(Error::MissingSemicolonAfterCharacterReference);                  } -                slf.temporary_buffer.clear(); -                slf.temporary_buffer.push_str(char_ref.characters); -                slf.flush_code_points_consumed_as_character_reference(); +                if slf.is_consumed_as_part_of_an_attribute() { +                    slf.temporary_buffer.clear(); +                    slf.temporary_buffer.push_str(char_ref.characters); +                    slf.emitter.push_attribute_value(&slf.temporary_buffer); +                } else { +                    for c in char_ref.characters.chars() { +                        slf.emitter.emit_char( +                            c, +                            slf.some_offset +                                ..slf.reader.position() +                                    - slf.reader.len_of_char_in_current_encoding(c), +                        ); +                    } +                }                  slf.state = slf.return_state.take().unwrap();                  Ok(ControlToken::Continue)              } @@ -1998,10 +2024,16 @@ where                  _ => (),              } -            slf.temporary_buffer.clear(); -            slf.temporary_buffer -                .push(std::char::from_u32(slf.character_reference_code).unwrap()); -            slf.flush_code_points_consumed_as_character_reference(); +            let char = std::char::from_u32(slf.character_reference_code).unwrap(); + +            if slf.is_consumed_as_part_of_an_attribute() { +                slf.temporary_buffer.clear(); +                slf.temporary_buffer.push(char); +                slf.emitter.push_attribute_value(&slf.temporary_buffer); +            } else { +                slf.emitter +                    .emit_char(char, slf.some_offset..slf.reader.position()); +            }              slf.state = slf.return_state.take().unwrap();              Ok(ControlToken::Continue)          } diff --git a/src/tokenizer/machine/utils.rs b/src/tokenizer/machine/utils.rs index 9752746..4d59282 100644 --- a/src/tokenizer/machine/utils.rs +++ b/src/tokenizer/machine/utils.rs @@ -17,20 +17,40 @@ where          self.reader.position()      } +    /// Emits the given character as a character token, with its span set according to the given source character. +    /// +    /// This method should only be used if `c != source_char`, otherwise [`Machine::emit_char`] should be used instead. +    #[inline] +    pub(super) fn emit_char_for_source_char(&mut self, c: char, source_char: char) { +        let pos = self.reader.position(); +        self.emitter.emit_char( +            c, +            pos - self.reader.len_of_char_in_current_encoding(source_char)..pos, +        ); +    } +      /// Emits the given character as a character token. +    /// +    /// The character MUST have been present literally in the read input.      #[inline]      pub(super) fn emit_char(&mut self, c: char) { -        self.emitter.emit_char(c); +        self.emit_char_for_source_char(c, c);      }      /// Emits every byte of the given byte slice as a character token.      /// +    /// Every byte MUST have been literally present as a character in the read input. +    ///      /// (We're operating on bytes to enable compiler optimization,      /// since [`str::chars`] isn't `const`.)      #[inline]      pub(super) fn emit_chars(&mut self, s: &[u8]) { +        let mut start = self.some_offset; +          for c in s { -            self.emit_char(*c as char); +            let end = start + self.reader.len_of_char_in_current_encoding(*c as char); +            self.emitter.emit_char(*c as char, start..end); +            start = end;          }      } @@ -207,10 +227,8 @@ where      }      pub(super) fn flush_buffer_characters(&mut self) { -        for c in self.temporary_buffer.chars() { -            self.emitter.emit_char(c); -        } -        self.temporary_buffer.clear(); +        let temporary_buffer = std::mem::take(&mut self.temporary_buffer); +        self.emit_chars(temporary_buffer.as_bytes());      }  }  | 
