summaryrefslogtreecommitdiff
path: root/src/tokenizer/machine
diff options
context:
space:
mode:
authorMartin Fischer <martin@push-f.com>2023-09-09 21:53:22 +0200
committerMartin Fischer <martin@push-f.com>2023-09-28 10:36:08 +0200
commit5aa3b82fbe62882da8007b0a4548b979c845aa97 (patch)
tree9788640728ea7894a7ff53c561ed10bff3a611c1 /src/tokenizer/machine
parent2c73901944e2d22747a2a4ebcc11881b3f8c2ad3 (diff)
refactor: move machine impl details to machine module
This commit separates the public API (the "Tokenizer") from the internal implementation (the "Machine") to make the code more readable.
Diffstat (limited to 'src/tokenizer/machine')
-rw-r--r--src/tokenizer/machine/utils.rs193
1 files changed, 193 insertions, 0 deletions
diff --git a/src/tokenizer/machine/utils.rs b/src/tokenizer/machine/utils.rs
index 7d220cf..6e45f4d 100644
--- a/src/tokenizer/machine/utils.rs
+++ b/src/tokenizer/machine/utils.rs
@@ -1,3 +1,196 @@
+use crate::{
+ naive_parser::naive_next_state,
+ offset::{Offset, Position},
+ reader::Reader,
+ Emitter, Error,
+};
+
+use super::Machine;
+
+impl<R, O, E> Machine<R, O, E>
+where
+ R: Reader + Position<O>,
+ O: Offset,
+ E: Emitter<O>,
+{
+ #[inline]
+ pub(crate) fn emit_error(&mut self, error: Error) {
+ let span = match error {
+ Error::EofBeforeTagName
+ | Error::EofInCdata
+ | Error::EofInComment
+ | Error::EofInDoctype
+ | Error::EofInScriptHtmlCommentLikeText
+ | Error::EofInTag
+ | Error::MissingSemicolonAfterCharacterReference => {
+ self.reader.position()..self.reader.position()
+ }
+ Error::AbsenceOfDigitsInNumericCharacterReference
+ | Error::NullCharacterReference
+ | Error::CharacterReferenceOutsideUnicodeRange
+ | Error::SurrogateCharacterReference
+ | Error::NoncharacterCharacterReference
+ | Error::ControlCharacterReference
+ | Error::UnknownNamedCharacterReference => self.some_offset..self.reader.position(),
+
+ _ => self.position_before_match..self.reader.position(),
+ };
+ self.emitter.report_error(error, span);
+ }
+
+ /// Assuming the _current token_ is an end tag, return true if all of these hold. Return false otherwise.
+ ///
+ /// * the _last start tag_ exists
+ /// * the current end tag token's name equals to the last start tag's name.
+ ///
+ /// See also WHATWG's definition of [appropriate end tag token].
+ ///
+ /// [appropriate end tag token]: https://html.spec.whatwg.org/multipage/parsing.html#appropriate-end-tag-token
+ #[inline]
+ pub(super) fn current_end_tag_is_appropriate(&mut self) -> bool {
+ self.current_tag_name == self.last_start_tag_name
+ }
+
+ #[inline]
+ pub(super) fn init_start_tag(&mut self) {
+ self.emitter
+ .init_start_tag(self.some_offset, self.position_before_match);
+ self.current_tag_name.clear();
+ self.is_start_tag = true;
+ }
+
+ #[inline]
+ pub(super) fn init_end_tag(&mut self) {
+ self.emitter
+ .init_end_tag(self.some_offset, self.position_before_match);
+ self.current_tag_name.clear();
+ self.is_start_tag = false;
+ }
+
+ #[inline]
+ pub(super) fn init_doctype(&mut self) {
+ self.emitter.init_doctype(self.some_offset);
+ }
+
+ #[inline]
+ pub(super) fn push_tag_name(&mut self, s: &str) {
+ self.emitter.push_tag_name(s);
+ self.current_tag_name.push_str(s);
+ }
+
+ #[inline]
+ pub(super) fn emit_current_tag(&mut self) {
+ self.emitter.emit_current_tag(self.reader.position());
+ if self.is_start_tag {
+ if self.naively_switch_state {
+ self.state = naive_next_state(&self.current_tag_name).into();
+ }
+ std::mem::swap(&mut self.last_start_tag_name, &mut self.current_tag_name);
+ }
+ }
+
+ #[inline]
+ pub(super) fn unread_char(&mut self, c: Option<char>) {
+ self.to_reconsume.push(c);
+ }
+
+ #[inline]
+ fn validate_char(&mut self, c: char) {
+ match c as u32 {
+ surrogate_pat!() => {
+ self.emit_error(Error::SurrogateInInputStream);
+ }
+ noncharacter_pat!() => {
+ self.emit_error(Error::NoncharacterInInputStream);
+ }
+ // control without whitespace or nul
+ x @ control_pat!()
+ if !matches!(x, 0x0000 | 0x0009 | 0x000a | 0x000c | 0x000d | 0x0020) =>
+ {
+ self.emit_error(Error::ControlCharacterInInputStream);
+ }
+ _ => (),
+ }
+ }
+
+ pub(super) fn read_char(&mut self) -> Result<Option<char>, R::Error> {
+ let (c_res, reconsumed) = match self.to_reconsume.pop() {
+ Some(c) => (Ok(c), true),
+ None => (self.reader.read_char(), false),
+ };
+
+ let mut c = match c_res {
+ Ok(Some(c)) => c,
+ res => return res,
+ };
+
+ if c == '\r' {
+ c = '\n';
+ let c2 = self.reader.read_char()?;
+ if c2 != Some('\n') {
+ self.unread_char(c2);
+ }
+ }
+
+ if !reconsumed {
+ self.validate_char(c);
+ }
+
+ Ok(Some(c))
+ }
+
+ #[inline]
+ pub(super) fn try_read_string(
+ &mut self,
+ mut s: &str,
+ case_sensitive: bool,
+ ) -> Result<bool, R::Error> {
+ debug_assert!(!s.is_empty());
+
+ let to_reconsume_bak = self.to_reconsume;
+ let mut chars = s.chars();
+ while let Some(c) = self.to_reconsume.pop() {
+ if let (Some(x), Some(x2)) = (c, chars.next()) {
+ if x == x2 || (!case_sensitive && x.to_ascii_lowercase() == x2.to_ascii_lowercase())
+ {
+ s = &s[x.len_utf8()..];
+ continue;
+ }
+ }
+
+ self.to_reconsume = to_reconsume_bak;
+ return Ok(false);
+ }
+
+ self.reader.try_read_string(s, case_sensitive)
+ }
+
+ pub(super) fn is_consumed_as_part_of_an_attribute(&self) -> bool {
+ matches!(
+ self.return_state,
+ Some(
+ State::AttributeValueDoubleQuoted
+ | State::AttributeValueSingleQuoted
+ | State::AttributeValueUnquoted
+ )
+ )
+ }
+
+ pub(super) fn flush_code_points_consumed_as_character_reference(&mut self) {
+ if self.is_consumed_as_part_of_an_attribute() {
+ self.emitter.push_attribute_value(&self.temporary_buffer);
+ self.temporary_buffer.clear();
+ } else {
+ self.flush_buffer_characters();
+ }
+ }
+
+ pub(super) fn flush_buffer_characters(&mut self) {
+ self.emitter.emit_string(&self.temporary_buffer);
+ self.temporary_buffer.clear();
+ }
+}
+
macro_rules! surrogate_pat {
() => {
0xd800..=0xdfff