summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--src/naive_parser.rs6
-rw-r--r--src/tokenizer.rs263
-rw-r--r--src/tokenizer/machine.rs90
-rw-r--r--src/tokenizer/machine/utils.rs193
4 files changed, 293 insertions, 259 deletions
diff --git a/src/naive_parser.rs b/src/naive_parser.rs
index 5bf002b..c5e9568 100644
--- a/src/naive_parser.rs
+++ b/src/naive_parser.rs
@@ -35,7 +35,7 @@ impl<R: Reader + Position<O>, O: Offset> NaiveParser<R, O, DefaultEmitter<O>> {
// TODO: add example for NaiveParser::new
pub fn new<'a>(reader: impl IntoReader<'a, Reader = R>) -> Self {
let mut tokenizer = Tokenizer::new(reader, DefaultEmitter::default());
- tokenizer.naively_switch_state = true;
+ tokenizer.enable_naive_state_switching();
NaiveParser { tokenizer }
}
}
@@ -45,7 +45,7 @@ impl<R: Reader + Position<usize>> NaiveParser<R, usize, DefaultEmitter<usize>> {
// TODO: add example for NaiveParser::new_with_spans
pub fn new_with_spans<'a>(reader: impl IntoReader<'a, Reader = R>) -> Self {
let mut tokenizer = Tokenizer::new(reader, DefaultEmitter::default());
- tokenizer.naively_switch_state = true;
+ tokenizer.enable_naive_state_switching();
NaiveParser { tokenizer }
}
}
@@ -55,7 +55,7 @@ impl<R: Reader + Position<O>, O: Offset, E: Emitter<O>> NaiveParser<R, O, E> {
// TODO: add example for NaiveParser::new_with_emitter
pub fn new_with_emitter<'a>(reader: impl IntoReader<'a, Reader = R>, emitter: E) -> Self {
let mut tokenizer = Tokenizer::new(reader, emitter);
- tokenizer.naively_switch_state = true;
+ tokenizer.enable_naive_state_switching();
NaiveParser { tokenizer }
}
diff --git a/src/tokenizer.rs b/src/tokenizer.rs
index 6f698f6..7c38e49 100644
--- a/src/tokenizer.rs
+++ b/src/tokenizer.rs
@@ -1,41 +1,13 @@
-mod machine;
+pub(crate) mod machine;
-use crate::naive_parser::naive_next_state;
use crate::offset::{Offset, Position};
use crate::reader::{IntoReader, Reader};
-use crate::{Emitter, Error};
-use machine::utils::{control_pat, noncharacter_pat, surrogate_pat};
+use crate::Emitter;
use machine::ControlToken;
#[cfg(feature = "integration-tests")]
pub use machine::State as InternalState;
-// this is a stack that can hold 0 to 2 Ts
-#[derive(Debug, Default, Clone, Copy)]
-struct Stack2<T: Copy>(Option<(T, Option<T>)>);
-
-impl<T: Copy> Stack2<T> {
- #[inline]
- fn push(&mut self, c: T) {
- self.0 = match self.0 {
- None => Some((c, None)),
- Some((c1, None)) => Some((c1, Some(c))),
- Some((_c1, Some(_c2))) => panic!("stack full!"),
- }
- }
-
- #[inline]
- fn pop(&mut self) -> Option<T> {
- let (new_self, rv) = match self.0 {
- Some((c1, Some(c2))) => (Some((c1, None)), Some(c2)),
- Some((c1, None)) => (None, Some(c1)),
- None => (None, None),
- };
- self.0 = new_self;
- rv
- }
-}
-
/// An HTML tokenizer.
///
/// # Warning
@@ -56,27 +28,9 @@ impl<T: Copy> Stack2<T> {
///
/// [`NaiveParser`]: crate::NaiveParser
/// [tree construction]: https://html.spec.whatwg.org/multipage/parsing.html#tree-construction
-pub struct Tokenizer<R: Reader, O, E: Emitter<O>> {
+pub struct Tokenizer<R, O, E> {
+ machine: machine::Machine<R, O, E>,
eof: bool,
- pub(crate) state: machine::State,
- pub(crate) emitter: E,
- pub(crate) temporary_buffer: String,
- pub(crate) reader: R,
- to_reconsume: Stack2<Option<char>>,
- pub(crate) character_reference_code: u32,
- pub(crate) return_state: Option<machine::State>,
- current_tag_name: String,
- last_start_tag_name: String,
- is_start_tag: bool,
- /// The reader position before the match block in [`machine::consume`].
- pub(crate) position_before_match: O,
- /// * Set to the offset of `<` in [`machine::State::Data`].
- /// * Set to the offset of `-` in [`machine::State::Comment`].
- /// * Set to the offset of `&` in [`machine::State::CharacterReference`].
- pub(crate) some_offset: O,
- /// This boolean flag exists so that the [`NaiveParser`](crate::NaiveParser) can work with any [`Emitter`]
- /// (it cannot call [`Tokenizer::set_state`] using the emitted start tags since they can be of an arbitrary type).
- pub(crate) naively_switch_state: bool,
}
impl<R: Reader + Position<O>, O: Offset, E: Emitter<O>> Tokenizer<R, O, E> {
@@ -88,20 +42,8 @@ impl<R: Reader + Position<O>, O: Offset, E: Emitter<O>> Tokenizer<R, O, E> {
/// [tree construction]: https://html.spec.whatwg.org/multipage/parsing.html#tree-construction
pub fn new<'a>(reader: impl IntoReader<'a, Reader = R>, emitter: E) -> Self {
Tokenizer {
- reader: reader.into_reader(),
- emitter,
- state: machine::State::Data,
- to_reconsume: Stack2::default(),
- return_state: None,
- temporary_buffer: String::new(),
- character_reference_code: 0,
+ machine: machine::Machine::new(reader.into_reader(), emitter),
eof: false,
- current_tag_name: String::new(),
- last_start_tag_name: String::new(),
- is_start_tag: false,
- position_before_match: O::default(),
- some_offset: O::default(),
- naively_switch_state: false,
}
}
@@ -114,12 +56,12 @@ impl<R: Reader + Position<O>, O: Offset, E: Emitter<O>> Tokenizer<R, O, E> {
///
/// [Markup declaration open state]: https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state
pub fn handle_cdata_open(&mut self, action: CdataAction) {
- machine::handle_cdata_open(self, action);
+ machine::handle_cdata_open(&mut self.machine, action);
}
/// Returns a mutable reference to the emitter.
pub fn emitter_mut(&mut self) -> &mut E {
- &mut self.emitter
+ &mut self.machine.emitter
}
}
@@ -197,185 +139,12 @@ impl<R: Reader + Position<O>, O: Offset, E: Emitter<O>> Tokenizer<R, O, E> {
/// Only available with the `integration-tests` feature which is not public API.
#[cfg(feature = "integration-tests")]
pub fn set_internal_state(&mut self, state: InternalState) {
- self.state = state;
+ self.machine.state = state;
}
/// Set the statemachine to start/continue in the given state.
pub fn set_state(&mut self, state: State) {
- self.state = state.into();
- }
-
- /// Just a helper method for the machine.
- #[inline]
- pub(crate) fn emit_error(&mut self, error: Error) {
- let span = match error {
- Error::EofBeforeTagName
- | Error::EofInCdata
- | Error::EofInComment
- | Error::EofInDoctype
- | Error::EofInScriptHtmlCommentLikeText
- | Error::EofInTag
- | Error::MissingSemicolonAfterCharacterReference => {
- self.reader.position()..self.reader.position()
- }
- Error::AbsenceOfDigitsInNumericCharacterReference
- | Error::NullCharacterReference
- | Error::CharacterReferenceOutsideUnicodeRange
- | Error::SurrogateCharacterReference
- | Error::NoncharacterCharacterReference
- | Error::ControlCharacterReference
- | Error::UnknownNamedCharacterReference => self.some_offset..self.reader.position(),
-
- _ => self.position_before_match..self.reader.position(),
- };
- self.emitter.report_error(error, span);
- }
-
- /// Assuming the _current token_ is an end tag, return true if all of these hold. Return false otherwise.
- ///
- /// * the _last start tag_ exists
- /// * the current end tag token's name equals to the last start tag's name.
- ///
- /// See also WHATWG's definition of [appropriate end tag token].
- ///
- /// [appropriate end tag token]: https://html.spec.whatwg.org/multipage/parsing.html#appropriate-end-tag-token
- #[inline]
- pub(crate) fn current_end_tag_is_appropriate(&mut self) -> bool {
- self.current_tag_name == self.last_start_tag_name
- }
-
- #[inline]
- pub(crate) fn init_start_tag(&mut self) {
- self.emitter
- .init_start_tag(self.some_offset, self.position_before_match);
- self.current_tag_name.clear();
- self.is_start_tag = true;
- }
-
- #[inline]
- pub(crate) fn init_end_tag(&mut self) {
- self.emitter
- .init_end_tag(self.some_offset, self.position_before_match);
- self.current_tag_name.clear();
- self.is_start_tag = false;
- }
-
- #[inline]
- pub(crate) fn push_tag_name(&mut self, s: &str) {
- self.emitter.push_tag_name(s);
- self.current_tag_name.push_str(s);
- }
-
- #[inline]
- pub(crate) fn emit_current_tag(&mut self) {
- self.emitter.emit_current_tag(self.reader.position());
- if self.is_start_tag {
- if self.naively_switch_state {
- self.state = naive_next_state(&self.current_tag_name).into();
- }
- std::mem::swap(&mut self.last_start_tag_name, &mut self.current_tag_name);
- }
- }
-
- #[inline]
- pub(crate) fn unread_char(&mut self, c: Option<char>) {
- self.to_reconsume.push(c);
- }
-
- #[inline]
- fn validate_char(&mut self, c: char) {
- match c as u32 {
- surrogate_pat!() => {
- self.emit_error(Error::SurrogateInInputStream);
- }
- noncharacter_pat!() => {
- self.emit_error(Error::NoncharacterInInputStream);
- }
- // control without whitespace or nul
- x @ control_pat!()
- if !matches!(x, 0x0000 | 0x0009 | 0x000a | 0x000c | 0x000d | 0x0020) =>
- {
- self.emit_error(Error::ControlCharacterInInputStream);
- }
- _ => (),
- }
- }
-
- pub(crate) fn read_char(&mut self) -> Result<Option<char>, R::Error> {
- let (c_res, reconsumed) = match self.to_reconsume.pop() {
- Some(c) => (Ok(c), true),
- None => (self.reader.read_char(), false),
- };
-
- let mut c = match c_res {
- Ok(Some(c)) => c,
- res => return res,
- };
-
- if c == '\r' {
- c = '\n';
- let c2 = self.reader.read_char()?;
- if c2 != Some('\n') {
- self.unread_char(c2);
- }
- }
-
- if !reconsumed {
- self.validate_char(c);
- }
-
- Ok(Some(c))
- }
-
- #[inline]
- pub(crate) fn try_read_string(
- &mut self,
- mut s: &str,
- case_sensitive: bool,
- ) -> Result<bool, R::Error> {
- debug_assert!(!s.is_empty());
-
- let to_reconsume_bak = self.to_reconsume;
- let mut chars = s.chars();
- while let Some(c) = self.to_reconsume.pop() {
- if let (Some(x), Some(x2)) = (c, chars.next()) {
- if x == x2 || (!case_sensitive && x.to_ascii_lowercase() == x2.to_ascii_lowercase())
- {
- s = &s[x.len_utf8()..];
- continue;
- }
- }
-
- self.to_reconsume = to_reconsume_bak;
- return Ok(false);
- }
-
- self.reader.try_read_string(s, case_sensitive)
- }
-
- pub(crate) fn is_consumed_as_part_of_an_attribute(&self) -> bool {
- matches!(
- self.return_state,
- Some(
- machine::State::AttributeValueDoubleQuoted
- | machine::State::AttributeValueSingleQuoted
- | machine::State::AttributeValueUnquoted
- )
- )
- }
-
- pub(crate) fn flush_code_points_consumed_as_character_reference(&mut self) {
- if self.is_consumed_as_part_of_an_attribute() {
- self.emitter.push_attribute_value(&self.temporary_buffer);
- self.temporary_buffer.clear();
- } else {
- self.flush_buffer_characters();
- }
- }
-
- pub(crate) fn flush_buffer_characters(&mut self) {
- self.emitter.emit_string(&self.temporary_buffer);
- self.temporary_buffer.clear();
+ self.machine.state = state.into();
}
}
@@ -389,7 +158,7 @@ where
fn next(&mut self) -> Option<Self::Item> {
loop {
- if let Some(token) = self.emitter.next() {
+ if let Some(token) = self.machine.emitter.next() {
return Some(Ok(Event::Token(token)));
}
@@ -397,12 +166,12 @@ where
return None;
}
- match machine::consume(self) {
+ match machine::consume(&mut self.machine) {
Err(e) => return Some(Err(e)),
Ok(ControlToken::Continue) => (),
Ok(ControlToken::Eof) => {
self.eof = true;
- self.emitter.emit_eof();
+ self.machine.emitter.emit_eof();
}
Ok(ControlToken::CdataOpen) => return Some(Ok(Event::CdataOpen)),
}
@@ -411,12 +180,16 @@ where
}
impl<R: Reader, O, E: Emitter<O>> Tokenizer<R, O, E> {
+ pub(crate) fn enable_naive_state_switching(&mut self) {
+ self.machine.naively_switch_state = true;
+ }
+
/// Test-internal function to override internal state.
///
/// Only available with the `integration-tests` feature which is not public API.
#[cfg(feature = "integration-tests")]
pub fn set_last_start_tag(&mut self, last_start_tag: &str) {
- self.last_start_tag_name.clear();
- self.last_start_tag_name.push_str(last_start_tag);
+ self.machine.last_start_tag_name.clear();
+ self.machine.last_start_tag_name.push_str(last_start_tag);
}
}
diff --git a/src/tokenizer/machine.rs b/src/tokenizer/machine.rs
index fc31a42..e9a3e68 100644
--- a/src/tokenizer/machine.rs
+++ b/src/tokenizer/machine.rs
@@ -1,16 +1,65 @@
-pub(super) mod utils;
+mod utils;
use crate::entities::try_read_character_reference;
use crate::offset::{Offset, Position};
use crate::token::AttrValueSyntax;
use crate::tokenizer::CdataAction;
-use crate::{reader::Reader, Emitter, Error, Tokenizer};
+use crate::{reader::Reader, Emitter, Error};
use utils::{
ascii_digit_pat, control_pat, ctostr, noncharacter_pat, surrogate_pat, whitespace_pat,
};
pub use utils::State;
+pub(super) struct Machine<R, O, E> {
+ pub(super) state: State,
+ pub(super) emitter: E,
+ temporary_buffer: String,
+ reader: R,
+ to_reconsume: Stack2<Option<char>>,
+ character_reference_code: u32,
+ return_state: Option<State>,
+ current_tag_name: String,
+ pub(super) last_start_tag_name: String,
+ is_start_tag: bool,
+ /// The reader position before the match block in [`consume`].
+ position_before_match: O,
+ /// * Set to the offset of `<` in [`State::Data`].
+ /// * Set to the offset of `-` in [`State::Comment`].
+ /// * Set to the offset of `&` in [`State::CharacterReference`].
+ some_offset: O,
+ /// This boolean flag exists so that the [`NaiveParser`](crate::NaiveParser) can work with any [`Emitter`]
+ /// (it cannot call [`Tokenizer::set_state`] using the emitted start tags since they can be of an arbitrary type).
+ ///
+ /// [`Tokenizer::set_state`]: super::Tokenizer::set_state
+ pub(crate) naively_switch_state: bool,
+}
+
+impl<R, O, E> Machine<R, O, E>
+where
+ R: Reader + Position<O>,
+ O: Offset,
+ E: Emitter<O>,
+{
+ pub fn new(reader: R, emitter: E) -> Self {
+ Self {
+ reader,
+ emitter,
+ state: State::Data,
+ to_reconsume: Stack2::default(),
+ return_state: None,
+ temporary_buffer: String::new(),
+ character_reference_code: 0,
+ current_tag_name: String::new(),
+ last_start_tag_name: String::new(),
+ is_start_tag: false,
+ position_before_match: O::default(),
+ some_offset: O::default(),
+ naively_switch_state: false,
+ }
+ }
+}
+
pub enum ControlToken {
Eof,
Continue,
@@ -18,7 +67,7 @@ pub enum ControlToken {
}
#[inline]
-pub fn consume<O, R, E>(slf: &mut Tokenizer<R, O, E>) -> Result<ControlToken, R::Error>
+pub(super) fn consume<O, R, E>(slf: &mut Machine<R, O, E>) -> Result<ControlToken, R::Error>
where
O: Offset,
R: Reader + Position<O>,
@@ -1964,15 +2013,8 @@ where
}
}
-impl<R: Reader + Position<O>, O: Offset, E: Emitter<O>> Tokenizer<R, O, E> {
- #[inline]
- fn init_doctype(&mut self) {
- self.emitter.init_doctype(self.some_offset);
- }
-}
-
#[inline]
-pub fn handle_cdata_open<O, R, E>(slf: &mut Tokenizer<R, O, E>, action: CdataAction)
+pub(super) fn handle_cdata_open<O, R, E>(slf: &mut Machine<R, O, E>, action: CdataAction)
where
O: Offset,
R: Reader + Position<O>,
@@ -1989,3 +2031,29 @@ where
}
}
}
+
+// this is a stack that can hold 0 to 2 Ts
+#[derive(Debug, Default, Clone, Copy)]
+struct Stack2<T: Copy>(Option<(T, Option<T>)>);
+
+impl<T: Copy> Stack2<T> {
+ #[inline]
+ fn push(&mut self, c: T) {
+ self.0 = match self.0 {
+ None => Some((c, None)),
+ Some((c1, None)) => Some((c1, Some(c))),
+ Some((_c1, Some(_c2))) => panic!("stack full!"),
+ }
+ }
+
+ #[inline]
+ fn pop(&mut self) -> Option<T> {
+ let (new_self, rv) = match self.0 {
+ Some((c1, Some(c2))) => (Some((c1, None)), Some(c2)),
+ Some((c1, None)) => (None, Some(c1)),
+ None => (None, None),
+ };
+ self.0 = new_self;
+ rv
+ }
+}
diff --git a/src/tokenizer/machine/utils.rs b/src/tokenizer/machine/utils.rs
index 7d220cf..6e45f4d 100644
--- a/src/tokenizer/machine/utils.rs
+++ b/src/tokenizer/machine/utils.rs
@@ -1,3 +1,196 @@
+use crate::{
+ naive_parser::naive_next_state,
+ offset::{Offset, Position},
+ reader::Reader,
+ Emitter, Error,
+};
+
+use super::Machine;
+
+impl<R, O, E> Machine<R, O, E>
+where
+ R: Reader + Position<O>,
+ O: Offset,
+ E: Emitter<O>,
+{
+ #[inline]
+ pub(crate) fn emit_error(&mut self, error: Error) {
+ let span = match error {
+ Error::EofBeforeTagName
+ | Error::EofInCdata
+ | Error::EofInComment
+ | Error::EofInDoctype
+ | Error::EofInScriptHtmlCommentLikeText
+ | Error::EofInTag
+ | Error::MissingSemicolonAfterCharacterReference => {
+ self.reader.position()..self.reader.position()
+ }
+ Error::AbsenceOfDigitsInNumericCharacterReference
+ | Error::NullCharacterReference
+ | Error::CharacterReferenceOutsideUnicodeRange
+ | Error::SurrogateCharacterReference
+ | Error::NoncharacterCharacterReference
+ | Error::ControlCharacterReference
+ | Error::UnknownNamedCharacterReference => self.some_offset..self.reader.position(),
+
+ _ => self.position_before_match..self.reader.position(),
+ };
+ self.emitter.report_error(error, span);
+ }
+
+ /// Assuming the _current token_ is an end tag, return true if all of these hold. Return false otherwise.
+ ///
+ /// * the _last start tag_ exists
+ /// * the current end tag token's name equals to the last start tag's name.
+ ///
+ /// See also WHATWG's definition of [appropriate end tag token].
+ ///
+ /// [appropriate end tag token]: https://html.spec.whatwg.org/multipage/parsing.html#appropriate-end-tag-token
+ #[inline]
+ pub(super) fn current_end_tag_is_appropriate(&mut self) -> bool {
+ self.current_tag_name == self.last_start_tag_name
+ }
+
+ #[inline]
+ pub(super) fn init_start_tag(&mut self) {
+ self.emitter
+ .init_start_tag(self.some_offset, self.position_before_match);
+ self.current_tag_name.clear();
+ self.is_start_tag = true;
+ }
+
+ #[inline]
+ pub(super) fn init_end_tag(&mut self) {
+ self.emitter
+ .init_end_tag(self.some_offset, self.position_before_match);
+ self.current_tag_name.clear();
+ self.is_start_tag = false;
+ }
+
+ #[inline]
+ pub(super) fn init_doctype(&mut self) {
+ self.emitter.init_doctype(self.some_offset);
+ }
+
+ #[inline]
+ pub(super) fn push_tag_name(&mut self, s: &str) {
+ self.emitter.push_tag_name(s);
+ self.current_tag_name.push_str(s);
+ }
+
+ #[inline]
+ pub(super) fn emit_current_tag(&mut self) {
+ self.emitter.emit_current_tag(self.reader.position());
+ if self.is_start_tag {
+ if self.naively_switch_state {
+ self.state = naive_next_state(&self.current_tag_name).into();
+ }
+ std::mem::swap(&mut self.last_start_tag_name, &mut self.current_tag_name);
+ }
+ }
+
+ #[inline]
+ pub(super) fn unread_char(&mut self, c: Option<char>) {
+ self.to_reconsume.push(c);
+ }
+
+ #[inline]
+ fn validate_char(&mut self, c: char) {
+ match c as u32 {
+ surrogate_pat!() => {
+ self.emit_error(Error::SurrogateInInputStream);
+ }
+ noncharacter_pat!() => {
+ self.emit_error(Error::NoncharacterInInputStream);
+ }
+ // control without whitespace or nul
+ x @ control_pat!()
+ if !matches!(x, 0x0000 | 0x0009 | 0x000a | 0x000c | 0x000d | 0x0020) =>
+ {
+ self.emit_error(Error::ControlCharacterInInputStream);
+ }
+ _ => (),
+ }
+ }
+
+ pub(super) fn read_char(&mut self) -> Result<Option<char>, R::Error> {
+ let (c_res, reconsumed) = match self.to_reconsume.pop() {
+ Some(c) => (Ok(c), true),
+ None => (self.reader.read_char(), false),
+ };
+
+ let mut c = match c_res {
+ Ok(Some(c)) => c,
+ res => return res,
+ };
+
+ if c == '\r' {
+ c = '\n';
+ let c2 = self.reader.read_char()?;
+ if c2 != Some('\n') {
+ self.unread_char(c2);
+ }
+ }
+
+ if !reconsumed {
+ self.validate_char(c);
+ }
+
+ Ok(Some(c))
+ }
+
+ #[inline]
+ pub(super) fn try_read_string(
+ &mut self,
+ mut s: &str,
+ case_sensitive: bool,
+ ) -> Result<bool, R::Error> {
+ debug_assert!(!s.is_empty());
+
+ let to_reconsume_bak = self.to_reconsume;
+ let mut chars = s.chars();
+ while let Some(c) = self.to_reconsume.pop() {
+ if let (Some(x), Some(x2)) = (c, chars.next()) {
+ if x == x2 || (!case_sensitive && x.to_ascii_lowercase() == x2.to_ascii_lowercase())
+ {
+ s = &s[x.len_utf8()..];
+ continue;
+ }
+ }
+
+ self.to_reconsume = to_reconsume_bak;
+ return Ok(false);
+ }
+
+ self.reader.try_read_string(s, case_sensitive)
+ }
+
+ pub(super) fn is_consumed_as_part_of_an_attribute(&self) -> bool {
+ matches!(
+ self.return_state,
+ Some(
+ State::AttributeValueDoubleQuoted
+ | State::AttributeValueSingleQuoted
+ | State::AttributeValueUnquoted
+ )
+ )
+ }
+
+ pub(super) fn flush_code_points_consumed_as_character_reference(&mut self) {
+ if self.is_consumed_as_part_of_an_attribute() {
+ self.emitter.push_attribute_value(&self.temporary_buffer);
+ self.temporary_buffer.clear();
+ } else {
+ self.flush_buffer_characters();
+ }
+ }
+
+ pub(super) fn flush_buffer_characters(&mut self) {
+ self.emitter.emit_string(&self.temporary_buffer);
+ self.temporary_buffer.clear();
+ }
+}
+
macro_rules! surrogate_pat {
() => {
0xd800..=0xdfff