summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/tokenizer.rs39
-rw-r--r--src/tokenizer/machine.rs4
2 files changed, 24 insertions, 19 deletions
diff --git a/src/tokenizer.rs b/src/tokenizer.rs
index b46bf45..7e05477 100644
--- a/src/tokenizer.rs
+++ b/src/tokenizer.rs
@@ -3,10 +3,13 @@ mod machine;
use crate::naive_parser::naive_next_state;
use crate::offset::{Offset, Position};
use crate::reader::{IntoReader, Reader};
-use crate::utils::{control_pat, noncharacter_pat, surrogate_pat, State as InternalState};
+use crate::utils::{control_pat, noncharacter_pat, surrogate_pat};
use crate::{Emitter, Error};
use machine::ControlToken;
+#[cfg(feature = "integration-tests")]
+use crate::utils::State as InternalState;
+
// this is a stack that can hold 0 to 2 Ts
#[derive(Debug, Default, Clone, Copy)]
struct Stack2<T: Copy>(Option<(T, Option<T>)>);
@@ -55,21 +58,21 @@ impl<T: Copy> Stack2<T> {
/// [tree construction]: https://html.spec.whatwg.org/multipage/parsing.html#tree-construction
pub struct Tokenizer<R: Reader, O, E: Emitter<O>> {
eof: bool,
- pub(crate) state: InternalState,
+ pub(crate) state: machine::State,
pub(crate) emitter: E,
pub(crate) temporary_buffer: String,
pub(crate) reader: R,
to_reconsume: Stack2<Option<char>>,
pub(crate) character_reference_code: u32,
- pub(crate) return_state: Option<InternalState>,
+ pub(crate) return_state: Option<machine::State>,
current_tag_name: String,
last_start_tag_name: String,
is_start_tag: bool,
/// The reader position before the match block in [`machine::consume`].
pub(crate) position_before_match: O,
- /// * Set to the offset of `<` in [`InternalState::Data`].
- /// * Set to the offset of `-` in [`InternalState::Comment`].
- /// * Set to the offset of `&` in [`InternalState::CharacterReference`].
+ /// * Set to the offset of `<` in [`machine::State::Data`].
+ /// * Set to the offset of `-` in [`machine::State::Comment`].
+ /// * Set to the offset of `&` in [`machine::State::CharacterReference`].
pub(crate) some_offset: O,
/// This boolean flag exists so that the [`NaiveParser`](crate::NaiveParser) can work with any [`Emitter`]
/// (it cannot call [`Tokenizer::set_state`] using the emitted start tags since they can be of an arbitrary type).
@@ -87,7 +90,7 @@ impl<R: Reader + Position<O>, O: Offset, E: Emitter<O>> Tokenizer<R, O, E> {
Tokenizer {
reader: reader.into_reader(),
emitter,
- state: InternalState::Data,
+ state: machine::State::Data,
to_reconsume: Stack2::default(),
return_state: None,
temporary_buffer: String::new(),
@@ -174,16 +177,16 @@ pub enum State {
ScriptDataDoubleEscaped,
}
-impl From<State> for InternalState {
+impl From<State> for machine::State {
fn from(state: State) -> Self {
match state {
- State::Data => InternalState::Data,
- State::PlainText => InternalState::PlainText,
- State::RcData => InternalState::RcData,
- State::RawText => InternalState::RawText,
- State::ScriptData => InternalState::ScriptData,
- State::ScriptDataEscaped => InternalState::ScriptDataEscaped,
- State::ScriptDataDoubleEscaped => InternalState::ScriptDataDoubleEscaped,
+ State::Data => machine::State::Data,
+ State::PlainText => machine::State::PlainText,
+ State::RcData => machine::State::RcData,
+ State::RawText => machine::State::RawText,
+ State::ScriptData => machine::State::ScriptData,
+ State::ScriptDataEscaped => machine::State::ScriptDataEscaped,
+ State::ScriptDataDoubleEscaped => machine::State::ScriptDataDoubleEscaped,
}
}
}
@@ -354,9 +357,9 @@ impl<R: Reader + Position<O>, O: Offset, E: Emitter<O>> Tokenizer<R, O, E> {
matches!(
self.return_state,
Some(
- InternalState::AttributeValueDoubleQuoted
- | InternalState::AttributeValueSingleQuoted
- | InternalState::AttributeValueUnquoted
+ machine::State::AttributeValueDoubleQuoted
+ | machine::State::AttributeValueSingleQuoted
+ | machine::State::AttributeValueUnquoted
)
)
}
diff --git a/src/tokenizer/machine.rs b/src/tokenizer/machine.rs
index fd4b36b..07d4c05 100644
--- a/src/tokenizer/machine.rs
+++ b/src/tokenizer/machine.rs
@@ -3,10 +3,12 @@ use crate::offset::{Offset, Position};
use crate::token::AttrValueSyntax;
use crate::tokenizer::CdataAction;
use crate::utils::{
- ascii_digit_pat, control_pat, ctostr, noncharacter_pat, surrogate_pat, whitespace_pat, State,
+ ascii_digit_pat, control_pat, ctostr, noncharacter_pat, surrogate_pat, whitespace_pat,
};
use crate::{reader::Reader, Emitter, Error, Tokenizer};
+pub use crate::utils::State;
+
pub enum ControlToken {
Eof,
Continue,