summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMartin Fischer <martin@push-f.com>2021-11-30 16:00:47 +0100
committerMartin Fischer <martin@push-f.com>2021-12-05 02:39:51 +0100
commit2a6e3bf05c419eb21cb7a4db141ed6a319e98622 (patch)
treed97f6949190d3d0955b376d9b099c871f1e33ce6
parent4892172b629590ac4362f7506c14e993fd1ddd2b (diff)
allow setting the Tokenizer to Data, PlainText, RcData, RawText and ScriptData states
-rw-r--r--src/lib.rs2
-rw-r--r--src/tokenizer.rs59
2 files changed, 54 insertions, 7 deletions
diff --git a/src/lib.rs b/src/lib.rs
index 2125776..b222193 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -19,4 +19,4 @@ pub use emitter::{DefaultEmitter, Doctype, Emitter, EndTag, StartTag, Token};
pub use error::Error;
pub use never::Never;
pub use reader::{BufReadReader, Readable, Reader, StringReader};
-pub use tokenizer::{InfallibleTokenizer, Tokenizer};
+pub use tokenizer::{InfallibleTokenizer, State, Tokenizer};
diff --git a/src/tokenizer.rs b/src/tokenizer.rs
index b09e030..b5a2edf 100644
--- a/src/tokenizer.rs
+++ b/src/tokenizer.rs
@@ -1,3 +1,5 @@
+use std::ops::{Deref, DerefMut};
+
use crate::machine;
use crate::utils::{
control_pat, noncharacter_pat, surrogate_pat, ControlToken, State as InternalState,
@@ -55,6 +57,40 @@ impl<R: Reader> Tokenizer<R> {
}
}
+/// The states you can set the tokenizer to.
+#[derive(Debug)]
+#[non_exhaustive]
+pub enum State {
+ /// The [Data state](https://html.spec.whatwg.org/#data-state).
+ Data,
+ /// The [PLAINTEXT state](https://html.spec.whatwg.org/#plaintext-state).
+ PlainText,
+ /// The [RCDATA state](https://html.spec.whatwg.org/#rcdata-state).
+ RcData,
+ /// The [RAWTEXT state](https://html.spec.whatwg.org/#rawtext-state).
+ RawText,
+ /// The [Script data state](https://html.spec.whatwg.org/#script-data-state).
+ ScriptData,
+ /// The [Script data escaped state](https://html.spec.whatwg.org/#script-data-escaped-state).
+ ScriptDataEscaped,
+ /// The [Script data double escaped state](https://html.spec.whatwg.org/#script-data-double-escaped-state).
+ ScriptDataDoubleEscaped,
+}
+
+impl From<State> for InternalState {
+ fn from(state: State) -> Self {
+ match state {
+ State::Data => InternalState::Data,
+ State::PlainText => InternalState::PlainText,
+ State::RcData => InternalState::RcData,
+ State::RawText => InternalState::RawText,
+ State::ScriptData => InternalState::ScriptData,
+ State::ScriptDataEscaped => InternalState::ScriptDataEscaped,
+ State::ScriptDataDoubleEscaped => InternalState::ScriptDataDoubleEscaped,
+ }
+ }
+}
+
impl<R: Reader, E: Emitter> Tokenizer<R, E> {
/// Construct a new tokenizer from some input and a custom emitter.
///
@@ -81,12 +117,9 @@ impl<R: Reader, E: Emitter> Tokenizer<R, E> {
self.state = state;
}
- /// Set the statemachine to start/continue in [plaintext
- /// state](https://html.spec.whatwg.org/#plaintext-state).
- ///
- /// This tokenizer never gets into that state naturally.
- pub fn set_plaintext_state(&mut self) {
- self.state = InternalState::PlainText;
+ /// Set the statemachine to start/continue in the given state.
+ pub fn set_state(&mut self, state: State) {
+ self.state = state.into();
}
/// Test-internal function to override internal state.
@@ -254,3 +287,17 @@ impl<R: Reader<Error = Never>, E: Emitter> Iterator for InfallibleTokenizer<R, E
}
}
}
+
+impl<R: Reader<Error = Never>, E: Emitter> Deref for InfallibleTokenizer<R, E> {
+ type Target = Tokenizer<R, E>;
+
+ fn deref(&self) -> &Self::Target {
+ &self.0
+ }
+}
+
+impl<R: Reader<Error = Never>, E: Emitter> DerefMut for InfallibleTokenizer<R, E> {
+ fn deref_mut(&mut self) -> &mut Self::Target {
+ &mut self.0
+ }
+}