diff options
-rw-r--r-- | examples/switch-state.rs | 20 | ||||
-rw-r--r-- | src/emitter.rs | 18 |
2 files changed, 38 insertions, 0 deletions
diff --git a/examples/switch-state.rs b/examples/switch-state.rs new file mode 100644 index 0000000..e966687 --- /dev/null +++ b/examples/switch-state.rs @@ -0,0 +1,20 @@ +//! Let's you easily try out the tokenizer with e.g. +//! printf '<style><b>Hello world!</b></style>' | cargo run --example=switch-state +use html5gum::{BufReadReader, Token, Tokenizer}; +use std::io::stdin; + +fn main() { + let stdin = stdin(); + let mut tokenizer = Tokenizer::new(BufReadReader::new(stdin.lock())); + + while let Some(token) = tokenizer.next() { + let token = token.unwrap(); + println!("{:?}", token); + + if let Token::StartTag(start_tag) = token { + // take care of switching parser state for e.g. <script> & <style> + // this is not strictly spec-compliant but good enough most of the time + tokenizer.set_state(start_tag.next_state(false)); + } + } +} diff --git a/src/emitter.rs b/src/emitter.rs index 8c8976d..0a80544 100644 --- a/src/emitter.rs +++ b/src/emitter.rs @@ -4,6 +4,7 @@ use std::collections::VecDeque; use std::mem; use crate::Error; +use crate::State; /// An emitter is an object providing methods to the tokenizer to produce tokens. /// @@ -431,6 +432,23 @@ pub struct StartTag { pub attributes: BTreeMap<String, String>, } +impl StartTag { + /// Returns the next tokenizer state according to + /// [Parsing HTML fragments](https://html.spec.whatwg.org/multipage/parsing.html#concept-frag-parse-context). + /// If `scripting` is set to true [`State::RawText`] is returned if this is a `<noscript>` tag, + /// otherwise [`State::Data`] is returned (as with any other regular tag). + pub fn next_state(&self, scripting: bool) -> State { + match self.name.as_str() { + "title" | "textarea" => State::RcData, + "style" | "xmp" | "iframe" | "noembed" | "noframes" => State::RawText, + "script" => State::ScriptData, + "noscript" if scripting => State::RawText, + "plaintext" => State::PlainText, + _other => State::Data, + } + } +} + /// A HTML end/close tag, such as `</p>` or `</a>`. #[derive(Debug, Default, Eq, PartialEq)] pub struct EndTag { |