aboutsummaryrefslogtreecommitdiff
path: root/examples/tokenize.rs
diff options
context:
space:
mode:
authorMartin Fischer <martin@push-f.com>2021-04-08 08:42:01 +0200
committerMartin Fischer <martin@push-f.com>2021-04-08 15:40:37 +0200
commit57e7eefcbe6fb8c3dc4b01c707be9de4c34963a7 (patch)
tree6a9d296389bf3023396592c8514ed6712e011c7f /examples/tokenize.rs
import https://github.com/servo/html5ever
commit d1206daa740305f55a5fa159e43eb33afc359cb4
Diffstat (limited to 'examples/tokenize.rs')
-rw-r--r--examples/tokenize.rs103
1 files changed, 103 insertions, 0 deletions
diff --git a/examples/tokenize.rs b/examples/tokenize.rs
new file mode 100644
index 0000000..039ffb7
--- /dev/null
+++ b/examples/tokenize.rs
@@ -0,0 +1,103 @@
+// Copyright 2014-2017 The html5ever Project Developers. See the
+// COPYRIGHT file at the top-level directory of this distribution.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+extern crate html5ever;
+
+use std::default::Default;
+use std::io;
+
+use html5ever::tendril::*;
+use html5ever::tokenizer::BufferQueue;
+use html5ever::tokenizer::{CharacterTokens, EndTag, NullCharacterToken, StartTag, TagToken};
+use html5ever::tokenizer::{
+ ParseError, Token, TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts,
+};
+
+#[derive(Copy, Clone)]
+struct TokenPrinter {
+ in_char_run: bool,
+}
+
+impl TokenPrinter {
+ fn is_char(&mut self, is_char: bool) {
+ match (self.in_char_run, is_char) {
+ (false, true) => print!("CHAR : \""),
+ (true, false) => println!("\""),
+ _ => (),
+ }
+ self.in_char_run = is_char;
+ }
+
+ fn do_char(&mut self, c: char) {
+ self.is_char(true);
+ print!("{}", c.escape_default().collect::<String>());
+ }
+}
+
+impl TokenSink for TokenPrinter {
+ type Handle = ();
+
+ fn process_token(&mut self, token: Token, _line_number: u64) -> TokenSinkResult<()> {
+ match token {
+ CharacterTokens(b) => {
+ for c in b.chars() {
+ self.do_char(c);
+ }
+ },
+ NullCharacterToken => self.do_char('\0'),
+ TagToken(tag) => {
+ self.is_char(false);
+ // This is not proper HTML serialization, of course.
+ match tag.kind {
+ StartTag => print!("TAG : <\x1b[32m{}\x1b[0m", tag.name),
+ EndTag => print!("TAG : <\x1b[31m/{}\x1b[0m", tag.name),
+ }
+ for attr in tag.attrs.iter() {
+ print!(
+ " \x1b[36m{}\x1b[0m='\x1b[34m{}\x1b[0m'",
+ attr.name.local, attr.value
+ );
+ }
+ if tag.self_closing {
+ print!(" \x1b[31m/\x1b[0m");
+ }
+ println!(">");
+ },
+ ParseError(err) => {
+ self.is_char(false);
+ println!("ERROR: {}", err);
+ },
+ _ => {
+ self.is_char(false);
+ println!("OTHER: {:?}", token);
+ },
+ }
+ TokenSinkResult::Continue
+ }
+}
+
+fn main() {
+ let mut sink = TokenPrinter { in_char_run: false };
+ let mut chunk = ByteTendril::new();
+ io::stdin().read_to_tendril(&mut chunk).unwrap();
+ let mut input = BufferQueue::new();
+ input.push_back(chunk.try_reinterpret().unwrap());
+
+ let mut tok = Tokenizer::new(
+ sink,
+ TokenizerOpts {
+ profile: true,
+ ..Default::default()
+ },
+ );
+ let _ = tok.feed(&mut input);
+ assert!(input.is_empty());
+ tok.end();
+ sink.is_char(false);
+}