diff options
-rw-r--r-- | examples/tokenize.rs | 12 | ||||
-rw-r--r-- | src/lib.rs | 4 | ||||
-rw-r--r-- | src/macros.rs | 2 | ||||
-rw-r--r-- | src/tokenizer/char_ref/mod.rs | 30 | ||||
-rw-r--r-- | src/tokenizer/mod.rs | 160 | ||||
-rw-r--r-- | src/util/buffer_queue.rs | 12 | ||||
-rw-r--r-- | src/util/str.rs | 2 |
7 files changed, 109 insertions, 113 deletions
diff --git a/examples/tokenize.rs b/examples/tokenize.rs index dc3b476..bea6a84 100644 --- a/examples/tokenize.rs +++ b/examples/tokenize.rs @@ -14,9 +14,7 @@ use std::io; use html5tokenizer::BufferQueue; use html5tokenizer::{CharacterTokens, EndTag, NullCharacterToken, StartTag, TagToken}; -use html5tokenizer::{ - ParseError, Token, TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts, -}; +use html5tokenizer::{ParseError, Token, TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts}; use io::Read; #[derive(Copy, Clone)] @@ -49,7 +47,7 @@ impl TokenSink for TokenPrinter { for c in b.chars() { self.do_char(c); } - }, + } NullCharacterToken => self.do_char('\0'), TagToken(tag) => { self.is_char(false); @@ -68,15 +66,15 @@ impl TokenSink for TokenPrinter { print!(" \x1b[31m/\x1b[0m"); } println!(">"); - }, + } ParseError(err) => { self.is_char(false); println!("ERROR: {}", err); - }, + } _ => { self.is_char(false); println!("OTHER: {:?}", token); - }, + } } TokenSinkResult::Continue } @@ -29,12 +29,12 @@ macro_rules! small_char_set ( ($($e:expr)+) => ( )); mod util { - pub mod str; pub mod buffer_queue; pub mod smallcharset; + pub mod str; } mod tokenizer; #[doc(inline)] -pub use tokenizer::*;
\ No newline at end of file +pub use tokenizer::*; diff --git a/src/macros.rs b/src/macros.rs index 643e754..d87ea98 100644 --- a/src/macros.rs +++ b/src/macros.rs @@ -50,4 +50,4 @@ macro_rules! format_if { ::std::borrow::Cow::Borrowed($borrowed) } } -}
\ No newline at end of file +} diff --git a/src/tokenizer/char_ref/mod.rs b/src/tokenizer/char_ref/mod.rs index 7b27bff..41f4c13 100644 --- a/src/tokenizer/char_ref/mod.rs +++ b/src/tokenizer/char_ref/mod.rs @@ -145,13 +145,13 @@ impl CharRefTokenizer { tokenizer.discard_char(input); self.state = Octothorpe; Progress - }, + } _ => { self.state = Named; self.name_buf_opt = Some(String::new()); Progress - }, + } } } @@ -166,12 +166,12 @@ impl CharRefTokenizer { tokenizer.discard_char(input); self.hex_marker = Some(c); self.state = Numeric(16); - }, + } _ => { self.hex_marker = None; self.state = Numeric(10); - }, + } } Progress } @@ -195,14 +195,14 @@ impl CharRefTokenizer { self.num = self.num.wrapping_add(n); self.seen_digit = true; Progress - }, + } None if !self.seen_digit => self.unconsume_numeric(tokenizer, input), None => { self.state = NumericSemicolon; Progress - }, + } } } @@ -287,7 +287,7 @@ impl CharRefTokenizer { } // Otherwise we just have a prefix match. Progress - }, + } // Can't continue the match. None => self.finish_named(tokenizer, input, Some(c)), @@ -322,7 +322,7 @@ impl CharRefTokenizer { // we emit a parse error. self.state = BogusName; return Progress; - }, + } // Check length because &; is not a parse error. Some(';') if self.name_buf().len() > 1 => self.emit_name_error(tokenizer), @@ -331,7 +331,7 @@ impl CharRefTokenizer { } self.unconsume_name(input); self.finish_none() - }, + } Some((c1, c2)) => { // We have a complete match, but we may have consumed @@ -371,14 +371,14 @@ impl CharRefTokenizer { "Equals sign after character reference in attribute", )); true - }, + } (Some(_), _, Some(c)) if is_ascii_alnum(c) => true, _ => { tokenizer.emit_error(Borrowed( "Character reference does not end with semicolon", )); false - }, + } }; if unconsume_all { @@ -392,7 +392,7 @@ impl CharRefTokenizer { }); Done } - }, + } } } @@ -426,20 +426,20 @@ impl CharRefTokenizer { Numeric(_) | NumericSemicolon => { tokenizer.emit_error(Borrowed("EOF in numeric character reference")); self.finish_numeric(tokenizer); - }, + } Named => drop(self.finish_named(tokenizer, input, None)), BogusName => { self.unconsume_name(input); self.finish_none(); - }, + } Octothorpe => { input.push_front(String::from("#")); tokenizer.emit_error(Borrowed("EOF after '#' in character reference")); self.finish_none(); - }, + } } } } diff --git a/src/tokenizer/mod.rs b/src/tokenizer/mod.rs index a58e388..bcbc6b7 100644 --- a/src/tokenizer/mod.rs +++ b/src/tokenizer/mod.rs @@ -9,9 +9,9 @@ //! The HTML5 tokenizer. +pub use self::interface::{Attribute, Doctype, EndTag, StartTag, Tag, TagKind}; pub use self::interface::{CharacterTokens, EOFToken, NullCharacterToken, ParseError}; pub use self::interface::{CommentToken, DoctypeToken, TagToken, Token}; -pub use self::interface::{Doctype, EndTag, StartTag, Tag, TagKind, Attribute}; pub use self::interface::{TokenSink, TokenSinkResult}; use self::states::{DoctypeIdKind, Public, System}; @@ -168,9 +168,7 @@ pub struct Tokenizer<Sink> { impl<Sink: TokenSink> Tokenizer<Sink> { /// Create a new tokenizer which feeds tokens to a particular `TokenSink`. pub fn new(sink: Sink, mut opts: TokenizerOpts) -> Tokenizer<Sink> { - let start_tag_name = opts - .last_start_tag_name - .take(); + let start_tag_name = opts.last_start_tag_name.take(); let state = opts.initial_state.unwrap_or(states::Data); let discard_bom = opts.discard_bom; Tokenizer { @@ -259,8 +257,8 @@ impl<Sink: TokenSink> Tokenizer<Sink> { self.current_line += 1; } - if self.opts.exact_errors && - match c as u32 { + if self.opts.exact_errors + && match c as u32 { 0x01..=0x08 | 0x0B | 0x0E..=0x1F | 0x7F..=0x9F | 0xFDD0..=0xFDEF => true, n if (n & 0xFFFE) == 0xFFFE => true, _ => false, @@ -326,7 +324,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> { self.temp_buf.push(c); } None - }, + } Some(matched) => Some(matched), } } @@ -343,7 +341,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> { Some(x) => { *x += dt; false - }, + } None => true, }; if new { @@ -410,7 +408,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> { match self.current_tag_kind { StartTag => { self.last_start_tag_name = Some(name.clone()); - }, + } EndTag => { if !self.current_tag_attrs.is_empty() { self.emit_error(Borrowed("Attributes on an end tag")); @@ -418,7 +416,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> { if self.current_tag_self_closing { self.emit_error(Borrowed("Self-closing end tag")); } - }, + } } let token = TagToken(Tag { @@ -433,15 +431,15 @@ impl<Sink: TokenSink> Tokenizer<Sink> { TokenSinkResult::Plaintext => { self.state = states::Plaintext; ProcessResult::Continue - }, + } TokenSinkResult::Script(node) => { self.state = states::Data; ProcessResult::Script(node) - }, + } TokenSinkResult::RawData(kind) => { self.state = states::RawData(kind); ProcessResult::Continue - }, + } } } @@ -496,9 +494,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> { // FIXME: linear time search, do we care? let dup = { let name = &*self.current_attr_name; - self.current_tag_attrs - .iter() - .any(|a| &*a.name == name) + self.current_tag_attrs.iter().any(|a| &*a.name == name) }; if dup { @@ -740,7 +736,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> { FromSet('-') => go!(self: emit '-'; to ScriptDataEscapedDash DoubleEscaped), FromSet('<') => { go!(self: emit '<'; to RawLessThanSign ScriptDataEscaped DoubleEscaped) - }, + } FromSet(c) => go!(self: emit c), NotFromSet(b) => self.emit_chars(b), } @@ -774,7 +770,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> { '>' => go!(self: error; to Data), '\0' => { go!(self: error; clear_comment; push_comment '\u{fffd}'; to BogusComment) - }, + } c => match lower_ascii_letter(c) { Some(cl) => go!(self: create_tag EndTag cl; to TagName), None => go!(self: error; clear_comment; push_comment c; to BogusComment), @@ -820,7 +816,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> { '/' => go!(self: clear_temp; to RawEndTagOpen kind), '!' if kind == ScriptData => { go!(self: emit '<'; emit '!'; to ScriptDataEscapeStart Escaped) - }, + } _ => go!(self: emit '<'; reconsume RawData kind), } }, @@ -850,7 +846,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> { Some(cl) => go!(self: push_tag cl; push_temp c), None => { go!(self: discard_tag; emit '<'; emit '/'; emit_temp; reconsume RawData kind) - }, + } } }, @@ -865,7 +861,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> { Escaped }; go!(self: emit c; to RawData ScriptDataEscaped esc); - }, + } _ => match lower_ascii_letter(c) { Some(cl) => go!(self: push_temp cl; emit c), None => go!(self: reconsume RawData ScriptDataEscaped Escaped), @@ -898,7 +894,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> { go!(self: emit '<'); } go!(self: to RawLessThanSign ScriptDataEscaped kind); - }, + } '\0' => go!(self: error; emit '\u{fffd}'; to RawData ScriptDataEscaped kind), c => go!(self: emit c; to RawData ScriptDataEscaped kind), } @@ -913,7 +909,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> { go!(self: emit '<'); } go!(self: to RawLessThanSign ScriptDataEscaped kind); - }, + } '>' => go!(self: emit '>'; to RawData ScriptData), '\0' => go!(self: error; emit '\u{fffd}'; to RawData ScriptDataEscaped kind), c => go!(self: emit c; to RawData ScriptDataEscaped kind), @@ -931,7 +927,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> { DoubleEscaped }; go!(self: emit c; to RawData ScriptDataEscaped esc); - }, + } _ => match lower_ascii_letter(c) { Some(cl) => go!(self: push_temp cl; emit c), None => go!(self: reconsume RawData ScriptDataEscaped DoubleEscaped), @@ -952,7 +948,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> { go_match!(self: c, '"' , '\'' , '<' , '=' => error); go!(self: create_attr c; to AttributeName); - }, + } }, } }, @@ -971,7 +967,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> { go_match!(self: c, '"' , '\'' , '<' => error); go!(self: push_name c); - }, + } }, } }, @@ -990,7 +986,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> { go_match!(self: c, '"' , '\'' , '<' => error); go!(self: create_attr c; to AttributeName); - }, + } }, } }, @@ -1005,7 +1001,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> { '\'' => go!(self: discard_char input; to AttributeValue SingleQuoted), '\0' => { go!(self: discard_char input; error; push_value '\u{fffd}'; to AttributeValue Unquoted) - }, + } '>' => go!(self: discard_char input; error; emit_tag Data), _ => go!(self: to AttributeValue Unquoted), } @@ -1042,7 +1038,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> { ) { FromSet('\t') | FromSet('\n') | FromSet('\x0C') | FromSet(' ') => { go!(self: to BeforeAttributeName) - }, + } FromSet('&') => go!(self: consume_char_ref '>'), FromSet('>') => go!(self: emit_tag Data), FromSet('\0') => go!(self: error; push_value '\u{fffd}'), @@ -1050,7 +1046,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> { go_match!(self: c, '"' , '\'' , '<' , '=' , '`' => error); go!(self: push_value c); - }, + } NotFromSet(ref b) => go!(self: append_value b), } }, @@ -1071,7 +1067,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> { '>' => { self.current_tag_self_closing = true; go!(self: emit_tag Data); - }, + } _ => go!(self: error; reconsume BeforeAttributeName), } }, @@ -1149,7 +1145,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> { '\t' | '\n' | '\x0C' | ' ' => (), '\0' => { go!(self: error; create_doctype; push_doctype_name '\u{fffd}'; to DoctypeName) - }, + } '>' => go!(self: error; create_doctype; force_quirks; emit_doctype; to Data), c => go!(self: create_doctype; push_doctype_name (c.to_ascii_lowercase()); to DoctypeName), @@ -1187,10 +1183,10 @@ impl<Sink: TokenSink> Tokenizer<Sink> { '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeDoctypeIdentifier kind), '"' => { go!(self: error; clear_doctype_id kind; to DoctypeIdentifierDoubleQuoted kind) - }, + } '\'' => { go!(self: error; clear_doctype_id kind; to DoctypeIdentifierSingleQuoted kind) - }, + } '>' => go!(self: error; force_quirks; emit_doctype; to Data), _ => go!(self: error; force_quirks; to BogusDoctype), } @@ -1232,14 +1228,14 @@ impl<Sink: TokenSink> Tokenizer<Sink> { match get_char!(self, input) { '\t' | '\n' | '\x0C' | ' ' => { go!(self: to BetweenDoctypePublicAndSystemIdentifiers) - }, + } '>' => go!(self: emit_doctype; to Data), '"' => { go!(self: error; clear_doctype_id System; to DoctypeIdentifierDoubleQuoted System) - }, + } '\'' => { go!(self: error; clear_doctype_id System; to DoctypeIdentifierSingleQuoted System) - }, + } _ => go!(self: error; force_quirks; to BogusDoctype), } }, @@ -1260,10 +1256,10 @@ impl<Sink: TokenSink> Tokenizer<Sink> { '>' => go!(self: emit_doctype; to Data), '"' => { go!(self: clear_doctype_id System; to DoctypeIdentifierDoubleQuoted System) - }, + } '\'' => { go!(self: clear_doctype_id System; to DoctypeIdentifierSingleQuoted System) - }, + } _ => go!(self: error; force_quirks; to BogusDoctype), } }, @@ -1341,7 +1337,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> { char_ref::Done => { self.process_char_ref(tok.get_result()); return ProcessResult::Continue; - }, + } char_ref::Stuck => ProcessResult::Suspend, char_ref::Progress => ProcessResult::Continue, @@ -1387,7 +1383,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> { Some(mut tok) => { tok.end_of_file(self, &mut input); self.process_char_ref(tok.get_result()); - }, + } } // Process all remaining buffered input. @@ -1432,23 +1428,23 @@ impl<Sink: TokenSink> Tokenizer<Sink> { fn eof_step(&mut self) -> ProcessResult<Sink::Handle> { match self.state { - states::Data | - states::RawData(Rcdata) | - states::RawData(Rawtext) | - states::RawData(ScriptData) | - states::Plaintext => go!(self: eof), - - states::TagName | - states::RawData(ScriptDataEscaped(_)) | - states::BeforeAttributeName | - states::AttributeName | - states::AfterAttributeName | - states::BeforeAttributeValue | - states::AttributeValue(_) | - states::AfterAttributeValueQuoted | - states::SelfClosingStartTag | - states::ScriptDataEscapedDash(_) | - states::ScriptDataEscapedDashDash(_) => go!(self: error_eof; to Data), + states::Data + | states::RawData(Rcdata) + | states::RawData(Rawtext) + | states::RawData(ScriptData) + | states::Plaintext => go!(self: eof), + + states::TagName + | states::RawData(ScriptDataEscaped(_)) + | states::BeforeAttributeName + | states::AttributeName + | states::AfterAttributeName + | states::BeforeAttributeValue + | states::AttributeValue(_) + | states::AfterAttributeValueQuoted + | states::SelfClosingStartTag + | states::ScriptDataEscapedDash(_) + | states::ScriptDataEscapedDashDash(_) => go!(self: error_eof; to Data), states::TagOpen => go!(self: error_eof; emit '<'; to Data), @@ -1456,7 +1452,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> { states::RawLessThanSign(ScriptDataEscaped(DoubleEscaped)) => { go!(self: to RawData ScriptDataEscaped DoubleEscaped) - }, + } states::RawLessThanSign(kind) => go!(self: emit '<'; to RawData kind), @@ -1464,7 +1460,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> { states::RawEndTagName(kind) => { go!(self: emit '<'; emit '/'; emit_temp; to RawData kind) - }, + } states::ScriptDataEscapeStart(kind) => go!(self: to RawData ScriptDataEscaped kind), @@ -1472,29 +1468,29 @@ impl<Sink: TokenSink> Tokenizer<Sink> { states::ScriptDataDoubleEscapeEnd => { go!(self: to RawData ScriptDataEscaped DoubleEscaped) - }, + } - states::CommentStart | - states::CommentStartDash | - states::Comment | - states::CommentEndDash | - states::CommentEnd | - states::CommentEndBang => go!(self: error_eof; emit_comment; to Data), + states::CommentStart + | states::CommentStartDash + | states::Comment + | states::CommentEndDash + | states::CommentEnd + | states::CommentEndBang => go!(self: error_eof; emit_comment; to Data), states::Doctype | states::BeforeDoctypeName => { go!(self: error_eof; create_doctype; force_quirks; emit_doctype; to Data) - }, + } - states::DoctypeName | - states::AfterDoctypeName | - states::AfterDoctypeKeyword(_) | - states::BeforeDoctypeIdentifier(_) | - states::DoctypeIdentifierDoubleQuoted(_) | - states::DoctypeIdentifierSingleQuoted(_) | - states::AfterDoctypeIdentifier(_) | - states::BetweenDoctypePublicAndSystemIdentifiers => { + states::DoctypeName + | states::AfterDoctypeName + | states::AfterDoctypeKeyword(_) + | states::BeforeDoctypeIdentifier(_) + | states::DoctypeIdentifierDoubleQuoted(_) + | states::DoctypeIdentifierSingleQuoted(_) + | states::AfterDoctypeIdentifier(_) + | states::BetweenDoctypePublicAndSystemIdentifiers => { go!(self: error_eof; force_quirks; emit_doctype; to Data) - }, + } states::BogusDoctype => go!(self: emit_doctype; to Data), @@ -1567,15 +1563,15 @@ mod test { match token { CharacterTokens(b) => { self.current_str.push_str(&b); - }, + } NullCharacterToken => { self.current_str.push('\0'); - }, + } ParseError(_) => { panic!("unexpected parse error"); - }, + } TagToken(mut t) => { // The spec seems to indicate that one can emit @@ -1585,11 +1581,11 @@ mod test { EndTag => { t.self_closing = false; t.attrs = vec![]; - }, + } _ => t.attrs.sort_by(|a1, a2| a1.name.cmp(&a2.name)), } self.push(TagToken(t), line_number); - }, + } EOFToken => (), diff --git a/src/util/buffer_queue.rs b/src/util/buffer_queue.rs index 5201a57..b741b36 100644 --- a/src/util/buffer_queue.rs +++ b/src/util/buffer_queue.rs @@ -66,7 +66,7 @@ impl BufferQueue { #[inline] pub fn pop_front(&mut self) -> Option<String> { if let Some((i, s)) = self.buffers.pop_front() { - return Some(s[i..].into()) + return Some(s[i..].into()); } None // self.buffers.pop_front().map(|(i, s)| &s[i..]) @@ -101,7 +101,9 @@ impl BufferQueue { .is_none(), "invariant \"all buffers in the queue are non-empty\" failed" ); - self.buffers.front().map(|(i, s)| s[*i..].chars().next().unwrap()) + self.buffers + .front() + .map(|(i, s)| s[*i..].chars().next().unwrap()) } /// Get the next character if one is available, removing it from the queue. @@ -114,7 +116,7 @@ impl BufferQueue { let c = &buf[*i..].chars().next().expect("empty buffer in queue"); *i += c.len_utf8(); (Some(*c), buf[*i..].is_empty()) - }, + } }; if now_empty { @@ -139,7 +141,7 @@ impl BufferQueue { *i += c.len_utf8(); (Some(FromSet(*c)), buf[*i..].is_empty()) } - }, + } }; // Unborrow self for this part. @@ -188,7 +190,7 @@ impl BufferQueue { None => assert_eq!(consumed_from_last, 0), Some((i, _buf)) => { *i += consumed_from_last; - }, + } } Some(true) diff --git a/src/util/str.rs b/src/util/str.rs index c3185a0..bf38fe1 100644 --- a/src/util/str.rs +++ b/src/util/str.rs @@ -35,7 +35,7 @@ mod test { fn $name() { assert_eq!($left, $right); } - } + }; } test_eq!(lower_letter_a_is_a, lower_ascii_letter('a'), Some('a')); |