diff options
Diffstat (limited to 'src/tokenizer/mod.rs')
-rw-r--r-- | src/tokenizer/mod.rs | 160 |
1 files changed, 78 insertions, 82 deletions
diff --git a/src/tokenizer/mod.rs b/src/tokenizer/mod.rs index a58e388..bcbc6b7 100644 --- a/src/tokenizer/mod.rs +++ b/src/tokenizer/mod.rs @@ -9,9 +9,9 @@ //! The HTML5 tokenizer. +pub use self::interface::{Attribute, Doctype, EndTag, StartTag, Tag, TagKind}; pub use self::interface::{CharacterTokens, EOFToken, NullCharacterToken, ParseError}; pub use self::interface::{CommentToken, DoctypeToken, TagToken, Token}; -pub use self::interface::{Doctype, EndTag, StartTag, Tag, TagKind, Attribute}; pub use self::interface::{TokenSink, TokenSinkResult}; use self::states::{DoctypeIdKind, Public, System}; @@ -168,9 +168,7 @@ pub struct Tokenizer<Sink> { impl<Sink: TokenSink> Tokenizer<Sink> { /// Create a new tokenizer which feeds tokens to a particular `TokenSink`. pub fn new(sink: Sink, mut opts: TokenizerOpts) -> Tokenizer<Sink> { - let start_tag_name = opts - .last_start_tag_name - .take(); + let start_tag_name = opts.last_start_tag_name.take(); let state = opts.initial_state.unwrap_or(states::Data); let discard_bom = opts.discard_bom; Tokenizer { @@ -259,8 +257,8 @@ impl<Sink: TokenSink> Tokenizer<Sink> { self.current_line += 1; } - if self.opts.exact_errors && - match c as u32 { + if self.opts.exact_errors + && match c as u32 { 0x01..=0x08 | 0x0B | 0x0E..=0x1F | 0x7F..=0x9F | 0xFDD0..=0xFDEF => true, n if (n & 0xFFFE) == 0xFFFE => true, _ => false, @@ -326,7 +324,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> { self.temp_buf.push(c); } None - }, + } Some(matched) => Some(matched), } } @@ -343,7 +341,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> { Some(x) => { *x += dt; false - }, + } None => true, }; if new { @@ -410,7 +408,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> { match self.current_tag_kind { StartTag => { self.last_start_tag_name = Some(name.clone()); - }, + } EndTag => { if !self.current_tag_attrs.is_empty() { self.emit_error(Borrowed("Attributes on an end tag")); @@ -418,7 +416,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> { if self.current_tag_self_closing { self.emit_error(Borrowed("Self-closing end tag")); } - }, + } } let token = TagToken(Tag { @@ -433,15 +431,15 @@ impl<Sink: TokenSink> Tokenizer<Sink> { TokenSinkResult::Plaintext => { self.state = states::Plaintext; ProcessResult::Continue - }, + } TokenSinkResult::Script(node) => { self.state = states::Data; ProcessResult::Script(node) - }, + } TokenSinkResult::RawData(kind) => { self.state = states::RawData(kind); ProcessResult::Continue - }, + } } } @@ -496,9 +494,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> { // FIXME: linear time search, do we care? let dup = { let name = &*self.current_attr_name; - self.current_tag_attrs - .iter() - .any(|a| &*a.name == name) + self.current_tag_attrs.iter().any(|a| &*a.name == name) }; if dup { @@ -740,7 +736,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> { FromSet('-') => go!(self: emit '-'; to ScriptDataEscapedDash DoubleEscaped), FromSet('<') => { go!(self: emit '<'; to RawLessThanSign ScriptDataEscaped DoubleEscaped) - }, + } FromSet(c) => go!(self: emit c), NotFromSet(b) => self.emit_chars(b), } @@ -774,7 +770,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> { '>' => go!(self: error; to Data), '\0' => { go!(self: error; clear_comment; push_comment '\u{fffd}'; to BogusComment) - }, + } c => match lower_ascii_letter(c) { Some(cl) => go!(self: create_tag EndTag cl; to TagName), None => go!(self: error; clear_comment; push_comment c; to BogusComment), @@ -820,7 +816,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> { '/' => go!(self: clear_temp; to RawEndTagOpen kind), '!' if kind == ScriptData => { go!(self: emit '<'; emit '!'; to ScriptDataEscapeStart Escaped) - }, + } _ => go!(self: emit '<'; reconsume RawData kind), } }, @@ -850,7 +846,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> { Some(cl) => go!(self: push_tag cl; push_temp c), None => { go!(self: discard_tag; emit '<'; emit '/'; emit_temp; reconsume RawData kind) - }, + } } }, @@ -865,7 +861,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> { Escaped }; go!(self: emit c; to RawData ScriptDataEscaped esc); - }, + } _ => match lower_ascii_letter(c) { Some(cl) => go!(self: push_temp cl; emit c), None => go!(self: reconsume RawData ScriptDataEscaped Escaped), @@ -898,7 +894,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> { go!(self: emit '<'); } go!(self: to RawLessThanSign ScriptDataEscaped kind); - }, + } '\0' => go!(self: error; emit '\u{fffd}'; to RawData ScriptDataEscaped kind), c => go!(self: emit c; to RawData ScriptDataEscaped kind), } @@ -913,7 +909,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> { go!(self: emit '<'); } go!(self: to RawLessThanSign ScriptDataEscaped kind); - }, + } '>' => go!(self: emit '>'; to RawData ScriptData), '\0' => go!(self: error; emit '\u{fffd}'; to RawData ScriptDataEscaped kind), c => go!(self: emit c; to RawData ScriptDataEscaped kind), @@ -931,7 +927,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> { DoubleEscaped }; go!(self: emit c; to RawData ScriptDataEscaped esc); - }, + } _ => match lower_ascii_letter(c) { Some(cl) => go!(self: push_temp cl; emit c), None => go!(self: reconsume RawData ScriptDataEscaped DoubleEscaped), @@ -952,7 +948,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> { go_match!(self: c, '"' , '\'' , '<' , '=' => error); go!(self: create_attr c; to AttributeName); - }, + } }, } }, @@ -971,7 +967,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> { go_match!(self: c, '"' , '\'' , '<' => error); go!(self: push_name c); - }, + } }, } }, @@ -990,7 +986,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> { go_match!(self: c, '"' , '\'' , '<' => error); go!(self: create_attr c; to AttributeName); - }, + } }, } }, @@ -1005,7 +1001,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> { '\'' => go!(self: discard_char input; to AttributeValue SingleQuoted), '\0' => { go!(self: discard_char input; error; push_value '\u{fffd}'; to AttributeValue Unquoted) - }, + } '>' => go!(self: discard_char input; error; emit_tag Data), _ => go!(self: to AttributeValue Unquoted), } @@ -1042,7 +1038,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> { ) { FromSet('\t') | FromSet('\n') | FromSet('\x0C') | FromSet(' ') => { go!(self: to BeforeAttributeName) - }, + } FromSet('&') => go!(self: consume_char_ref '>'), FromSet('>') => go!(self: emit_tag Data), FromSet('\0') => go!(self: error; push_value '\u{fffd}'), @@ -1050,7 +1046,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> { go_match!(self: c, '"' , '\'' , '<' , '=' , '`' => error); go!(self: push_value c); - }, + } NotFromSet(ref b) => go!(self: append_value b), } }, @@ -1071,7 +1067,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> { '>' => { self.current_tag_self_closing = true; go!(self: emit_tag Data); - }, + } _ => go!(self: error; reconsume BeforeAttributeName), } }, @@ -1149,7 +1145,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> { '\t' | '\n' | '\x0C' | ' ' => (), '\0' => { go!(self: error; create_doctype; push_doctype_name '\u{fffd}'; to DoctypeName) - }, + } '>' => go!(self: error; create_doctype; force_quirks; emit_doctype; to Data), c => go!(self: create_doctype; push_doctype_name (c.to_ascii_lowercase()); to DoctypeName), @@ -1187,10 +1183,10 @@ impl<Sink: TokenSink> Tokenizer<Sink> { '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeDoctypeIdentifier kind), '"' => { go!(self: error; clear_doctype_id kind; to DoctypeIdentifierDoubleQuoted kind) - }, + } '\'' => { go!(self: error; clear_doctype_id kind; to DoctypeIdentifierSingleQuoted kind) - }, + } '>' => go!(self: error; force_quirks; emit_doctype; to Data), _ => go!(self: error; force_quirks; to BogusDoctype), } @@ -1232,14 +1228,14 @@ impl<Sink: TokenSink> Tokenizer<Sink> { match get_char!(self, input) { '\t' | '\n' | '\x0C' | ' ' => { go!(self: to BetweenDoctypePublicAndSystemIdentifiers) - }, + } '>' => go!(self: emit_doctype; to Data), '"' => { go!(self: error; clear_doctype_id System; to DoctypeIdentifierDoubleQuoted System) - }, + } '\'' => { go!(self: error; clear_doctype_id System; to DoctypeIdentifierSingleQuoted System) - }, + } _ => go!(self: error; force_quirks; to BogusDoctype), } }, @@ -1260,10 +1256,10 @@ impl<Sink: TokenSink> Tokenizer<Sink> { '>' => go!(self: emit_doctype; to Data), '"' => { go!(self: clear_doctype_id System; to DoctypeIdentifierDoubleQuoted System) - }, + } '\'' => { go!(self: clear_doctype_id System; to DoctypeIdentifierSingleQuoted System) - }, + } _ => go!(self: error; force_quirks; to BogusDoctype), } }, @@ -1341,7 +1337,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> { char_ref::Done => { self.process_char_ref(tok.get_result()); return ProcessResult::Continue; - }, + } char_ref::Stuck => ProcessResult::Suspend, char_ref::Progress => ProcessResult::Continue, @@ -1387,7 +1383,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> { Some(mut tok) => { tok.end_of_file(self, &mut input); self.process_char_ref(tok.get_result()); - }, + } } // Process all remaining buffered input. @@ -1432,23 +1428,23 @@ impl<Sink: TokenSink> Tokenizer<Sink> { fn eof_step(&mut self) -> ProcessResult<Sink::Handle> { match self.state { - states::Data | - states::RawData(Rcdata) | - states::RawData(Rawtext) | - states::RawData(ScriptData) | - states::Plaintext => go!(self: eof), - - states::TagName | - states::RawData(ScriptDataEscaped(_)) | - states::BeforeAttributeName | - states::AttributeName | - states::AfterAttributeName | - states::BeforeAttributeValue | - states::AttributeValue(_) | - states::AfterAttributeValueQuoted | - states::SelfClosingStartTag | - states::ScriptDataEscapedDash(_) | - states::ScriptDataEscapedDashDash(_) => go!(self: error_eof; to Data), + states::Data + | states::RawData(Rcdata) + | states::RawData(Rawtext) + | states::RawData(ScriptData) + | states::Plaintext => go!(self: eof), + + states::TagName + | states::RawData(ScriptDataEscaped(_)) + | states::BeforeAttributeName + | states::AttributeName + | states::AfterAttributeName + | states::BeforeAttributeValue + | states::AttributeValue(_) + | states::AfterAttributeValueQuoted + | states::SelfClosingStartTag + | states::ScriptDataEscapedDash(_) + | states::ScriptDataEscapedDashDash(_) => go!(self: error_eof; to Data), states::TagOpen => go!(self: error_eof; emit '<'; to Data), @@ -1456,7 +1452,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> { states::RawLessThanSign(ScriptDataEscaped(DoubleEscaped)) => { go!(self: to RawData ScriptDataEscaped DoubleEscaped) - }, + } states::RawLessThanSign(kind) => go!(self: emit '<'; to RawData kind), @@ -1464,7 +1460,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> { states::RawEndTagName(kind) => { go!(self: emit '<'; emit '/'; emit_temp; to RawData kind) - }, + } states::ScriptDataEscapeStart(kind) => go!(self: to RawData ScriptDataEscaped kind), @@ -1472,29 +1468,29 @@ impl<Sink: TokenSink> Tokenizer<Sink> { states::ScriptDataDoubleEscapeEnd => { go!(self: to RawData ScriptDataEscaped DoubleEscaped) - }, + } - states::CommentStart | - states::CommentStartDash | - states::Comment | - states::CommentEndDash | - states::CommentEnd | - states::CommentEndBang => go!(self: error_eof; emit_comment; to Data), + states::CommentStart + | states::CommentStartDash + | states::Comment + | states::CommentEndDash + | states::CommentEnd + | states::CommentEndBang => go!(self: error_eof; emit_comment; to Data), states::Doctype | states::BeforeDoctypeName => { go!(self: error_eof; create_doctype; force_quirks; emit_doctype; to Data) - }, + } - states::DoctypeName | - states::AfterDoctypeName | - states::AfterDoctypeKeyword(_) | - states::BeforeDoctypeIdentifier(_) | - states::DoctypeIdentifierDoubleQuoted(_) | - states::DoctypeIdentifierSingleQuoted(_) | - states::AfterDoctypeIdentifier(_) | - states::BetweenDoctypePublicAndSystemIdentifiers => { + states::DoctypeName + | states::AfterDoctypeName + | states::AfterDoctypeKeyword(_) + | states::BeforeDoctypeIdentifier(_) + | states::DoctypeIdentifierDoubleQuoted(_) + | states::DoctypeIdentifierSingleQuoted(_) + | states::AfterDoctypeIdentifier(_) + | states::BetweenDoctypePublicAndSystemIdentifiers => { go!(self: error_eof; force_quirks; emit_doctype; to Data) - }, + } states::BogusDoctype => go!(self: emit_doctype; to Data), @@ -1567,15 +1563,15 @@ mod test { match token { CharacterTokens(b) => { self.current_str.push_str(&b); - }, + } NullCharacterToken => { self.current_str.push('\0'); - }, + } ParseError(_) => { panic!("unexpected parse error"); - }, + } TagToken(mut t) => { // The spec seems to indicate that one can emit @@ -1585,11 +1581,11 @@ mod test { EndTag => { t.self_closing = false; t.attrs = vec![]; - }, + } _ => t.attrs.sort_by(|a1, a2| a1.name.cmp(&a2.name)), } self.push(TagToken(t), line_number); - }, + } EOFToken => (), |