summaryrefslogtreecommitdiff
path: root/src/tokenizer
diff options
context:
space:
mode:
Diffstat (limited to 'src/tokenizer')
-rw-r--r--src/tokenizer/char_ref/mod.rs30
-rw-r--r--src/tokenizer/mod.rs160
2 files changed, 93 insertions, 97 deletions
diff --git a/src/tokenizer/char_ref/mod.rs b/src/tokenizer/char_ref/mod.rs
index 7b27bff..41f4c13 100644
--- a/src/tokenizer/char_ref/mod.rs
+++ b/src/tokenizer/char_ref/mod.rs
@@ -145,13 +145,13 @@ impl CharRefTokenizer {
tokenizer.discard_char(input);
self.state = Octothorpe;
Progress
- },
+ }
_ => {
self.state = Named;
self.name_buf_opt = Some(String::new());
Progress
- },
+ }
}
}
@@ -166,12 +166,12 @@ impl CharRefTokenizer {
tokenizer.discard_char(input);
self.hex_marker = Some(c);
self.state = Numeric(16);
- },
+ }
_ => {
self.hex_marker = None;
self.state = Numeric(10);
- },
+ }
}
Progress
}
@@ -195,14 +195,14 @@ impl CharRefTokenizer {
self.num = self.num.wrapping_add(n);
self.seen_digit = true;
Progress
- },
+ }
None if !self.seen_digit => self.unconsume_numeric(tokenizer, input),
None => {
self.state = NumericSemicolon;
Progress
- },
+ }
}
}
@@ -287,7 +287,7 @@ impl CharRefTokenizer {
}
// Otherwise we just have a prefix match.
Progress
- },
+ }
// Can't continue the match.
None => self.finish_named(tokenizer, input, Some(c)),
@@ -322,7 +322,7 @@ impl CharRefTokenizer {
// we emit a parse error.
self.state = BogusName;
return Progress;
- },
+ }
// Check length because &; is not a parse error.
Some(';') if self.name_buf().len() > 1 => self.emit_name_error(tokenizer),
@@ -331,7 +331,7 @@ impl CharRefTokenizer {
}
self.unconsume_name(input);
self.finish_none()
- },
+ }
Some((c1, c2)) => {
// We have a complete match, but we may have consumed
@@ -371,14 +371,14 @@ impl CharRefTokenizer {
"Equals sign after character reference in attribute",
));
true
- },
+ }
(Some(_), _, Some(c)) if is_ascii_alnum(c) => true,
_ => {
tokenizer.emit_error(Borrowed(
"Character reference does not end with semicolon",
));
false
- },
+ }
};
if unconsume_all {
@@ -392,7 +392,7 @@ impl CharRefTokenizer {
});
Done
}
- },
+ }
}
}
@@ -426,20 +426,20 @@ impl CharRefTokenizer {
Numeric(_) | NumericSemicolon => {
tokenizer.emit_error(Borrowed("EOF in numeric character reference"));
self.finish_numeric(tokenizer);
- },
+ }
Named => drop(self.finish_named(tokenizer, input, None)),
BogusName => {
self.unconsume_name(input);
self.finish_none();
- },
+ }
Octothorpe => {
input.push_front(String::from("#"));
tokenizer.emit_error(Borrowed("EOF after '#' in character reference"));
self.finish_none();
- },
+ }
}
}
}
diff --git a/src/tokenizer/mod.rs b/src/tokenizer/mod.rs
index a58e388..bcbc6b7 100644
--- a/src/tokenizer/mod.rs
+++ b/src/tokenizer/mod.rs
@@ -9,9 +9,9 @@
//! The HTML5 tokenizer.
+pub use self::interface::{Attribute, Doctype, EndTag, StartTag, Tag, TagKind};
pub use self::interface::{CharacterTokens, EOFToken, NullCharacterToken, ParseError};
pub use self::interface::{CommentToken, DoctypeToken, TagToken, Token};
-pub use self::interface::{Doctype, EndTag, StartTag, Tag, TagKind, Attribute};
pub use self::interface::{TokenSink, TokenSinkResult};
use self::states::{DoctypeIdKind, Public, System};
@@ -168,9 +168,7 @@ pub struct Tokenizer<Sink> {
impl<Sink: TokenSink> Tokenizer<Sink> {
/// Create a new tokenizer which feeds tokens to a particular `TokenSink`.
pub fn new(sink: Sink, mut opts: TokenizerOpts) -> Tokenizer<Sink> {
- let start_tag_name = opts
- .last_start_tag_name
- .take();
+ let start_tag_name = opts.last_start_tag_name.take();
let state = opts.initial_state.unwrap_or(states::Data);
let discard_bom = opts.discard_bom;
Tokenizer {
@@ -259,8 +257,8 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
self.current_line += 1;
}
- if self.opts.exact_errors &&
- match c as u32 {
+ if self.opts.exact_errors
+ && match c as u32 {
0x01..=0x08 | 0x0B | 0x0E..=0x1F | 0x7F..=0x9F | 0xFDD0..=0xFDEF => true,
n if (n & 0xFFFE) == 0xFFFE => true,
_ => false,
@@ -326,7 +324,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
self.temp_buf.push(c);
}
None
- },
+ }
Some(matched) => Some(matched),
}
}
@@ -343,7 +341,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
Some(x) => {
*x += dt;
false
- },
+ }
None => true,
};
if new {
@@ -410,7 +408,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
match self.current_tag_kind {
StartTag => {
self.last_start_tag_name = Some(name.clone());
- },
+ }
EndTag => {
if !self.current_tag_attrs.is_empty() {
self.emit_error(Borrowed("Attributes on an end tag"));
@@ -418,7 +416,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
if self.current_tag_self_closing {
self.emit_error(Borrowed("Self-closing end tag"));
}
- },
+ }
}
let token = TagToken(Tag {
@@ -433,15 +431,15 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
TokenSinkResult::Plaintext => {
self.state = states::Plaintext;
ProcessResult::Continue
- },
+ }
TokenSinkResult::Script(node) => {
self.state = states::Data;
ProcessResult::Script(node)
- },
+ }
TokenSinkResult::RawData(kind) => {
self.state = states::RawData(kind);
ProcessResult::Continue
- },
+ }
}
}
@@ -496,9 +494,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
// FIXME: linear time search, do we care?
let dup = {
let name = &*self.current_attr_name;
- self.current_tag_attrs
- .iter()
- .any(|a| &*a.name == name)
+ self.current_tag_attrs.iter().any(|a| &*a.name == name)
};
if dup {
@@ -740,7 +736,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
FromSet('-') => go!(self: emit '-'; to ScriptDataEscapedDash DoubleEscaped),
FromSet('<') => {
go!(self: emit '<'; to RawLessThanSign ScriptDataEscaped DoubleEscaped)
- },
+ }
FromSet(c) => go!(self: emit c),
NotFromSet(b) => self.emit_chars(b),
}
@@ -774,7 +770,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
'>' => go!(self: error; to Data),
'\0' => {
go!(self: error; clear_comment; push_comment '\u{fffd}'; to BogusComment)
- },
+ }
c => match lower_ascii_letter(c) {
Some(cl) => go!(self: create_tag EndTag cl; to TagName),
None => go!(self: error; clear_comment; push_comment c; to BogusComment),
@@ -820,7 +816,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
'/' => go!(self: clear_temp; to RawEndTagOpen kind),
'!' if kind == ScriptData => {
go!(self: emit '<'; emit '!'; to ScriptDataEscapeStart Escaped)
- },
+ }
_ => go!(self: emit '<'; reconsume RawData kind),
}
},
@@ -850,7 +846,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
Some(cl) => go!(self: push_tag cl; push_temp c),
None => {
go!(self: discard_tag; emit '<'; emit '/'; emit_temp; reconsume RawData kind)
- },
+ }
}
},
@@ -865,7 +861,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
Escaped
};
go!(self: emit c; to RawData ScriptDataEscaped esc);
- },
+ }
_ => match lower_ascii_letter(c) {
Some(cl) => go!(self: push_temp cl; emit c),
None => go!(self: reconsume RawData ScriptDataEscaped Escaped),
@@ -898,7 +894,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
go!(self: emit '<');
}
go!(self: to RawLessThanSign ScriptDataEscaped kind);
- },
+ }
'\0' => go!(self: error; emit '\u{fffd}'; to RawData ScriptDataEscaped kind),
c => go!(self: emit c; to RawData ScriptDataEscaped kind),
}
@@ -913,7 +909,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
go!(self: emit '<');
}
go!(self: to RawLessThanSign ScriptDataEscaped kind);
- },
+ }
'>' => go!(self: emit '>'; to RawData ScriptData),
'\0' => go!(self: error; emit '\u{fffd}'; to RawData ScriptDataEscaped kind),
c => go!(self: emit c; to RawData ScriptDataEscaped kind),
@@ -931,7 +927,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
DoubleEscaped
};
go!(self: emit c; to RawData ScriptDataEscaped esc);
- },
+ }
_ => match lower_ascii_letter(c) {
Some(cl) => go!(self: push_temp cl; emit c),
None => go!(self: reconsume RawData ScriptDataEscaped DoubleEscaped),
@@ -952,7 +948,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
go_match!(self: c,
'"' , '\'' , '<' , '=' => error);
go!(self: create_attr c; to AttributeName);
- },
+ }
},
}
},
@@ -971,7 +967,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
go_match!(self: c,
'"' , '\'' , '<' => error);
go!(self: push_name c);
- },
+ }
},
}
},
@@ -990,7 +986,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
go_match!(self: c,
'"' , '\'' , '<' => error);
go!(self: create_attr c; to AttributeName);
- },
+ }
},
}
},
@@ -1005,7 +1001,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
'\'' => go!(self: discard_char input; to AttributeValue SingleQuoted),
'\0' => {
go!(self: discard_char input; error; push_value '\u{fffd}'; to AttributeValue Unquoted)
- },
+ }
'>' => go!(self: discard_char input; error; emit_tag Data),
_ => go!(self: to AttributeValue Unquoted),
}
@@ -1042,7 +1038,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
) {
FromSet('\t') | FromSet('\n') | FromSet('\x0C') | FromSet(' ') => {
go!(self: to BeforeAttributeName)
- },
+ }
FromSet('&') => go!(self: consume_char_ref '>'),
FromSet('>') => go!(self: emit_tag Data),
FromSet('\0') => go!(self: error; push_value '\u{fffd}'),
@@ -1050,7 +1046,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
go_match!(self: c,
'"' , '\'' , '<' , '=' , '`' => error);
go!(self: push_value c);
- },
+ }
NotFromSet(ref b) => go!(self: append_value b),
}
},
@@ -1071,7 +1067,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
'>' => {
self.current_tag_self_closing = true;
go!(self: emit_tag Data);
- },
+ }
_ => go!(self: error; reconsume BeforeAttributeName),
}
},
@@ -1149,7 +1145,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
'\t' | '\n' | '\x0C' | ' ' => (),
'\0' => {
go!(self: error; create_doctype; push_doctype_name '\u{fffd}'; to DoctypeName)
- },
+ }
'>' => go!(self: error; create_doctype; force_quirks; emit_doctype; to Data),
c => go!(self: create_doctype; push_doctype_name (c.to_ascii_lowercase());
to DoctypeName),
@@ -1187,10 +1183,10 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
'\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeDoctypeIdentifier kind),
'"' => {
go!(self: error; clear_doctype_id kind; to DoctypeIdentifierDoubleQuoted kind)
- },
+ }
'\'' => {
go!(self: error; clear_doctype_id kind; to DoctypeIdentifierSingleQuoted kind)
- },
+ }
'>' => go!(self: error; force_quirks; emit_doctype; to Data),
_ => go!(self: error; force_quirks; to BogusDoctype),
}
@@ -1232,14 +1228,14 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
match get_char!(self, input) {
'\t' | '\n' | '\x0C' | ' ' => {
go!(self: to BetweenDoctypePublicAndSystemIdentifiers)
- },
+ }
'>' => go!(self: emit_doctype; to Data),
'"' => {
go!(self: error; clear_doctype_id System; to DoctypeIdentifierDoubleQuoted System)
- },
+ }
'\'' => {
go!(self: error; clear_doctype_id System; to DoctypeIdentifierSingleQuoted System)
- },
+ }
_ => go!(self: error; force_quirks; to BogusDoctype),
}
},
@@ -1260,10 +1256,10 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
'>' => go!(self: emit_doctype; to Data),
'"' => {
go!(self: clear_doctype_id System; to DoctypeIdentifierDoubleQuoted System)
- },
+ }
'\'' => {
go!(self: clear_doctype_id System; to DoctypeIdentifierSingleQuoted System)
- },
+ }
_ => go!(self: error; force_quirks; to BogusDoctype),
}
},
@@ -1341,7 +1337,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
char_ref::Done => {
self.process_char_ref(tok.get_result());
return ProcessResult::Continue;
- },
+ }
char_ref::Stuck => ProcessResult::Suspend,
char_ref::Progress => ProcessResult::Continue,
@@ -1387,7 +1383,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
Some(mut tok) => {
tok.end_of_file(self, &mut input);
self.process_char_ref(tok.get_result());
- },
+ }
}
// Process all remaining buffered input.
@@ -1432,23 +1428,23 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
fn eof_step(&mut self) -> ProcessResult<Sink::Handle> {
match self.state {
- states::Data |
- states::RawData(Rcdata) |
- states::RawData(Rawtext) |
- states::RawData(ScriptData) |
- states::Plaintext => go!(self: eof),
-
- states::TagName |
- states::RawData(ScriptDataEscaped(_)) |
- states::BeforeAttributeName |
- states::AttributeName |
- states::AfterAttributeName |
- states::BeforeAttributeValue |
- states::AttributeValue(_) |
- states::AfterAttributeValueQuoted |
- states::SelfClosingStartTag |
- states::ScriptDataEscapedDash(_) |
- states::ScriptDataEscapedDashDash(_) => go!(self: error_eof; to Data),
+ states::Data
+ | states::RawData(Rcdata)
+ | states::RawData(Rawtext)
+ | states::RawData(ScriptData)
+ | states::Plaintext => go!(self: eof),
+
+ states::TagName
+ | states::RawData(ScriptDataEscaped(_))
+ | states::BeforeAttributeName
+ | states::AttributeName
+ | states::AfterAttributeName
+ | states::BeforeAttributeValue
+ | states::AttributeValue(_)
+ | states::AfterAttributeValueQuoted
+ | states::SelfClosingStartTag
+ | states::ScriptDataEscapedDash(_)
+ | states::ScriptDataEscapedDashDash(_) => go!(self: error_eof; to Data),
states::TagOpen => go!(self: error_eof; emit '<'; to Data),
@@ -1456,7 +1452,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
states::RawLessThanSign(ScriptDataEscaped(DoubleEscaped)) => {
go!(self: to RawData ScriptDataEscaped DoubleEscaped)
- },
+ }
states::RawLessThanSign(kind) => go!(self: emit '<'; to RawData kind),
@@ -1464,7 +1460,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
states::RawEndTagName(kind) => {
go!(self: emit '<'; emit '/'; emit_temp; to RawData kind)
- },
+ }
states::ScriptDataEscapeStart(kind) => go!(self: to RawData ScriptDataEscaped kind),
@@ -1472,29 +1468,29 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
states::ScriptDataDoubleEscapeEnd => {
go!(self: to RawData ScriptDataEscaped DoubleEscaped)
- },
+ }
- states::CommentStart |
- states::CommentStartDash |
- states::Comment |
- states::CommentEndDash |
- states::CommentEnd |
- states::CommentEndBang => go!(self: error_eof; emit_comment; to Data),
+ states::CommentStart
+ | states::CommentStartDash
+ | states::Comment
+ | states::CommentEndDash
+ | states::CommentEnd
+ | states::CommentEndBang => go!(self: error_eof; emit_comment; to Data),
states::Doctype | states::BeforeDoctypeName => {
go!(self: error_eof; create_doctype; force_quirks; emit_doctype; to Data)
- },
+ }
- states::DoctypeName |
- states::AfterDoctypeName |
- states::AfterDoctypeKeyword(_) |
- states::BeforeDoctypeIdentifier(_) |
- states::DoctypeIdentifierDoubleQuoted(_) |
- states::DoctypeIdentifierSingleQuoted(_) |
- states::AfterDoctypeIdentifier(_) |
- states::BetweenDoctypePublicAndSystemIdentifiers => {
+ states::DoctypeName
+ | states::AfterDoctypeName
+ | states::AfterDoctypeKeyword(_)
+ | states::BeforeDoctypeIdentifier(_)
+ | states::DoctypeIdentifierDoubleQuoted(_)
+ | states::DoctypeIdentifierSingleQuoted(_)
+ | states::AfterDoctypeIdentifier(_)
+ | states::BetweenDoctypePublicAndSystemIdentifiers => {
go!(self: error_eof; force_quirks; emit_doctype; to Data)
- },
+ }
states::BogusDoctype => go!(self: emit_doctype; to Data),
@@ -1567,15 +1563,15 @@ mod test {
match token {
CharacterTokens(b) => {
self.current_str.push_str(&b);
- },
+ }
NullCharacterToken => {
self.current_str.push('\0');
- },
+ }
ParseError(_) => {
panic!("unexpected parse error");
- },
+ }
TagToken(mut t) => {
// The spec seems to indicate that one can emit
@@ -1585,11 +1581,11 @@ mod test {
EndTag => {
t.self_closing = false;
t.attrs = vec![];
- },
+ }
_ => t.attrs.sort_by(|a1, a2| a1.name.cmp(&a2.name)),
}
self.push(TagToken(t), line_number);
- },
+ }
EOFToken => (),