summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--examples/tokenize.rs12
-rw-r--r--src/lib.rs4
-rw-r--r--src/macros.rs2
-rw-r--r--src/tokenizer/char_ref/mod.rs30
-rw-r--r--src/tokenizer/mod.rs160
-rw-r--r--src/util/buffer_queue.rs12
-rw-r--r--src/util/str.rs2
7 files changed, 109 insertions, 113 deletions
diff --git a/examples/tokenize.rs b/examples/tokenize.rs
index dc3b476..bea6a84 100644
--- a/examples/tokenize.rs
+++ b/examples/tokenize.rs
@@ -14,9 +14,7 @@ use std::io;
use html5tokenizer::BufferQueue;
use html5tokenizer::{CharacterTokens, EndTag, NullCharacterToken, StartTag, TagToken};
-use html5tokenizer::{
- ParseError, Token, TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts,
-};
+use html5tokenizer::{ParseError, Token, TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts};
use io::Read;
#[derive(Copy, Clone)]
@@ -49,7 +47,7 @@ impl TokenSink for TokenPrinter {
for c in b.chars() {
self.do_char(c);
}
- },
+ }
NullCharacterToken => self.do_char('\0'),
TagToken(tag) => {
self.is_char(false);
@@ -68,15 +66,15 @@ impl TokenSink for TokenPrinter {
print!(" \x1b[31m/\x1b[0m");
}
println!(">");
- },
+ }
ParseError(err) => {
self.is_char(false);
println!("ERROR: {}", err);
- },
+ }
_ => {
self.is_char(false);
println!("OTHER: {:?}", token);
- },
+ }
}
TokenSinkResult::Continue
}
diff --git a/src/lib.rs b/src/lib.rs
index 5e6a620..69557b9 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -29,12 +29,12 @@ macro_rules! small_char_set ( ($($e:expr)+) => (
));
mod util {
- pub mod str;
pub mod buffer_queue;
pub mod smallcharset;
+ pub mod str;
}
mod tokenizer;
#[doc(inline)]
-pub use tokenizer::*; \ No newline at end of file
+pub use tokenizer::*;
diff --git a/src/macros.rs b/src/macros.rs
index 643e754..d87ea98 100644
--- a/src/macros.rs
+++ b/src/macros.rs
@@ -50,4 +50,4 @@ macro_rules! format_if {
::std::borrow::Cow::Borrowed($borrowed)
}
}
-} \ No newline at end of file
+}
diff --git a/src/tokenizer/char_ref/mod.rs b/src/tokenizer/char_ref/mod.rs
index 7b27bff..41f4c13 100644
--- a/src/tokenizer/char_ref/mod.rs
+++ b/src/tokenizer/char_ref/mod.rs
@@ -145,13 +145,13 @@ impl CharRefTokenizer {
tokenizer.discard_char(input);
self.state = Octothorpe;
Progress
- },
+ }
_ => {
self.state = Named;
self.name_buf_opt = Some(String::new());
Progress
- },
+ }
}
}
@@ -166,12 +166,12 @@ impl CharRefTokenizer {
tokenizer.discard_char(input);
self.hex_marker = Some(c);
self.state = Numeric(16);
- },
+ }
_ => {
self.hex_marker = None;
self.state = Numeric(10);
- },
+ }
}
Progress
}
@@ -195,14 +195,14 @@ impl CharRefTokenizer {
self.num = self.num.wrapping_add(n);
self.seen_digit = true;
Progress
- },
+ }
None if !self.seen_digit => self.unconsume_numeric(tokenizer, input),
None => {
self.state = NumericSemicolon;
Progress
- },
+ }
}
}
@@ -287,7 +287,7 @@ impl CharRefTokenizer {
}
// Otherwise we just have a prefix match.
Progress
- },
+ }
// Can't continue the match.
None => self.finish_named(tokenizer, input, Some(c)),
@@ -322,7 +322,7 @@ impl CharRefTokenizer {
// we emit a parse error.
self.state = BogusName;
return Progress;
- },
+ }
// Check length because &; is not a parse error.
Some(';') if self.name_buf().len() > 1 => self.emit_name_error(tokenizer),
@@ -331,7 +331,7 @@ impl CharRefTokenizer {
}
self.unconsume_name(input);
self.finish_none()
- },
+ }
Some((c1, c2)) => {
// We have a complete match, but we may have consumed
@@ -371,14 +371,14 @@ impl CharRefTokenizer {
"Equals sign after character reference in attribute",
));
true
- },
+ }
(Some(_), _, Some(c)) if is_ascii_alnum(c) => true,
_ => {
tokenizer.emit_error(Borrowed(
"Character reference does not end with semicolon",
));
false
- },
+ }
};
if unconsume_all {
@@ -392,7 +392,7 @@ impl CharRefTokenizer {
});
Done
}
- },
+ }
}
}
@@ -426,20 +426,20 @@ impl CharRefTokenizer {
Numeric(_) | NumericSemicolon => {
tokenizer.emit_error(Borrowed("EOF in numeric character reference"));
self.finish_numeric(tokenizer);
- },
+ }
Named => drop(self.finish_named(tokenizer, input, None)),
BogusName => {
self.unconsume_name(input);
self.finish_none();
- },
+ }
Octothorpe => {
input.push_front(String::from("#"));
tokenizer.emit_error(Borrowed("EOF after '#' in character reference"));
self.finish_none();
- },
+ }
}
}
}
diff --git a/src/tokenizer/mod.rs b/src/tokenizer/mod.rs
index a58e388..bcbc6b7 100644
--- a/src/tokenizer/mod.rs
+++ b/src/tokenizer/mod.rs
@@ -9,9 +9,9 @@
//! The HTML5 tokenizer.
+pub use self::interface::{Attribute, Doctype, EndTag, StartTag, Tag, TagKind};
pub use self::interface::{CharacterTokens, EOFToken, NullCharacterToken, ParseError};
pub use self::interface::{CommentToken, DoctypeToken, TagToken, Token};
-pub use self::interface::{Doctype, EndTag, StartTag, Tag, TagKind, Attribute};
pub use self::interface::{TokenSink, TokenSinkResult};
use self::states::{DoctypeIdKind, Public, System};
@@ -168,9 +168,7 @@ pub struct Tokenizer<Sink> {
impl<Sink: TokenSink> Tokenizer<Sink> {
/// Create a new tokenizer which feeds tokens to a particular `TokenSink`.
pub fn new(sink: Sink, mut opts: TokenizerOpts) -> Tokenizer<Sink> {
- let start_tag_name = opts
- .last_start_tag_name
- .take();
+ let start_tag_name = opts.last_start_tag_name.take();
let state = opts.initial_state.unwrap_or(states::Data);
let discard_bom = opts.discard_bom;
Tokenizer {
@@ -259,8 +257,8 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
self.current_line += 1;
}
- if self.opts.exact_errors &&
- match c as u32 {
+ if self.opts.exact_errors
+ && match c as u32 {
0x01..=0x08 | 0x0B | 0x0E..=0x1F | 0x7F..=0x9F | 0xFDD0..=0xFDEF => true,
n if (n & 0xFFFE) == 0xFFFE => true,
_ => false,
@@ -326,7 +324,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
self.temp_buf.push(c);
}
None
- },
+ }
Some(matched) => Some(matched),
}
}
@@ -343,7 +341,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
Some(x) => {
*x += dt;
false
- },
+ }
None => true,
};
if new {
@@ -410,7 +408,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
match self.current_tag_kind {
StartTag => {
self.last_start_tag_name = Some(name.clone());
- },
+ }
EndTag => {
if !self.current_tag_attrs.is_empty() {
self.emit_error(Borrowed("Attributes on an end tag"));
@@ -418,7 +416,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
if self.current_tag_self_closing {
self.emit_error(Borrowed("Self-closing end tag"));
}
- },
+ }
}
let token = TagToken(Tag {
@@ -433,15 +431,15 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
TokenSinkResult::Plaintext => {
self.state = states::Plaintext;
ProcessResult::Continue
- },
+ }
TokenSinkResult::Script(node) => {
self.state = states::Data;
ProcessResult::Script(node)
- },
+ }
TokenSinkResult::RawData(kind) => {
self.state = states::RawData(kind);
ProcessResult::Continue
- },
+ }
}
}
@@ -496,9 +494,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
// FIXME: linear time search, do we care?
let dup = {
let name = &*self.current_attr_name;
- self.current_tag_attrs
- .iter()
- .any(|a| &*a.name == name)
+ self.current_tag_attrs.iter().any(|a| &*a.name == name)
};
if dup {
@@ -740,7 +736,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
FromSet('-') => go!(self: emit '-'; to ScriptDataEscapedDash DoubleEscaped),
FromSet('<') => {
go!(self: emit '<'; to RawLessThanSign ScriptDataEscaped DoubleEscaped)
- },
+ }
FromSet(c) => go!(self: emit c),
NotFromSet(b) => self.emit_chars(b),
}
@@ -774,7 +770,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
'>' => go!(self: error; to Data),
'\0' => {
go!(self: error; clear_comment; push_comment '\u{fffd}'; to BogusComment)
- },
+ }
c => match lower_ascii_letter(c) {
Some(cl) => go!(self: create_tag EndTag cl; to TagName),
None => go!(self: error; clear_comment; push_comment c; to BogusComment),
@@ -820,7 +816,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
'/' => go!(self: clear_temp; to RawEndTagOpen kind),
'!' if kind == ScriptData => {
go!(self: emit '<'; emit '!'; to ScriptDataEscapeStart Escaped)
- },
+ }
_ => go!(self: emit '<'; reconsume RawData kind),
}
},
@@ -850,7 +846,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
Some(cl) => go!(self: push_tag cl; push_temp c),
None => {
go!(self: discard_tag; emit '<'; emit '/'; emit_temp; reconsume RawData kind)
- },
+ }
}
},
@@ -865,7 +861,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
Escaped
};
go!(self: emit c; to RawData ScriptDataEscaped esc);
- },
+ }
_ => match lower_ascii_letter(c) {
Some(cl) => go!(self: push_temp cl; emit c),
None => go!(self: reconsume RawData ScriptDataEscaped Escaped),
@@ -898,7 +894,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
go!(self: emit '<');
}
go!(self: to RawLessThanSign ScriptDataEscaped kind);
- },
+ }
'\0' => go!(self: error; emit '\u{fffd}'; to RawData ScriptDataEscaped kind),
c => go!(self: emit c; to RawData ScriptDataEscaped kind),
}
@@ -913,7 +909,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
go!(self: emit '<');
}
go!(self: to RawLessThanSign ScriptDataEscaped kind);
- },
+ }
'>' => go!(self: emit '>'; to RawData ScriptData),
'\0' => go!(self: error; emit '\u{fffd}'; to RawData ScriptDataEscaped kind),
c => go!(self: emit c; to RawData ScriptDataEscaped kind),
@@ -931,7 +927,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
DoubleEscaped
};
go!(self: emit c; to RawData ScriptDataEscaped esc);
- },
+ }
_ => match lower_ascii_letter(c) {
Some(cl) => go!(self: push_temp cl; emit c),
None => go!(self: reconsume RawData ScriptDataEscaped DoubleEscaped),
@@ -952,7 +948,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
go_match!(self: c,
'"' , '\'' , '<' , '=' => error);
go!(self: create_attr c; to AttributeName);
- },
+ }
},
}
},
@@ -971,7 +967,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
go_match!(self: c,
'"' , '\'' , '<' => error);
go!(self: push_name c);
- },
+ }
},
}
},
@@ -990,7 +986,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
go_match!(self: c,
'"' , '\'' , '<' => error);
go!(self: create_attr c; to AttributeName);
- },
+ }
},
}
},
@@ -1005,7 +1001,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
'\'' => go!(self: discard_char input; to AttributeValue SingleQuoted),
'\0' => {
go!(self: discard_char input; error; push_value '\u{fffd}'; to AttributeValue Unquoted)
- },
+ }
'>' => go!(self: discard_char input; error; emit_tag Data),
_ => go!(self: to AttributeValue Unquoted),
}
@@ -1042,7 +1038,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
) {
FromSet('\t') | FromSet('\n') | FromSet('\x0C') | FromSet(' ') => {
go!(self: to BeforeAttributeName)
- },
+ }
FromSet('&') => go!(self: consume_char_ref '>'),
FromSet('>') => go!(self: emit_tag Data),
FromSet('\0') => go!(self: error; push_value '\u{fffd}'),
@@ -1050,7 +1046,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
go_match!(self: c,
'"' , '\'' , '<' , '=' , '`' => error);
go!(self: push_value c);
- },
+ }
NotFromSet(ref b) => go!(self: append_value b),
}
},
@@ -1071,7 +1067,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
'>' => {
self.current_tag_self_closing = true;
go!(self: emit_tag Data);
- },
+ }
_ => go!(self: error; reconsume BeforeAttributeName),
}
},
@@ -1149,7 +1145,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
'\t' | '\n' | '\x0C' | ' ' => (),
'\0' => {
go!(self: error; create_doctype; push_doctype_name '\u{fffd}'; to DoctypeName)
- },
+ }
'>' => go!(self: error; create_doctype; force_quirks; emit_doctype; to Data),
c => go!(self: create_doctype; push_doctype_name (c.to_ascii_lowercase());
to DoctypeName),
@@ -1187,10 +1183,10 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
'\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeDoctypeIdentifier kind),
'"' => {
go!(self: error; clear_doctype_id kind; to DoctypeIdentifierDoubleQuoted kind)
- },
+ }
'\'' => {
go!(self: error; clear_doctype_id kind; to DoctypeIdentifierSingleQuoted kind)
- },
+ }
'>' => go!(self: error; force_quirks; emit_doctype; to Data),
_ => go!(self: error; force_quirks; to BogusDoctype),
}
@@ -1232,14 +1228,14 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
match get_char!(self, input) {
'\t' | '\n' | '\x0C' | ' ' => {
go!(self: to BetweenDoctypePublicAndSystemIdentifiers)
- },
+ }
'>' => go!(self: emit_doctype; to Data),
'"' => {
go!(self: error; clear_doctype_id System; to DoctypeIdentifierDoubleQuoted System)
- },
+ }
'\'' => {
go!(self: error; clear_doctype_id System; to DoctypeIdentifierSingleQuoted System)
- },
+ }
_ => go!(self: error; force_quirks; to BogusDoctype),
}
},
@@ -1260,10 +1256,10 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
'>' => go!(self: emit_doctype; to Data),
'"' => {
go!(self: clear_doctype_id System; to DoctypeIdentifierDoubleQuoted System)
- },
+ }
'\'' => {
go!(self: clear_doctype_id System; to DoctypeIdentifierSingleQuoted System)
- },
+ }
_ => go!(self: error; force_quirks; to BogusDoctype),
}
},
@@ -1341,7 +1337,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
char_ref::Done => {
self.process_char_ref(tok.get_result());
return ProcessResult::Continue;
- },
+ }
char_ref::Stuck => ProcessResult::Suspend,
char_ref::Progress => ProcessResult::Continue,
@@ -1387,7 +1383,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
Some(mut tok) => {
tok.end_of_file(self, &mut input);
self.process_char_ref(tok.get_result());
- },
+ }
}
// Process all remaining buffered input.
@@ -1432,23 +1428,23 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
fn eof_step(&mut self) -> ProcessResult<Sink::Handle> {
match self.state {
- states::Data |
- states::RawData(Rcdata) |
- states::RawData(Rawtext) |
- states::RawData(ScriptData) |
- states::Plaintext => go!(self: eof),
-
- states::TagName |
- states::RawData(ScriptDataEscaped(_)) |
- states::BeforeAttributeName |
- states::AttributeName |
- states::AfterAttributeName |
- states::BeforeAttributeValue |
- states::AttributeValue(_) |
- states::AfterAttributeValueQuoted |
- states::SelfClosingStartTag |
- states::ScriptDataEscapedDash(_) |
- states::ScriptDataEscapedDashDash(_) => go!(self: error_eof; to Data),
+ states::Data
+ | states::RawData(Rcdata)
+ | states::RawData(Rawtext)
+ | states::RawData(ScriptData)
+ | states::Plaintext => go!(self: eof),
+
+ states::TagName
+ | states::RawData(ScriptDataEscaped(_))
+ | states::BeforeAttributeName
+ | states::AttributeName
+ | states::AfterAttributeName
+ | states::BeforeAttributeValue
+ | states::AttributeValue(_)
+ | states::AfterAttributeValueQuoted
+ | states::SelfClosingStartTag
+ | states::ScriptDataEscapedDash(_)
+ | states::ScriptDataEscapedDashDash(_) => go!(self: error_eof; to Data),
states::TagOpen => go!(self: error_eof; emit '<'; to Data),
@@ -1456,7 +1452,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
states::RawLessThanSign(ScriptDataEscaped(DoubleEscaped)) => {
go!(self: to RawData ScriptDataEscaped DoubleEscaped)
- },
+ }
states::RawLessThanSign(kind) => go!(self: emit '<'; to RawData kind),
@@ -1464,7 +1460,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
states::RawEndTagName(kind) => {
go!(self: emit '<'; emit '/'; emit_temp; to RawData kind)
- },
+ }
states::ScriptDataEscapeStart(kind) => go!(self: to RawData ScriptDataEscaped kind),
@@ -1472,29 +1468,29 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
states::ScriptDataDoubleEscapeEnd => {
go!(self: to RawData ScriptDataEscaped DoubleEscaped)
- },
+ }
- states::CommentStart |
- states::CommentStartDash |
- states::Comment |
- states::CommentEndDash |
- states::CommentEnd |
- states::CommentEndBang => go!(self: error_eof; emit_comment; to Data),
+ states::CommentStart
+ | states::CommentStartDash
+ | states::Comment
+ | states::CommentEndDash
+ | states::CommentEnd
+ | states::CommentEndBang => go!(self: error_eof; emit_comment; to Data),
states::Doctype | states::BeforeDoctypeName => {
go!(self: error_eof; create_doctype; force_quirks; emit_doctype; to Data)
- },
+ }
- states::DoctypeName |
- states::AfterDoctypeName |
- states::AfterDoctypeKeyword(_) |
- states::BeforeDoctypeIdentifier(_) |
- states::DoctypeIdentifierDoubleQuoted(_) |
- states::DoctypeIdentifierSingleQuoted(_) |
- states::AfterDoctypeIdentifier(_) |
- states::BetweenDoctypePublicAndSystemIdentifiers => {
+ states::DoctypeName
+ | states::AfterDoctypeName
+ | states::AfterDoctypeKeyword(_)
+ | states::BeforeDoctypeIdentifier(_)
+ | states::DoctypeIdentifierDoubleQuoted(_)
+ | states::DoctypeIdentifierSingleQuoted(_)
+ | states::AfterDoctypeIdentifier(_)
+ | states::BetweenDoctypePublicAndSystemIdentifiers => {
go!(self: error_eof; force_quirks; emit_doctype; to Data)
- },
+ }
states::BogusDoctype => go!(self: emit_doctype; to Data),
@@ -1567,15 +1563,15 @@ mod test {
match token {
CharacterTokens(b) => {
self.current_str.push_str(&b);
- },
+ }
NullCharacterToken => {
self.current_str.push('\0');
- },
+ }
ParseError(_) => {
panic!("unexpected parse error");
- },
+ }
TagToken(mut t) => {
// The spec seems to indicate that one can emit
@@ -1585,11 +1581,11 @@ mod test {
EndTag => {
t.self_closing = false;
t.attrs = vec![];
- },
+ }
_ => t.attrs.sort_by(|a1, a2| a1.name.cmp(&a2.name)),
}
self.push(TagToken(t), line_number);
- },
+ }
EOFToken => (),
diff --git a/src/util/buffer_queue.rs b/src/util/buffer_queue.rs
index 5201a57..b741b36 100644
--- a/src/util/buffer_queue.rs
+++ b/src/util/buffer_queue.rs
@@ -66,7 +66,7 @@ impl BufferQueue {
#[inline]
pub fn pop_front(&mut self) -> Option<String> {
if let Some((i, s)) = self.buffers.pop_front() {
- return Some(s[i..].into())
+ return Some(s[i..].into());
}
None
// self.buffers.pop_front().map(|(i, s)| &s[i..])
@@ -101,7 +101,9 @@ impl BufferQueue {
.is_none(),
"invariant \"all buffers in the queue are non-empty\" failed"
);
- self.buffers.front().map(|(i, s)| s[*i..].chars().next().unwrap())
+ self.buffers
+ .front()
+ .map(|(i, s)| s[*i..].chars().next().unwrap())
}
/// Get the next character if one is available, removing it from the queue.
@@ -114,7 +116,7 @@ impl BufferQueue {
let c = &buf[*i..].chars().next().expect("empty buffer in queue");
*i += c.len_utf8();
(Some(*c), buf[*i..].is_empty())
- },
+ }
};
if now_empty {
@@ -139,7 +141,7 @@ impl BufferQueue {
*i += c.len_utf8();
(Some(FromSet(*c)), buf[*i..].is_empty())
}
- },
+ }
};
// Unborrow self for this part.
@@ -188,7 +190,7 @@ impl BufferQueue {
None => assert_eq!(consumed_from_last, 0),
Some((i, _buf)) => {
*i += consumed_from_last;
- },
+ }
}
Some(true)
diff --git a/src/util/str.rs b/src/util/str.rs
index c3185a0..bf38fe1 100644
--- a/src/util/str.rs
+++ b/src/util/str.rs
@@ -35,7 +35,7 @@ mod test {
fn $name() {
assert_eq!($left, $right);
}
- }
+ };
}
test_eq!(lower_letter_a_is_a, lower_ascii_letter('a'), Some('a'));