diff options
author | Martin Fischer <martin@push-f.com> | 2021-11-28 10:07:00 +0100 |
---|---|---|
committer | Martin Fischer <martin@push-f.com> | 2021-11-29 15:11:01 +0100 |
commit | 2b84e92f4f0981b099f30759330d00a73c90dee8 (patch) | |
tree | 31d39077aa50944ff9bbdd784f316b5811d9c31c /src/tokenizer/mod.rs | |
parent | 8c3be2ac471edc6f24059d552e82fb5acc3d7cc2 (diff) |
refactor: split up go! macro calls
#!/usr/bin/env python3
import re
def split_go(match):
inner = match.group(1)
stmts = inner.split(';')
text = '{'
for stmt in stmts:
if stmt.startswith(' reconsume '):
text += 'self.reconsume = true;'
stmt = ' to' + stmt[len(' reconsume'):]
text += 'go!(self:{});'.format(stmt)
return text + '}'
text = ''
with open('src/tokenizer/mod.rs') as f:
for line in f:
if '$me:ident' in line:
# skip macro rules
text += line
else:
text += re.sub('go!\(self:(.*)\)', split_go, line)
with open('src/tokenizer/mod.rs', 'w') as f:
f.write(text)
import subprocess
subprocess.call(['cargo', 'fmt'])
Diffstat (limited to 'src/tokenizer/mod.rs')
-rw-r--r-- | src/tokenizer/mod.rs | 1062 |
1 files changed, 841 insertions, 221 deletions
diff --git a/src/tokenizer/mod.rs b/src/tokenizer/mod.rs index a8ec39f..fc8bd7f 100644 --- a/src/tokenizer/mod.rs +++ b/src/tokenizer/mod.rs @@ -676,10 +676,19 @@ impl<Sink: TokenSink> Tokenizer<Sink> { //§ data-state states::Data => loop { match pop_except_from!(self, input, small_char_set!('\r' '\0' '&' '<' '\n')) { - FromSet('\0') => go!(self: error; emit '\0'), - FromSet('&') => go!(self: consume_char_ref), - FromSet('<') => go!(self: to TagOpen), - FromSet(c) => go!(self: emit c), + FromSet('\0') => { + go!(self: error); + go!(self: emit '\0'); + } + FromSet('&') => { + go!(self: consume_char_ref); + } + FromSet('<') => { + go!(self: to TagOpen); + } + FromSet(c) => { + go!(self: emit c); + } NotFromSet(b) => self.emit_chars(b), } }, @@ -687,10 +696,19 @@ impl<Sink: TokenSink> Tokenizer<Sink> { //§ rcdata-state states::RawData(Rcdata) => loop { match pop_except_from!(self, input, small_char_set!('\r' '\0' '&' '<' '\n')) { - FromSet('\0') => go!(self: error; emit '\u{fffd}'), - FromSet('&') => go!(self: consume_char_ref), - FromSet('<') => go!(self: to RawLessThanSign Rcdata), - FromSet(c) => go!(self: emit c), + FromSet('\0') => { + go!(self: error); + go!(self: emit '\u{fffd}'); + } + FromSet('&') => { + go!(self: consume_char_ref); + } + FromSet('<') => { + go!(self: to RawLessThanSign Rcdata); + } + FromSet(c) => { + go!(self: emit c); + } NotFromSet(b) => self.emit_chars(b), } }, @@ -698,9 +716,16 @@ impl<Sink: TokenSink> Tokenizer<Sink> { //§ rawtext-state states::RawData(Rawtext) => loop { match pop_except_from!(self, input, small_char_set!('\r' '\0' '<' '\n')) { - FromSet('\0') => go!(self: error; emit '\u{fffd}'), - FromSet('<') => go!(self: to RawLessThanSign Rawtext), - FromSet(c) => go!(self: emit c), + FromSet('\0') => { + go!(self: error); + go!(self: emit '\u{fffd}'); + } + FromSet('<') => { + go!(self: to RawLessThanSign Rawtext); + } + FromSet(c) => { + go!(self: emit c); + } NotFromSet(b) => self.emit_chars(b), } }, @@ -708,9 +733,16 @@ impl<Sink: TokenSink> Tokenizer<Sink> { //§ script-data-state states::RawData(ScriptData) => loop { match pop_except_from!(self, input, small_char_set!('\r' '\0' '<' '\n')) { - FromSet('\0') => go!(self: error; emit '\u{fffd}'), - FromSet('<') => go!(self: to RawLessThanSign ScriptData), - FromSet(c) => go!(self: emit c), + FromSet('\0') => { + go!(self: error); + go!(self: emit '\u{fffd}'); + } + FromSet('<') => { + go!(self: to RawLessThanSign ScriptData); + } + FromSet(c) => { + go!(self: emit c); + } NotFromSet(b) => self.emit_chars(b), } }, @@ -718,10 +750,20 @@ impl<Sink: TokenSink> Tokenizer<Sink> { //§ script-data-escaped-state states::RawData(ScriptDataEscaped(Escaped)) => loop { match pop_except_from!(self, input, small_char_set!('\r' '\0' '-' '<' '\n')) { - FromSet('\0') => go!(self: error; emit '\u{fffd}'), - FromSet('-') => go!(self: emit '-'; to ScriptDataEscapedDash Escaped), - FromSet('<') => go!(self: to RawLessThanSign ScriptDataEscaped Escaped), - FromSet(c) => go!(self: emit c), + FromSet('\0') => { + go!(self: error); + go!(self: emit '\u{fffd}'); + } + FromSet('-') => { + go!(self: emit '-'); + go!(self: to ScriptDataEscapedDash Escaped); + } + FromSet('<') => { + go!(self: to RawLessThanSign ScriptDataEscaped Escaped); + } + FromSet(c) => { + go!(self: emit c); + } NotFromSet(b) => self.emit_chars(b), } }, @@ -729,12 +771,21 @@ impl<Sink: TokenSink> Tokenizer<Sink> { //§ script-data-double-escaped-state states::RawData(ScriptDataEscaped(DoubleEscaped)) => loop { match pop_except_from!(self, input, small_char_set!('\r' '\0' '-' '<' '\n')) { - FromSet('\0') => go!(self: error; emit '\u{fffd}'), - FromSet('-') => go!(self: emit '-'; to ScriptDataEscapedDash DoubleEscaped), + FromSet('\0') => { + go!(self: error); + go!(self: emit '\u{fffd}'); + } + FromSet('-') => { + go!(self: emit '-'); + go!(self: to ScriptDataEscapedDash DoubleEscaped); + } FromSet('<') => { - go!(self: emit '<'; to RawLessThanSign ScriptDataEscaped DoubleEscaped) + go!(self: emit '<'); + go!(self: to RawLessThanSign ScriptDataEscaped DoubleEscaped); + } + FromSet(c) => { + go!(self: emit c); } - FromSet(c) => go!(self: emit c), NotFromSet(b) => self.emit_chars(b), } }, @@ -742,8 +793,13 @@ impl<Sink: TokenSink> Tokenizer<Sink> { //§ plaintext-state states::Plaintext => loop { match pop_except_from!(self, input, small_char_set!('\r' '\0' '\n')) { - FromSet('\0') => go!(self: error; emit '\u{fffd}'), - FromSet(c) => go!(self: emit c), + FromSet('\0') => { + go!(self: error); + go!(self: emit '\u{fffd}'); + } + FromSet(c) => { + go!(self: emit c); + } NotFromSet(b) => self.emit_chars(b), } }, @@ -751,12 +807,30 @@ impl<Sink: TokenSink> Tokenizer<Sink> { //§ tag-open-state states::TagOpen => loop { match get_char!(self, input) { - '!' => go!(self: clear_temp; to MarkupDeclarationOpen), - '/' => go!(self: to EndTagOpen), - '?' => go!(self: error; clear_comment; push_comment '?'; to BogusComment), + '!' => { + go!(self: clear_temp); + go!(self: to MarkupDeclarationOpen); + } + '/' => { + go!(self: to EndTagOpen); + } + '?' => { + go!(self: error); + go!(self: clear_comment); + go!(self: push_comment '?'); + go!(self: to BogusComment); + } c => match lower_ascii_letter(c) { - Some(cl) => go!(self: create_tag StartTag cl; to TagName), - None => go!(self: error; emit '<'; reconsume Data), + Some(cl) => { + go!(self: create_tag StartTag cl); + go!(self: to TagName); + } + None => { + go!(self: error); + go!(self: emit '<'); + self.reconsume = true; + go!(self: to Data); + } }, } }, @@ -764,13 +838,27 @@ impl<Sink: TokenSink> Tokenizer<Sink> { //§ end-tag-open-state states::EndTagOpen => loop { match get_char!(self, input) { - '>' => go!(self: error; to Data), + '>' => { + go!(self: error); + go!(self: to Data); + } '\0' => { - go!(self: error; clear_comment; push_comment '\u{fffd}'; to BogusComment) + go!(self: error); + go!(self: clear_comment); + go!(self: push_comment '\u{fffd}'); + go!(self: to BogusComment); } c => match lower_ascii_letter(c) { - Some(cl) => go!(self: create_tag EndTag cl; to TagName), - None => go!(self: error; clear_comment; push_comment c; to BogusComment), + Some(cl) => { + go!(self: create_tag EndTag cl); + go!(self: to TagName); + } + None => { + go!(self: error); + go!(self: clear_comment); + go!(self: push_comment c); + go!(self: to BogusComment); + } }, } }, @@ -778,23 +866,45 @@ impl<Sink: TokenSink> Tokenizer<Sink> { //§ tag-name-state states::TagName => loop { match get_char!(self, input) { - '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeAttributeName), - '/' => go!(self: to SelfClosingStartTag), - '>' => go!(self: emit_tag Data), - '\0' => go!(self: error; push_tag '\u{fffd}'), - c => go!(self: push_tag (c.to_ascii_lowercase())), + '\t' | '\n' | '\x0C' | ' ' => { + go!(self: to BeforeAttributeName); + } + '/' => { + go!(self: to SelfClosingStartTag); + } + '>' => { + go!(self: emit_tag Data); + } + '\0' => { + go!(self: error); + go!(self: push_tag '\u{fffd}'); + } + c => { + go!(self: push_tag (c.to_ascii_lowercase())); + } } }, //§ script-data-escaped-less-than-sign-state states::RawLessThanSign(ScriptDataEscaped(Escaped)) => loop { match get_char!(self, input) { - '/' => go!(self: clear_temp; to RawEndTagOpen ScriptDataEscaped Escaped), + '/' => { + go!(self: clear_temp); + go!(self: to RawEndTagOpen ScriptDataEscaped Escaped); + } c => match lower_ascii_letter(c) { Some(cl) => { - go!(self: clear_temp; push_temp cl; emit '<'; emit c; to ScriptDataEscapeStart DoubleEscaped) + go!(self: clear_temp); + go!(self: push_temp cl); + go!(self: emit '<'); + go!(self: emit c); + go!(self: to ScriptDataEscapeStart DoubleEscaped); + } + None => { + go!(self: emit '<'); + self.reconsume = true; + go!(self: to RawData ScriptDataEscaped Escaped); } - None => go!(self: emit '<'; reconsume RawData ScriptDataEscaped Escaped), }, } }, @@ -802,8 +912,15 @@ impl<Sink: TokenSink> Tokenizer<Sink> { //§ script-data-double-escaped-less-than-sign-state states::RawLessThanSign(ScriptDataEscaped(DoubleEscaped)) => loop { match get_char!(self, input) { - '/' => go!(self: clear_temp; emit '/'; to ScriptDataDoubleEscapeEnd), - _ => go!(self: reconsume RawData ScriptDataEscaped DoubleEscaped), + '/' => { + go!(self: clear_temp); + go!(self: emit '/'); + go!(self: to ScriptDataDoubleEscapeEnd); + } + _ => { + self.reconsume = true; + go!(self: to RawData ScriptDataEscaped DoubleEscaped); + } } }, @@ -811,11 +928,20 @@ impl<Sink: TokenSink> Tokenizer<Sink> { // otherwise states::RawLessThanSign(kind) => loop { match get_char!(self, input) { - '/' => go!(self: clear_temp; to RawEndTagOpen kind), + '/' => { + go!(self: clear_temp); + go!(self: to RawEndTagOpen kind); + } '!' if kind == ScriptData => { - go!(self: emit '<'; emit '!'; to ScriptDataEscapeStart Escaped) + go!(self: emit '<'); + go!(self: emit '!'); + go!(self: to ScriptDataEscapeStart Escaped); + } + _ => { + go!(self: emit '<'); + self.reconsume = true; + go!(self: to RawData kind); } - _ => go!(self: emit '<'; reconsume RawData kind), } }, @@ -823,8 +949,17 @@ impl<Sink: TokenSink> Tokenizer<Sink> { states::RawEndTagOpen(kind) => loop { let c = get_char!(self, input); match lower_ascii_letter(c) { - Some(cl) => go!(self: create_tag EndTag cl; push_temp c; to RawEndTagName kind), - None => go!(self: emit '<'; emit '/'; reconsume RawData kind), + Some(cl) => { + go!(self: create_tag EndTag cl); + go!(self: push_temp c); + go!(self: to RawEndTagName kind); + } + None => { + go!(self: emit '<'); + go!(self: emit '/'); + self.reconsume = true; + go!(self: to RawData kind); + } } }, @@ -833,17 +968,31 @@ impl<Sink: TokenSink> Tokenizer<Sink> { let c = get_char!(self, input); if self.have_appropriate_end_tag() { match c { - '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeAttributeName), - '/' => go!(self: to SelfClosingStartTag), - '>' => go!(self: emit_tag Data), + '\t' | '\n' | '\x0C' | ' ' => { + go!(self: to BeforeAttributeName); + } + '/' => { + go!(self: to SelfClosingStartTag); + } + '>' => { + go!(self: emit_tag Data); + } _ => (), } } match lower_ascii_letter(c) { - Some(cl) => go!(self: push_tag cl; push_temp c), + Some(cl) => { + go!(self: push_tag cl); + go!(self: push_temp c); + } None => { - go!(self: discard_tag; emit '<'; emit '/'; emit_temp; reconsume RawData kind) + go!(self: discard_tag); + go!(self: emit '<'); + go!(self: emit '/'); + go!(self: emit_temp); + self.reconsume = true; + go!(self: to RawData kind); } } }, @@ -858,11 +1007,20 @@ impl<Sink: TokenSink> Tokenizer<Sink> { } else { Escaped }; - go!(self: emit c; to RawData ScriptDataEscaped esc); + { + go!(self: emit c); + go!(self: to RawData ScriptDataEscaped esc); + }; } _ => match lower_ascii_letter(c) { - Some(cl) => go!(self: push_temp cl; emit c), - None => go!(self: reconsume RawData ScriptDataEscaped Escaped), + Some(cl) => { + go!(self: push_temp cl); + go!(self: emit c); + } + None => { + self.reconsume = true; + go!(self: to RawData ScriptDataEscaped Escaped); + } }, } }, @@ -870,47 +1028,89 @@ impl<Sink: TokenSink> Tokenizer<Sink> { //§ script-data-escape-start-state states::ScriptDataEscapeStart(Escaped) => loop { match get_char!(self, input) { - '-' => go!(self: emit '-'; to ScriptDataEscapeStartDash), - _ => go!(self: reconsume RawData ScriptData), + '-' => { + go!(self: emit '-'); + go!(self: to ScriptDataEscapeStartDash); + } + _ => { + self.reconsume = true; + go!(self: to RawData ScriptData); + } } }, //§ script-data-escape-start-dash-state states::ScriptDataEscapeStartDash => loop { match get_char!(self, input) { - '-' => go!(self: emit '-'; to ScriptDataEscapedDashDash Escaped), - _ => go!(self: reconsume RawData ScriptData), + '-' => { + go!(self: emit '-'); + go!(self: to ScriptDataEscapedDashDash Escaped); + } + _ => { + self.reconsume = true; + go!(self: to RawData ScriptData); + } } }, //§ script-data-escaped-dash-state script-data-double-escaped-dash-state states::ScriptDataEscapedDash(kind) => loop { match get_char!(self, input) { - '-' => go!(self: emit '-'; to ScriptDataEscapedDashDash kind), + '-' => { + go!(self: emit '-'); + go!(self: to ScriptDataEscapedDashDash kind); + } '<' => { if kind == DoubleEscaped { - go!(self: emit '<'); + { + go!(self: emit '<'); + }; } - go!(self: to RawLessThanSign ScriptDataEscaped kind); + { + go!(self: to RawLessThanSign ScriptDataEscaped kind); + }; + } + '\0' => { + go!(self: error); + go!(self: emit '\u{fffd}'); + go!(self: to RawData ScriptDataEscaped kind); + } + c => { + go!(self: emit c); + go!(self: to RawData ScriptDataEscaped kind); } - '\0' => go!(self: error; emit '\u{fffd}'; to RawData ScriptDataEscaped kind), - c => go!(self: emit c; to RawData ScriptDataEscaped kind), } }, //§ script-data-escaped-dash-dash-state script-data-double-escaped-dash-dash-state states::ScriptDataEscapedDashDash(kind) => loop { match get_char!(self, input) { - '-' => go!(self: emit '-'), + '-' => { + go!(self: emit '-'); + } '<' => { if kind == DoubleEscaped { - go!(self: emit '<'); + { + go!(self: emit '<'); + }; } - go!(self: to RawLessThanSign ScriptDataEscaped kind); + { + go!(self: to RawLessThanSign ScriptDataEscaped kind); + }; + } + '>' => { + go!(self: emit '>'); + go!(self: to RawData ScriptData); + } + '\0' => { + go!(self: error); + go!(self: emit '\u{fffd}'); + go!(self: to RawData ScriptDataEscaped kind); + } + c => { + go!(self: emit c); + go!(self: to RawData ScriptDataEscaped kind); } - '>' => go!(self: emit '>'; to RawData ScriptData), - '\0' => go!(self: error; emit '\u{fffd}'; to RawData ScriptDataEscaped kind), - c => go!(self: emit c; to RawData ScriptDataEscaped kind), } }, @@ -924,11 +1124,20 @@ impl<Sink: TokenSink> Tokenizer<Sink> { } else { DoubleEscaped }; - go!(self: emit c; to RawData ScriptDataEscaped esc); + { + go!(self: emit c); + go!(self: to RawData ScriptDataEscaped esc); + }; } _ => match lower_ascii_letter(c) { - Some(cl) => go!(self: push_temp cl; emit c), - None => go!(self: reconsume RawData ScriptDataEscaped DoubleEscaped), + Some(cl) => { + go!(self: push_temp cl); + go!(self: emit c); + } + None => { + self.reconsume = true; + go!(self: to RawData ScriptDataEscaped DoubleEscaped); + } }, } }, @@ -937,15 +1146,29 @@ impl<Sink: TokenSink> Tokenizer<Sink> { states::BeforeAttributeName => loop { match get_char!(self, input) { '\t' | '\n' | '\x0C' | ' ' => (), - '/' => go!(self: to SelfClosingStartTag), - '>' => go!(self: emit_tag Data), - '\0' => go!(self: error; create_attr '\u{fffd}'; to AttributeName), + '/' => { + go!(self: to SelfClosingStartTag); + } + '>' => { + go!(self: emit_tag Data); + } + '\0' => { + go!(self: error); + go!(self: create_attr '\u{fffd}'); + go!(self: to AttributeName); + } c => match lower_ascii_letter(c) { - Some(cl) => go!(self: create_attr cl; to AttributeName), + Some(cl) => { + go!(self: create_attr cl); + go!(self: to AttributeName); + } None => { go_match!(self: c, '"' , '\'' , '<' , '=' => error); - go!(self: create_attr c; to AttributeName); + { + go!(self: create_attr c); + go!(self: to AttributeName); + }; } }, } @@ -954,17 +1177,32 @@ impl<Sink: TokenSink> Tokenizer<Sink> { //§ attribute-name-state states::AttributeName => loop { match get_char!(self, input) { - '\t' | '\n' | '\x0C' | ' ' => go!(self: to AfterAttributeName), - '/' => go!(self: to SelfClosingStartTag), - '=' => go!(self: to BeforeAttributeValue), - '>' => go!(self: emit_tag Data), - '\0' => go!(self: error; push_name '\u{fffd}'), + '\t' | '\n' | '\x0C' | ' ' => { + go!(self: to AfterAttributeName); + } + '/' => { + go!(self: to SelfClosingStartTag); + } + '=' => { + go!(self: to BeforeAttributeValue); + } + '>' => { + go!(self: emit_tag Data); + } + '\0' => { + go!(self: error); + go!(self: push_name '\u{fffd}'); + } c => match lower_ascii_letter(c) { - Some(cl) => go!(self: push_name cl), + Some(cl) => { + go!(self: push_name cl); + } None => { go_match!(self: c, '"' , '\'' , '<' => error); - go!(self: push_name c); + { + go!(self: push_name c); + }; } }, } @@ -974,16 +1212,32 @@ impl<Sink: TokenSink> Tokenizer<Sink> { states::AfterAttributeName => loop { match get_char!(self, input) { '\t' | '\n' | '\x0C' | ' ' => (), - '/' => go!(self: to SelfClosingStartTag), - '=' => go!(self: to BeforeAttributeValue), - '>' => go!(self: emit_tag Data), - '\0' => go!(self: error; create_attr '\u{fffd}'; to AttributeName), + '/' => { + go!(self: to SelfClosingStartTag); + } + '=' => { + go!(self: to BeforeAttributeValue); + } + '>' => { + go!(self: emit_tag Data); + } + '\0' => { + go!(self: error); + go!(self: create_attr '\u{fffd}'); + go!(self: to AttributeName); + } c => match lower_ascii_letter(c) { - Some(cl) => go!(self: create_attr cl; to AttributeName), + Some(cl) => { + go!(self: create_attr cl); + go!(self: to AttributeName); + } None => { go_match!(self: c, '"' , '\'' , '<' => error); - go!(self: create_attr c; to AttributeName); + { + go!(self: create_attr c); + go!(self: to AttributeName); + }; } }, } @@ -994,36 +1248,75 @@ impl<Sink: TokenSink> Tokenizer<Sink> { // hopefully in the same zero-copy buffer. states::BeforeAttributeValue => loop { match peek!(self, input) { - '\t' | '\n' | '\r' | '\x0C' | ' ' => go!(self: discard_char input), - '"' => go!(self: discard_char input; to AttributeValue DoubleQuoted), - '\'' => go!(self: discard_char input; to AttributeValue SingleQuoted), + '\t' | '\n' | '\r' | '\x0C' | ' ' => { + go!(self: discard_char input); + } + '"' => { + go!(self: discard_char input); + go!(self: to AttributeValue DoubleQuoted); + } + '\'' => { + go!(self: discard_char input); + go!(self: to AttributeValue SingleQuoted); + } '\0' => { - go!(self: discard_char input; error; push_value '\u{fffd}'; to AttributeValue Unquoted) + go!(self: discard_char input); + go!(self: error); + go!(self: push_value '\u{fffd}'); + go!(self: to AttributeValue Unquoted); + } + '>' => { + go!(self: discard_char input); + go!(self: error); + go!(self: emit_tag Data); + } + _ => { + go!(self: to AttributeValue Unquoted); } - '>' => go!(self: discard_char input; error; emit_tag Data), - _ => go!(self: to AttributeValue Unquoted), } }, //§ attribute-value-(double-quoted)-state states::AttributeValue(DoubleQuoted) => loop { match pop_except_from!(self, input, small_char_set!('\r' '"' '&' '\0' '\n')) { - FromSet('"') => go!(self: to AfterAttributeValueQuoted), - FromSet('&') => go!(self: consume_char_ref '"'), - FromSet('\0') => go!(self: error; push_value '\u{fffd}'), - FromSet(c) => go!(self: push_value c), - NotFromSet(ref b) => go!(self: append_value b), + FromSet('"') => { + go!(self: to AfterAttributeValueQuoted); + } + FromSet('&') => { + go!(self: consume_char_ref '"'); + } + FromSet('\0') => { + go!(self: error); + go!(self: push_value '\u{fffd}'); + } + FromSet(c) => { + go!(self: push_value c); + } + NotFromSet(ref b) => { + go!(self: append_value b); + } } }, //§ attribute-value-(single-quoted)-state states::AttributeValue(SingleQuoted) => loop { match pop_except_from!(self, input, small_char_set!('\r' '\'' '&' '\0' '\n')) { - FromSet('\'') => go!(self: to AfterAttributeValueQuoted), - FromSet('&') => go!(self: consume_char_ref '\''), - FromSet('\0') => go!(self: error; push_value '\u{fffd}'), - FromSet(c) => go!(self: push_value c), - NotFromSet(ref b) => go!(self: append_value b), + FromSet('\'') => { + go!(self: to AfterAttributeValueQuoted); + } + FromSet('&') => { + go!(self: consume_char_ref '\''); + } + FromSet('\0') => { + go!(self: error); + go!(self: push_value '\u{fffd}'); + } + FromSet(c) => { + go!(self: push_value c); + } + NotFromSet(ref b) => { + go!(self: append_value b); + } } }, @@ -1035,27 +1328,48 @@ impl<Sink: TokenSink> Tokenizer<Sink> { small_char_set!('\r' '\t' '\n' '\x0C' ' ' '&' '>' '\0') ) { FromSet('\t') | FromSet('\n') | FromSet('\x0C') | FromSet(' ') => { - go!(self: to BeforeAttributeName) + go!(self: to BeforeAttributeName); + } + FromSet('&') => { + go!(self: consume_char_ref '>'); + } + FromSet('>') => { + go!(self: emit_tag Data); + } + FromSet('\0') => { + go!(self: error); + go!(self: push_value '\u{fffd}'); } - FromSet('&') => go!(self: consume_char_ref '>'), - FromSet('>') => go!(self: emit_tag Data), - FromSet('\0') => go!(self: error; push_value '\u{fffd}'), FromSet(c) => { go_match!(self: c, '"' , '\'' , '<' , '=' , '`' => error); - go!(self: push_value c); + { + go!(self: push_value c); + }; + } + NotFromSet(ref b) => { + go!(self: append_value b); } - NotFromSet(ref b) => go!(self: append_value b), } }, //§ after-attribute-value-(quoted)-state states::AfterAttributeValueQuoted => loop { match get_char!(self, input) { - '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeAttributeName), - '/' => go!(self: to SelfClosingStartTag), - '>' => go!(self: emit_tag Data), - _ => go!(self: error; reconsume BeforeAttributeName), + '\t' | '\n' | '\x0C' | ' ' => { + go!(self: to BeforeAttributeName); + } + '/' => { + go!(self: to SelfClosingStartTag); + } + '>' => { + go!(self: emit_tag Data); + } + _ => { + go!(self: error); + self.reconsume = true; + go!(self: to BeforeAttributeName); + } } }, @@ -1064,76 +1378,164 @@ impl<Sink: TokenSink> Tokenizer<Sink> { match get_char!(self, input) { '>' => { self.current_tag_self_closing = true; - go!(self: emit_tag Data); + { + go!(self: emit_tag Data); + }; + } + _ => { + go!(self: error); + self.reconsume = true; + go!(self: to BeforeAttributeName); } - _ => go!(self: error; reconsume BeforeAttributeName), } }, //§ comment-start-state states::CommentStart => loop { match get_char!(self, input) { - '-' => go!(self: to CommentStartDash), - '\0' => go!(self: error; push_comment '\u{fffd}'; to Comment), - '>' => go!(self: error; emit_comment; to Data), - c => go!(self: push_comment c; to Comment), + '-' => { + go!(self: to CommentStartDash); + } + '\0' => { + go!(self: error); + go!(self: push_comment '\u{fffd}'); + go!(self: to Comment); + } + '>' => { + go!(self: error); + go!(self: emit_comment); + go!(self: to Data); + } + c => { + go!(self: push_comment c); + go!(self: to Comment); + } } }, //§ comment-start-dash-state states::CommentStartDash => loop { match get_char!(self, input) { - '-' => go!(self: to CommentEnd), - '\0' => go!(self: error; append_comment "-\u{fffd}"; to Comment), - '>' => go!(self: error; emit_comment; to Data), - c => go!(self: push_comment '-'; push_comment c; to Comment), + '-' => { + go!(self: to CommentEnd); + } + '\0' => { + go!(self: error); + go!(self: append_comment "-\u{fffd}"); + go!(self: to Comment); + } + '>' => { + go!(self: error); + go!(self: emit_comment); + go!(self: to Data); + } + c => { + go!(self: push_comment '-'); + go!(self: push_comment c); + go!(self: to Comment); + } } }, //§ comment-state states::Comment => loop { match get_char!(self, input) { - '-' => go!(self: to CommentEndDash), - '\0' => go!(self: error; push_comment '\u{fffd}'), - c => go!(self: push_comment c), + '-' => { + go!(self: to CommentEndDash); + } + '\0' => { + go!(self: error); + go!(self: push_comment '\u{fffd}'); + } + c => { + go!(self: push_comment c); + } } }, //§ comment-end-dash-state states::CommentEndDash => loop { match get_char!(self, input) { - '-' => go!(self: to CommentEnd), - '\0' => go!(self: error; append_comment "-\u{fffd}"; to Comment), - c => go!(self: push_comment '-'; push_comment c; to Comment), + '-' => { + go!(self: to CommentEnd); + } + '\0' => { + go!(self: error); + go!(self: append_comment "-\u{fffd}"); + go!(self: to Comment); + } + c => { + go!(self: push_comment '-'); + go!(self: push_comment c); + go!(self: to Comment); + } } }, //§ comment-end-state states::CommentEnd => loop { match get_char!(self, input) { - '>' => go!(self: emit_comment; to Data), - '\0' => go!(self: error; append_comment "--\u{fffd}"; to Comment), - '!' => go!(self: error; to CommentEndBang), - '-' => go!(self: error; push_comment '-'), - c => go!(self: error; append_comment "--"; push_comment c; to Comment), + '>' => { + go!(self: emit_comment); + go!(self: to Data); + } + '\0' => { + go!(self: error); + go!(self: append_comment "--\u{fffd}"); + go!(self: to Comment); + } + '!' => { + go!(self: error); + go!(self: to CommentEndBang); + } + '-' => { + go!(self: error); + go!(self: push_comment '-'); + } + c => { + go!(self: error); + go!(self: append_comment "--"); + go!(self: push_comment c); + go!(self: to Comment); + } } }, //§ comment-end-bang-state states::CommentEndBang => loop { match get_char!(self, input) { - '-' => go!(self: append_comment "--!"; to CommentEndDash), - '>' => go!(self: emit_comment; to Data), - '\0' => go!(self: error; append_comment "--!\u{fffd}"; to Comment), - c => go!(self: append_comment "--!"; push_comment c; to Comment), + '-' => { + go!(self: append_comment "--!"); + go!(self: to CommentEndDash); + } + '>' => { + go!(self: emit_comment); + go!(self: to Data); + } + '\0' => { + go!(self: error); + go!(self: append_comment "--!\u{fffd}"); + go!(self: to Comment); + } + c => { + go!(self: append_comment "--!"); + go!(self: push_comment c); + go!(self: to Comment); + } } }, //§ doctype-state states::Doctype => loop { match get_char!(self, input) { - '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeDoctypeName), - _ => go!(self: error; reconsume BeforeDoctypeName), + '\t' | '\n' | '\x0C' | ' ' => { + go!(self: to BeforeDoctypeName); + } + _ => { + go!(self: error); + self.reconsume = true; + go!(self: to BeforeDoctypeName); + } } }, @@ -1142,11 +1544,22 @@ impl<Sink: TokenSink> Tokenizer<Sink> { match get_char!(self, input) { '\t' | '\n' | '\x0C' | ' ' => (), '\0' => { - go!(self: error; create_doctype; push_doctype_name '\u{fffd}'; to DoctypeName) + go!(self: error); + go!(self: create_doctype); + go!(self: push_doctype_name '\u{fffd}'); + go!(self: to DoctypeName); + } + '>' => { + go!(self: error); + go!(self: create_doctype); + go!(self: force_quirks); + go!(self: emit_doctype); + go!(self: to Data); } - '>' => go!(self: error; create_doctype; force_quirks; emit_doctype; to Data), c => { - go!(self: create_doctype; push_doctype_name (c.to_ascii_lowercase()); to DoctypeName) + go!(self: create_doctype); + go!(self: push_doctype_name (c.to_ascii_lowercase())); + go!(self: to DoctypeName); } } }, @@ -1154,24 +1567,46 @@ impl<Sink: TokenSink> Tokenizer<Sink> { //§ doctype-name-state states::DoctypeName => loop { match get_char!(self, input) { - '\t' | '\n' | '\x0C' | ' ' => go!(self: clear_temp; to AfterDoctypeName), - '>' => go!(self: emit_doctype; to Data), - '\0' => go!(self: error; push_doctype_name '\u{fffd}'), - c => go!(self: push_doctype_name (c.to_ascii_lowercase())), + '\t' | '\n' | '\x0C' | ' ' => { + go!(self: clear_temp); + go!(self: to AfterDoctypeName); + } + '>' => { + go!(self: emit_doctype); + go!(self: to Data); + } + '\0' => { + go!(self: error); + go!(self: push_doctype_name '\u{fffd}'); + } + c => { + go!(self: push_doctype_name (c.to_ascii_lowercase())); + } } }, //§ after-doctype-name-state states::AfterDoctypeName => loop { if eat!(self, input, "public") { - go!(self: to AfterDoctypeKeyword Public); + { + go!(self: to AfterDoctypeKeyword Public); + }; } else if eat!(self, input, "system") { - go!(self: to AfterDoctypeKeyword System); + { + go!(self: to AfterDoctypeKeyword System); + }; } else { match get_char!(self, input) { '\t' | '\n' | '\x0C' | ' ' => (), - '>' => go!(self: emit_doctype; to Data), - _ => go!(self: error; force_quirks; to BogusDoctype), + '>' => { + go!(self: emit_doctype); + go!(self: to Data); + } + _ => { + go!(self: error); + go!(self: force_quirks); + go!(self: to BogusDoctype); + } } } }, @@ -1179,15 +1614,30 @@ impl<Sink: TokenSink> Tokenizer<Sink> { //§ after-doctype-public-keyword-state after-doctype-system-keyword-state states::AfterDoctypeKeyword(kind) => loop { match get_char!(self, input) { - '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeDoctypeIdentifier kind), + '\t' | '\n' | '\x0C' | ' ' => { + go!(self: to BeforeDoctypeIdentifier kind); + } '"' => { - go!(self: error; clear_doctype_id kind; to DoctypeIdentifierDoubleQuoted kind) + go!(self: error); + go!(self: clear_doctype_id kind); + go!(self: to DoctypeIdentifierDoubleQuoted kind); } '\'' => { - go!(self: error; clear_doctype_id kind; to DoctypeIdentifierSingleQuoted kind) + go!(self: error); + go!(self: clear_doctype_id kind); + go!(self: to DoctypeIdentifierSingleQuoted kind); + } + '>' => { + go!(self: error); + go!(self: force_quirks); + go!(self: emit_doctype); + go!(self: to Data); + } + _ => { + go!(self: error); + go!(self: force_quirks); + go!(self: to BogusDoctype); } - '>' => go!(self: error; force_quirks; emit_doctype; to Data), - _ => go!(self: error; force_quirks; to BogusDoctype), } }, @@ -1195,30 +1645,69 @@ impl<Sink: TokenSink> Tokenizer<Sink> { states::BeforeDoctypeIdentifier(kind) => loop { match get_char!(self, input) { '\t' | '\n' | '\x0C' | ' ' => (), - '"' => go!(self: clear_doctype_id kind; to DoctypeIdentifierDoubleQuoted kind), - '\'' => go!(self: clear_doctype_id kind; to DoctypeIdentifierSingleQuoted kind), - '>' => go!(self: error; force_quirks; emit_doctype; to Data), - _ => go!(self: error; force_quirks; to BogusDoctype), + '"' => { + go!(self: clear_doctype_id kind); + go!(self: to DoctypeIdentifierDoubleQuoted kind); + } + '\'' => { + go!(self: clear_doctype_id kind); + go!(self: to DoctypeIdentifierSingleQuoted kind); + } + '>' => { + go!(self: error); + go!(self: force_quirks); + go!(self: emit_doctype); + go!(self: to Data); + } + _ => { + go!(self: error); + go!(self: force_quirks); + go!(self: to BogusDoctype); + } } }, //§ doctype-public-identifier-(double-quoted)-state doctype-system-identifier-(double-quoted)-state states::DoctypeIdentifierDoubleQuoted(kind) => loop { match get_char!(self, input) { - '"' => go!(self: to AfterDoctypeIdentifier kind), - '\0' => go!(self: error; push_doctype_id kind '\u{fffd}'), - '>' => go!(self: error; force_quirks; emit_doctype; to Data), - c => go!(self: push_doctype_id kind c), + '"' => { + go!(self: to AfterDoctypeIdentifier kind); + } + '\0' => { + go!(self: error); + go!(self: push_doctype_id kind '\u{fffd}'); + } + '>' => { + go!(self: error); + go!(self: force_quirks); + go!(self: emit_doctype); + go!(self: to Data); + } + c => { + go!(self: push_doctype_id kind c); + } } }, //§ doctype-public-identifier-(single-quoted)-state doctype-system-identifier-(single-quoted)-state states::DoctypeIdentifierSingleQuoted(kind) => loop { match get_char!(self, input) { - '\'' => go!(self: to AfterDoctypeIdentifier kind), - '\0' => go!(self: error; push_doctype_id kind '\u{fffd}'), - '>' => go!(self: error; force_quirks; emit_doctype; to Data), - c => go!(self: push_doctype_id kind c), + '\'' => { + go!(self: to AfterDoctypeIdentifier kind); + } + '\0' => { + go!(self: error); + go!(self: push_doctype_id kind '\u{fffd}'); + } + '>' => { + go!(self: error); + go!(self: force_quirks); + go!(self: emit_doctype); + go!(self: to Data); + } + c => { + go!(self: push_doctype_id kind c); + } } }, @@ -1226,16 +1715,27 @@ impl<Sink: TokenSink> Tokenizer<Sink> { states::AfterDoctypeIdentifier(Public) => loop { match get_char!(self, input) { '\t' | '\n' | '\x0C' | ' ' => { - go!(self: to BetweenDoctypePublicAndSystemIdentifiers) + go!(self: to BetweenDoctypePublicAndSystemIdentifiers); + } + '>' => { + go!(self: emit_doctype); + go!(self: to Data); } - '>' => go!(self: emit_doctype; to Data), '"' => { - go!(self: error; clear_doctype_id System; to DoctypeIdentifierDoubleQuoted System) + go!(self: error); + go!(self: clear_doctype_id System); + go!(self: to DoctypeIdentifierDoubleQuoted System); } '\'' => { - go!(self: error; clear_doctype_id System; to DoctypeIdentifierSingleQuoted System) + go!(self: error); + go!(self: clear_doctype_id System); + go!(self: to DoctypeIdentifierSingleQuoted System); + } + _ => { + go!(self: error); + go!(self: force_quirks); + go!(self: to BogusDoctype); } - _ => go!(self: error; force_quirks; to BogusDoctype), } }, @@ -1243,8 +1743,14 @@ impl<Sink: TokenSink> Tokenizer<Sink> { states::AfterDoctypeIdentifier(System) => loop { match get_char!(self, input) { '\t' | '\n' | '\x0C' | ' ' => (), - '>' => go!(self: emit_doctype; to Data), - _ => go!(self: error; to BogusDoctype), + '>' => { + go!(self: emit_doctype); + go!(self: to Data); + } + _ => { + go!(self: error); + go!(self: to BogusDoctype); + } } }, @@ -1252,21 +1758,33 @@ impl<Sink: TokenSink> Tokenizer<Sink> { states::BetweenDoctypePublicAndSystemIdentifiers => loop { match get_char!(self, input) { '\t' | '\n' | '\x0C' | ' ' => (), - '>' => go!(self: emit_doctype; to Data), + '>' => { + go!(self: emit_doctype); + go!(self: to Data); + } '"' => { - go!(self: clear_doctype_id System; to DoctypeIdentifierDoubleQuoted System) + go!(self: clear_doctype_id System); + go!(self: to DoctypeIdentifierDoubleQuoted System); } '\'' => { - go!(self: clear_doctype_id System; to DoctypeIdentifierSingleQuoted System) + go!(self: clear_doctype_id System); + go!(self: to DoctypeIdentifierSingleQuoted System); + } + _ => { + go!(self: error); + go!(self: force_quirks); + go!(self: to BogusDoctype); } - _ => go!(self: error; force_quirks; to BogusDoctype), } }, //§ bogus-doctype-state states::BogusDoctype => loop { match get_char!(self, input) { - '>' => go!(self: emit_doctype; to Data), + '>' => { + go!(self: emit_doctype); + go!(self: to Data); + } _ => (), } }, @@ -1274,52 +1792,93 @@ impl<Sink: TokenSink> Tokenizer<Sink> { //§ bogus-comment-state states::BogusComment => loop { match get_char!(self, input) { - '>' => go!(self: emit_comment; to Data), - '\0' => go!(self: push_comment '\u{fffd}'), - c => go!(self: push_comment c), + '>' => { + go!(self: emit_comment); + go!(self: to Data); + } + '\0' => { + go!(self: push_comment '\u{fffd}'); + } + c => { + go!(self: push_comment c); + } } }, //§ markup-declaration-open-state states::MarkupDeclarationOpen => loop { if eat_exact!(self, input, "--") { - go!(self: clear_comment; to CommentStart); + { + go!(self: clear_comment); + go!(self: to CommentStart); + }; } else if eat!(self, input, "doctype") { - go!(self: to Doctype); + { + go!(self: to Doctype); + }; } else { if self .sink .adjusted_current_node_present_but_not_in_html_namespace() { if eat_exact!(self, input, "[CDATA[") { - go!(self: clear_temp; to CdataSection); + { + go!(self: clear_temp); + go!(self: to CdataSection); + }; } } - go!(self: error; to BogusComment); + { + go!(self: error); + go!(self: to BogusComment); + }; } }, //§ cdata-section-state states::CdataSection => loop { match get_char!(self, input) { - ']' => go!(self: to CdataSectionBracket), - '\0' => go!(self: emit_temp; emit '\0'), - c => go!(self: push_temp c), + ']' => { + go!(self: to CdataSectionBracket); + } + '\0' => { + go!(self: emit_temp); + go!(self: emit '\0'); + } + c => { + go!(self: push_temp c); + } } }, //§ cdata-section-bracket states::CdataSectionBracket => match get_char!(self, input) { - ']' => go!(self: to CdataSectionEnd), - _ => go!(self: push_temp ']'; reconsume CdataSection), + ']' => { + go!(self: to CdataSectionEnd); + } + _ => { + go!(self: push_temp ']'); + self.reconsume = true; + go!(self: to CdataSection); + } }, //§ cdata-section-end states::CdataSectionEnd => loop { match get_char!(self, input) { - ']' => go!(self: push_temp ']'), - '>' => go!(self: emit_temp; to Data), - _ => go!(self: push_temp ']'; push_temp ']'; reconsume CdataSection), + ']' => { + go!(self: push_temp ']'); + } + '>' => { + go!(self: emit_temp); + go!(self: to Data); + } + _ => { + go!(self: push_temp ']'); + go!(self: push_temp ']'); + self.reconsume = true; + go!(self: to CdataSection); + } } }, //§ END @@ -1360,9 +1919,13 @@ impl<Sink: TokenSink> Tokenizer<Sink> { for i in 0..num_chars { let c = chars[i as usize]; match self.state { - states::Data | states::RawData(states::Rcdata) => go!(self: emit c), + states::Data | states::RawData(states::Rcdata) => { + go!(self: emit c); + } - states::AttributeValue(_) => go!(self: push_value c), + states::AttributeValue(_) => { + go!(self: push_value c); + } _ => panic!( "state {:?} should not be reachable in process_char_ref", @@ -1446,30 +2009,56 @@ impl<Sink: TokenSink> Tokenizer<Sink> { | states::AfterAttributeValueQuoted | states::SelfClosingStartTag | states::ScriptDataEscapedDash(_) - | states::ScriptDataEscapedDashDash(_) => go!(self: error_eof; to Data), + | states::ScriptDataEscapedDashDash(_) => { + go!(self: error_eof); + go!(self: to Data); + } - states::TagOpen => go!(self: error_eof; emit '<'; to Data), + states::TagOpen => { + go!(self: error_eof); + go!(self: emit '<'); + go!(self: to Data); + } - states::EndTagOpen => go!(self: error_eof; emit '<'; emit '/'; to Data), + states::EndTagOpen => { + go!(self: error_eof); + go!(self: emit '<'); + go!(self: emit '/'); + go!(self: to Data); + } states::RawLessThanSign(ScriptDataEscaped(DoubleEscaped)) => { - go!(self: to RawData ScriptDataEscaped DoubleEscaped) + go!(self: to RawData ScriptDataEscaped DoubleEscaped); } - states::RawLessThanSign(kind) => go!(self: emit '<'; to RawData kind), + states::RawLessThanSign(kind) => { + go!(self: emit '<'); + go!(self: to RawData kind); + } - states::RawEndTagOpen(kind) => go!(self: emit '<'; emit '/'; to RawData kind), + states::RawEndTagOpen(kind) => { + go!(self: emit '<'); + go!(self: emit '/'); + go!(self: to RawData kind); + } states::RawEndTagName(kind) => { - go!(self: emit '<'; emit '/'; emit_temp; to RawData kind) + go!(self: emit '<'); + go!(self: emit '/'); + go!(self: emit_temp); + go!(self: to RawData kind); } - states::ScriptDataEscapeStart(kind) => go!(self: to RawData ScriptDataEscaped kind), + states::ScriptDataEscapeStart(kind) => { + go!(self: to RawData ScriptDataEscaped kind); + } - states::ScriptDataEscapeStartDash => go!(self: to RawData ScriptData), + states::ScriptDataEscapeStartDash => { + go!(self: to RawData ScriptData); + } states::ScriptDataDoubleEscapeEnd => { - go!(self: to RawData ScriptDataEscaped DoubleEscaped) + go!(self: to RawData ScriptDataEscaped DoubleEscaped); } states::CommentStart @@ -1477,10 +2066,18 @@ impl<Sink: TokenSink> Tokenizer<Sink> { | states::Comment | states::CommentEndDash | states::CommentEnd - | states::CommentEndBang => go!(self: error_eof; emit_comment; to Data), + | states::CommentEndBang => { + go!(self: error_eof); + go!(self: emit_comment); + go!(self: to Data); + } states::Doctype | states::BeforeDoctypeName => { - go!(self: error_eof; create_doctype; force_quirks; emit_doctype; to Data) + go!(self: error_eof); + go!(self: create_doctype); + go!(self: force_quirks); + go!(self: emit_doctype); + go!(self: to Data); } states::DoctypeName @@ -1491,20 +2088,43 @@ impl<Sink: TokenSink> Tokenizer<Sink> { | states::DoctypeIdentifierSingleQuoted(_) | states::AfterDoctypeIdentifier(_) | states::BetweenDoctypePublicAndSystemIdentifiers => { - go!(self: error_eof; force_quirks; emit_doctype; to Data) + go!(self: error_eof); + go!(self: force_quirks); + go!(self: emit_doctype); + go!(self: to Data); } - states::BogusDoctype => go!(self: emit_doctype; to Data), + states::BogusDoctype => { + go!(self: emit_doctype); + go!(self: to Data); + } - states::BogusComment => go!(self: emit_comment; to Data), + states::BogusComment => { + go!(self: emit_comment); + go!(self: to Data); + } - states::MarkupDeclarationOpen => go!(self: error; to BogusComment), + states::MarkupDeclarationOpen => { + go!(self: error); + go!(self: to BogusComment); + } - states::CdataSection => go!(self: emit_temp; error_eof; to Data), + states::CdataSection => { + go!(self: emit_temp); + go!(self: error_eof); + go!(self: to Data); + } - states::CdataSectionBracket => go!(self: push_temp ']'; to CdataSection), + states::CdataSectionBracket => { + go!(self: push_temp ']'); + go!(self: to CdataSection); + } - states::CdataSectionEnd => go!(self: push_temp ']'; push_temp ']'; to CdataSection), + states::CdataSectionEnd => { + go!(self: push_temp ']'); + go!(self: push_temp ']'); + go!(self: to CdataSection); + } } } } |