summaryrefslogtreecommitdiff
path: root/src/tokenizer
diff options
context:
space:
mode:
authorMartin Fischer <martin@push-f.com>2023-09-09 22:15:54 +0200
committerMartin Fischer <martin@push-f.com>2023-09-28 10:36:08 +0200
commit394c52260e861e911e2d8706d4904136a920da87 (patch)
treeb7511677c6f6a0ca03526991a770ef0b6752e963 /src/tokenizer
parent5aa3b82fbe62882da8007b0a4548b979c845aa97 (diff)
refactor: proxy emit_string calls through utils
This is done separately so that the next commit has a cleaner diff.
Diffstat (limited to 'src/tokenizer')
-rw-r--r--src/tokenizer/machine.rs124
-rw-r--r--src/tokenizer/machine/utils.rs18
2 files changed, 80 insertions, 62 deletions
diff --git a/src/tokenizer/machine.rs b/src/tokenizer/machine.rs
index e9a3e68..8b09aa7 100644
--- a/src/tokenizer/machine.rs
+++ b/src/tokenizer/machine.rs
@@ -105,11 +105,11 @@ where
}
Some('\0') => {
slf.emit_error(Error::UnexpectedNullCharacter);
- slf.emitter.emit_string("\0");
+ slf.emit_char('\0');
Ok(ControlToken::Continue)
}
Some(x) => {
- slf.emitter.emit_string(ctostr!(x));
+ slf.emit_char(x);
Ok(ControlToken::Continue)
}
None => Ok(ControlToken::Eof),
@@ -126,11 +126,11 @@ where
}
Some('\0') => {
slf.emit_error(Error::UnexpectedNullCharacter);
- slf.emitter.emit_string("\u{fffd}");
+ slf.emit_char('\u{fffd}');
Ok(ControlToken::Continue)
}
Some(x) => {
- slf.emitter.emit_string(ctostr!(x));
+ slf.emit_char(x);
Ok(ControlToken::Continue)
}
None => Ok(ControlToken::Eof),
@@ -142,11 +142,11 @@ where
}
Some('\0') => {
slf.emit_error(Error::UnexpectedNullCharacter);
- slf.emitter.emit_string("\u{fffd}");
+ slf.emit_char('\u{fffd}');
Ok(ControlToken::Continue)
}
Some(x) => {
- slf.emitter.emit_string(ctostr!(x));
+ slf.emit_char(x);
Ok(ControlToken::Continue)
}
None => Ok(ControlToken::Eof),
@@ -158,11 +158,11 @@ where
}
Some('\0') => {
slf.emit_error(Error::UnexpectedNullCharacter);
- slf.emitter.emit_string("\u{fffd}");
+ slf.emit_char('\u{fffd}');
Ok(ControlToken::Continue)
}
Some(x) => {
- slf.emitter.emit_string(ctostr!(x));
+ slf.emit_char(x);
Ok(ControlToken::Continue)
}
None => Ok(ControlToken::Eof),
@@ -170,11 +170,11 @@ where
State::PlainText => match slf.read_char()? {
Some('\0') => {
slf.emit_error(Error::UnexpectedNullCharacter);
- slf.emitter.emit_string("\u{fffd}");
+ slf.emit_char('\u{fffd}');
Ok(ControlToken::Continue)
}
Some(x) => {
- slf.emitter.emit_string(ctostr!(x));
+ slf.emit_char(x);
Ok(ControlToken::Continue)
}
None => Ok(ControlToken::Eof),
@@ -203,13 +203,13 @@ where
}
None => {
slf.emit_error(Error::EofBeforeTagName);
- slf.emitter.emit_string("<");
+ slf.emit_char('<');
Ok(ControlToken::Eof)
}
c @ Some(_) => {
slf.emit_error(Error::InvalidFirstCharacterOfTagName);
slf.state = State::Data;
- slf.emitter.emit_string("<");
+ slf.emit_char('<');
slf.unread_char(c);
Ok(ControlToken::Continue)
}
@@ -228,7 +228,7 @@ where
}
None => {
slf.emit_error(Error::EofBeforeTagName);
- slf.emitter.emit_string("</");
+ slf.emit_chars(b"</");
Ok(ControlToken::Eof)
}
Some(x) => {
@@ -277,7 +277,7 @@ where
Ok(ControlToken::Continue)
}
c => {
- slf.emitter.emit_string("<");
+ slf.emit_char('<');
slf.state = State::RcData;
slf.unread_char(c);
Ok(ControlToken::Continue)
@@ -291,7 +291,7 @@ where
Ok(ControlToken::Continue)
}
c => {
- slf.emitter.emit_string("</");
+ slf.emit_chars(b"</");
slf.state = State::RcData;
slf.unread_char(c);
Ok(ControlToken::Continue)
@@ -317,7 +317,7 @@ where
Ok(ControlToken::Continue)
}
c => {
- slf.emitter.emit_string("</");
+ slf.emit_chars(b"</");
slf.flush_buffer_characters();
slf.state = State::RcData;
@@ -332,7 +332,7 @@ where
Ok(ControlToken::Continue)
}
c => {
- slf.emitter.emit_string("<");
+ slf.emit_char('<');
slf.state = State::RawText;
slf.unread_char(c);
Ok(ControlToken::Continue)
@@ -346,7 +346,7 @@ where
Ok(ControlToken::Continue)
}
c => {
- slf.emitter.emit_string("</");
+ slf.emit_chars(b"</");
slf.state = State::RawText;
slf.unread_char(c);
Ok(ControlToken::Continue)
@@ -372,7 +372,7 @@ where
Ok(ControlToken::Continue)
}
c => {
- slf.emitter.emit_string("</");
+ slf.emit_chars(b"</");
slf.flush_buffer_characters();
slf.state = State::RawText;
@@ -388,11 +388,11 @@ where
}
Some('!') => {
slf.state = State::ScriptDataEscapeStart;
- slf.emitter.emit_string("<!");
+ slf.emit_chars(b"<!");
Ok(ControlToken::Continue)
}
c => {
- slf.emitter.emit_string("<");
+ slf.emit_char('<');
slf.state = State::ScriptData;
slf.unread_char(c);
Ok(ControlToken::Continue)
@@ -406,7 +406,7 @@ where
Ok(ControlToken::Continue)
}
c => {
- slf.emitter.emit_string("</");
+ slf.emit_chars(b"</");
slf.state = State::ScriptData;
slf.unread_char(c);
Ok(ControlToken::Continue)
@@ -432,7 +432,7 @@ where
Ok(ControlToken::Continue)
}
c => {
- slf.emitter.emit_string("</");
+ slf.emit_chars(b"</");
slf.flush_buffer_characters();
slf.state = State::Data;
slf.unread_char(c);
@@ -442,7 +442,7 @@ where
State::ScriptDataEscapeStart => match slf.read_char()? {
Some('-') => {
slf.state = State::ScriptDataEscapeStartDash;
- slf.emitter.emit_string("-");
+ slf.emit_char('-');
Ok(ControlToken::Continue)
}
c => {
@@ -454,7 +454,7 @@ where
State::ScriptDataEscapeStartDash => match slf.read_char()? {
Some('-') => {
slf.state = State::ScriptDataEscapedDashDash;
- slf.emitter.emit_string("-");
+ slf.emit_char('-');
Ok(ControlToken::Continue)
}
c => {
@@ -466,7 +466,7 @@ where
State::ScriptDataEscaped => match slf.read_char()? {
Some('-') => {
slf.state = State::ScriptDataEscapedDash;
- slf.emitter.emit_string("-");
+ slf.emit_char('-');
Ok(ControlToken::Continue)
}
Some('<') => {
@@ -475,7 +475,7 @@ where
}
Some('\0') => {
slf.emit_error(Error::UnexpectedNullCharacter);
- slf.emitter.emit_string("\u{fffd}");
+ slf.emit_char('\u{fffd}');
Ok(ControlToken::Continue)
}
None => {
@@ -483,14 +483,14 @@ where
Ok(ControlToken::Eof)
}
Some(x) => {
- slf.emitter.emit_string(ctostr!(x));
+ slf.emit_char(x);
Ok(ControlToken::Continue)
}
},
State::ScriptDataEscapedDash => match slf.read_char()? {
Some('-') => {
slf.state = State::ScriptDataEscapedDashDash;
- slf.emitter.emit_string("-");
+ slf.emit_char('-');
Ok(ControlToken::Continue)
}
Some('<') => {
@@ -500,7 +500,7 @@ where
Some('\0') => {
slf.emit_error(Error::UnexpectedNullCharacter);
slf.state = State::ScriptDataEscaped;
- slf.emitter.emit_string("\u{fffd}");
+ slf.emit_char('\u{fffd}');
Ok(ControlToken::Continue)
}
None => {
@@ -509,13 +509,13 @@ where
}
Some(x) => {
slf.state = State::ScriptDataEscaped;
- slf.emitter.emit_string(ctostr!(x));
+ slf.emit_char(x);
Ok(ControlToken::Continue)
}
},
State::ScriptDataEscapedDashDash => match slf.read_char()? {
Some('-') => {
- slf.emitter.emit_string("-");
+ slf.emit_char('-');
Ok(ControlToken::Continue)
}
Some('<') => {
@@ -524,13 +524,13 @@ where
}
Some('>') => {
slf.state = State::ScriptData;
- slf.emitter.emit_string(">");
+ slf.emit_char('>');
Ok(ControlToken::Continue)
}
Some('\0') => {
slf.emit_error(Error::UnexpectedNullCharacter);
slf.state = State::ScriptDataEscaped;
- slf.emitter.emit_string("\u{fffd}");
+ slf.emit_char('\u{fffd}');
Ok(ControlToken::Continue)
}
None => {
@@ -539,7 +539,7 @@ where
}
Some(x) => {
slf.state = State::ScriptDataEscaped;
- slf.emitter.emit_string(ctostr!(x));
+ slf.emit_char(x);
Ok(ControlToken::Continue)
}
},
@@ -551,13 +551,13 @@ where
}
Some(x) if x.is_ascii_alphabetic() => {
slf.temporary_buffer.clear();
- slf.emitter.emit_string("<");
+ slf.emit_char('<');
slf.state = State::ScriptDataDoubleEscapeStart;
slf.unread_char(Some(x));
Ok(ControlToken::Continue)
}
c => {
- slf.emitter.emit_string("<");
+ slf.emit_char('<');
slf.state = State::ScriptDataEscaped;
slf.unread_char(c);
Ok(ControlToken::Continue)
@@ -571,7 +571,7 @@ where
Ok(ControlToken::Continue)
}
c => {
- slf.emitter.emit_string("</");
+ slf.emit_chars(b"</");
slf.unread_char(c);
slf.state = State::ScriptDataEscaped;
Ok(ControlToken::Continue)
@@ -597,7 +597,7 @@ where
Ok(ControlToken::Continue)
}
c => {
- slf.emitter.emit_string("</");
+ slf.emit_chars(b"</");
slf.flush_buffer_characters();
slf.state = State::ScriptDataEscaped;
slf.unread_char(c);
@@ -611,12 +611,12 @@ where
} else {
slf.state = State::ScriptDataEscaped;
}
- slf.emitter.emit_string(ctostr!(x));
+ slf.emit_char(x);
Ok(ControlToken::Continue)
}
Some(x) if x.is_ascii_alphabetic() => {
slf.temporary_buffer.push(x.to_ascii_lowercase());
- slf.emitter.emit_string(ctostr!(x));
+ slf.emit_char(x);
Ok(ControlToken::Continue)
}
c => {
@@ -628,17 +628,17 @@ where
State::ScriptDataDoubleEscaped => match slf.read_char()? {
Some('-') => {
slf.state = State::ScriptDataDoubleEscapedDash;
- slf.emitter.emit_string("-");
+ slf.emit_char('-');
Ok(ControlToken::Continue)
}
Some('<') => {
slf.state = State::ScriptDataDoubleEscapedLessThanSign;
- slf.emitter.emit_string("<");
+ slf.emit_char('<');
Ok(ControlToken::Continue)
}
Some('\0') => {
slf.emit_error(Error::UnexpectedNullCharacter);
- slf.emitter.emit_string("\u{fffd}");
+ slf.emit_char('\u{fffd}');
Ok(ControlToken::Continue)
}
None => {
@@ -646,25 +646,25 @@ where
Ok(ControlToken::Eof)
}
Some(x) => {
- slf.emitter.emit_string(ctostr!(x));
+ slf.emit_char(x);
Ok(ControlToken::Continue)
}
},
State::ScriptDataDoubleEscapedDash => match slf.read_char()? {
Some('-') => {
slf.state = State::ScriptDataDoubleEscapedDashDash;
- slf.emitter.emit_string("-");
+ slf.emit_char('-');
Ok(ControlToken::Continue)
}
Some('<') => {
slf.state = State::ScriptDataDoubleEscapedLessThanSign;
- slf.emitter.emit_string("<");
+ slf.emit_char('<');
Ok(ControlToken::Continue)
}
Some('\0') => {
slf.emit_error(Error::UnexpectedNullCharacter);
slf.state = State::ScriptDataDoubleEscaped;
- slf.emitter.emit_string("\u{fffd}");
+ slf.emit_char('\u{fffd}');
Ok(ControlToken::Continue)
}
None => {
@@ -673,29 +673,29 @@ where
}
Some(x) => {
slf.state = State::ScriptDataDoubleEscaped;
- slf.emitter.emit_string(ctostr!(x));
+ slf.emit_char(x);
Ok(ControlToken::Continue)
}
},
State::ScriptDataDoubleEscapedDashDash => match slf.read_char()? {
Some('-') => {
- slf.emitter.emit_string("-");
+ slf.emit_char('-');
Ok(ControlToken::Continue)
}
Some('<') => {
- slf.emitter.emit_string("<");
+ slf.emit_char('<');
slf.state = State::ScriptDataDoubleEscapedLessThanSign;
Ok(ControlToken::Continue)
}
Some('>') => {
- slf.emitter.emit_string(">");
+ slf.emit_char('>');
slf.state = State::ScriptData;
Ok(ControlToken::Continue)
}
Some('\0') => {
slf.emit_error(Error::UnexpectedNullCharacter);
slf.state = State::ScriptDataDoubleEscaped;
- slf.emitter.emit_string("\u{fffd}");
+ slf.emit_char('\u{fffd}');
Ok(ControlToken::Continue)
}
None => {
@@ -704,7 +704,7 @@ where
}
Some(x) => {
slf.state = State::ScriptDataDoubleEscaped;
- slf.emitter.emit_string(ctostr!(x));
+ slf.emit_char(x);
Ok(ControlToken::Continue)
}
},
@@ -712,7 +712,7 @@ where
Some('/') => {
slf.temporary_buffer.clear();
slf.state = State::ScriptDataDoubleEscapeEnd;
- slf.emitter.emit_string("/");
+ slf.emit_char('/');
Ok(ControlToken::Continue)
}
c => {
@@ -729,12 +729,12 @@ where
slf.state = State::ScriptDataDoubleEscaped;
}
- slf.emitter.emit_string(ctostr!(x));
+ slf.emit_char(x);
Ok(ControlToken::Continue)
}
Some(x) if x.is_ascii_alphabetic() => {
slf.temporary_buffer.push(x.to_ascii_lowercase());
- slf.emitter.emit_string(ctostr!(x));
+ slf.emit_char(x);
Ok(ControlToken::Continue)
}
c => {
@@ -1741,7 +1741,7 @@ where
Ok(ControlToken::Eof)
}
Some(x) => {
- slf.emitter.emit_string(ctostr!(x));
+ slf.emit_char(x);
Ok(ControlToken::Continue)
}
},
@@ -1751,7 +1751,7 @@ where
Ok(ControlToken::Continue)
}
c => {
- slf.emitter.emit_string("]");
+ slf.emit_char(']');
slf.state = State::CdataSection;
slf.unread_char(c);
Ok(ControlToken::Continue)
@@ -1759,7 +1759,7 @@ where
},
State::CdataSectionEnd => match slf.read_char()? {
Some(']') => {
- slf.emitter.emit_string("]");
+ slf.emit_char(']');
Ok(ControlToken::Continue)
}
Some('>') => {
@@ -1767,7 +1767,7 @@ where
Ok(ControlToken::Continue)
}
c => {
- slf.emitter.emit_string("]]");
+ slf.emit_chars(b"]]");
slf.unread_char(c);
slf.state = State::CdataSection;
Ok(ControlToken::Continue)
@@ -1846,7 +1846,7 @@ where
if slf.is_consumed_as_part_of_an_attribute() {
slf.emitter.push_attribute_value(ctostr!(x));
} else {
- slf.emitter.emit_string(ctostr!(x));
+ slf.emit_char(x);
}
Ok(ControlToken::Continue)
diff --git a/src/tokenizer/machine/utils.rs b/src/tokenizer/machine/utils.rs
index 6e45f4d..d96e50b 100644
--- a/src/tokenizer/machine/utils.rs
+++ b/src/tokenizer/machine/utils.rs
@@ -13,6 +13,24 @@ where
O: Offset,
E: Emitter<O>,
{
+ /// Emits the given character as a character token.
+ #[inline]
+ pub(super) fn emit_char(&mut self, c: char) {
+ self.emitter.emit_string(ctostr!(c));
+ }
+
+ /// Emits every byte of the given byte slice as a character token.
+ ///
+ /// (We're operating on bytes to enable compiler optimization,
+ /// since [`str::chars`] isn't `const`.)
+ #[inline]
+ pub(super) fn emit_chars(&mut self, s: &[u8]) {
+ self.emitter.emit_string(
+ // this unsafe block is only temporary and will be removed in the next commit
+ unsafe { std::str::from_utf8_unchecked(s) },
+ );
+ }
+
#[inline]
pub(crate) fn emit_error(&mut self, error: Error) {
let span = match error {