summaryrefslogtreecommitdiff
path: root/src/tokenizer
diff options
context:
space:
mode:
Diffstat (limited to 'src/tokenizer')
-rw-r--r--src/tokenizer/machine.rs8
-rw-r--r--src/tokenizer/machine/utils.rs167
2 files changed, 172 insertions, 3 deletions
diff --git a/src/tokenizer/machine.rs b/src/tokenizer/machine.rs
index 07d4c05..fc31a42 100644
--- a/src/tokenizer/machine.rs
+++ b/src/tokenizer/machine.rs
@@ -1,13 +1,15 @@
+pub(super) mod utils;
+
use crate::entities::try_read_character_reference;
use crate::offset::{Offset, Position};
use crate::token::AttrValueSyntax;
use crate::tokenizer::CdataAction;
-use crate::utils::{
+use crate::{reader::Reader, Emitter, Error, Tokenizer};
+use utils::{
ascii_digit_pat, control_pat, ctostr, noncharacter_pat, surrogate_pat, whitespace_pat,
};
-use crate::{reader::Reader, Emitter, Error, Tokenizer};
-pub use crate::utils::State;
+pub use utils::State;
pub enum ControlToken {
Eof,
diff --git a/src/tokenizer/machine/utils.rs b/src/tokenizer/machine/utils.rs
new file mode 100644
index 0000000..7d220cf
--- /dev/null
+++ b/src/tokenizer/machine/utils.rs
@@ -0,0 +1,167 @@
+macro_rules! surrogate_pat {
+ () => {
+ 0xd800..=0xdfff
+ };
+}
+
+pub(crate) use surrogate_pat;
+
+macro_rules! control_pat {
+ () => (0x0d | 0x0000..=0x001f | 0x007f..=0x009f)
+}
+
+pub(crate) use control_pat;
+
+macro_rules! ascii_digit_pat {
+ () => {
+ '0'..='9'
+ };
+}
+
+pub(crate) use ascii_digit_pat;
+
+macro_rules! whitespace_pat {
+ () => {
+ '\t' | '\u{0A}' | '\u{0C}' | ' '
+ };
+}
+
+pub(crate) use whitespace_pat;
+
+macro_rules! noncharacter_pat {
+ () => {
+ 0xfdd0
+ ..=0xfdef
+ | 0xfffe
+ | 0xffff
+ | 0x1fffe
+ | 0x1ffff
+ | 0x2fffe
+ | 0x2ffff
+ | 0x3fffe
+ | 0x3ffff
+ | 0x4fffe
+ | 0x4ffff
+ | 0x5fffe
+ | 0x5ffff
+ | 0x6fffe
+ | 0x6ffff
+ | 0x7fffe
+ | 0x7ffff
+ | 0x8fffe
+ | 0x8ffff
+ | 0x9fffe
+ | 0x9ffff
+ | 0xafffe
+ | 0xaffff
+ | 0xbfffe
+ | 0xbffff
+ | 0xcfffe
+ | 0xcffff
+ | 0xdfffe
+ | 0xdffff
+ | 0xefffe
+ | 0xeffff
+ | 0xffffe
+ | 0xfffff
+ | 0x10fffe
+ | 0x10ffff
+ };
+}
+
+pub(crate) use noncharacter_pat;
+
+// When integration tests are running, this enum is public and we get warnings about missing docs.
+// However, it's not actually part of public API.
+#[allow(missing_docs)]
+#[derive(Clone, Copy, Debug, Eq, PartialEq)]
+pub enum State {
+ Data,
+ RcData,
+ RawText,
+ ScriptData,
+ PlainText,
+ TagOpen,
+ EndTagOpen,
+ TagName,
+ RcDataLessThanSign,
+ RcDataEndTagOpen,
+ RcDataEndTagName,
+ RawTextLessThanSign,
+ RawTextEndTagOpen,
+ RawTextEndTagName,
+ ScriptDataLessThanSign,
+ ScriptDataEndTagOpen,
+ ScriptDataEndTagName,
+ ScriptDataEscapeStart,
+ ScriptDataEscapeStartDash,
+ ScriptDataEscaped,
+ ScriptDataEscapedDash,
+ ScriptDataEscapedDashDash,
+ ScriptDataEscapedLessThanSign,
+ ScriptDataEscapedEndTagOpen,
+ ScriptDataEscapedEndTagName,
+ ScriptDataDoubleEscapeStart,
+ ScriptDataDoubleEscaped,
+ ScriptDataDoubleEscapedDash,
+ ScriptDataDoubleEscapedDashDash,
+ ScriptDataDoubleEscapedLessThanSign,
+ ScriptDataDoubleEscapeEnd,
+ BeforeAttributeName,
+ AttributeName,
+ AfterAttributeName,
+ BeforeAttributeValue,
+ AttributeValueDoubleQuoted,
+ AttributeValueSingleQuoted,
+ AttributeValueUnquoted,
+ AfterAttributeValueQuoted,
+ SelfClosingStartTag,
+ BogusComment,
+ MarkupDeclarationOpen,
+ CommentStart,
+ CommentStartDash,
+ Comment,
+ CommentLessThanSign,
+ CommentLessThanSignBang,
+ CommentLessThanSignBangDash,
+ CommentLessThanSignBangDashDash,
+ CommentEndDash,
+ CommentEnd,
+ CommentEndBang,
+ Doctype,
+ BeforeDoctypeName,
+ DoctypeName,
+ AfterDoctypeName,
+ AfterDoctypePublicKeyword,
+ BeforeDoctypePublicIdentifier,
+ DoctypePublicIdentifierDoubleQuoted,
+ DoctypePublicIdentifierSingleQuoted,
+ AfterDoctypePublicIdentifier,
+ BetweenDoctypePublicAndSystemIdentifiers,
+ AfterDoctypeSystemKeyword,
+ BeforeDoctypeSystemIdentifier,
+ DoctypeSystemIdentifierDoubleQuoted,
+ DoctypeSystemIdentifierSingleQuoted,
+ AfterDoctypeSystemIdentifier,
+ BogusDoctype,
+ CdataSection,
+ CdataSectionBracket,
+ CdataSectionEnd,
+ CharacterReference,
+ NamedCharacterReference,
+ AmbiguousAmpersand,
+ NumericCharacterReference,
+ HexadecimalCharacterReferenceStart,
+ DecimalCharacterReferenceStart,
+ HexadecimalCharacterReference,
+ DecimalCharacterReference,
+ NumericCharacterReferenceEnd,
+}
+
+macro_rules! ctostr {
+ ($c:expr) => {
+ &*$c.encode_utf8(&mut [0; 4])
+ };
+}
+
+pub(crate) use ctostr;