From 2c73901944e2d22747a2a4ebcc11881b3f8c2ad3 Mon Sep 17 00:00:00 2001
From: Martin Fischer <martin@push-f.com>
Date: Sat, 9 Sep 2023 21:42:17 +0200
Subject: refactor: move utils module under tokenizer::machine

---
 src/tokenizer/machine/utils.rs | 167 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 167 insertions(+)
 create mode 100644 src/tokenizer/machine/utils.rs

(limited to 'src/tokenizer/machine')

diff --git a/src/tokenizer/machine/utils.rs b/src/tokenizer/machine/utils.rs
new file mode 100644
index 0000000..7d220cf
--- /dev/null
+++ b/src/tokenizer/machine/utils.rs
@@ -0,0 +1,167 @@
+macro_rules! surrogate_pat {
+    () => {
+        0xd800..=0xdfff
+    };
+}
+
+pub(crate) use surrogate_pat;
+
+macro_rules! control_pat {
+    () => (0x0d | 0x0000..=0x001f | 0x007f..=0x009f)
+}
+
+pub(crate) use control_pat;
+
+macro_rules! ascii_digit_pat {
+    () => {
+        '0'..='9'
+    };
+}
+
+pub(crate) use ascii_digit_pat;
+
+macro_rules! whitespace_pat {
+    () => {
+        '\t' | '\u{0A}' | '\u{0C}' | ' '
+    };
+}
+
+pub(crate) use whitespace_pat;
+
+macro_rules! noncharacter_pat {
+    () => {
+        0xfdd0
+            ..=0xfdef
+                | 0xfffe
+                | 0xffff
+                | 0x1fffe
+                | 0x1ffff
+                | 0x2fffe
+                | 0x2ffff
+                | 0x3fffe
+                | 0x3ffff
+                | 0x4fffe
+                | 0x4ffff
+                | 0x5fffe
+                | 0x5ffff
+                | 0x6fffe
+                | 0x6ffff
+                | 0x7fffe
+                | 0x7ffff
+                | 0x8fffe
+                | 0x8ffff
+                | 0x9fffe
+                | 0x9ffff
+                | 0xafffe
+                | 0xaffff
+                | 0xbfffe
+                | 0xbffff
+                | 0xcfffe
+                | 0xcffff
+                | 0xdfffe
+                | 0xdffff
+                | 0xefffe
+                | 0xeffff
+                | 0xffffe
+                | 0xfffff
+                | 0x10fffe
+                | 0x10ffff
+    };
+}
+
+pub(crate) use noncharacter_pat;
+
+// When integration tests are running, this enum is public and we get warnings about missing docs.
+// However, it's not actually part of public API.
+#[allow(missing_docs)]
+#[derive(Clone, Copy, Debug, Eq, PartialEq)]
+pub enum State {
+    Data,
+    RcData,
+    RawText,
+    ScriptData,
+    PlainText,
+    TagOpen,
+    EndTagOpen,
+    TagName,
+    RcDataLessThanSign,
+    RcDataEndTagOpen,
+    RcDataEndTagName,
+    RawTextLessThanSign,
+    RawTextEndTagOpen,
+    RawTextEndTagName,
+    ScriptDataLessThanSign,
+    ScriptDataEndTagOpen,
+    ScriptDataEndTagName,
+    ScriptDataEscapeStart,
+    ScriptDataEscapeStartDash,
+    ScriptDataEscaped,
+    ScriptDataEscapedDash,
+    ScriptDataEscapedDashDash,
+    ScriptDataEscapedLessThanSign,
+    ScriptDataEscapedEndTagOpen,
+    ScriptDataEscapedEndTagName,
+    ScriptDataDoubleEscapeStart,
+    ScriptDataDoubleEscaped,
+    ScriptDataDoubleEscapedDash,
+    ScriptDataDoubleEscapedDashDash,
+    ScriptDataDoubleEscapedLessThanSign,
+    ScriptDataDoubleEscapeEnd,
+    BeforeAttributeName,
+    AttributeName,
+    AfterAttributeName,
+    BeforeAttributeValue,
+    AttributeValueDoubleQuoted,
+    AttributeValueSingleQuoted,
+    AttributeValueUnquoted,
+    AfterAttributeValueQuoted,
+    SelfClosingStartTag,
+    BogusComment,
+    MarkupDeclarationOpen,
+    CommentStart,
+    CommentStartDash,
+    Comment,
+    CommentLessThanSign,
+    CommentLessThanSignBang,
+    CommentLessThanSignBangDash,
+    CommentLessThanSignBangDashDash,
+    CommentEndDash,
+    CommentEnd,
+    CommentEndBang,
+    Doctype,
+    BeforeDoctypeName,
+    DoctypeName,
+    AfterDoctypeName,
+    AfterDoctypePublicKeyword,
+    BeforeDoctypePublicIdentifier,
+    DoctypePublicIdentifierDoubleQuoted,
+    DoctypePublicIdentifierSingleQuoted,
+    AfterDoctypePublicIdentifier,
+    BetweenDoctypePublicAndSystemIdentifiers,
+    AfterDoctypeSystemKeyword,
+    BeforeDoctypeSystemIdentifier,
+    DoctypeSystemIdentifierDoubleQuoted,
+    DoctypeSystemIdentifierSingleQuoted,
+    AfterDoctypeSystemIdentifier,
+    BogusDoctype,
+    CdataSection,
+    CdataSectionBracket,
+    CdataSectionEnd,
+    CharacterReference,
+    NamedCharacterReference,
+    AmbiguousAmpersand,
+    NumericCharacterReference,
+    HexadecimalCharacterReferenceStart,
+    DecimalCharacterReferenceStart,
+    HexadecimalCharacterReference,
+    DecimalCharacterReference,
+    NumericCharacterReferenceEnd,
+}
+
+macro_rules! ctostr {
+    ($c:expr) => {
+        &*$c.encode_utf8(&mut [0; 4])
+    };
+}
+
+pub(crate) use ctostr;
-- 
cgit v1.2.3