diff options
author | Martin Fischer <martin@push-f.com> | 2023-09-09 21:42:17 +0200 |
---|---|---|
committer | Martin Fischer <martin@push-f.com> | 2023-09-28 10:36:08 +0200 |
commit | 2c73901944e2d22747a2a4ebcc11881b3f8c2ad3 (patch) | |
tree | 310726d807df6f6ae6911033dd31e5bd139a0559 /src/tokenizer | |
parent | 2a0c35906d96203a3dc2b41cf8a1be74e025b285 (diff) |
refactor: move utils module under tokenizer::machine
Diffstat (limited to 'src/tokenizer')
-rw-r--r-- | src/tokenizer/machine.rs | 8 | ||||
-rw-r--r-- | src/tokenizer/machine/utils.rs | 167 |
2 files changed, 172 insertions, 3 deletions
diff --git a/src/tokenizer/machine.rs b/src/tokenizer/machine.rs index 07d4c05..fc31a42 100644 --- a/src/tokenizer/machine.rs +++ b/src/tokenizer/machine.rs @@ -1,13 +1,15 @@ +pub(super) mod utils; + use crate::entities::try_read_character_reference; use crate::offset::{Offset, Position}; use crate::token::AttrValueSyntax; use crate::tokenizer::CdataAction; -use crate::utils::{ +use crate::{reader::Reader, Emitter, Error, Tokenizer}; +use utils::{ ascii_digit_pat, control_pat, ctostr, noncharacter_pat, surrogate_pat, whitespace_pat, }; -use crate::{reader::Reader, Emitter, Error, Tokenizer}; -pub use crate::utils::State; +pub use utils::State; pub enum ControlToken { Eof, diff --git a/src/tokenizer/machine/utils.rs b/src/tokenizer/machine/utils.rs new file mode 100644 index 0000000..7d220cf --- /dev/null +++ b/src/tokenizer/machine/utils.rs @@ -0,0 +1,167 @@ +macro_rules! surrogate_pat { + () => { + 0xd800..=0xdfff + }; +} + +pub(crate) use surrogate_pat; + +macro_rules! control_pat { + () => (0x0d | 0x0000..=0x001f | 0x007f..=0x009f) +} + +pub(crate) use control_pat; + +macro_rules! ascii_digit_pat { + () => { + '0'..='9' + }; +} + +pub(crate) use ascii_digit_pat; + +macro_rules! whitespace_pat { + () => { + '\t' | '\u{0A}' | '\u{0C}' | ' ' + }; +} + +pub(crate) use whitespace_pat; + +macro_rules! noncharacter_pat { + () => { + 0xfdd0 + ..=0xfdef + | 0xfffe + | 0xffff + | 0x1fffe + | 0x1ffff + | 0x2fffe + | 0x2ffff + | 0x3fffe + | 0x3ffff + | 0x4fffe + | 0x4ffff + | 0x5fffe + | 0x5ffff + | 0x6fffe + | 0x6ffff + | 0x7fffe + | 0x7ffff + | 0x8fffe + | 0x8ffff + | 0x9fffe + | 0x9ffff + | 0xafffe + | 0xaffff + | 0xbfffe + | 0xbffff + | 0xcfffe + | 0xcffff + | 0xdfffe + | 0xdffff + | 0xefffe + | 0xeffff + | 0xffffe + | 0xfffff + | 0x10fffe + | 0x10ffff + }; +} + +pub(crate) use noncharacter_pat; + +// When integration tests are running, this enum is public and we get warnings about missing docs. +// However, it's not actually part of public API. +#[allow(missing_docs)] +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum State { + Data, + RcData, + RawText, + ScriptData, + PlainText, + TagOpen, + EndTagOpen, + TagName, + RcDataLessThanSign, + RcDataEndTagOpen, + RcDataEndTagName, + RawTextLessThanSign, + RawTextEndTagOpen, + RawTextEndTagName, + ScriptDataLessThanSign, + ScriptDataEndTagOpen, + ScriptDataEndTagName, + ScriptDataEscapeStart, + ScriptDataEscapeStartDash, + ScriptDataEscaped, + ScriptDataEscapedDash, + ScriptDataEscapedDashDash, + ScriptDataEscapedLessThanSign, + ScriptDataEscapedEndTagOpen, + ScriptDataEscapedEndTagName, + ScriptDataDoubleEscapeStart, + ScriptDataDoubleEscaped, + ScriptDataDoubleEscapedDash, + ScriptDataDoubleEscapedDashDash, + ScriptDataDoubleEscapedLessThanSign, + ScriptDataDoubleEscapeEnd, + BeforeAttributeName, + AttributeName, + AfterAttributeName, + BeforeAttributeValue, + AttributeValueDoubleQuoted, + AttributeValueSingleQuoted, + AttributeValueUnquoted, + AfterAttributeValueQuoted, + SelfClosingStartTag, + BogusComment, + MarkupDeclarationOpen, + CommentStart, + CommentStartDash, + Comment, + CommentLessThanSign, + CommentLessThanSignBang, + CommentLessThanSignBangDash, + CommentLessThanSignBangDashDash, + CommentEndDash, + CommentEnd, + CommentEndBang, + Doctype, + BeforeDoctypeName, + DoctypeName, + AfterDoctypeName, + AfterDoctypePublicKeyword, + BeforeDoctypePublicIdentifier, + DoctypePublicIdentifierDoubleQuoted, + DoctypePublicIdentifierSingleQuoted, + AfterDoctypePublicIdentifier, + BetweenDoctypePublicAndSystemIdentifiers, + AfterDoctypeSystemKeyword, + BeforeDoctypeSystemIdentifier, + DoctypeSystemIdentifierDoubleQuoted, + DoctypeSystemIdentifierSingleQuoted, + AfterDoctypeSystemIdentifier, + BogusDoctype, + CdataSection, + CdataSectionBracket, + CdataSectionEnd, + CharacterReference, + NamedCharacterReference, + AmbiguousAmpersand, + NumericCharacterReference, + HexadecimalCharacterReferenceStart, + DecimalCharacterReferenceStart, + HexadecimalCharacterReference, + DecimalCharacterReference, + NumericCharacterReferenceEnd, +} + +macro_rules! ctostr { + ($c:expr) => { + &*$c.encode_utf8(&mut [0; 4]) + }; +} + +pub(crate) use ctostr; |