diff options
| author | Martin Fischer <martin@push-f.com> | 2023-09-09 21:42:17 +0200 | 
|---|---|---|
| committer | Martin Fischer <martin@push-f.com> | 2023-09-28 10:36:08 +0200 | 
| commit | 2c73901944e2d22747a2a4ebcc11881b3f8c2ad3 (patch) | |
| tree | 310726d807df6f6ae6911033dd31e5bd139a0559 /src/tokenizer | |
| parent | 2a0c35906d96203a3dc2b41cf8a1be74e025b285 (diff) | |
refactor: move utils module under tokenizer::machine
Diffstat (limited to 'src/tokenizer')
| -rw-r--r-- | src/tokenizer/machine.rs | 8 | ||||
| -rw-r--r-- | src/tokenizer/machine/utils.rs | 167 | 
2 files changed, 172 insertions, 3 deletions
| diff --git a/src/tokenizer/machine.rs b/src/tokenizer/machine.rs index 07d4c05..fc31a42 100644 --- a/src/tokenizer/machine.rs +++ b/src/tokenizer/machine.rs @@ -1,13 +1,15 @@ +pub(super) mod utils; +  use crate::entities::try_read_character_reference;  use crate::offset::{Offset, Position};  use crate::token::AttrValueSyntax;  use crate::tokenizer::CdataAction; -use crate::utils::{ +use crate::{reader::Reader, Emitter, Error, Tokenizer}; +use utils::{      ascii_digit_pat, control_pat, ctostr, noncharacter_pat, surrogate_pat, whitespace_pat,  }; -use crate::{reader::Reader, Emitter, Error, Tokenizer}; -pub use crate::utils::State; +pub use utils::State;  pub enum ControlToken {      Eof, diff --git a/src/tokenizer/machine/utils.rs b/src/tokenizer/machine/utils.rs new file mode 100644 index 0000000..7d220cf --- /dev/null +++ b/src/tokenizer/machine/utils.rs @@ -0,0 +1,167 @@ +macro_rules! surrogate_pat { +    () => { +        0xd800..=0xdfff +    }; +} + +pub(crate) use surrogate_pat; + +macro_rules! control_pat { +    () => (0x0d | 0x0000..=0x001f | 0x007f..=0x009f) +} + +pub(crate) use control_pat; + +macro_rules! ascii_digit_pat { +    () => { +        '0'..='9' +    }; +} + +pub(crate) use ascii_digit_pat; + +macro_rules! whitespace_pat { +    () => { +        '\t' | '\u{0A}' | '\u{0C}' | ' ' +    }; +} + +pub(crate) use whitespace_pat; + +macro_rules! noncharacter_pat { +    () => { +        0xfdd0 +            ..=0xfdef +                | 0xfffe +                | 0xffff +                | 0x1fffe +                | 0x1ffff +                | 0x2fffe +                | 0x2ffff +                | 0x3fffe +                | 0x3ffff +                | 0x4fffe +                | 0x4ffff +                | 0x5fffe +                | 0x5ffff +                | 0x6fffe +                | 0x6ffff +                | 0x7fffe +                | 0x7ffff +                | 0x8fffe +                | 0x8ffff +                | 0x9fffe +                | 0x9ffff +                | 0xafffe +                | 0xaffff +                | 0xbfffe +                | 0xbffff +                | 0xcfffe +                | 0xcffff +                | 0xdfffe +                | 0xdffff +                | 0xefffe +                | 0xeffff +                | 0xffffe +                | 0xfffff +                | 0x10fffe +                | 0x10ffff +    }; +} + +pub(crate) use noncharacter_pat; + +// When integration tests are running, this enum is public and we get warnings about missing docs. +// However, it's not actually part of public API. +#[allow(missing_docs)] +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum State { +    Data, +    RcData, +    RawText, +    ScriptData, +    PlainText, +    TagOpen, +    EndTagOpen, +    TagName, +    RcDataLessThanSign, +    RcDataEndTagOpen, +    RcDataEndTagName, +    RawTextLessThanSign, +    RawTextEndTagOpen, +    RawTextEndTagName, +    ScriptDataLessThanSign, +    ScriptDataEndTagOpen, +    ScriptDataEndTagName, +    ScriptDataEscapeStart, +    ScriptDataEscapeStartDash, +    ScriptDataEscaped, +    ScriptDataEscapedDash, +    ScriptDataEscapedDashDash, +    ScriptDataEscapedLessThanSign, +    ScriptDataEscapedEndTagOpen, +    ScriptDataEscapedEndTagName, +    ScriptDataDoubleEscapeStart, +    ScriptDataDoubleEscaped, +    ScriptDataDoubleEscapedDash, +    ScriptDataDoubleEscapedDashDash, +    ScriptDataDoubleEscapedLessThanSign, +    ScriptDataDoubleEscapeEnd, +    BeforeAttributeName, +    AttributeName, +    AfterAttributeName, +    BeforeAttributeValue, +    AttributeValueDoubleQuoted, +    AttributeValueSingleQuoted, +    AttributeValueUnquoted, +    AfterAttributeValueQuoted, +    SelfClosingStartTag, +    BogusComment, +    MarkupDeclarationOpen, +    CommentStart, +    CommentStartDash, +    Comment, +    CommentLessThanSign, +    CommentLessThanSignBang, +    CommentLessThanSignBangDash, +    CommentLessThanSignBangDashDash, +    CommentEndDash, +    CommentEnd, +    CommentEndBang, +    Doctype, +    BeforeDoctypeName, +    DoctypeName, +    AfterDoctypeName, +    AfterDoctypePublicKeyword, +    BeforeDoctypePublicIdentifier, +    DoctypePublicIdentifierDoubleQuoted, +    DoctypePublicIdentifierSingleQuoted, +    AfterDoctypePublicIdentifier, +    BetweenDoctypePublicAndSystemIdentifiers, +    AfterDoctypeSystemKeyword, +    BeforeDoctypeSystemIdentifier, +    DoctypeSystemIdentifierDoubleQuoted, +    DoctypeSystemIdentifierSingleQuoted, +    AfterDoctypeSystemIdentifier, +    BogusDoctype, +    CdataSection, +    CdataSectionBracket, +    CdataSectionEnd, +    CharacterReference, +    NamedCharacterReference, +    AmbiguousAmpersand, +    NumericCharacterReference, +    HexadecimalCharacterReferenceStart, +    DecimalCharacterReferenceStart, +    HexadecimalCharacterReference, +    DecimalCharacterReference, +    NumericCharacterReferenceEnd, +} + +macro_rules! ctostr { +    ($c:expr) => { +        &*$c.encode_utf8(&mut [0; 4]) +    }; +} + +pub(crate) use ctostr; | 
