From 2c73901944e2d22747a2a4ebcc11881b3f8c2ad3 Mon Sep 17 00:00:00 2001 From: Martin Fischer Date: Sat, 9 Sep 2023 21:42:17 +0200 Subject: refactor: move utils module under tokenizer::machine --- src/lib.rs | 3 +- src/tokenizer.rs | 4 +- src/tokenizer/machine.rs | 8 +- src/tokenizer/machine/utils.rs | 167 +++++++++++++++++++++++++++++++++++++++++ src/utils.rs | 167 ----------------------------------------- 5 files changed, 175 insertions(+), 174 deletions(-) create mode 100644 src/tokenizer/machine/utils.rs delete mode 100644 src/utils.rs (limited to 'src') diff --git a/src/lib.rs b/src/lib.rs index baacb38..3c7e77b 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -13,7 +13,6 @@ mod entities; mod error; mod naive_parser; mod tokenizer; -mod utils; /// Types for HTML attributes. pub mod attr { @@ -33,7 +32,7 @@ pub use token::{Comment, Doctype, EndTag, StartTag, Token}; pub use tokenizer::{CdataAction, Event, State, Tokenizer}; #[cfg(feature = "integration-tests")] -pub use utils::State as InternalState; +pub use tokenizer::InternalState; /// Relative links in the README.md don't work in rustdoc, so we have to override them. macro_rules! file_url { diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 7e05477..6f698f6 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -3,12 +3,12 @@ mod machine; use crate::naive_parser::naive_next_state; use crate::offset::{Offset, Position}; use crate::reader::{IntoReader, Reader}; -use crate::utils::{control_pat, noncharacter_pat, surrogate_pat}; use crate::{Emitter, Error}; +use machine::utils::{control_pat, noncharacter_pat, surrogate_pat}; use machine::ControlToken; #[cfg(feature = "integration-tests")] -use crate::utils::State as InternalState; +pub use machine::State as InternalState; // this is a stack that can hold 0 to 2 Ts #[derive(Debug, Default, Clone, Copy)] diff --git a/src/tokenizer/machine.rs b/src/tokenizer/machine.rs index 07d4c05..fc31a42 100644 --- a/src/tokenizer/machine.rs +++ b/src/tokenizer/machine.rs @@ -1,13 +1,15 @@ +pub(super) mod utils; + use crate::entities::try_read_character_reference; use crate::offset::{Offset, Position}; use crate::token::AttrValueSyntax; use crate::tokenizer::CdataAction; -use crate::utils::{ +use crate::{reader::Reader, Emitter, Error, Tokenizer}; +use utils::{ ascii_digit_pat, control_pat, ctostr, noncharacter_pat, surrogate_pat, whitespace_pat, }; -use crate::{reader::Reader, Emitter, Error, Tokenizer}; -pub use crate::utils::State; +pub use utils::State; pub enum ControlToken { Eof, diff --git a/src/tokenizer/machine/utils.rs b/src/tokenizer/machine/utils.rs new file mode 100644 index 0000000..7d220cf --- /dev/null +++ b/src/tokenizer/machine/utils.rs @@ -0,0 +1,167 @@ +macro_rules! surrogate_pat { + () => { + 0xd800..=0xdfff + }; +} + +pub(crate) use surrogate_pat; + +macro_rules! control_pat { + () => (0x0d | 0x0000..=0x001f | 0x007f..=0x009f) +} + +pub(crate) use control_pat; + +macro_rules! ascii_digit_pat { + () => { + '0'..='9' + }; +} + +pub(crate) use ascii_digit_pat; + +macro_rules! whitespace_pat { + () => { + '\t' | '\u{0A}' | '\u{0C}' | ' ' + }; +} + +pub(crate) use whitespace_pat; + +macro_rules! noncharacter_pat { + () => { + 0xfdd0 + ..=0xfdef + | 0xfffe + | 0xffff + | 0x1fffe + | 0x1ffff + | 0x2fffe + | 0x2ffff + | 0x3fffe + | 0x3ffff + | 0x4fffe + | 0x4ffff + | 0x5fffe + | 0x5ffff + | 0x6fffe + | 0x6ffff + | 0x7fffe + | 0x7ffff + | 0x8fffe + | 0x8ffff + | 0x9fffe + | 0x9ffff + | 0xafffe + | 0xaffff + | 0xbfffe + | 0xbffff + | 0xcfffe + | 0xcffff + | 0xdfffe + | 0xdffff + | 0xefffe + | 0xeffff + | 0xffffe + | 0xfffff + | 0x10fffe + | 0x10ffff + }; +} + +pub(crate) use noncharacter_pat; + +// When integration tests are running, this enum is public and we get warnings about missing docs. +// However, it's not actually part of public API. +#[allow(missing_docs)] +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum State { + Data, + RcData, + RawText, + ScriptData, + PlainText, + TagOpen, + EndTagOpen, + TagName, + RcDataLessThanSign, + RcDataEndTagOpen, + RcDataEndTagName, + RawTextLessThanSign, + RawTextEndTagOpen, + RawTextEndTagName, + ScriptDataLessThanSign, + ScriptDataEndTagOpen, + ScriptDataEndTagName, + ScriptDataEscapeStart, + ScriptDataEscapeStartDash, + ScriptDataEscaped, + ScriptDataEscapedDash, + ScriptDataEscapedDashDash, + ScriptDataEscapedLessThanSign, + ScriptDataEscapedEndTagOpen, + ScriptDataEscapedEndTagName, + ScriptDataDoubleEscapeStart, + ScriptDataDoubleEscaped, + ScriptDataDoubleEscapedDash, + ScriptDataDoubleEscapedDashDash, + ScriptDataDoubleEscapedLessThanSign, + ScriptDataDoubleEscapeEnd, + BeforeAttributeName, + AttributeName, + AfterAttributeName, + BeforeAttributeValue, + AttributeValueDoubleQuoted, + AttributeValueSingleQuoted, + AttributeValueUnquoted, + AfterAttributeValueQuoted, + SelfClosingStartTag, + BogusComment, + MarkupDeclarationOpen, + CommentStart, + CommentStartDash, + Comment, + CommentLessThanSign, + CommentLessThanSignBang, + CommentLessThanSignBangDash, + CommentLessThanSignBangDashDash, + CommentEndDash, + CommentEnd, + CommentEndBang, + Doctype, + BeforeDoctypeName, + DoctypeName, + AfterDoctypeName, + AfterDoctypePublicKeyword, + BeforeDoctypePublicIdentifier, + DoctypePublicIdentifierDoubleQuoted, + DoctypePublicIdentifierSingleQuoted, + AfterDoctypePublicIdentifier, + BetweenDoctypePublicAndSystemIdentifiers, + AfterDoctypeSystemKeyword, + BeforeDoctypeSystemIdentifier, + DoctypeSystemIdentifierDoubleQuoted, + DoctypeSystemIdentifierSingleQuoted, + AfterDoctypeSystemIdentifier, + BogusDoctype, + CdataSection, + CdataSectionBracket, + CdataSectionEnd, + CharacterReference, + NamedCharacterReference, + AmbiguousAmpersand, + NumericCharacterReference, + HexadecimalCharacterReferenceStart, + DecimalCharacterReferenceStart, + HexadecimalCharacterReference, + DecimalCharacterReference, + NumericCharacterReferenceEnd, +} + +macro_rules! ctostr { + ($c:expr) => { + &*$c.encode_utf8(&mut [0; 4]) + }; +} + +pub(crate) use ctostr; diff --git a/src/utils.rs b/src/utils.rs deleted file mode 100644 index 7d220cf..0000000 --- a/src/utils.rs +++ /dev/null @@ -1,167 +0,0 @@ -macro_rules! surrogate_pat { - () => { - 0xd800..=0xdfff - }; -} - -pub(crate) use surrogate_pat; - -macro_rules! control_pat { - () => (0x0d | 0x0000..=0x001f | 0x007f..=0x009f) -} - -pub(crate) use control_pat; - -macro_rules! ascii_digit_pat { - () => { - '0'..='9' - }; -} - -pub(crate) use ascii_digit_pat; - -macro_rules! whitespace_pat { - () => { - '\t' | '\u{0A}' | '\u{0C}' | ' ' - }; -} - -pub(crate) use whitespace_pat; - -macro_rules! noncharacter_pat { - () => { - 0xfdd0 - ..=0xfdef - | 0xfffe - | 0xffff - | 0x1fffe - | 0x1ffff - | 0x2fffe - | 0x2ffff - | 0x3fffe - | 0x3ffff - | 0x4fffe - | 0x4ffff - | 0x5fffe - | 0x5ffff - | 0x6fffe - | 0x6ffff - | 0x7fffe - | 0x7ffff - | 0x8fffe - | 0x8ffff - | 0x9fffe - | 0x9ffff - | 0xafffe - | 0xaffff - | 0xbfffe - | 0xbffff - | 0xcfffe - | 0xcffff - | 0xdfffe - | 0xdffff - | 0xefffe - | 0xeffff - | 0xffffe - | 0xfffff - | 0x10fffe - | 0x10ffff - }; -} - -pub(crate) use noncharacter_pat; - -// When integration tests are running, this enum is public and we get warnings about missing docs. -// However, it's not actually part of public API. -#[allow(missing_docs)] -#[derive(Clone, Copy, Debug, Eq, PartialEq)] -pub enum State { - Data, - RcData, - RawText, - ScriptData, - PlainText, - TagOpen, - EndTagOpen, - TagName, - RcDataLessThanSign, - RcDataEndTagOpen, - RcDataEndTagName, - RawTextLessThanSign, - RawTextEndTagOpen, - RawTextEndTagName, - ScriptDataLessThanSign, - ScriptDataEndTagOpen, - ScriptDataEndTagName, - ScriptDataEscapeStart, - ScriptDataEscapeStartDash, - ScriptDataEscaped, - ScriptDataEscapedDash, - ScriptDataEscapedDashDash, - ScriptDataEscapedLessThanSign, - ScriptDataEscapedEndTagOpen, - ScriptDataEscapedEndTagName, - ScriptDataDoubleEscapeStart, - ScriptDataDoubleEscaped, - ScriptDataDoubleEscapedDash, - ScriptDataDoubleEscapedDashDash, - ScriptDataDoubleEscapedLessThanSign, - ScriptDataDoubleEscapeEnd, - BeforeAttributeName, - AttributeName, - AfterAttributeName, - BeforeAttributeValue, - AttributeValueDoubleQuoted, - AttributeValueSingleQuoted, - AttributeValueUnquoted, - AfterAttributeValueQuoted, - SelfClosingStartTag, - BogusComment, - MarkupDeclarationOpen, - CommentStart, - CommentStartDash, - Comment, - CommentLessThanSign, - CommentLessThanSignBang, - CommentLessThanSignBangDash, - CommentLessThanSignBangDashDash, - CommentEndDash, - CommentEnd, - CommentEndBang, - Doctype, - BeforeDoctypeName, - DoctypeName, - AfterDoctypeName, - AfterDoctypePublicKeyword, - BeforeDoctypePublicIdentifier, - DoctypePublicIdentifierDoubleQuoted, - DoctypePublicIdentifierSingleQuoted, - AfterDoctypePublicIdentifier, - BetweenDoctypePublicAndSystemIdentifiers, - AfterDoctypeSystemKeyword, - BeforeDoctypeSystemIdentifier, - DoctypeSystemIdentifierDoubleQuoted, - DoctypeSystemIdentifierSingleQuoted, - AfterDoctypeSystemIdentifier, - BogusDoctype, - CdataSection, - CdataSectionBracket, - CdataSectionEnd, - CharacterReference, - NamedCharacterReference, - AmbiguousAmpersand, - NumericCharacterReference, - HexadecimalCharacterReferenceStart, - DecimalCharacterReferenceStart, - HexadecimalCharacterReference, - DecimalCharacterReference, - NumericCharacterReferenceEnd, -} - -macro_rules! ctostr { - ($c:expr) => { - &*$c.encode_utf8(&mut [0; 4]) - }; -} - -pub(crate) use ctostr; -- cgit v1.2.3