diff options
author | Martin Fischer <martin@push-f.com> | 2023-09-09 19:10:49 +0200 |
---|---|---|
committer | Martin Fischer <martin@push-f.com> | 2023-09-09 23:02:47 +0200 |
commit | 1d8e6239875c810197a0679a20412726afb8ff66 (patch) | |
tree | 3daa917d720235117770040405a78dad548032e9 | |
parent | 77eb3eefeeab3ba7ab3c6387e5916192b95b482d (diff) |
refactor: merge token types with attr to new token module
-rw-r--r-- | src/emitter.rs | 145 | ||||
-rw-r--r-- | src/lib.rs | 11 | ||||
-rw-r--r-- | src/machine.rs | 2 | ||||
-rw-r--r-- | src/token.rs (renamed from src/attr.rs) | 137 |
4 files changed, 152 insertions, 143 deletions
diff --git a/src/emitter.rs b/src/emitter.rs index f27c778..23f9ede 100644 --- a/src/emitter.rs +++ b/src/emitter.rs @@ -4,9 +4,9 @@ use std::collections::VecDeque; use std::mem; use std::ops::Range; -use crate::attr::AttrValueSyntax; use crate::offset::NoopOffset; use crate::offset::Offset; +use crate::token::{AttrValueSyntax, Comment, Doctype, EndTag, StartTag, Token}; use crate::Error; /// An emitter is an object providing methods to the tokenizer to produce ("emit") tokens. @@ -214,7 +214,7 @@ pub trait Emitter<O> { pub struct DefaultEmitter<O = NoopOffset> { current_characters: String, current_token: Option<Token<O>>, - current_attribute: Option<(String, crate::attr::AttrInternal<O>)>, + current_attribute: Option<(String, crate::token::AttrInternal<O>)>, seen_attributes: BTreeSet<String>, emitted_tokens: VecDeque<Token<O>>, attr_in_end_tag_span: Option<Range<O>>, @@ -465,7 +465,7 @@ impl<O: Offset> Emitter<O> for DefaultEmitter<O> { self.flush_current_attribute(); self.current_attribute = Some(( String::new(), - crate::attr::AttrInternal { + crate::token::AttrInternal { name_span: offset..O::default(), value: String::new(), value_span: O::default()..O::default(), @@ -564,147 +564,14 @@ impl<O: Offset> Emitter<O> for DefaultEmitter<O> { } } -/// An HTML start tag, such as `<p>` or `<a>`. -#[derive(Debug, Eq, PartialEq)] -pub struct StartTag<O> { - /// Whether this tag is self-closing. If it is self-closing, no following [`EndTag`] should be - /// expected. - pub self_closing: bool, - - /// The tag name. - /// Uppercase ASCII characters (A-Z) have been converted to lowercase. - pub name: String, - - /// A mapping for any HTML attributes this start tag may have. - /// - /// Duplicate attributes are ignored after the first one as per WHATWG spec. - pub attributes: crate::attr::AttributeMap<O>, - - /// The source code span of the tag. - pub span: Range<O>, - - /// The span of the tag name. - pub name_span: Range<O>, -} - -/// An HTML end/close tag, such as `</p>` or `</a>`. -#[derive(Debug, Eq, PartialEq)] -pub struct EndTag<O> { - /// The tag name. - /// Uppercase ASCII characters (A-Z) have been converted to lowercase. - pub name: String, - - /// The source code span of the tag. - pub span: Range<O>, - - /// The span of the tag name. - pub name_span: Range<O>, -} - -/// An HTML comment. -#[derive(PartialEq, Eq, Debug)] -pub struct Comment<O> { - /// The text within the comment. - pub data: String, - /// The source offset of the comment data. - pub data_span: Range<O>, -} - -impl<O: Offset> Comment<O> { - /// Returns the span for the comment data. - pub fn data_span(&self) -> Range<O> { - self.data_span.clone() - } -} - -/// A doctype. Some examples: -/// -/// * `<!DOCTYPE {name}>` -/// * `<!DOCTYPE {name} PUBLIC '{public_id}'>` -/// * `<!DOCTYPE {name} SYSTEM '{system_id}'>` -/// * `<!DOCTYPE {name} PUBLIC '{public_id}' '{system_id}'>` -#[derive(Debug, Eq, PartialEq)] -pub struct Doctype<O> { - /// The [force-quirks flag]. - /// - /// [force-quirks flag]: https://html.spec.whatwg.org/multipage/parsing.html#force-quirks-flag - pub force_quirks: bool, - - /// The doctype's name. Uppercase ASCII characters (A-Z) have been - /// converted to lowercase. For HTML documents this should be "html". - pub name: Option<String>, - - /// The doctype's public identifier. - pub public_id: Option<String>, - - /// The doctype's system identifier. - pub system_id: Option<String>, - - /// The source code span of the doctype. - pub span: Range<O>, - - /// The span of the name. - name_span: Range<O>, - - /// The span of the public identifier. - public_id_span: Range<O>, - - /// The span of the system identifier. - system_id_span: Range<O>, -} - -impl<O: Offset> Doctype<O> { - /// Returns the span of the name. - pub fn name_span(&self) -> Option<Range<O>> { - self.name.as_ref()?; - Some(self.name_span.clone()) - } - - /// Returns the span of the public identifier. - pub fn public_id_span(&self) -> Option<Range<O>> { - self.public_id.as_ref()?; - Some(self.public_id_span.clone()) - } - - /// Returns the span of the system identifier. - pub fn system_id_span(&self) -> Option<Range<O>> { - self.system_id.as_ref()?; - Some(self.system_id_span.clone()) - } -} - -/// A type for the tokens emitted by a WHATWG-compliant HTML tokenizer. -#[derive(Debug, Eq, PartialEq)] -pub enum Token<O> { - /// An HTML start tag. - StartTag(StartTag<O>), - /// An HTML end tag. - EndTag(EndTag<O>), - /// A literal string. Character references have been resolved. - String(String), - /// An HTML comment. - Comment(Comment<O>), - /// An HTML doctype declaration. - Doctype(Doctype<O>), - /// An HTML parsing error. - /// - /// Can be skipped over, the tokenizer is supposed to recover from the error and continues with - /// more tokens afterward. - Error { - /// What kind of error occurred. - error: Error, - /// The source code span of the error. - span: Range<O>, - }, -} - /// The majority of our testing of the [`DefaultEmitter`] is done against the /// html5lib-tests in the html5lib integration test. This module only tests /// details that aren't present in the html5lib test data. #[cfg(test)] mod tests { - use super::{DefaultEmitter, Token}; - use crate::{attr::AttrValueSyntax, Event, Tokenizer}; + use super::DefaultEmitter; + use crate::token::{AttrValueSyntax, Token}; + use crate::{Event, Tokenizer}; #[test] fn test_attribute_value_syntax() { @@ -15,13 +15,20 @@ mod naive_parser; mod tokenizer; mod utils; -pub mod attr; +/// Types for HTML attributes. +pub mod attr { + pub use crate::token::{ + AttrIntoIter, AttrIter, AttrValueSyntax, Attribute, AttributeMap, AttributeOwned, + }; +} pub mod offset; pub mod reader; +pub mod token; -pub use emitter::{Comment, DefaultEmitter, Doctype, Emitter, EndTag, StartTag, Token}; +pub use emitter::{DefaultEmitter, Emitter}; pub use error::Error; pub use naive_parser::NaiveParser; +pub use token::{Comment, Doctype, EndTag, StartTag, Token}; pub use tokenizer::{CdataAction, Event, State, Tokenizer}; #[cfg(feature = "integration-tests")] diff --git a/src/machine.rs b/src/machine.rs index d175b8b..6d4dc10 100644 --- a/src/machine.rs +++ b/src/machine.rs @@ -1,6 +1,6 @@ -use crate::attr::AttrValueSyntax; use crate::entities::try_read_character_reference; use crate::offset::{Offset, Position}; +use crate::token::AttrValueSyntax; use crate::tokenizer::CdataAction; use crate::utils::{ ascii_digit_pat, control_pat, ctostr, noncharacter_pat, surrogate_pat, whitespace_pat, State, diff --git a/src/attr.rs b/src/token.rs index 096235e..48c90f7 100644 --- a/src/attr.rs +++ b/src/token.rs @@ -1,10 +1,145 @@ -//! Types for HTML attributes. +//! Provides the [`Token`] type. use std::collections::{btree_map, BTreeMap}; use std::iter::FromIterator; use std::ops::{Index, Range}; use crate::offset::Offset; +use crate::Error; + +/// A type for the tokens emitted by a WHATWG-compliant HTML tokenizer. +#[derive(Debug, Eq, PartialEq)] +pub enum Token<O> { + /// An HTML start tag. + StartTag(StartTag<O>), + /// An HTML end tag. + EndTag(EndTag<O>), + /// A literal string. Character references have been resolved. + String(String), + /// An HTML comment. + Comment(Comment<O>), + /// An HTML doctype declaration. + Doctype(Doctype<O>), + /// An HTML parsing error. + /// + /// Can be skipped over, the tokenizer is supposed to recover from the error and continues with + /// more tokens afterward. + Error { + /// What kind of error occurred. + error: Error, + /// The source code span of the error. + span: Range<O>, + }, +} + +/// An HTML start tag, such as `<p>` or `<a>`. +#[derive(Debug, Eq, PartialEq)] +pub struct StartTag<O> { + /// Whether this tag is self-closing. If it is self-closing, no following [`EndTag`] should be + /// expected. + pub self_closing: bool, + + /// The tag name. + /// Uppercase ASCII characters (A-Z) have been converted to lowercase. + pub name: String, + + /// A mapping for any HTML attributes this start tag may have. + /// + /// Duplicate attributes are ignored after the first one as per WHATWG spec. + pub attributes: AttributeMap<O>, + + /// The source code span of the tag. + pub span: Range<O>, + + /// The span of the tag name. + pub name_span: Range<O>, +} + +/// An HTML end/close tag, such as `</p>` or `</a>`. +#[derive(Debug, Eq, PartialEq)] +pub struct EndTag<O> { + /// The tag name. + /// Uppercase ASCII characters (A-Z) have been converted to lowercase. + pub name: String, + + /// The source code span of the tag. + pub span: Range<O>, + + /// The span of the tag name. + pub name_span: Range<O>, +} + +/// An HTML comment. +#[derive(PartialEq, Eq, Debug)] +pub struct Comment<O> { + /// The text within the comment. + pub data: String, + /// The source offset of the comment data. + pub data_span: Range<O>, +} + +impl<O: Offset> Comment<O> { + /// Returns the span for the comment data. + pub fn data_span(&self) -> Range<O> { + self.data_span.clone() + } +} + +/// A doctype. Some examples: +/// +/// * `<!DOCTYPE {name}>` +/// * `<!DOCTYPE {name} PUBLIC '{public_id}'>` +/// * `<!DOCTYPE {name} SYSTEM '{system_id}'>` +/// * `<!DOCTYPE {name} PUBLIC '{public_id}' '{system_id}'>` +#[derive(Debug, Eq, PartialEq)] +pub struct Doctype<O> { + /// The [force-quirks flag]. + /// + /// [force-quirks flag]: https://html.spec.whatwg.org/multipage/parsing.html#force-quirks-flag + pub force_quirks: bool, + + /// The doctype's name. Uppercase ASCII characters (A-Z) have been + /// converted to lowercase. For HTML documents this should be "html". + pub name: Option<String>, + + /// The doctype's public identifier. + pub public_id: Option<String>, + + /// The doctype's system identifier. + pub system_id: Option<String>, + + /// The source code span of the doctype. + pub span: Range<O>, + + /// The span of the name. + pub(crate) name_span: Range<O>, + + /// The span of the public identifier. + pub(crate) public_id_span: Range<O>, + + /// The span of the system identifier. + pub(crate) system_id_span: Range<O>, +} + +impl<O: Offset> Doctype<O> { + /// Returns the span of the name. + pub fn name_span(&self) -> Option<Range<O>> { + self.name.as_ref()?; + Some(self.name_span.clone()) + } + + /// Returns the span of the public identifier. + pub fn public_id_span(&self) -> Option<Range<O>> { + self.public_id.as_ref()?; + Some(self.public_id_span.clone()) + } + + /// Returns the span of the system identifier. + pub fn system_id_span(&self) -> Option<Range<O>> { + self.system_id.as_ref()?; + Some(self.system_id_span.clone()) + } +} /// A map of HTML attributes. /// |