diff options
| author | Martin Fischer <martin@push-f.com> | 2023-09-09 19:10:49 +0200 | 
|---|---|---|
| committer | Martin Fischer <martin@push-f.com> | 2023-09-09 23:02:47 +0200 | 
| commit | 1d8e6239875c810197a0679a20412726afb8ff66 (patch) | |
| tree | 3daa917d720235117770040405a78dad548032e9 | |
| parent | 77eb3eefeeab3ba7ab3c6387e5916192b95b482d (diff) | |
refactor: merge token types with attr to new token module
| -rw-r--r-- | src/emitter.rs | 145 | ||||
| -rw-r--r-- | src/lib.rs | 11 | ||||
| -rw-r--r-- | src/machine.rs | 2 | ||||
| -rw-r--r-- | src/token.rs (renamed from src/attr.rs) | 137 | 
4 files changed, 152 insertions, 143 deletions
| diff --git a/src/emitter.rs b/src/emitter.rs index f27c778..23f9ede 100644 --- a/src/emitter.rs +++ b/src/emitter.rs @@ -4,9 +4,9 @@ use std::collections::VecDeque;  use std::mem;  use std::ops::Range; -use crate::attr::AttrValueSyntax;  use crate::offset::NoopOffset;  use crate::offset::Offset; +use crate::token::{AttrValueSyntax, Comment, Doctype, EndTag, StartTag, Token};  use crate::Error;  /// An emitter is an object providing methods to the tokenizer to produce ("emit") tokens. @@ -214,7 +214,7 @@ pub trait Emitter<O> {  pub struct DefaultEmitter<O = NoopOffset> {      current_characters: String,      current_token: Option<Token<O>>, -    current_attribute: Option<(String, crate::attr::AttrInternal<O>)>, +    current_attribute: Option<(String, crate::token::AttrInternal<O>)>,      seen_attributes: BTreeSet<String>,      emitted_tokens: VecDeque<Token<O>>,      attr_in_end_tag_span: Option<Range<O>>, @@ -465,7 +465,7 @@ impl<O: Offset> Emitter<O> for DefaultEmitter<O> {          self.flush_current_attribute();          self.current_attribute = Some((              String::new(), -            crate::attr::AttrInternal { +            crate::token::AttrInternal {                  name_span: offset..O::default(),                  value: String::new(),                  value_span: O::default()..O::default(), @@ -564,147 +564,14 @@ impl<O: Offset> Emitter<O> for DefaultEmitter<O> {      }  } -/// An HTML start tag, such as `<p>` or `<a>`. -#[derive(Debug, Eq, PartialEq)] -pub struct StartTag<O> { -    /// Whether this tag is self-closing. If it is self-closing, no following [`EndTag`] should be -    /// expected. -    pub self_closing: bool, - -    /// The tag name. -    /// Uppercase ASCII characters (A-Z) have been converted to lowercase. -    pub name: String, - -    /// A mapping for any HTML attributes this start tag may have. -    /// -    /// Duplicate attributes are ignored after the first one as per WHATWG spec. -    pub attributes: crate::attr::AttributeMap<O>, - -    /// The source code span of the tag. -    pub span: Range<O>, - -    /// The span of the tag name. -    pub name_span: Range<O>, -} - -/// An HTML end/close tag, such as `</p>` or `</a>`. -#[derive(Debug, Eq, PartialEq)] -pub struct EndTag<O> { -    /// The tag name. -    /// Uppercase ASCII characters (A-Z) have been converted to lowercase. -    pub name: String, - -    /// The source code span of the tag. -    pub span: Range<O>, - -    /// The span of the tag name. -    pub name_span: Range<O>, -} - -/// An HTML comment. -#[derive(PartialEq, Eq, Debug)] -pub struct Comment<O> { -    /// The text within the comment. -    pub data: String, -    /// The source offset of the comment data. -    pub data_span: Range<O>, -} - -impl<O: Offset> Comment<O> { -    /// Returns the span for the comment data. -    pub fn data_span(&self) -> Range<O> { -        self.data_span.clone() -    } -} - -/// A doctype. Some examples: -/// -/// * `<!DOCTYPE {name}>` -/// * `<!DOCTYPE {name} PUBLIC '{public_id}'>` -/// * `<!DOCTYPE {name} SYSTEM '{system_id}'>` -/// * `<!DOCTYPE {name} PUBLIC '{public_id}' '{system_id}'>` -#[derive(Debug, Eq, PartialEq)] -pub struct Doctype<O> { -    /// The [force-quirks flag]. -    /// -    /// [force-quirks flag]: https://html.spec.whatwg.org/multipage/parsing.html#force-quirks-flag -    pub force_quirks: bool, - -    /// The doctype's name. Uppercase ASCII characters (A-Z) have been -    /// converted to lowercase. For HTML documents this should be "html". -    pub name: Option<String>, - -    /// The doctype's public identifier. -    pub public_id: Option<String>, - -    /// The doctype's system identifier. -    pub system_id: Option<String>, - -    /// The source code span of the doctype. -    pub span: Range<O>, - -    /// The span of the name. -    name_span: Range<O>, - -    /// The span of the public identifier. -    public_id_span: Range<O>, - -    /// The span of the system identifier. -    system_id_span: Range<O>, -} - -impl<O: Offset> Doctype<O> { -    /// Returns the span of the name. -    pub fn name_span(&self) -> Option<Range<O>> { -        self.name.as_ref()?; -        Some(self.name_span.clone()) -    } - -    /// Returns the span of the public identifier. -    pub fn public_id_span(&self) -> Option<Range<O>> { -        self.public_id.as_ref()?; -        Some(self.public_id_span.clone()) -    } - -    /// Returns the span of the system identifier. -    pub fn system_id_span(&self) -> Option<Range<O>> { -        self.system_id.as_ref()?; -        Some(self.system_id_span.clone()) -    } -} - -/// A type for the tokens emitted by a WHATWG-compliant HTML tokenizer. -#[derive(Debug, Eq, PartialEq)] -pub enum Token<O> { -    /// An HTML start tag. -    StartTag(StartTag<O>), -    /// An HTML end tag. -    EndTag(EndTag<O>), -    /// A literal string. Character references have been resolved. -    String(String), -    /// An HTML comment. -    Comment(Comment<O>), -    /// An HTML doctype declaration. -    Doctype(Doctype<O>), -    /// An HTML parsing error. -    /// -    /// Can be skipped over, the tokenizer is supposed to recover from the error and continues with -    /// more tokens afterward. -    Error { -        /// What kind of error occurred. -        error: Error, -        /// The source code span of the error. -        span: Range<O>, -    }, -} -  /// The majority of our testing of the [`DefaultEmitter`] is done against the  /// html5lib-tests in the html5lib integration test. This module only tests  /// details that aren't present in the html5lib test data.  #[cfg(test)]  mod tests { -    use super::{DefaultEmitter, Token}; -    use crate::{attr::AttrValueSyntax, Event, Tokenizer}; +    use super::DefaultEmitter; +    use crate::token::{AttrValueSyntax, Token}; +    use crate::{Event, Tokenizer};      #[test]      fn test_attribute_value_syntax() { @@ -15,13 +15,20 @@ mod naive_parser;  mod tokenizer;  mod utils; -pub mod attr; +/// Types for HTML attributes. +pub mod attr { +    pub use crate::token::{ +        AttrIntoIter, AttrIter, AttrValueSyntax, Attribute, AttributeMap, AttributeOwned, +    }; +}  pub mod offset;  pub mod reader; +pub mod token; -pub use emitter::{Comment, DefaultEmitter, Doctype, Emitter, EndTag, StartTag, Token}; +pub use emitter::{DefaultEmitter, Emitter};  pub use error::Error;  pub use naive_parser::NaiveParser; +pub use token::{Comment, Doctype, EndTag, StartTag, Token};  pub use tokenizer::{CdataAction, Event, State, Tokenizer};  #[cfg(feature = "integration-tests")] diff --git a/src/machine.rs b/src/machine.rs index d175b8b..6d4dc10 100644 --- a/src/machine.rs +++ b/src/machine.rs @@ -1,6 +1,6 @@ -use crate::attr::AttrValueSyntax;  use crate::entities::try_read_character_reference;  use crate::offset::{Offset, Position}; +use crate::token::AttrValueSyntax;  use crate::tokenizer::CdataAction;  use crate::utils::{      ascii_digit_pat, control_pat, ctostr, noncharacter_pat, surrogate_pat, whitespace_pat, State, diff --git a/src/attr.rs b/src/token.rs index 096235e..48c90f7 100644 --- a/src/attr.rs +++ b/src/token.rs @@ -1,10 +1,145 @@ -//! Types for HTML attributes. +//! Provides the [`Token`] type.  use std::collections::{btree_map, BTreeMap};  use std::iter::FromIterator;  use std::ops::{Index, Range};  use crate::offset::Offset; +use crate::Error; + +/// A type for the tokens emitted by a WHATWG-compliant HTML tokenizer. +#[derive(Debug, Eq, PartialEq)] +pub enum Token<O> { +    /// An HTML start tag. +    StartTag(StartTag<O>), +    /// An HTML end tag. +    EndTag(EndTag<O>), +    /// A literal string. Character references have been resolved. +    String(String), +    /// An HTML comment. +    Comment(Comment<O>), +    /// An HTML doctype declaration. +    Doctype(Doctype<O>), +    /// An HTML parsing error. +    /// +    /// Can be skipped over, the tokenizer is supposed to recover from the error and continues with +    /// more tokens afterward. +    Error { +        /// What kind of error occurred. +        error: Error, +        /// The source code span of the error. +        span: Range<O>, +    }, +} + +/// An HTML start tag, such as `<p>` or `<a>`. +#[derive(Debug, Eq, PartialEq)] +pub struct StartTag<O> { +    /// Whether this tag is self-closing. If it is self-closing, no following [`EndTag`] should be +    /// expected. +    pub self_closing: bool, + +    /// The tag name. +    /// Uppercase ASCII characters (A-Z) have been converted to lowercase. +    pub name: String, + +    /// A mapping for any HTML attributes this start tag may have. +    /// +    /// Duplicate attributes are ignored after the first one as per WHATWG spec. +    pub attributes: AttributeMap<O>, + +    /// The source code span of the tag. +    pub span: Range<O>, + +    /// The span of the tag name. +    pub name_span: Range<O>, +} + +/// An HTML end/close tag, such as `</p>` or `</a>`. +#[derive(Debug, Eq, PartialEq)] +pub struct EndTag<O> { +    /// The tag name. +    /// Uppercase ASCII characters (A-Z) have been converted to lowercase. +    pub name: String, + +    /// The source code span of the tag. +    pub span: Range<O>, + +    /// The span of the tag name. +    pub name_span: Range<O>, +} + +/// An HTML comment. +#[derive(PartialEq, Eq, Debug)] +pub struct Comment<O> { +    /// The text within the comment. +    pub data: String, +    /// The source offset of the comment data. +    pub data_span: Range<O>, +} + +impl<O: Offset> Comment<O> { +    /// Returns the span for the comment data. +    pub fn data_span(&self) -> Range<O> { +        self.data_span.clone() +    } +} + +/// A doctype. Some examples: +/// +/// * `<!DOCTYPE {name}>` +/// * `<!DOCTYPE {name} PUBLIC '{public_id}'>` +/// * `<!DOCTYPE {name} SYSTEM '{system_id}'>` +/// * `<!DOCTYPE {name} PUBLIC '{public_id}' '{system_id}'>` +#[derive(Debug, Eq, PartialEq)] +pub struct Doctype<O> { +    /// The [force-quirks flag]. +    /// +    /// [force-quirks flag]: https://html.spec.whatwg.org/multipage/parsing.html#force-quirks-flag +    pub force_quirks: bool, + +    /// The doctype's name. Uppercase ASCII characters (A-Z) have been +    /// converted to lowercase. For HTML documents this should be "html". +    pub name: Option<String>, + +    /// The doctype's public identifier. +    pub public_id: Option<String>, + +    /// The doctype's system identifier. +    pub system_id: Option<String>, + +    /// The source code span of the doctype. +    pub span: Range<O>, + +    /// The span of the name. +    pub(crate) name_span: Range<O>, + +    /// The span of the public identifier. +    pub(crate) public_id_span: Range<O>, + +    /// The span of the system identifier. +    pub(crate) system_id_span: Range<O>, +} + +impl<O: Offset> Doctype<O> { +    /// Returns the span of the name. +    pub fn name_span(&self) -> Option<Range<O>> { +        self.name.as_ref()?; +        Some(self.name_span.clone()) +    } + +    /// Returns the span of the public identifier. +    pub fn public_id_span(&self) -> Option<Range<O>> { +        self.public_id.as_ref()?; +        Some(self.public_id_span.clone()) +    } + +    /// Returns the span of the system identifier. +    pub fn system_id_span(&self) -> Option<Range<O>> { +        self.system_id.as_ref()?; +        Some(self.system_id_span.clone()) +    } +}  /// A map of HTML attributes.  /// | 
