From 1d8e6239875c810197a0679a20412726afb8ff66 Mon Sep 17 00:00:00 2001 From: Martin Fischer Date: Sat, 9 Sep 2023 19:10:49 +0200 Subject: refactor: merge token types with attr to new token module --- src/attr.rs | 208 ---------------------------------- src/emitter.rs | 145 +----------------------- src/lib.rs | 11 +- src/machine.rs | 2 +- src/token.rs | 343 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 359 insertions(+), 350 deletions(-) delete mode 100644 src/attr.rs create mode 100644 src/token.rs (limited to 'src') diff --git a/src/attr.rs b/src/attr.rs deleted file mode 100644 index 096235e..0000000 --- a/src/attr.rs +++ /dev/null @@ -1,208 +0,0 @@ -//! Types for HTML attributes. - -use std::collections::{btree_map, BTreeMap}; -use std::iter::FromIterator; -use std::ops::{Index, Range}; - -use crate::offset::Offset; - -/// A map of HTML attributes. -/// -/// Does not preserve the order of attributes. -/// Iterating always yields attributes in order by name. -/// -/// # Example -/// -/// ``` -/// # use html5tokenizer::attr::AttributeMap; -/// let attrs: AttributeMap<()> = vec![("href".into(), "http://example.com".into())] -/// .into_iter() -/// .collect(); -/// assert_eq!(&attrs["href"], "http://example.com"); -/// ``` -#[derive(Debug, Default, PartialEq, Eq)] -pub struct AttributeMap { - pub(crate) inner: BTreeMap>, -} - -/// The value type internally used by the [`AttributeMap`]. -/// Not part of the public API. -#[derive(Debug, Eq, PartialEq)] -pub(crate) struct AttrInternal { - pub value: String, - /// The span of the attribute name. - pub name_span: Range, - /// The span of the attribute value. - /// For the empty attribute syntax this is just `O::default()..O::default()`. - /// We intentionally don't use `Option>` here to spare us a byte (and padding) per attribute. - pub value_span: Range, - pub value_syntax: Option, -} - -/// The syntax of the attribute value. -#[derive(Clone, Copy, PartialEq, Eq, Debug)] -pub enum AttrValueSyntax { - /// An unquoted attribute value, e.g. `id=foo`. - Unquoted, - /// A single-quoted attribute value, e.g. `id='foo'`. - SingleQuoted, - /// A double-quoted attribute value, e.g. `id="foo"`. - DoubleQuoted, -} - -/// An HTML attribute borrowed from an [`AttributeMap`]. -#[derive(Debug, Eq, PartialEq)] -pub struct Attribute<'a, O> { - name: &'a str, - map_val: &'a AttrInternal, -} - -/// An owned HTML attribute. -#[derive(Debug, PartialEq, Eq)] -pub struct AttributeOwned { - /// The attribute name. - /// Uppercase ASCII characters (A-Z) have been converted to lowercase. - pub name: String, - /// The attribute value. Character references have been resolved. - pub value: String, - /// The span of the attribute name. - pub name_span: Range, - /// The span of the attribute value. - /// `None` in case of the empty attribute syntax (e.g. `disabled` in ``). - pub value_span: Option>, - /// The syntax of the attribute value. - /// `None` indicates the empty attribute syntax (e.g. `disabled` in ``). - pub value_syntax: Option, -} - -impl AttributeMap { - /// Returns the attribute with the given name. - /// - /// The name must not contain any uppercase ASCII character (A-Z) - /// or the method will always return `None`. - pub fn get(&self, name: &str) -> Option> { - self.inner - .get_key_value(name) - .map(|(name, map_val)| Attribute { name, map_val }) - } -} - -impl<'a, O: Offset> Attribute<'a, O> { - /// Returns the attribute name. - /// Uppercase ASCII characters (A-Z) have been converted to lowercase. - pub fn name(&self) -> &'a str { - self.name - } - - /// Returns the attribute value. Character references have been resolved. - pub fn value(&self) -> &'a str { - &self.map_val.value - } - - /// Returns the span of the attribute name. - pub fn name_span(&self) -> Range { - self.map_val.name_span.clone() - } - - /// For explicitly defined values returns the span of the attribute value. - /// - /// Returns `None` for attributes using the empty attribute syntax (e.g. `disabled` in ``). - pub fn value_span(&self) -> Option> { - if self.map_val.value_syntax.is_none() { - return None; - } - Some(self.map_val.value_span.clone()) - } - - /// Returns the attribute value syntax in case the value is explicitly defined. - /// - /// Returns `None` for attributes using the empty attribute syntax (e.g. `disabled` in ``). - pub fn value_syntax(&self) -> Option { - self.map_val.value_syntax - } -} - -// We cannot impl Index because Index::index returns a reference of -// the Output type (and you cannot return a value referencing a temporary value). -impl Index<&str> for AttributeMap { - type Output = str; - - /// Returns the attribute value with the given name. - /// - /// The name must not contain any uppercase ASCII character (A-Z) - /// or the method will always panic. - fn index(&self, name: &str) -> &Self::Output { - &self.inner[name].value - } -} - -impl IntoIterator for AttributeMap { - type Item = AttributeOwned; - - type IntoIter = AttrIntoIter; - - fn into_iter(self) -> Self::IntoIter { - AttrIntoIter(self.inner.into_iter()) - } -} - -/// A consuming iterator over the attributes of an [`AttributeMap`]. -pub struct AttrIntoIter(btree_map::IntoIter>); - -impl Iterator for AttrIntoIter { - type Item = AttributeOwned; - - fn next(&mut self) -> Option { - let (name, map_val) = self.0.next()?; - Some(AttributeOwned { - name, - value: map_val.value, - name_span: map_val.name_span, - value_span: map_val.value_syntax.is_some().then_some(map_val.value_span), - value_syntax: map_val.value_syntax, - }) - } -} - -impl<'a, O> IntoIterator for &'a AttributeMap { - type Item = Attribute<'a, O>; - - type IntoIter = AttrIter<'a, O>; - - fn into_iter(self) -> Self::IntoIter { - AttrIter(self.inner.iter()) - } -} - -/// A borrowed iterator over the attributes of an [`AttributeMap`]. -pub struct AttrIter<'a, S>(btree_map::Iter<'a, String, AttrInternal>); - -impl<'a, S> Iterator for AttrIter<'a, S> { - type Item = Attribute<'a, S>; - - fn next(&mut self) -> Option { - let (name, map_val) = self.0.next()?; - Some(Attribute { name, map_val }) - } -} - -impl FromIterator<(String, String)> for AttributeMap { - fn from_iter>(iter: T) -> Self { - Self { - inner: iter - .into_iter() - .map(|(name, value)| { - ( - name, - AttrInternal { - value, - name_span: O::default()..O::default(), - value_span: O::default()..O::default(), - value_syntax: Some(AttrValueSyntax::DoubleQuoted), - }, - ) - }) - .collect(), - } - } -} diff --git a/src/emitter.rs b/src/emitter.rs index f27c778..23f9ede 100644 --- a/src/emitter.rs +++ b/src/emitter.rs @@ -4,9 +4,9 @@ use std::collections::VecDeque; use std::mem; use std::ops::Range; -use crate::attr::AttrValueSyntax; use crate::offset::NoopOffset; use crate::offset::Offset; +use crate::token::{AttrValueSyntax, Comment, Doctype, EndTag, StartTag, Token}; use crate::Error; /// An emitter is an object providing methods to the tokenizer to produce ("emit") tokens. @@ -214,7 +214,7 @@ pub trait Emitter { pub struct DefaultEmitter { current_characters: String, current_token: Option>, - current_attribute: Option<(String, crate::attr::AttrInternal)>, + current_attribute: Option<(String, crate::token::AttrInternal)>, seen_attributes: BTreeSet, emitted_tokens: VecDeque>, attr_in_end_tag_span: Option>, @@ -465,7 +465,7 @@ impl Emitter for DefaultEmitter { self.flush_current_attribute(); self.current_attribute = Some(( String::new(), - crate::attr::AttrInternal { + crate::token::AttrInternal { name_span: offset..O::default(), value: String::new(), value_span: O::default()..O::default(), @@ -564,147 +564,14 @@ impl Emitter for DefaultEmitter { } } -/// An HTML start tag, such as `

` or ``. -#[derive(Debug, Eq, PartialEq)] -pub struct StartTag { - /// Whether this tag is self-closing. If it is self-closing, no following [`EndTag`] should be - /// expected. - pub self_closing: bool, - - /// The tag name. - /// Uppercase ASCII characters (A-Z) have been converted to lowercase. - pub name: String, - - /// A mapping for any HTML attributes this start tag may have. - /// - /// Duplicate attributes are ignored after the first one as per WHATWG spec. - pub attributes: crate::attr::AttributeMap, - - /// The source code span of the tag. - pub span: Range, - - /// The span of the tag name. - pub name_span: Range, -} - -/// An HTML end/close tag, such as `

` or ``. -#[derive(Debug, Eq, PartialEq)] -pub struct EndTag { - /// The tag name. - /// Uppercase ASCII characters (A-Z) have been converted to lowercase. - pub name: String, - - /// The source code span of the tag. - pub span: Range, - - /// The span of the tag name. - pub name_span: Range, -} - -/// An HTML comment. -#[derive(PartialEq, Eq, Debug)] -pub struct Comment { - /// The text within the comment. - pub data: String, - /// The source offset of the comment data. - pub data_span: Range, -} - -impl Comment { - /// Returns the span for the comment data. - pub fn data_span(&self) -> Range { - self.data_span.clone() - } -} - -/// A doctype. Some examples: -/// -/// * `` -/// * `` -/// * `` -/// * `` -#[derive(Debug, Eq, PartialEq)] -pub struct Doctype { - /// The [force-quirks flag]. - /// - /// [force-quirks flag]: https://html.spec.whatwg.org/multipage/parsing.html#force-quirks-flag - pub force_quirks: bool, - - /// The doctype's name. Uppercase ASCII characters (A-Z) have been - /// converted to lowercase. For HTML documents this should be "html". - pub name: Option, - - /// The doctype's public identifier. - pub public_id: Option, - - /// The doctype's system identifier. - pub system_id: Option, - - /// The source code span of the doctype. - pub span: Range, - - /// The span of the name. - name_span: Range, - - /// The span of the public identifier. - public_id_span: Range, - - /// The span of the system identifier. - system_id_span: Range, -} - -impl Doctype { - /// Returns the span of the name. - pub fn name_span(&self) -> Option> { - self.name.as_ref()?; - Some(self.name_span.clone()) - } - - /// Returns the span of the public identifier. - pub fn public_id_span(&self) -> Option> { - self.public_id.as_ref()?; - Some(self.public_id_span.clone()) - } - - /// Returns the span of the system identifier. - pub fn system_id_span(&self) -> Option> { - self.system_id.as_ref()?; - Some(self.system_id_span.clone()) - } -} - -/// A type for the tokens emitted by a WHATWG-compliant HTML tokenizer. -#[derive(Debug, Eq, PartialEq)] -pub enum Token { - /// An HTML start tag. - StartTag(StartTag), - /// An HTML end tag. - EndTag(EndTag), - /// A literal string. Character references have been resolved. - String(String), - /// An HTML comment. - Comment(Comment), - /// An HTML doctype declaration. - Doctype(Doctype), - /// An HTML parsing error. - /// - /// Can be skipped over, the tokenizer is supposed to recover from the error and continues with - /// more tokens afterward. - Error { - /// What kind of error occurred. - error: Error, - /// The source code span of the error. - span: Range, - }, -} - /// The majority of our testing of the [`DefaultEmitter`] is done against the /// html5lib-tests in the html5lib integration test. This module only tests /// details that aren't present in the html5lib test data. #[cfg(test)] mod tests { - use super::{DefaultEmitter, Token}; - use crate::{attr::AttrValueSyntax, Event, Tokenizer}; + use super::DefaultEmitter; + use crate::token::{AttrValueSyntax, Token}; + use crate::{Event, Tokenizer}; #[test] fn test_attribute_value_syntax() { diff --git a/src/lib.rs b/src/lib.rs index 97358d0..2918c80 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -15,13 +15,20 @@ mod naive_parser; mod tokenizer; mod utils; -pub mod attr; +/// Types for HTML attributes. +pub mod attr { + pub use crate::token::{ + AttrIntoIter, AttrIter, AttrValueSyntax, Attribute, AttributeMap, AttributeOwned, + }; +} pub mod offset; pub mod reader; +pub mod token; -pub use emitter::{Comment, DefaultEmitter, Doctype, Emitter, EndTag, StartTag, Token}; +pub use emitter::{DefaultEmitter, Emitter}; pub use error::Error; pub use naive_parser::NaiveParser; +pub use token::{Comment, Doctype, EndTag, StartTag, Token}; pub use tokenizer::{CdataAction, Event, State, Tokenizer}; #[cfg(feature = "integration-tests")] diff --git a/src/machine.rs b/src/machine.rs index d175b8b..6d4dc10 100644 --- a/src/machine.rs +++ b/src/machine.rs @@ -1,6 +1,6 @@ -use crate::attr::AttrValueSyntax; use crate::entities::try_read_character_reference; use crate::offset::{Offset, Position}; +use crate::token::AttrValueSyntax; use crate::tokenizer::CdataAction; use crate::utils::{ ascii_digit_pat, control_pat, ctostr, noncharacter_pat, surrogate_pat, whitespace_pat, State, diff --git a/src/token.rs b/src/token.rs new file mode 100644 index 0000000..48c90f7 --- /dev/null +++ b/src/token.rs @@ -0,0 +1,343 @@ +//! Provides the [`Token`] type. + +use std::collections::{btree_map, BTreeMap}; +use std::iter::FromIterator; +use std::ops::{Index, Range}; + +use crate::offset::Offset; +use crate::Error; + +/// A type for the tokens emitted by a WHATWG-compliant HTML tokenizer. +#[derive(Debug, Eq, PartialEq)] +pub enum Token { + /// An HTML start tag. + StartTag(StartTag), + /// An HTML end tag. + EndTag(EndTag), + /// A literal string. Character references have been resolved. + String(String), + /// An HTML comment. + Comment(Comment), + /// An HTML doctype declaration. + Doctype(Doctype), + /// An HTML parsing error. + /// + /// Can be skipped over, the tokenizer is supposed to recover from the error and continues with + /// more tokens afterward. + Error { + /// What kind of error occurred. + error: Error, + /// The source code span of the error. + span: Range, + }, +} + +/// An HTML start tag, such as `

` or ``. +#[derive(Debug, Eq, PartialEq)] +pub struct StartTag { + /// Whether this tag is self-closing. If it is self-closing, no following [`EndTag`] should be + /// expected. + pub self_closing: bool, + + /// The tag name. + /// Uppercase ASCII characters (A-Z) have been converted to lowercase. + pub name: String, + + /// A mapping for any HTML attributes this start tag may have. + /// + /// Duplicate attributes are ignored after the first one as per WHATWG spec. + pub attributes: AttributeMap, + + /// The source code span of the tag. + pub span: Range, + + /// The span of the tag name. + pub name_span: Range, +} + +/// An HTML end/close tag, such as `

` or ``. +#[derive(Debug, Eq, PartialEq)] +pub struct EndTag { + /// The tag name. + /// Uppercase ASCII characters (A-Z) have been converted to lowercase. + pub name: String, + + /// The source code span of the tag. + pub span: Range, + + /// The span of the tag name. + pub name_span: Range, +} + +/// An HTML comment. +#[derive(PartialEq, Eq, Debug)] +pub struct Comment { + /// The text within the comment. + pub data: String, + /// The source offset of the comment data. + pub data_span: Range, +} + +impl Comment { + /// Returns the span for the comment data. + pub fn data_span(&self) -> Range { + self.data_span.clone() + } +} + +/// A doctype. Some examples: +/// +/// * `` +/// * `` +/// * `` +/// * `` +#[derive(Debug, Eq, PartialEq)] +pub struct Doctype { + /// The [force-quirks flag]. + /// + /// [force-quirks flag]: https://html.spec.whatwg.org/multipage/parsing.html#force-quirks-flag + pub force_quirks: bool, + + /// The doctype's name. Uppercase ASCII characters (A-Z) have been + /// converted to lowercase. For HTML documents this should be "html". + pub name: Option, + + /// The doctype's public identifier. + pub public_id: Option, + + /// The doctype's system identifier. + pub system_id: Option, + + /// The source code span of the doctype. + pub span: Range, + + /// The span of the name. + pub(crate) name_span: Range, + + /// The span of the public identifier. + pub(crate) public_id_span: Range, + + /// The span of the system identifier. + pub(crate) system_id_span: Range, +} + +impl Doctype { + /// Returns the span of the name. + pub fn name_span(&self) -> Option> { + self.name.as_ref()?; + Some(self.name_span.clone()) + } + + /// Returns the span of the public identifier. + pub fn public_id_span(&self) -> Option> { + self.public_id.as_ref()?; + Some(self.public_id_span.clone()) + } + + /// Returns the span of the system identifier. + pub fn system_id_span(&self) -> Option> { + self.system_id.as_ref()?; + Some(self.system_id_span.clone()) + } +} + +/// A map of HTML attributes. +/// +/// Does not preserve the order of attributes. +/// Iterating always yields attributes in order by name. +/// +/// # Example +/// +/// ``` +/// # use html5tokenizer::attr::AttributeMap; +/// let attrs: AttributeMap<()> = vec![("href".into(), "http://example.com".into())] +/// .into_iter() +/// .collect(); +/// assert_eq!(&attrs["href"], "http://example.com"); +/// ``` +#[derive(Debug, Default, PartialEq, Eq)] +pub struct AttributeMap { + pub(crate) inner: BTreeMap>, +} + +/// The value type internally used by the [`AttributeMap`]. +/// Not part of the public API. +#[derive(Debug, Eq, PartialEq)] +pub(crate) struct AttrInternal { + pub value: String, + /// The span of the attribute name. + pub name_span: Range, + /// The span of the attribute value. + /// For the empty attribute syntax this is just `O::default()..O::default()`. + /// We intentionally don't use `Option>` here to spare us a byte (and padding) per attribute. + pub value_span: Range, + pub value_syntax: Option, +} + +/// The syntax of the attribute value. +#[derive(Clone, Copy, PartialEq, Eq, Debug)] +pub enum AttrValueSyntax { + /// An unquoted attribute value, e.g. `id=foo`. + Unquoted, + /// A single-quoted attribute value, e.g. `id='foo'`. + SingleQuoted, + /// A double-quoted attribute value, e.g. `id="foo"`. + DoubleQuoted, +} + +/// An HTML attribute borrowed from an [`AttributeMap`]. +#[derive(Debug, Eq, PartialEq)] +pub struct Attribute<'a, O> { + name: &'a str, + map_val: &'a AttrInternal, +} + +/// An owned HTML attribute. +#[derive(Debug, PartialEq, Eq)] +pub struct AttributeOwned { + /// The attribute name. + /// Uppercase ASCII characters (A-Z) have been converted to lowercase. + pub name: String, + /// The attribute value. Character references have been resolved. + pub value: String, + /// The span of the attribute name. + pub name_span: Range, + /// The span of the attribute value. + /// `None` in case of the empty attribute syntax (e.g. `disabled` in ``). + pub value_span: Option>, + /// The syntax of the attribute value. + /// `None` indicates the empty attribute syntax (e.g. `disabled` in ``). + pub value_syntax: Option, +} + +impl AttributeMap { + /// Returns the attribute with the given name. + /// + /// The name must not contain any uppercase ASCII character (A-Z) + /// or the method will always return `None`. + pub fn get(&self, name: &str) -> Option> { + self.inner + .get_key_value(name) + .map(|(name, map_val)| Attribute { name, map_val }) + } +} + +impl<'a, O: Offset> Attribute<'a, O> { + /// Returns the attribute name. + /// Uppercase ASCII characters (A-Z) have been converted to lowercase. + pub fn name(&self) -> &'a str { + self.name + } + + /// Returns the attribute value. Character references have been resolved. + pub fn value(&self) -> &'a str { + &self.map_val.value + } + + /// Returns the span of the attribute name. + pub fn name_span(&self) -> Range { + self.map_val.name_span.clone() + } + + /// For explicitly defined values returns the span of the attribute value. + /// + /// Returns `None` for attributes using the empty attribute syntax (e.g. `disabled` in ``). + pub fn value_span(&self) -> Option> { + if self.map_val.value_syntax.is_none() { + return None; + } + Some(self.map_val.value_span.clone()) + } + + /// Returns the attribute value syntax in case the value is explicitly defined. + /// + /// Returns `None` for attributes using the empty attribute syntax (e.g. `disabled` in ``). + pub fn value_syntax(&self) -> Option { + self.map_val.value_syntax + } +} + +// We cannot impl Index because Index::index returns a reference of +// the Output type (and you cannot return a value referencing a temporary value). +impl Index<&str> for AttributeMap { + type Output = str; + + /// Returns the attribute value with the given name. + /// + /// The name must not contain any uppercase ASCII character (A-Z) + /// or the method will always panic. + fn index(&self, name: &str) -> &Self::Output { + &self.inner[name].value + } +} + +impl IntoIterator for AttributeMap { + type Item = AttributeOwned; + + type IntoIter = AttrIntoIter; + + fn into_iter(self) -> Self::IntoIter { + AttrIntoIter(self.inner.into_iter()) + } +} + +/// A consuming iterator over the attributes of an [`AttributeMap`]. +pub struct AttrIntoIter(btree_map::IntoIter>); + +impl Iterator for AttrIntoIter { + type Item = AttributeOwned; + + fn next(&mut self) -> Option { + let (name, map_val) = self.0.next()?; + Some(AttributeOwned { + name, + value: map_val.value, + name_span: map_val.name_span, + value_span: map_val.value_syntax.is_some().then_some(map_val.value_span), + value_syntax: map_val.value_syntax, + }) + } +} + +impl<'a, O> IntoIterator for &'a AttributeMap { + type Item = Attribute<'a, O>; + + type IntoIter = AttrIter<'a, O>; + + fn into_iter(self) -> Self::IntoIter { + AttrIter(self.inner.iter()) + } +} + +/// A borrowed iterator over the attributes of an [`AttributeMap`]. +pub struct AttrIter<'a, S>(btree_map::Iter<'a, String, AttrInternal>); + +impl<'a, S> Iterator for AttrIter<'a, S> { + type Item = Attribute<'a, S>; + + fn next(&mut self) -> Option { + let (name, map_val) = self.0.next()?; + Some(Attribute { name, map_val }) + } +} + +impl FromIterator<(String, String)> for AttributeMap { + fn from_iter>(iter: T) -> Self { + Self { + inner: iter + .into_iter() + .map(|(name, value)| { + ( + name, + AttrInternal { + value, + name_span: O::default()..O::default(), + value_span: O::default()..O::default(), + value_syntax: Some(AttrValueSyntax::DoubleQuoted), + }, + ) + }) + .collect(), + } + } +} -- cgit v1.2.3