//! Provides the [`Token`] type. use std::collections::{btree_map, BTreeMap}; use std::iter::FromIterator; use std::ops::{Index, Range}; use crate::offset::Offset; use crate::Error; /// A type for the tokens emitted by a WHATWG-compliant HTML tokenizer. #[derive(Debug, Eq, PartialEq)] pub enum Token { /// An HTML start tag. StartTag(StartTag), /// An HTML end tag. EndTag(EndTag), /// A literal string. Character references have been resolved. String(String), /// An HTML comment. Comment(Comment), /// An HTML doctype declaration. Doctype(Doctype), /// An HTML parsing error. /// /// Can be skipped over, the tokenizer is supposed to recover from the error and continues with /// more tokens afterward. Error { /// What kind of error occurred. error: Error, /// The source code span of the error. span: Range, }, } /// An HTML start tag, such as `

` or ``. #[derive(Debug, Eq, PartialEq)] pub struct StartTag { /// Whether this tag is self-closing. If it is self-closing, no following [`EndTag`] should be /// expected. pub self_closing: bool, /// The tag name. /// Uppercase ASCII characters (A-Z) have been converted to lowercase. pub name: String, /// A mapping for any HTML attributes this start tag may have. /// /// Duplicate attributes are ignored after the first one as per WHATWG spec. pub attributes: AttributeMap, /// The source code span of the tag. pub span: Range, /// The span of the tag name. pub name_span: Range, } /// An HTML end/close tag, such as `

` or ``. #[derive(Debug, Eq, PartialEq)] pub struct EndTag { /// The tag name. /// Uppercase ASCII characters (A-Z) have been converted to lowercase. pub name: String, /// The source code span of the tag. pub span: Range, /// The span of the tag name. pub name_span: Range, } /// An HTML comment. #[derive(PartialEq, Eq, Debug)] pub struct Comment { /// The text within the comment. pub data: String, /// The source offset of the comment data. pub data_span: Range, } impl Comment { /// Returns the span for the comment data. pub fn data_span(&self) -> Range { self.data_span.clone() } } /// A doctype. Some examples: /// /// * `` /// * `` /// * `` /// * `` #[derive(Debug, Eq, PartialEq)] pub struct Doctype { /// The [force-quirks flag]. /// /// [force-quirks flag]: https://html.spec.whatwg.org/multipage/parsing.html#force-quirks-flag pub force_quirks: bool, /// The doctype's name. Uppercase ASCII characters (A-Z) have been /// converted to lowercase. For HTML documents this should be "html". pub name: Option, /// The doctype's public identifier. pub public_id: Option, /// The doctype's system identifier. pub system_id: Option, /// The source code span of the doctype. pub span: Range, /// The span of the name. pub(crate) name_span: Range, /// The span of the public identifier. pub(crate) public_id_span: Range, /// The span of the system identifier. pub(crate) system_id_span: Range, } impl Doctype { /// Returns the span of the name. pub fn name_span(&self) -> Option> { self.name.as_ref()?; Some(self.name_span.clone()) } /// Returns the span of the public identifier. pub fn public_id_span(&self) -> Option> { self.public_id.as_ref()?; Some(self.public_id_span.clone()) } /// Returns the span of the system identifier. pub fn system_id_span(&self) -> Option> { self.system_id.as_ref()?; Some(self.system_id_span.clone()) } } /// A map of HTML attributes. /// /// Does not preserve the order of attributes. /// Iterating always yields attributes in order by name. /// /// # Example /// /// ``` /// # use html5tokenizer::attr::AttributeMap; /// let attrs: AttributeMap<()> = vec![("href".into(), "http://example.com".into())] /// .into_iter() /// .collect(); /// assert_eq!(&attrs["href"], "http://example.com"); /// ``` #[derive(Debug, Default, PartialEq, Eq)] pub struct AttributeMap { pub(crate) inner: BTreeMap>, } /// The value type internally used by the [`AttributeMap`]. /// Not part of the public API. #[derive(Debug, Eq, PartialEq)] pub(crate) struct AttrInternal { pub value: String, /// The span of the attribute name. pub name_span: Range, /// The span of the attribute value. /// For the empty attribute syntax this is just `O::default()..O::default()`. /// We intentionally don't use `Option>` here to spare us a byte (and padding) per attribute. pub value_span: Range, pub value_syntax: Option, } /// The syntax of the attribute value. #[derive(Clone, Copy, PartialEq, Eq, Debug)] pub enum AttrValueSyntax { /// An unquoted attribute value, e.g. `id=foo`. Unquoted, /// A single-quoted attribute value, e.g. `id='foo'`. SingleQuoted, /// A double-quoted attribute value, e.g. `id="foo"`. DoubleQuoted, } /// An HTML attribute borrowed from an [`AttributeMap`]. #[derive(Debug, Eq, PartialEq)] pub struct Attribute<'a, O> { name: &'a str, map_val: &'a AttrInternal, } /// An owned HTML attribute. #[derive(Debug, PartialEq, Eq)] pub struct AttributeOwned { /// The attribute name. /// Uppercase ASCII characters (A-Z) have been converted to lowercase. pub name: String, /// The attribute value. Character references have been resolved. pub value: String, /// The span of the attribute name. pub name_span: Range, /// The span of the attribute value. /// `None` in case of the empty attribute syntax (e.g. `disabled` in ``). pub value_span: Option>, /// The syntax of the attribute value. /// `None` indicates the empty attribute syntax (e.g. `disabled` in ``). pub value_syntax: Option, } impl AttributeMap { /// Returns the attribute with the given name. /// /// The name must not contain any uppercase ASCII character (A-Z) /// or the method will always return `None`. pub fn get(&self, name: &str) -> Option> { self.inner .get_key_value(name) .map(|(name, map_val)| Attribute { name, map_val }) } } impl<'a, O: Offset> Attribute<'a, O> { /// Returns the attribute name. /// Uppercase ASCII characters (A-Z) have been converted to lowercase. pub fn name(&self) -> &'a str { self.name } /// Returns the attribute value. Character references have been resolved. pub fn value(&self) -> &'a str { &self.map_val.value } /// Returns the span of the attribute name. pub fn name_span(&self) -> Range { self.map_val.name_span.clone() } /// For explicitly defined values returns the span of the attribute value. /// /// Returns `None` for attributes using the empty attribute syntax (e.g. `disabled` in ``). pub fn value_span(&self) -> Option> { if self.map_val.value_syntax.is_none() { return None; } Some(self.map_val.value_span.clone()) } /// Returns the attribute value syntax in case the value is explicitly defined. /// /// Returns `None` for attributes using the empty attribute syntax (e.g. `disabled` in ``). pub fn value_syntax(&self) -> Option { self.map_val.value_syntax } } // We cannot impl Index because Index::index returns a reference of // the Output type (and you cannot return a value referencing a temporary value). impl Index<&str> for AttributeMap { type Output = str; /// Returns the attribute value with the given name. /// /// The name must not contain any uppercase ASCII character (A-Z) /// or the method will always panic. fn index(&self, name: &str) -> &Self::Output { &self.inner[name].value } } impl IntoIterator for AttributeMap { type Item = AttributeOwned; type IntoIter = AttrIntoIter; fn into_iter(self) -> Self::IntoIter { AttrIntoIter(self.inner.into_iter()) } } /// A consuming iterator over the attributes of an [`AttributeMap`]. pub struct AttrIntoIter(btree_map::IntoIter>); impl Iterator for AttrIntoIter { type Item = AttributeOwned; fn next(&mut self) -> Option { let (name, map_val) = self.0.next()?; Some(AttributeOwned { name, value: map_val.value, name_span: map_val.name_span, value_span: map_val.value_syntax.is_some().then_some(map_val.value_span), value_syntax: map_val.value_syntax, }) } } impl<'a, O> IntoIterator for &'a AttributeMap { type Item = Attribute<'a, O>; type IntoIter = AttrIter<'a, O>; fn into_iter(self) -> Self::IntoIter { AttrIter(self.inner.iter()) } } /// A borrowed iterator over the attributes of an [`AttributeMap`]. pub struct AttrIter<'a, S>(btree_map::Iter<'a, String, AttrInternal>); impl<'a, S> Iterator for AttrIter<'a, S> { type Item = Attribute<'a, S>; fn next(&mut self) -> Option { let (name, map_val) = self.0.next()?; Some(Attribute { name, map_val }) } } impl FromIterator<(String, String)> for AttributeMap { fn from_iter>(iter: T) -> Self { Self { inner: iter .into_iter() .map(|(name, value)| { ( name, AttrInternal { value, name_span: O::default()..O::default(), value_span: O::default()..O::default(), value_syntax: Some(AttrValueSyntax::DoubleQuoted), }, ) }) .collect(), } } }