//! Provides the [`Token`] type. use std::collections::{btree_map, BTreeMap}; use std::fmt::Debug; use std::iter::FromIterator; use std::ops::Index; /// A type for the tokens emitted by a WHATWG-compliant HTML tokenizer. #[derive(Clone, Debug, Eq, PartialEq)] pub enum Token { /// A literal character, a resolved character reference, /// or part of a resolved character reference (since some /// character references resolve to two `char`s). Char(char), /// An HTML start tag. StartTag(StartTag), /// An HTML end tag. EndTag(EndTag), /// An HTML comment. Comment(String), /// An HTML doctype declaration. Doctype(Doctype), /// An end-of-file token. EndOfFile, } /// An HTML start tag, such as `

` or ``. #[derive(Clone, Debug, Eq, PartialEq)] pub struct StartTag { /// The tag name. /// Uppercase ASCII characters (A-Z) have been converted to lowercase. pub name: String, /// Whether this tag is self-closing. If it is self-closing, no following [`EndTag`] should be /// expected. pub self_closing: bool, /// A mapping for any HTML attributes this start tag may have. /// /// Duplicate attributes are ignored after the first one as per WHATWG spec. pub attributes: AttributeMap, } /// An HTML end/close tag, such as `

` or ``. #[derive(Clone, Debug, Eq, PartialEq)] pub struct EndTag { /// The tag name. /// Uppercase ASCII characters (A-Z) have been converted to lowercase. pub name: String, } /// A doctype. Some examples: /// /// * `` /// * `` /// * `` /// * `` #[derive(Clone, Debug, Eq, PartialEq)] pub struct Doctype { /// The [force-quirks flag]. /// /// [force-quirks flag]: https://html.spec.whatwg.org/multipage/parsing.html#force-quirks-flag pub force_quirks: bool, /// The doctype's name. Uppercase ASCII characters (A-Z) have been /// converted to lowercase. For HTML documents this should be "html". pub name: Option, /// The doctype's public identifier. pub public_id: Option, /// The doctype's system identifier. pub system_id: Option, } /// A map of HTML attributes. /// /// Does not preserve the order of attributes. /// Iterating always yields attributes in order by name. /// /// # Example /// /// ``` /// # use html5tokenizer::attr::AttributeMap; /// let attrs: AttributeMap = vec![("href".into(), "http://example.com".into())] /// .into_iter() /// .collect(); /// assert_eq!(&attrs["href"], "http://example.com"); /// ``` #[derive(Clone, Default, PartialEq, Eq)] // Debug has a custom impl below pub struct AttributeMap { pub(crate) inner: BTreeMap, } /// The value type internally used by the [`AttributeMap`]. /// Not part of the public API. #[derive(Clone, Default, Eq)] // Debug has a custom impl below pub(crate) struct AttrInternal { pub value: String, pub trace_idx: Option, } /// The index of an [`AttributeTrace`] within an [`AttributeTraceList`]. /// /// [`AttributeTrace`]: crate::trace::AttributeTrace /// [`AttributeTraceList`]: crate::trace::AttributeTraceList #[derive(Clone, Copy, Eq, PartialEq, Debug)] pub struct AttributeTraceIdx( // Using NonZeroUsize so that `Option` // has the same size as `AttributeTraceIdx`. pub std::num::NonZeroUsize, ); impl PartialEq for AttrInternal { fn eq(&self, other: &Self) -> bool { // We intentionally don't include the trace_idx, // so that PartialEq of Token only compares semantics. self.value == other.value } } /// An HTML attribute borrowed from an [`AttributeMap`]. #[derive(Eq, PartialEq)] // Debug has a custom impl below pub struct Attribute<'a> { name: &'a str, map_val: &'a AttrInternal, } /// An owned HTML attribute. #[derive(Debug, PartialEq, Eq)] pub struct AttributeOwned { /// The attribute name. /// Uppercase ASCII characters (A-Z) have been converted to lowercase. pub name: String, /// The attribute value. Character references have been resolved. pub value: String, /// The index of the corresponding [`AttributeTrace`] in the /// `attribute_traces` field of [`StartTagTrace`], in case this attribute /// was present in the source and the [`Emitter`] has tracked this. /// /// [`AttributeTrace`]: super::trace::AttributeTrace /// [`StartTagTrace`]: super::trace::AttributeTrace /// [`Emitter`]: super::Emitter pub trace_idx: Option, } impl AttributeMap { /// Returns the value for the given attribute name. /// /// The name must not contain any uppercase ASCII character (A-Z) /// or the method will always return `None`. pub fn get(&self, name: &str) -> Option<&str> { self.inner.get(name).map(|map_val| map_val.value.as_str()) } /// Returns the value and trace index for a given attribute name. /// /// The name must not contain any uppercase ASCII character (A-Z) /// or the method will always return `None`. pub fn value_and_trace_idx(&self, name: &str) -> Option<(&str, Option)> { self.inner .get(name) .map(|map_val| (map_val.value.as_str(), map_val.trace_idx)) } } impl<'a> Attribute<'a> { /// Returns the attribute name. /// Uppercase ASCII characters (A-Z) have been converted to lowercase. pub fn name(&self) -> &'a str { self.name } /// Returns the attribute value. Character references have been resolved. pub fn value(&self) -> &'a str { &self.map_val.value } /// Returns the index of the corresponding [`AttributeTrace`] in the /// `attribute_traces` field of [`StartTagTrace`], in case this attribute /// was present in the source and the [`Emitter`] has tracked that. /// /// [`AttributeTrace`]: super::trace::AttributeTrace /// [`StartTagTrace`]: super::trace::AttributeTrace /// [`Emitter`]: super::Emitter pub fn trace_idx(&self) -> Option { self.map_val.trace_idx } } // We cannot impl Index because Index::index returns a reference of // the Output type (and you cannot return a value referencing a temporary value). impl Index<&str> for AttributeMap { type Output = str; /// Returns the attribute value with the given name. /// /// The name must not contain any uppercase ASCII character (A-Z) /// or the method will always panic. fn index(&self, name: &str) -> &Self::Output { &self.inner[name].value } } impl IntoIterator for AttributeMap { type Item = AttributeOwned; type IntoIter = AttrIntoIter; fn into_iter(self) -> Self::IntoIter { AttrIntoIter(self.inner.into_iter()) } } /// A consuming iterator over the attributes of an [`AttributeMap`]. pub struct AttrIntoIter(btree_map::IntoIter); impl Iterator for AttrIntoIter { type Item = AttributeOwned; fn next(&mut self) -> Option { let (name, map_val) = self.0.next()?; Some(AttributeOwned { name, value: map_val.value, trace_idx: map_val.trace_idx, }) } } impl<'a> IntoIterator for &'a AttributeMap { type Item = Attribute<'a>; type IntoIter = AttrIter<'a>; fn into_iter(self) -> Self::IntoIter { AttrIter(self.inner.iter()) } } /// A borrowed iterator over the attributes of an [`AttributeMap`]. pub struct AttrIter<'a>(btree_map::Iter<'a, String, AttrInternal>); impl<'a> Iterator for AttrIter<'a> { type Item = Attribute<'a>; fn next(&mut self) -> Option { let (name, map_val) = self.0.next()?; Some(Attribute { name, map_val }) } } impl FromIterator<(String, String)> for AttributeMap { fn from_iter>(iter: T) -> Self { Self { inner: iter .into_iter() .map(|(name, value)| { ( name, AttrInternal { value, trace_idx: None, }, ) }) .collect(), } } } impl Debug for AttributeMap { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { self.inner.fmt(f) } } impl Debug for AttrInternal { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(f, "{:?}", self.value)?; if let Some(idx) = self.trace_idx { write!(f, " (trace #{})", idx.0)?; } Ok(()) } } impl Debug for Attribute<'_> { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { f.debug_struct("Attribute") .field("name", &self.name) .field("value", &self.value()) .field("trace_idx", &self.trace_idx().map(|idx| idx.0)) .finish() } }