diff options
Diffstat (limited to 'src/token.rs')
-rw-r--r-- | src/token.rs | 343 |
1 files changed, 343 insertions, 0 deletions
diff --git a/src/token.rs b/src/token.rs new file mode 100644 index 0000000..48c90f7 --- /dev/null +++ b/src/token.rs @@ -0,0 +1,343 @@ +//! Provides the [`Token`] type. + +use std::collections::{btree_map, BTreeMap}; +use std::iter::FromIterator; +use std::ops::{Index, Range}; + +use crate::offset::Offset; +use crate::Error; + +/// A type for the tokens emitted by a WHATWG-compliant HTML tokenizer. +#[derive(Debug, Eq, PartialEq)] +pub enum Token<O> { + /// An HTML start tag. + StartTag(StartTag<O>), + /// An HTML end tag. + EndTag(EndTag<O>), + /// A literal string. Character references have been resolved. + String(String), + /// An HTML comment. + Comment(Comment<O>), + /// An HTML doctype declaration. + Doctype(Doctype<O>), + /// An HTML parsing error. + /// + /// Can be skipped over, the tokenizer is supposed to recover from the error and continues with + /// more tokens afterward. + Error { + /// What kind of error occurred. + error: Error, + /// The source code span of the error. + span: Range<O>, + }, +} + +/// An HTML start tag, such as `<p>` or `<a>`. +#[derive(Debug, Eq, PartialEq)] +pub struct StartTag<O> { + /// Whether this tag is self-closing. If it is self-closing, no following [`EndTag`] should be + /// expected. + pub self_closing: bool, + + /// The tag name. + /// Uppercase ASCII characters (A-Z) have been converted to lowercase. + pub name: String, + + /// A mapping for any HTML attributes this start tag may have. + /// + /// Duplicate attributes are ignored after the first one as per WHATWG spec. + pub attributes: AttributeMap<O>, + + /// The source code span of the tag. + pub span: Range<O>, + + /// The span of the tag name. + pub name_span: Range<O>, +} + +/// An HTML end/close tag, such as `</p>` or `</a>`. +#[derive(Debug, Eq, PartialEq)] +pub struct EndTag<O> { + /// The tag name. + /// Uppercase ASCII characters (A-Z) have been converted to lowercase. + pub name: String, + + /// The source code span of the tag. + pub span: Range<O>, + + /// The span of the tag name. + pub name_span: Range<O>, +} + +/// An HTML comment. +#[derive(PartialEq, Eq, Debug)] +pub struct Comment<O> { + /// The text within the comment. + pub data: String, + /// The source offset of the comment data. + pub data_span: Range<O>, +} + +impl<O: Offset> Comment<O> { + /// Returns the span for the comment data. + pub fn data_span(&self) -> Range<O> { + self.data_span.clone() + } +} + +/// A doctype. Some examples: +/// +/// * `<!DOCTYPE {name}>` +/// * `<!DOCTYPE {name} PUBLIC '{public_id}'>` +/// * `<!DOCTYPE {name} SYSTEM '{system_id}'>` +/// * `<!DOCTYPE {name} PUBLIC '{public_id}' '{system_id}'>` +#[derive(Debug, Eq, PartialEq)] +pub struct Doctype<O> { + /// The [force-quirks flag]. + /// + /// [force-quirks flag]: https://html.spec.whatwg.org/multipage/parsing.html#force-quirks-flag + pub force_quirks: bool, + + /// The doctype's name. Uppercase ASCII characters (A-Z) have been + /// converted to lowercase. For HTML documents this should be "html". + pub name: Option<String>, + + /// The doctype's public identifier. + pub public_id: Option<String>, + + /// The doctype's system identifier. + pub system_id: Option<String>, + + /// The source code span of the doctype. + pub span: Range<O>, + + /// The span of the name. + pub(crate) name_span: Range<O>, + + /// The span of the public identifier. + pub(crate) public_id_span: Range<O>, + + /// The span of the system identifier. + pub(crate) system_id_span: Range<O>, +} + +impl<O: Offset> Doctype<O> { + /// Returns the span of the name. + pub fn name_span(&self) -> Option<Range<O>> { + self.name.as_ref()?; + Some(self.name_span.clone()) + } + + /// Returns the span of the public identifier. + pub fn public_id_span(&self) -> Option<Range<O>> { + self.public_id.as_ref()?; + Some(self.public_id_span.clone()) + } + + /// Returns the span of the system identifier. + pub fn system_id_span(&self) -> Option<Range<O>> { + self.system_id.as_ref()?; + Some(self.system_id_span.clone()) + } +} + +/// A map of HTML attributes. +/// +/// Does not preserve the order of attributes. +/// Iterating always yields attributes in order by name. +/// +/// # Example +/// +/// ``` +/// # use html5tokenizer::attr::AttributeMap; +/// let attrs: AttributeMap<()> = vec![("href".into(), "http://example.com".into())] +/// .into_iter() +/// .collect(); +/// assert_eq!(&attrs["href"], "http://example.com"); +/// ``` +#[derive(Debug, Default, PartialEq, Eq)] +pub struct AttributeMap<O> { + pub(crate) inner: BTreeMap<String, AttrInternal<O>>, +} + +/// The value type internally used by the [`AttributeMap`]. +/// Not part of the public API. +#[derive(Debug, Eq, PartialEq)] +pub(crate) struct AttrInternal<O> { + pub value: String, + /// The span of the attribute name. + pub name_span: Range<O>, + /// The span of the attribute value. + /// For the empty attribute syntax this is just `O::default()..O::default()`. + /// We intentionally don't use `Option<Range<O>>` here to spare us a byte (and padding) per attribute. + pub value_span: Range<O>, + pub value_syntax: Option<AttrValueSyntax>, +} + +/// The syntax of the attribute value. +#[derive(Clone, Copy, PartialEq, Eq, Debug)] +pub enum AttrValueSyntax { + /// An unquoted attribute value, e.g. `id=foo`. + Unquoted, + /// A single-quoted attribute value, e.g. `id='foo'`. + SingleQuoted, + /// A double-quoted attribute value, e.g. `id="foo"`. + DoubleQuoted, +} + +/// An HTML attribute borrowed from an [`AttributeMap`]. +#[derive(Debug, Eq, PartialEq)] +pub struct Attribute<'a, O> { + name: &'a str, + map_val: &'a AttrInternal<O>, +} + +/// An owned HTML attribute. +#[derive(Debug, PartialEq, Eq)] +pub struct AttributeOwned<O> { + /// The attribute name. + /// Uppercase ASCII characters (A-Z) have been converted to lowercase. + pub name: String, + /// The attribute value. Character references have been resolved. + pub value: String, + /// The span of the attribute name. + pub name_span: Range<O>, + /// The span of the attribute value. + /// `None` in case of the empty attribute syntax (e.g. `disabled` in `<input disabled>`). + pub value_span: Option<Range<O>>, + /// The syntax of the attribute value. + /// `None` indicates the empty attribute syntax (e.g. `disabled` in `<input disabled>`). + pub value_syntax: Option<AttrValueSyntax>, +} + +impl<O> AttributeMap<O> { + /// Returns the attribute with the given name. + /// + /// The name must not contain any uppercase ASCII character (A-Z) + /// or the method will always return `None`. + pub fn get(&self, name: &str) -> Option<Attribute<O>> { + self.inner + .get_key_value(name) + .map(|(name, map_val)| Attribute { name, map_val }) + } +} + +impl<'a, O: Offset> Attribute<'a, O> { + /// Returns the attribute name. + /// Uppercase ASCII characters (A-Z) have been converted to lowercase. + pub fn name(&self) -> &'a str { + self.name + } + + /// Returns the attribute value. Character references have been resolved. + pub fn value(&self) -> &'a str { + &self.map_val.value + } + + /// Returns the span of the attribute name. + pub fn name_span(&self) -> Range<O> { + self.map_val.name_span.clone() + } + + /// For explicitly defined values returns the span of the attribute value. + /// + /// Returns `None` for attributes using the empty attribute syntax (e.g. `disabled` in `<input disabled>`). + pub fn value_span(&self) -> Option<Range<O>> { + if self.map_val.value_syntax.is_none() { + return None; + } + Some(self.map_val.value_span.clone()) + } + + /// Returns the attribute value syntax in case the value is explicitly defined. + /// + /// Returns `None` for attributes using the empty attribute syntax (e.g. `disabled` in `<input disabled>`). + pub fn value_syntax(&self) -> Option<AttrValueSyntax> { + self.map_val.value_syntax + } +} + +// We cannot impl Index<Output=Attribute> because Index::index returns a reference of +// the Output type (and you cannot return a value referencing a temporary value). +impl<O> Index<&str> for AttributeMap<O> { + type Output = str; + + /// Returns the attribute value with the given name. + /// + /// The name must not contain any uppercase ASCII character (A-Z) + /// or the method will always panic. + fn index(&self, name: &str) -> &Self::Output { + &self.inner[name].value + } +} + +impl<O> IntoIterator for AttributeMap<O> { + type Item = AttributeOwned<O>; + + type IntoIter = AttrIntoIter<O>; + + fn into_iter(self) -> Self::IntoIter { + AttrIntoIter(self.inner.into_iter()) + } +} + +/// A consuming iterator over the attributes of an [`AttributeMap`]. +pub struct AttrIntoIter<O>(btree_map::IntoIter<String, AttrInternal<O>>); + +impl<O> Iterator for AttrIntoIter<O> { + type Item = AttributeOwned<O>; + + fn next(&mut self) -> Option<Self::Item> { + let (name, map_val) = self.0.next()?; + Some(AttributeOwned { + name, + value: map_val.value, + name_span: map_val.name_span, + value_span: map_val.value_syntax.is_some().then_some(map_val.value_span), + value_syntax: map_val.value_syntax, + }) + } +} + +impl<'a, O> IntoIterator for &'a AttributeMap<O> { + type Item = Attribute<'a, O>; + + type IntoIter = AttrIter<'a, O>; + + fn into_iter(self) -> Self::IntoIter { + AttrIter(self.inner.iter()) + } +} + +/// A borrowed iterator over the attributes of an [`AttributeMap`]. +pub struct AttrIter<'a, S>(btree_map::Iter<'a, String, AttrInternal<S>>); + +impl<'a, S> Iterator for AttrIter<'a, S> { + type Item = Attribute<'a, S>; + + fn next(&mut self) -> Option<Self::Item> { + let (name, map_val) = self.0.next()?; + Some(Attribute { name, map_val }) + } +} + +impl<O: Default> FromIterator<(String, String)> for AttributeMap<O> { + fn from_iter<T: IntoIterator<Item = (String, String)>>(iter: T) -> Self { + Self { + inner: iter + .into_iter() + .map(|(name, value)| { + ( + name, + AttrInternal { + value, + name_span: O::default()..O::default(), + value_span: O::default()..O::default(), + value_syntax: Some(AttrValueSyntax::DoubleQuoted), + }, + ) + }) + .collect(), + } + } +} |