aboutsummaryrefslogtreecommitdiff
path: root/src/token.rs
diff options
context:
space:
mode:
Diffstat (limited to 'src/token.rs')
-rw-r--r--src/token.rs343
1 files changed, 343 insertions, 0 deletions
diff --git a/src/token.rs b/src/token.rs
new file mode 100644
index 0000000..48c90f7
--- /dev/null
+++ b/src/token.rs
@@ -0,0 +1,343 @@
+//! Provides the [`Token`] type.
+
+use std::collections::{btree_map, BTreeMap};
+use std::iter::FromIterator;
+use std::ops::{Index, Range};
+
+use crate::offset::Offset;
+use crate::Error;
+
+/// A type for the tokens emitted by a WHATWG-compliant HTML tokenizer.
+#[derive(Debug, Eq, PartialEq)]
+pub enum Token<O> {
+ /// An HTML start tag.
+ StartTag(StartTag<O>),
+ /// An HTML end tag.
+ EndTag(EndTag<O>),
+ /// A literal string. Character references have been resolved.
+ String(String),
+ /// An HTML comment.
+ Comment(Comment<O>),
+ /// An HTML doctype declaration.
+ Doctype(Doctype<O>),
+ /// An HTML parsing error.
+ ///
+ /// Can be skipped over, the tokenizer is supposed to recover from the error and continues with
+ /// more tokens afterward.
+ Error {
+ /// What kind of error occurred.
+ error: Error,
+ /// The source code span of the error.
+ span: Range<O>,
+ },
+}
+
+/// An HTML start tag, such as `<p>` or `<a>`.
+#[derive(Debug, Eq, PartialEq)]
+pub struct StartTag<O> {
+ /// Whether this tag is self-closing. If it is self-closing, no following [`EndTag`] should be
+ /// expected.
+ pub self_closing: bool,
+
+ /// The tag name.
+ /// Uppercase ASCII characters (A-Z) have been converted to lowercase.
+ pub name: String,
+
+ /// A mapping for any HTML attributes this start tag may have.
+ ///
+ /// Duplicate attributes are ignored after the first one as per WHATWG spec.
+ pub attributes: AttributeMap<O>,
+
+ /// The source code span of the tag.
+ pub span: Range<O>,
+
+ /// The span of the tag name.
+ pub name_span: Range<O>,
+}
+
+/// An HTML end/close tag, such as `</p>` or `</a>`.
+#[derive(Debug, Eq, PartialEq)]
+pub struct EndTag<O> {
+ /// The tag name.
+ /// Uppercase ASCII characters (A-Z) have been converted to lowercase.
+ pub name: String,
+
+ /// The source code span of the tag.
+ pub span: Range<O>,
+
+ /// The span of the tag name.
+ pub name_span: Range<O>,
+}
+
+/// An HTML comment.
+#[derive(PartialEq, Eq, Debug)]
+pub struct Comment<O> {
+ /// The text within the comment.
+ pub data: String,
+ /// The source offset of the comment data.
+ pub data_span: Range<O>,
+}
+
+impl<O: Offset> Comment<O> {
+ /// Returns the span for the comment data.
+ pub fn data_span(&self) -> Range<O> {
+ self.data_span.clone()
+ }
+}
+
+/// A doctype. Some examples:
+///
+/// * `<!DOCTYPE {name}>`
+/// * `<!DOCTYPE {name} PUBLIC '{public_id}'>`
+/// * `<!DOCTYPE {name} SYSTEM '{system_id}'>`
+/// * `<!DOCTYPE {name} PUBLIC '{public_id}' '{system_id}'>`
+#[derive(Debug, Eq, PartialEq)]
+pub struct Doctype<O> {
+ /// The [force-quirks flag].
+ ///
+ /// [force-quirks flag]: https://html.spec.whatwg.org/multipage/parsing.html#force-quirks-flag
+ pub force_quirks: bool,
+
+ /// The doctype's name. Uppercase ASCII characters (A-Z) have been
+ /// converted to lowercase. For HTML documents this should be "html".
+ pub name: Option<String>,
+
+ /// The doctype's public identifier.
+ pub public_id: Option<String>,
+
+ /// The doctype's system identifier.
+ pub system_id: Option<String>,
+
+ /// The source code span of the doctype.
+ pub span: Range<O>,
+
+ /// The span of the name.
+ pub(crate) name_span: Range<O>,
+
+ /// The span of the public identifier.
+ pub(crate) public_id_span: Range<O>,
+
+ /// The span of the system identifier.
+ pub(crate) system_id_span: Range<O>,
+}
+
+impl<O: Offset> Doctype<O> {
+ /// Returns the span of the name.
+ pub fn name_span(&self) -> Option<Range<O>> {
+ self.name.as_ref()?;
+ Some(self.name_span.clone())
+ }
+
+ /// Returns the span of the public identifier.
+ pub fn public_id_span(&self) -> Option<Range<O>> {
+ self.public_id.as_ref()?;
+ Some(self.public_id_span.clone())
+ }
+
+ /// Returns the span of the system identifier.
+ pub fn system_id_span(&self) -> Option<Range<O>> {
+ self.system_id.as_ref()?;
+ Some(self.system_id_span.clone())
+ }
+}
+
+/// A map of HTML attributes.
+///
+/// Does not preserve the order of attributes.
+/// Iterating always yields attributes in order by name.
+///
+/// # Example
+///
+/// ```
+/// # use html5tokenizer::attr::AttributeMap;
+/// let attrs: AttributeMap<()> = vec![("href".into(), "http://example.com".into())]
+/// .into_iter()
+/// .collect();
+/// assert_eq!(&attrs["href"], "http://example.com");
+/// ```
+#[derive(Debug, Default, PartialEq, Eq)]
+pub struct AttributeMap<O> {
+ pub(crate) inner: BTreeMap<String, AttrInternal<O>>,
+}
+
+/// The value type internally used by the [`AttributeMap`].
+/// Not part of the public API.
+#[derive(Debug, Eq, PartialEq)]
+pub(crate) struct AttrInternal<O> {
+ pub value: String,
+ /// The span of the attribute name.
+ pub name_span: Range<O>,
+ /// The span of the attribute value.
+ /// For the empty attribute syntax this is just `O::default()..O::default()`.
+ /// We intentionally don't use `Option<Range<O>>` here to spare us a byte (and padding) per attribute.
+ pub value_span: Range<O>,
+ pub value_syntax: Option<AttrValueSyntax>,
+}
+
+/// The syntax of the attribute value.
+#[derive(Clone, Copy, PartialEq, Eq, Debug)]
+pub enum AttrValueSyntax {
+ /// An unquoted attribute value, e.g. `id=foo`.
+ Unquoted,
+ /// A single-quoted attribute value, e.g. `id='foo'`.
+ SingleQuoted,
+ /// A double-quoted attribute value, e.g. `id="foo"`.
+ DoubleQuoted,
+}
+
+/// An HTML attribute borrowed from an [`AttributeMap`].
+#[derive(Debug, Eq, PartialEq)]
+pub struct Attribute<'a, O> {
+ name: &'a str,
+ map_val: &'a AttrInternal<O>,
+}
+
+/// An owned HTML attribute.
+#[derive(Debug, PartialEq, Eq)]
+pub struct AttributeOwned<O> {
+ /// The attribute name.
+ /// Uppercase ASCII characters (A-Z) have been converted to lowercase.
+ pub name: String,
+ /// The attribute value. Character references have been resolved.
+ pub value: String,
+ /// The span of the attribute name.
+ pub name_span: Range<O>,
+ /// The span of the attribute value.
+ /// `None` in case of the empty attribute syntax (e.g. `disabled` in `<input disabled>`).
+ pub value_span: Option<Range<O>>,
+ /// The syntax of the attribute value.
+ /// `None` indicates the empty attribute syntax (e.g. `disabled` in `<input disabled>`).
+ pub value_syntax: Option<AttrValueSyntax>,
+}
+
+impl<O> AttributeMap<O> {
+ /// Returns the attribute with the given name.
+ ///
+ /// The name must not contain any uppercase ASCII character (A-Z)
+ /// or the method will always return `None`.
+ pub fn get(&self, name: &str) -> Option<Attribute<O>> {
+ self.inner
+ .get_key_value(name)
+ .map(|(name, map_val)| Attribute { name, map_val })
+ }
+}
+
+impl<'a, O: Offset> Attribute<'a, O> {
+ /// Returns the attribute name.
+ /// Uppercase ASCII characters (A-Z) have been converted to lowercase.
+ pub fn name(&self) -> &'a str {
+ self.name
+ }
+
+ /// Returns the attribute value. Character references have been resolved.
+ pub fn value(&self) -> &'a str {
+ &self.map_val.value
+ }
+
+ /// Returns the span of the attribute name.
+ pub fn name_span(&self) -> Range<O> {
+ self.map_val.name_span.clone()
+ }
+
+ /// For explicitly defined values returns the span of the attribute value.
+ ///
+ /// Returns `None` for attributes using the empty attribute syntax (e.g. `disabled` in `<input disabled>`).
+ pub fn value_span(&self) -> Option<Range<O>> {
+ if self.map_val.value_syntax.is_none() {
+ return None;
+ }
+ Some(self.map_val.value_span.clone())
+ }
+
+ /// Returns the attribute value syntax in case the value is explicitly defined.
+ ///
+ /// Returns `None` for attributes using the empty attribute syntax (e.g. `disabled` in `<input disabled>`).
+ pub fn value_syntax(&self) -> Option<AttrValueSyntax> {
+ self.map_val.value_syntax
+ }
+}
+
+// We cannot impl Index<Output=Attribute> because Index::index returns a reference of
+// the Output type (and you cannot return a value referencing a temporary value).
+impl<O> Index<&str> for AttributeMap<O> {
+ type Output = str;
+
+ /// Returns the attribute value with the given name.
+ ///
+ /// The name must not contain any uppercase ASCII character (A-Z)
+ /// or the method will always panic.
+ fn index(&self, name: &str) -> &Self::Output {
+ &self.inner[name].value
+ }
+}
+
+impl<O> IntoIterator for AttributeMap<O> {
+ type Item = AttributeOwned<O>;
+
+ type IntoIter = AttrIntoIter<O>;
+
+ fn into_iter(self) -> Self::IntoIter {
+ AttrIntoIter(self.inner.into_iter())
+ }
+}
+
+/// A consuming iterator over the attributes of an [`AttributeMap`].
+pub struct AttrIntoIter<O>(btree_map::IntoIter<String, AttrInternal<O>>);
+
+impl<O> Iterator for AttrIntoIter<O> {
+ type Item = AttributeOwned<O>;
+
+ fn next(&mut self) -> Option<Self::Item> {
+ let (name, map_val) = self.0.next()?;
+ Some(AttributeOwned {
+ name,
+ value: map_val.value,
+ name_span: map_val.name_span,
+ value_span: map_val.value_syntax.is_some().then_some(map_val.value_span),
+ value_syntax: map_val.value_syntax,
+ })
+ }
+}
+
+impl<'a, O> IntoIterator for &'a AttributeMap<O> {
+ type Item = Attribute<'a, O>;
+
+ type IntoIter = AttrIter<'a, O>;
+
+ fn into_iter(self) -> Self::IntoIter {
+ AttrIter(self.inner.iter())
+ }
+}
+
+/// A borrowed iterator over the attributes of an [`AttributeMap`].
+pub struct AttrIter<'a, S>(btree_map::Iter<'a, String, AttrInternal<S>>);
+
+impl<'a, S> Iterator for AttrIter<'a, S> {
+ type Item = Attribute<'a, S>;
+
+ fn next(&mut self) -> Option<Self::Item> {
+ let (name, map_val) = self.0.next()?;
+ Some(Attribute { name, map_val })
+ }
+}
+
+impl<O: Default> FromIterator<(String, String)> for AttributeMap<O> {
+ fn from_iter<T: IntoIterator<Item = (String, String)>>(iter: T) -> Self {
+ Self {
+ inner: iter
+ .into_iter()
+ .map(|(name, value)| {
+ (
+ name,
+ AttrInternal {
+ value,
+ name_span: O::default()..O::default(),
+ value_span: O::default()..O::default(),
+ value_syntax: Some(AttrValueSyntax::DoubleQuoted),
+ },
+ )
+ })
+ .collect(),
+ }
+ }
+}