//! Provides the [`Token`] type.
use std::collections::{btree_map, BTreeMap};
use std::fmt::Debug;
use std::iter::FromIterator;
use std::ops::Index;
/// A type for the tokens emitted by a WHATWG-compliant HTML tokenizer.
#[derive(Clone, Debug, Eq, PartialEq)]
pub enum Token {
/// A literal character, a resolved character reference,
/// or part of a resolved character reference (since some
/// character references resolve to two `char`s).
Char(char),
/// An HTML start tag.
StartTag(StartTag),
/// An HTML end tag.
EndTag(EndTag),
/// An HTML comment.
Comment(String),
/// An HTML doctype declaration.
Doctype(Doctype),
/// An end-of-file token.
EndOfFile,
}
/// An HTML start tag, such as `
` or ``.
#[derive(Clone, Debug, Eq, PartialEq)]
pub struct StartTag {
/// Whether this tag is self-closing. If it is self-closing, no following [`EndTag`] should be
/// expected.
pub self_closing: bool,
/// The tag name.
/// Uppercase ASCII characters (A-Z) have been converted to lowercase.
pub name: String,
/// A mapping for any HTML attributes this start tag may have.
///
/// Duplicate attributes are ignored after the first one as per WHATWG spec.
pub attributes: AttributeMap,
}
/// An HTML end/close tag, such as `
` or ``.
#[derive(Clone, Debug, Eq, PartialEq)]
pub struct EndTag {
/// The tag name.
/// Uppercase ASCII characters (A-Z) have been converted to lowercase.
pub name: String,
}
/// A doctype. Some examples:
///
/// * ``
/// * ``
/// * ``
/// * ``
#[derive(Clone, Debug, Eq, PartialEq)]
pub struct Doctype {
/// The [force-quirks flag].
///
/// [force-quirks flag]: https://html.spec.whatwg.org/multipage/parsing.html#force-quirks-flag
pub force_quirks: bool,
/// The doctype's name. Uppercase ASCII characters (A-Z) have been
/// converted to lowercase. For HTML documents this should be "html".
pub name: Option,
/// The doctype's public identifier.
pub public_id: Option,
/// The doctype's system identifier.
pub system_id: Option,
}
/// A map of HTML attributes.
///
/// Does not preserve the order of attributes.
/// Iterating always yields attributes in order by name.
///
/// # Example
///
/// ```
/// # use html5tokenizer::attr::AttributeMap;
/// let attrs: AttributeMap = vec![("href".into(), "http://example.com".into())]
/// .into_iter()
/// .collect();
/// assert_eq!(&attrs["href"], "http://example.com");
/// ```
#[derive(Clone, Default, PartialEq, Eq)] // Debug has a custom impl below
pub struct AttributeMap {
pub(crate) inner: BTreeMap,
}
/// The value type internally used by the [`AttributeMap`].
/// Not part of the public API.
#[derive(Clone, Default, Eq)] // Debug has a custom impl below
pub(crate) struct AttrInternal {
pub value: String,
pub trace_idx: Option,
}
/// The index of an [`AttributeTrace`] within an [`AttributeTraceList`].
///
/// [`AttributeTrace`]: crate::trace::AttributeTrace
/// [`AttributeTraceList`]: crate::trace::AttributeTraceList
#[derive(Clone, Copy, Eq, PartialEq, Debug)]
pub struct AttributeTraceIdx(
// Using NonZeroUsize so that `Option`
// has the same size as `AttributeTraceIdx`.
pub std::num::NonZeroUsize,
);
impl PartialEq for AttrInternal {
fn eq(&self, other: &Self) -> bool {
// We intentionally don't include the trace_idx,
// so that PartialEq of Token only compares semantics.
self.value == other.value
}
}
/// An HTML attribute borrowed from an [`AttributeMap`].
#[derive(Eq, PartialEq)] // Debug has a custom impl below
pub struct Attribute<'a> {
name: &'a str,
map_val: &'a AttrInternal,
}
/// An owned HTML attribute.
#[derive(Debug, PartialEq, Eq)]
pub struct AttributeOwned {
/// The attribute name.
/// Uppercase ASCII characters (A-Z) have been converted to lowercase.
pub name: String,
/// The attribute value. Character references have been resolved.
pub value: String,
/// The index of the corresponding [`AttributeTrace`] in the
/// `attribute_traces` field of [`StartTagTrace`], in case this attribute
/// was present in the source and the [`Emitter`] has tracked this.
///
/// [`AttributeTrace`]: super::trace::AttributeTrace
/// [`StartTagTrace`]: super::trace::AttributeTrace
/// [`Emitter`]: super::Emitter
pub trace_idx: Option,
}
impl AttributeMap {
/// Returns the value for the given attribute name.
///
/// The name must not contain any uppercase ASCII character (A-Z)
/// or the method will always return `None`.
pub fn get(&self, name: &str) -> Option<&str> {
self.inner.get(name).map(|map_val| map_val.value.as_str())
}
/// Returns the value and trace index for a given attribute name.
///
/// The name must not contain any uppercase ASCII character (A-Z)
/// or the method will always return `None`.
pub fn value_and_trace_idx(&self, name: &str) -> Option<(&str, Option)> {
self.inner
.get(name)
.map(|map_val| (map_val.value.as_str(), map_val.trace_idx))
}
}
impl<'a> Attribute<'a> {
/// Returns the attribute name.
/// Uppercase ASCII characters (A-Z) have been converted to lowercase.
pub fn name(&self) -> &'a str {
self.name
}
/// Returns the attribute value. Character references have been resolved.
pub fn value(&self) -> &'a str {
&self.map_val.value
}
/// Returns the index of the corresponding [`AttributeTrace`] in the
/// `attribute_traces` field of [`StartTagTrace`], in case this attribute
/// was present in the source and the [`Emitter`] has tracked that.
///
/// [`AttributeTrace`]: super::trace::AttributeTrace
/// [`StartTagTrace`]: super::trace::AttributeTrace
/// [`Emitter`]: super::Emitter
pub fn trace_idx(&self) -> Option {
self.map_val.trace_idx
}
}
// We cannot impl Index