diff options
Diffstat (limited to 'src/token.rs')
-rw-r--r-- | src/token.rs | 217 |
1 files changed, 71 insertions, 146 deletions
diff --git a/src/token.rs b/src/token.rs index ed8c8c8..4f3c0ce 100644 --- a/src/token.rs +++ b/src/token.rs @@ -2,32 +2,30 @@ use std::collections::{btree_map, BTreeMap}; use std::iter::FromIterator; -use std::ops::{Index, Range}; - -use crate::offset::Offset; +use std::ops::Index; /// A type for the tokens emitted by a WHATWG-compliant HTML tokenizer. #[derive(Debug, Eq, PartialEq)] -pub enum Token<O> { +pub enum Token { /// A literal character, a resolved character reference, /// or part of a resolved character reference (since some /// character references resolve to two `char`s). Char(char), /// An HTML start tag. - StartTag(StartTag<O>), + StartTag(StartTag), /// An HTML end tag. - EndTag(EndTag<O>), + EndTag(EndTag), /// An HTML comment. - Comment(Comment<O>), + Comment(String), /// An HTML doctype declaration. - Doctype(Doctype<O>), + Doctype(Doctype), /// An end-of-file token. EndOfFile, } /// An HTML start tag, such as `<p>` or `<a>`. #[derive(Debug, Eq, PartialEq)] -pub struct StartTag<O> { +pub struct StartTag { /// Whether this tag is self-closing. If it is self-closing, no following [`EndTag`] should be /// expected. pub self_closing: bool, @@ -39,43 +37,15 @@ pub struct StartTag<O> { /// A mapping for any HTML attributes this start tag may have. /// /// Duplicate attributes are ignored after the first one as per WHATWG spec. - pub attributes: AttributeMap<O>, - - /// The source code span of the tag. - pub span: Range<O>, - - /// The span of the tag name. - pub name_span: Range<O>, + pub attributes: AttributeMap, } /// An HTML end/close tag, such as `</p>` or `</a>`. #[derive(Debug, Eq, PartialEq)] -pub struct EndTag<O> { +pub struct EndTag { /// The tag name. /// Uppercase ASCII characters (A-Z) have been converted to lowercase. pub name: String, - - /// The source code span of the tag. - pub span: Range<O>, - - /// The span of the tag name. - pub name_span: Range<O>, -} - -/// An HTML comment. -#[derive(PartialEq, Eq, Debug)] -pub struct Comment<O> { - /// The text within the comment. - pub data: String, - /// The source offset of the comment data. - pub data_span: Range<O>, -} - -impl<O: Offset> Comment<O> { - /// Returns the span for the comment data. - pub fn data_span(&self) -> Range<O> { - self.data_span.clone() - } } /// A doctype. Some examples: @@ -85,7 +55,7 @@ impl<O: Offset> Comment<O> { /// * `<!DOCTYPE {name} SYSTEM '{system_id}'>` /// * `<!DOCTYPE {name} PUBLIC '{public_id}' '{system_id}'>` #[derive(Debug, Eq, PartialEq)] -pub struct Doctype<O> { +pub struct Doctype { /// The [force-quirks flag]. /// /// [force-quirks flag]: https://html.spec.whatwg.org/multipage/parsing.html#force-quirks-flag @@ -100,38 +70,6 @@ pub struct Doctype<O> { /// The doctype's system identifier. pub system_id: Option<String>, - - /// The source code span of the doctype. - pub span: Range<O>, - - /// The span of the name. - pub(crate) name_span: Range<O>, - - /// The span of the public identifier. - pub(crate) public_id_span: Range<O>, - - /// The span of the system identifier. - pub(crate) system_id_span: Range<O>, -} - -impl<O: Offset> Doctype<O> { - /// Returns the span of the name. - pub fn name_span(&self) -> Option<Range<O>> { - self.name.as_ref()?; - Some(self.name_span.clone()) - } - - /// Returns the span of the public identifier. - pub fn public_id_span(&self) -> Option<Range<O>> { - self.public_id.as_ref()?; - Some(self.public_id_span.clone()) - } - - /// Returns the span of the system identifier. - pub fn system_id_span(&self) -> Option<Range<O>> { - self.system_id.as_ref()?; - Some(self.system_id_span.clone()) - } } /// A map of HTML attributes. @@ -143,79 +81,81 @@ impl<O: Offset> Doctype<O> { /// /// ``` /// # use html5tokenizer::attr::AttributeMap; -/// let attrs: AttributeMap<()> = vec![("href".into(), "http://example.com".into())] +/// let attrs: AttributeMap = vec![("href".into(), "http://example.com".into())] /// .into_iter() /// .collect(); /// assert_eq!(&attrs["href"], "http://example.com"); /// ``` #[derive(Debug, Default, PartialEq, Eq)] -pub struct AttributeMap<O> { - pub(crate) inner: BTreeMap<String, AttrInternal<O>>, +pub struct AttributeMap { + pub(crate) inner: BTreeMap<String, AttrInternal>, } /// The value type internally used by the [`AttributeMap`]. /// Not part of the public API. -#[derive(Default, Debug, Eq, PartialEq)] -pub(crate) struct AttrInternal<O> { +#[derive(Default, Debug, Eq)] +pub(crate) struct AttrInternal { pub value: String, - /// The span of the attribute name. - pub name_span: Range<O>, - /// The span of the attribute value. - /// For the empty attribute syntax this is just `O::default()..O::default()`. - /// We intentionally don't use `Option<Range<O>>` here to spare us a byte (and padding) per attribute. - pub value_span: Range<O>, - pub value_syntax: Option<AttrValueSyntax>, + pub trace_idx: Option<AttributeTraceIdx>, } -/// The syntax of the attribute value. -#[derive(Clone, Copy, PartialEq, Eq, Debug)] -pub enum AttrValueSyntax { - /// An unquoted attribute value, e.g. `id=foo`. - Unquoted, - /// A single-quoted attribute value, e.g. `id='foo'`. - SingleQuoted, - /// A double-quoted attribute value, e.g. `id="foo"`. - DoubleQuoted, +/// The index of an [`AttributeTrace`] within an [`AttributeTraceList`]. +/// +/// [`AttributeTrace`]: crate::trace::AttributeTrace +/// [`AttributeTraceList`]: crate::trace::AttributeTraceList +#[derive(Clone, Copy, Eq, PartialEq, Debug)] +pub struct AttributeTraceIdx( + // Using NonZeroUsize so that `Option<AttributeTraceIdx>` + // has the same size as `AttributeTraceIdx`. + pub std::num::NonZeroUsize, +); + +impl PartialEq for AttrInternal { + fn eq(&self, other: &Self) -> bool { + // We intentionally don't include the trace_idx, + // so that PartialEq of Token only compares semantics. + self.value == other.value + } } /// An HTML attribute borrowed from an [`AttributeMap`]. #[derive(Debug, Eq, PartialEq)] -pub struct Attribute<'a, O> { +pub struct Attribute<'a> { name: &'a str, - map_val: &'a AttrInternal<O>, + map_val: &'a AttrInternal, } /// An owned HTML attribute. #[derive(Debug, PartialEq, Eq)] -pub struct AttributeOwned<O> { +pub struct AttributeOwned { /// The attribute name. /// Uppercase ASCII characters (A-Z) have been converted to lowercase. pub name: String, /// The attribute value. Character references have been resolved. pub value: String, - /// The span of the attribute name. - pub name_span: Range<O>, - /// The span of the attribute value. - /// `None` in case of the empty attribute syntax (e.g. `disabled` in `<input disabled>`). - pub value_span: Option<Range<O>>, - /// The syntax of the attribute value. - /// `None` indicates the empty attribute syntax (e.g. `disabled` in `<input disabled>`). - pub value_syntax: Option<AttrValueSyntax>, + /// The index of the corresponding [`AttributeTrace`] in the + /// `attribute_traces` field of [`StartTagTrace`], in case this attribute + /// was present in the source and the [`Emitter`] has tracked this. + /// + /// [`AttributeTrace`]: super::trace::AttributeTrace + /// [`StartTagTrace`]: super::trace::AttributeTrace + /// [`Emitter`]: super::Emitter + pub trace_idx: Option<AttributeTraceIdx>, } -impl<O> AttributeMap<O> { +impl AttributeMap { /// Returns the attribute with the given name. /// /// The name must not contain any uppercase ASCII character (A-Z) /// or the method will always return `None`. - pub fn get(&self, name: &str) -> Option<Attribute<O>> { + pub fn get(&self, name: &str) -> Option<Attribute> { self.inner .get_key_value(name) .map(|(name, map_val)| Attribute { name, map_val }) } } -impl<'a, O: Offset> Attribute<'a, O> { +impl<'a> Attribute<'a> { /// Returns the attribute name. /// Uppercase ASCII characters (A-Z) have been converted to lowercase. pub fn name(&self) -> &'a str { @@ -227,32 +167,21 @@ impl<'a, O: Offset> Attribute<'a, O> { &self.map_val.value } - /// Returns the span of the attribute name. - pub fn name_span(&self) -> Range<O> { - self.map_val.name_span.clone() - } - - /// For explicitly defined values returns the span of the attribute value. - /// - /// Returns `None` for attributes using the empty attribute syntax (e.g. `disabled` in `<input disabled>`). - pub fn value_span(&self) -> Option<Range<O>> { - if self.map_val.value_syntax.is_none() { - return None; - } - Some(self.map_val.value_span.clone()) - } - - /// Returns the attribute value syntax in case the value is explicitly defined. + /// Returns the index of the corresponding [`AttributeTrace`] in the + /// `attribute_traces` field of [`StartTagTrace`], in case this attribute + /// was present in the source and the [`Emitter`] has tracked that. /// - /// Returns `None` for attributes using the empty attribute syntax (e.g. `disabled` in `<input disabled>`). - pub fn value_syntax(&self) -> Option<AttrValueSyntax> { - self.map_val.value_syntax + /// [`AttributeTrace`]: super::trace::AttributeTrace + /// [`StartTagTrace`]: super::trace::AttributeTrace + /// [`Emitter`]: super::Emitter + pub fn trace_idx(&self) -> Option<AttributeTraceIdx> { + self.map_val.trace_idx } } // We cannot impl Index<Output=Attribute> because Index::index returns a reference of // the Output type (and you cannot return a value referencing a temporary value). -impl<O> Index<&str> for AttributeMap<O> { +impl Index<&str> for AttributeMap { type Output = str; /// Returns the attribute value with the given name. @@ -264,10 +193,10 @@ impl<O> Index<&str> for AttributeMap<O> { } } -impl<O> IntoIterator for AttributeMap<O> { - type Item = AttributeOwned<O>; +impl IntoIterator for AttributeMap { + type Item = AttributeOwned; - type IntoIter = AttrIntoIter<O>; + type IntoIter = AttrIntoIter; fn into_iter(self) -> Self::IntoIter { AttrIntoIter(self.inner.into_iter()) @@ -275,27 +204,25 @@ impl<O> IntoIterator for AttributeMap<O> { } /// A consuming iterator over the attributes of an [`AttributeMap`]. -pub struct AttrIntoIter<O>(btree_map::IntoIter<String, AttrInternal<O>>); +pub struct AttrIntoIter(btree_map::IntoIter<String, AttrInternal>); -impl<O> Iterator for AttrIntoIter<O> { - type Item = AttributeOwned<O>; +impl Iterator for AttrIntoIter { + type Item = AttributeOwned; fn next(&mut self) -> Option<Self::Item> { let (name, map_val) = self.0.next()?; Some(AttributeOwned { name, value: map_val.value, - name_span: map_val.name_span, - value_span: map_val.value_syntax.is_some().then_some(map_val.value_span), - value_syntax: map_val.value_syntax, + trace_idx: map_val.trace_idx, }) } } -impl<'a, O> IntoIterator for &'a AttributeMap<O> { - type Item = Attribute<'a, O>; +impl<'a> IntoIterator for &'a AttributeMap { + type Item = Attribute<'a>; - type IntoIter = AttrIter<'a, O>; + type IntoIter = AttrIter<'a>; fn into_iter(self) -> Self::IntoIter { AttrIter(self.inner.iter()) @@ -303,10 +230,10 @@ impl<'a, O> IntoIterator for &'a AttributeMap<O> { } /// A borrowed iterator over the attributes of an [`AttributeMap`]. -pub struct AttrIter<'a, S>(btree_map::Iter<'a, String, AttrInternal<S>>); +pub struct AttrIter<'a>(btree_map::Iter<'a, String, AttrInternal>); -impl<'a, S> Iterator for AttrIter<'a, S> { - type Item = Attribute<'a, S>; +impl<'a> Iterator for AttrIter<'a> { + type Item = Attribute<'a>; fn next(&mut self) -> Option<Self::Item> { let (name, map_val) = self.0.next()?; @@ -314,7 +241,7 @@ impl<'a, S> Iterator for AttrIter<'a, S> { } } -impl<O: Default> FromIterator<(String, String)> for AttributeMap<O> { +impl FromIterator<(String, String)> for AttributeMap { fn from_iter<T: IntoIterator<Item = (String, String)>>(iter: T) -> Self { Self { inner: iter @@ -324,9 +251,7 @@ impl<O: Default> FromIterator<(String, String)> for AttributeMap<O> { name, AttrInternal { value, - name_span: O::default()..O::default(), - value_span: O::default()..O::default(), - value_syntax: Some(AttrValueSyntax::DoubleQuoted), + trace_idx: None, }, ) }) |