aboutsummaryrefslogtreecommitdiff
path: root/src/token.rs
diff options
context:
space:
mode:
authorMartin Fischer <martin@push-f.com>2023-09-10 19:37:34 +0200
committerMartin Fischer <martin@push-f.com>2023-09-28 10:36:08 +0200
commit852d5c6f2e65a5ab466662ae1c649a0ed25c70a9 (patch)
tree96d6bcdb2f2274f1081a0b6cfbde314f319159a1 /src/token.rs
parenta03cea75d9d120a7519be91ec872b143b5d74276 (diff)
break!: move offsets out of Token
Previously the Token enum contained the offsets using the O generic type parameter, which could be a usize if you're tracking offsets or a zero-sized type if you didn't care about offsets. This commit moves all the byte offset and syntax information to a new Trace enum, which has several advantages: * Traces can now easily be stored separately, while the tokens are fed to the tree builder. (The tree builder only has to keep track of which tree nodes originate from which tokens.) * No needless generics for functions that take a token but don't care about offsets (a tree construction implementation is bound to have many of such functions). * The FromIterator<(String, String)> impl for AttributeMap no longer has to specify arbitrary values for the spans and the value_syntax). * The PartialEq implementation of Token is now much more useful (since it no longer includes all the offsets). * The Debug formatting of Token is now more readable (since it no longer includes all the offsets). * Function pointers to functions accepting tokens are possible. (Since function pointer types may not have generic parameters.)
Diffstat (limited to 'src/token.rs')
-rw-r--r--src/token.rs217
1 files changed, 71 insertions, 146 deletions
diff --git a/src/token.rs b/src/token.rs
index ed8c8c8..4f3c0ce 100644
--- a/src/token.rs
+++ b/src/token.rs
@@ -2,32 +2,30 @@
use std::collections::{btree_map, BTreeMap};
use std::iter::FromIterator;
-use std::ops::{Index, Range};
-
-use crate::offset::Offset;
+use std::ops::Index;
/// A type for the tokens emitted by a WHATWG-compliant HTML tokenizer.
#[derive(Debug, Eq, PartialEq)]
-pub enum Token<O> {
+pub enum Token {
/// A literal character, a resolved character reference,
/// or part of a resolved character reference (since some
/// character references resolve to two `char`s).
Char(char),
/// An HTML start tag.
- StartTag(StartTag<O>),
+ StartTag(StartTag),
/// An HTML end tag.
- EndTag(EndTag<O>),
+ EndTag(EndTag),
/// An HTML comment.
- Comment(Comment<O>),
+ Comment(String),
/// An HTML doctype declaration.
- Doctype(Doctype<O>),
+ Doctype(Doctype),
/// An end-of-file token.
EndOfFile,
}
/// An HTML start tag, such as `<p>` or `<a>`.
#[derive(Debug, Eq, PartialEq)]
-pub struct StartTag<O> {
+pub struct StartTag {
/// Whether this tag is self-closing. If it is self-closing, no following [`EndTag`] should be
/// expected.
pub self_closing: bool,
@@ -39,43 +37,15 @@ pub struct StartTag<O> {
/// A mapping for any HTML attributes this start tag may have.
///
/// Duplicate attributes are ignored after the first one as per WHATWG spec.
- pub attributes: AttributeMap<O>,
-
- /// The source code span of the tag.
- pub span: Range<O>,
-
- /// The span of the tag name.
- pub name_span: Range<O>,
+ pub attributes: AttributeMap,
}
/// An HTML end/close tag, such as `</p>` or `</a>`.
#[derive(Debug, Eq, PartialEq)]
-pub struct EndTag<O> {
+pub struct EndTag {
/// The tag name.
/// Uppercase ASCII characters (A-Z) have been converted to lowercase.
pub name: String,
-
- /// The source code span of the tag.
- pub span: Range<O>,
-
- /// The span of the tag name.
- pub name_span: Range<O>,
-}
-
-/// An HTML comment.
-#[derive(PartialEq, Eq, Debug)]
-pub struct Comment<O> {
- /// The text within the comment.
- pub data: String,
- /// The source offset of the comment data.
- pub data_span: Range<O>,
-}
-
-impl<O: Offset> Comment<O> {
- /// Returns the span for the comment data.
- pub fn data_span(&self) -> Range<O> {
- self.data_span.clone()
- }
}
/// A doctype. Some examples:
@@ -85,7 +55,7 @@ impl<O: Offset> Comment<O> {
/// * `<!DOCTYPE {name} SYSTEM '{system_id}'>`
/// * `<!DOCTYPE {name} PUBLIC '{public_id}' '{system_id}'>`
#[derive(Debug, Eq, PartialEq)]
-pub struct Doctype<O> {
+pub struct Doctype {
/// The [force-quirks flag].
///
/// [force-quirks flag]: https://html.spec.whatwg.org/multipage/parsing.html#force-quirks-flag
@@ -100,38 +70,6 @@ pub struct Doctype<O> {
/// The doctype's system identifier.
pub system_id: Option<String>,
-
- /// The source code span of the doctype.
- pub span: Range<O>,
-
- /// The span of the name.
- pub(crate) name_span: Range<O>,
-
- /// The span of the public identifier.
- pub(crate) public_id_span: Range<O>,
-
- /// The span of the system identifier.
- pub(crate) system_id_span: Range<O>,
-}
-
-impl<O: Offset> Doctype<O> {
- /// Returns the span of the name.
- pub fn name_span(&self) -> Option<Range<O>> {
- self.name.as_ref()?;
- Some(self.name_span.clone())
- }
-
- /// Returns the span of the public identifier.
- pub fn public_id_span(&self) -> Option<Range<O>> {
- self.public_id.as_ref()?;
- Some(self.public_id_span.clone())
- }
-
- /// Returns the span of the system identifier.
- pub fn system_id_span(&self) -> Option<Range<O>> {
- self.system_id.as_ref()?;
- Some(self.system_id_span.clone())
- }
}
/// A map of HTML attributes.
@@ -143,79 +81,81 @@ impl<O: Offset> Doctype<O> {
///
/// ```
/// # use html5tokenizer::attr::AttributeMap;
-/// let attrs: AttributeMap<()> = vec![("href".into(), "http://example.com".into())]
+/// let attrs: AttributeMap = vec![("href".into(), "http://example.com".into())]
/// .into_iter()
/// .collect();
/// assert_eq!(&attrs["href"], "http://example.com");
/// ```
#[derive(Debug, Default, PartialEq, Eq)]
-pub struct AttributeMap<O> {
- pub(crate) inner: BTreeMap<String, AttrInternal<O>>,
+pub struct AttributeMap {
+ pub(crate) inner: BTreeMap<String, AttrInternal>,
}
/// The value type internally used by the [`AttributeMap`].
/// Not part of the public API.
-#[derive(Default, Debug, Eq, PartialEq)]
-pub(crate) struct AttrInternal<O> {
+#[derive(Default, Debug, Eq)]
+pub(crate) struct AttrInternal {
pub value: String,
- /// The span of the attribute name.
- pub name_span: Range<O>,
- /// The span of the attribute value.
- /// For the empty attribute syntax this is just `O::default()..O::default()`.
- /// We intentionally don't use `Option<Range<O>>` here to spare us a byte (and padding) per attribute.
- pub value_span: Range<O>,
- pub value_syntax: Option<AttrValueSyntax>,
+ pub trace_idx: Option<AttributeTraceIdx>,
}
-/// The syntax of the attribute value.
-#[derive(Clone, Copy, PartialEq, Eq, Debug)]
-pub enum AttrValueSyntax {
- /// An unquoted attribute value, e.g. `id=foo`.
- Unquoted,
- /// A single-quoted attribute value, e.g. `id='foo'`.
- SingleQuoted,
- /// A double-quoted attribute value, e.g. `id="foo"`.
- DoubleQuoted,
+/// The index of an [`AttributeTrace`] within an [`AttributeTraceList`].
+///
+/// [`AttributeTrace`]: crate::trace::AttributeTrace
+/// [`AttributeTraceList`]: crate::trace::AttributeTraceList
+#[derive(Clone, Copy, Eq, PartialEq, Debug)]
+pub struct AttributeTraceIdx(
+ // Using NonZeroUsize so that `Option<AttributeTraceIdx>`
+ // has the same size as `AttributeTraceIdx`.
+ pub std::num::NonZeroUsize,
+);
+
+impl PartialEq for AttrInternal {
+ fn eq(&self, other: &Self) -> bool {
+ // We intentionally don't include the trace_idx,
+ // so that PartialEq of Token only compares semantics.
+ self.value == other.value
+ }
}
/// An HTML attribute borrowed from an [`AttributeMap`].
#[derive(Debug, Eq, PartialEq)]
-pub struct Attribute<'a, O> {
+pub struct Attribute<'a> {
name: &'a str,
- map_val: &'a AttrInternal<O>,
+ map_val: &'a AttrInternal,
}
/// An owned HTML attribute.
#[derive(Debug, PartialEq, Eq)]
-pub struct AttributeOwned<O> {
+pub struct AttributeOwned {
/// The attribute name.
/// Uppercase ASCII characters (A-Z) have been converted to lowercase.
pub name: String,
/// The attribute value. Character references have been resolved.
pub value: String,
- /// The span of the attribute name.
- pub name_span: Range<O>,
- /// The span of the attribute value.
- /// `None` in case of the empty attribute syntax (e.g. `disabled` in `<input disabled>`).
- pub value_span: Option<Range<O>>,
- /// The syntax of the attribute value.
- /// `None` indicates the empty attribute syntax (e.g. `disabled` in `<input disabled>`).
- pub value_syntax: Option<AttrValueSyntax>,
+ /// The index of the corresponding [`AttributeTrace`] in the
+ /// `attribute_traces` field of [`StartTagTrace`], in case this attribute
+ /// was present in the source and the [`Emitter`] has tracked this.
+ ///
+ /// [`AttributeTrace`]: super::trace::AttributeTrace
+ /// [`StartTagTrace`]: super::trace::AttributeTrace
+ /// [`Emitter`]: super::Emitter
+ pub trace_idx: Option<AttributeTraceIdx>,
}
-impl<O> AttributeMap<O> {
+impl AttributeMap {
/// Returns the attribute with the given name.
///
/// The name must not contain any uppercase ASCII character (A-Z)
/// or the method will always return `None`.
- pub fn get(&self, name: &str) -> Option<Attribute<O>> {
+ pub fn get(&self, name: &str) -> Option<Attribute> {
self.inner
.get_key_value(name)
.map(|(name, map_val)| Attribute { name, map_val })
}
}
-impl<'a, O: Offset> Attribute<'a, O> {
+impl<'a> Attribute<'a> {
/// Returns the attribute name.
/// Uppercase ASCII characters (A-Z) have been converted to lowercase.
pub fn name(&self) -> &'a str {
@@ -227,32 +167,21 @@ impl<'a, O: Offset> Attribute<'a, O> {
&self.map_val.value
}
- /// Returns the span of the attribute name.
- pub fn name_span(&self) -> Range<O> {
- self.map_val.name_span.clone()
- }
-
- /// For explicitly defined values returns the span of the attribute value.
- ///
- /// Returns `None` for attributes using the empty attribute syntax (e.g. `disabled` in `<input disabled>`).
- pub fn value_span(&self) -> Option<Range<O>> {
- if self.map_val.value_syntax.is_none() {
- return None;
- }
- Some(self.map_val.value_span.clone())
- }
-
- /// Returns the attribute value syntax in case the value is explicitly defined.
+ /// Returns the index of the corresponding [`AttributeTrace`] in the
+ /// `attribute_traces` field of [`StartTagTrace`], in case this attribute
+ /// was present in the source and the [`Emitter`] has tracked that.
///
- /// Returns `None` for attributes using the empty attribute syntax (e.g. `disabled` in `<input disabled>`).
- pub fn value_syntax(&self) -> Option<AttrValueSyntax> {
- self.map_val.value_syntax
+ /// [`AttributeTrace`]: super::trace::AttributeTrace
+ /// [`StartTagTrace`]: super::trace::AttributeTrace
+ /// [`Emitter`]: super::Emitter
+ pub fn trace_idx(&self) -> Option<AttributeTraceIdx> {
+ self.map_val.trace_idx
}
}
// We cannot impl Index<Output=Attribute> because Index::index returns a reference of
// the Output type (and you cannot return a value referencing a temporary value).
-impl<O> Index<&str> for AttributeMap<O> {
+impl Index<&str> for AttributeMap {
type Output = str;
/// Returns the attribute value with the given name.
@@ -264,10 +193,10 @@ impl<O> Index<&str> for AttributeMap<O> {
}
}
-impl<O> IntoIterator for AttributeMap<O> {
- type Item = AttributeOwned<O>;
+impl IntoIterator for AttributeMap {
+ type Item = AttributeOwned;
- type IntoIter = AttrIntoIter<O>;
+ type IntoIter = AttrIntoIter;
fn into_iter(self) -> Self::IntoIter {
AttrIntoIter(self.inner.into_iter())
@@ -275,27 +204,25 @@ impl<O> IntoIterator for AttributeMap<O> {
}
/// A consuming iterator over the attributes of an [`AttributeMap`].
-pub struct AttrIntoIter<O>(btree_map::IntoIter<String, AttrInternal<O>>);
+pub struct AttrIntoIter(btree_map::IntoIter<String, AttrInternal>);
-impl<O> Iterator for AttrIntoIter<O> {
- type Item = AttributeOwned<O>;
+impl Iterator for AttrIntoIter {
+ type Item = AttributeOwned;
fn next(&mut self) -> Option<Self::Item> {
let (name, map_val) = self.0.next()?;
Some(AttributeOwned {
name,
value: map_val.value,
- name_span: map_val.name_span,
- value_span: map_val.value_syntax.is_some().then_some(map_val.value_span),
- value_syntax: map_val.value_syntax,
+ trace_idx: map_val.trace_idx,
})
}
}
-impl<'a, O> IntoIterator for &'a AttributeMap<O> {
- type Item = Attribute<'a, O>;
+impl<'a> IntoIterator for &'a AttributeMap {
+ type Item = Attribute<'a>;
- type IntoIter = AttrIter<'a, O>;
+ type IntoIter = AttrIter<'a>;
fn into_iter(self) -> Self::IntoIter {
AttrIter(self.inner.iter())
@@ -303,10 +230,10 @@ impl<'a, O> IntoIterator for &'a AttributeMap<O> {
}
/// A borrowed iterator over the attributes of an [`AttributeMap`].
-pub struct AttrIter<'a, S>(btree_map::Iter<'a, String, AttrInternal<S>>);
+pub struct AttrIter<'a>(btree_map::Iter<'a, String, AttrInternal>);
-impl<'a, S> Iterator for AttrIter<'a, S> {
- type Item = Attribute<'a, S>;
+impl<'a> Iterator for AttrIter<'a> {
+ type Item = Attribute<'a>;
fn next(&mut self) -> Option<Self::Item> {
let (name, map_val) = self.0.next()?;
@@ -314,7 +241,7 @@ impl<'a, S> Iterator for AttrIter<'a, S> {
}
}
-impl<O: Default> FromIterator<(String, String)> for AttributeMap<O> {
+impl FromIterator<(String, String)> for AttributeMap {
fn from_iter<T: IntoIterator<Item = (String, String)>>(iter: T) -> Self {
Self {
inner: iter
@@ -324,9 +251,7 @@ impl<O: Default> FromIterator<(String, String)> for AttributeMap<O> {
name,
AttrInternal {
value,
- name_span: O::default()..O::default(),
- value_span: O::default()..O::default(),
- value_syntax: Some(AttrValueSyntax::DoubleQuoted),
+ trace_idx: None,
},
)
})