diff options
author | Martin Fischer <martin@push-f.com> | 2023-09-10 19:37:34 +0200 |
---|---|---|
committer | Martin Fischer <martin@push-f.com> | 2023-09-28 10:36:08 +0200 |
commit | 852d5c6f2e65a5ab466662ae1c649a0ed25c70a9 (patch) | |
tree | 96d6bcdb2f2274f1081a0b6cfbde314f319159a1 /src/trace.rs | |
parent | a03cea75d9d120a7519be91ec872b143b5d74276 (diff) |
break!: move offsets out of Token
Previously the Token enum contained the offsets using the O generic
type parameter, which could be a usize if you're tracking offsets or
a zero-sized type if you didn't care about offsets. This commit moves
all the byte offset and syntax information to a new Trace enum,
which has several advantages:
* Traces can now easily be stored separately, while the tokens are
fed to the tree builder. (The tree builder only has to keep track
of which tree nodes originate from which tokens.)
* No needless generics for functions that take a token but don't
care about offsets (a tree construction implementation is bound
to have many of such functions).
* The FromIterator<(String, String)> impl for AttributeMap no longer
has to specify arbitrary values for the spans and the value_syntax).
* The PartialEq implementation of Token is now much more useful
(since it no longer includes all the offsets).
* The Debug formatting of Token is now more readable
(since it no longer includes all the offsets).
* Function pointers to functions accepting tokens are possible.
(Since function pointer types may not have generic parameters.)
Diffstat (limited to 'src/trace.rs')
-rw-r--r-- | src/trace.rs | 241 |
1 files changed, 241 insertions, 0 deletions
diff --git a/src/trace.rs b/src/trace.rs new file mode 100644 index 0000000..a816429 --- /dev/null +++ b/src/trace.rs @@ -0,0 +1,241 @@ +//! Provides the [`Trace`] type (byte offsets and syntax information about tokens). + +use std::{ + num::NonZeroUsize, + ops::{Index, Range}, +}; + +use crate::let_else::assume; +use crate::token::AttributeTraceIdx; + +/// Provides byte offsets and syntax information about a [`Token`]. +/// +/// [`Token`]: crate::token::Token +#[allow(missing_docs)] +#[derive(Eq, PartialEq, Debug)] +pub enum Trace { + Char, + StartTag(StartTagTrace), + EndTag(EndTagTrace), + Comment(CommentTrace), + Doctype(DoctypeTrace), + EndOfFile, +} + +/// Provides byte offsets and syntax information for a [`StartTag`] token. +/// +/// [`StartTag`]: crate::token::StartTag +#[derive(Eq, PartialEq, Debug)] +pub struct StartTagTrace { + /// The span of the tag. + pub span: Range<usize>, + + /// The span of the tag name. + pub name_span: Range<usize>, + + /// List of [`AttributeTrace`]s for the attributes that were present in the source. + pub attribute_traces: AttributeTraceList, +} + +/// Provides byte offsets for an [`EndTag`] token. +/// +/// [`EndTag`]: crate::token::EndTag +#[derive(Eq, PartialEq, Debug)] +pub struct EndTagTrace { + /// The span of the tag. + pub span: Range<usize>, + + /// The span of the tag name. + pub name_span: Range<usize>, +} + +/// Provides byte offsets for a [`Token::Comment`]. +/// +/// [`Token::Comment`]: crate::token::Token::Comment +#[derive(Eq, PartialEq, Debug)] +pub struct CommentTrace { + /// The offset of the comment data. + pub data_span: Range<usize>, +} + +/// Provides byte offsets for a [`Doctype`] token. +/// +/// [`Doctype`]: crate::token::Doctype +#[derive(Eq, PartialEq, Debug)] +pub struct DoctypeTrace { + pub(crate) span: Range<usize>, + // Using NonZeroUsize to optimize the size of the struct. + name_span: Option<Range<std::num::NonZeroUsize>>, + public_id_span: Option<Range<std::num::NonZeroUsize>>, + system_id_span: Option<Range<std::num::NonZeroUsize>>, +} + +impl DoctypeTrace { + /// Returns the span of the DOCTYPE. + pub fn span(&self) -> Range<usize> { + self.span.clone() + } + + /// Returns the span of the name. + pub fn name_span(&self) -> Option<Range<usize>> { + self.name_span + .as_ref() + .map(|range| range.start.get()..range.end.get()) + } + + /// Returns the span of the public identifier. + pub fn public_id_span(&self) -> Option<Range<usize>> { + self.public_id_span + .as_ref() + .map(|range| range.start.get()..range.end.get()) + } + + /// Returns the span of the system identifier. + pub fn system_id_span(&self) -> Option<Range<usize>> { + self.system_id_span + .as_ref() + .map(|range| range.start.get()..range.end.get()) + } +} + +/// Internal [`DoctypeTrace`] methods. +/// +/// Note that even though it stands to reason that the offsets provided to the `set_` +/// methods can never be zero, we intentionally don't use `new_unchecked` since +/// actually verifying that the offsets provided to the respective Emitter methods can +/// never be zero would non-trivial (since the tokenizer state machine has 80 states). +impl DoctypeTrace { + #[inline] + pub(crate) fn new(span_start: usize) -> Self { + Self { + span: span_start..0, + name_span: None, + public_id_span: None, + system_id_span: None, + } + } + + #[inline] + pub(crate) fn set_name_start(&mut self, start: usize) { + let start = NonZeroUsize::new(start).expect("expected offset to be non-zero"); + self.name_span = Some(start..start); + } + + #[inline] + pub(crate) fn set_public_id_start(&mut self, start: usize) { + let start = NonZeroUsize::new(start).expect("expected offset to be non-zero"); + self.public_id_span = Some(start..start); + } + + #[inline] + pub(crate) fn set_system_id_start(&mut self, start: usize) { + let start = NonZeroUsize::new(start).expect("expected offset to be non-zero"); + self.system_id_span = Some(start..start); + } + + #[inline] + pub(crate) fn set_name_end(&mut self, end: usize) { + assume!(Some(span), &mut self.name_span); + span.end = NonZeroUsize::new(end).expect("expected offset to be non-zero"); + } + + #[inline] + pub(crate) fn set_public_id_end(&mut self, end: usize) { + assume!(Some(span), &mut self.public_id_span); + span.end = NonZeroUsize::new(end).expect("expected offset to be non-zero"); + } + + #[inline] + pub(crate) fn set_system_id_end(&mut self, end: usize) { + assume!(Some(span), &mut self.system_id_span); + span.end = NonZeroUsize::new(end).expect("expected offset to be non-zero"); + } +} + +/// The syntax of the attribute value. +#[derive(Clone, Copy, PartialEq, Eq, Debug)] +pub enum AttrValueSyntax { + /// An unquoted attribute value, e.g. `id=foo`. + Unquoted, + /// A single-quoted attribute value, e.g. `id='foo'`. + SingleQuoted, + /// A double-quoted attribute value, e.g. `id="foo"`. + DoubleQuoted, +} + +/// Provides byte offsets and the [`AttrValueSyntax`] for an attribute that was present in the source. +#[derive(Eq, PartialEq, Debug)] +pub struct AttributeTrace { + pub(crate) value_syntax: Option<AttrValueSyntax>, + pub(crate) name_span: Range<usize>, + /// We intentionally don't use `Option<Range<O>>` here to spare us a byte (and padding) per attribute. + /// For the empty attribute syntax this is just `O::default()..O::default()`. + pub(crate) value_span: Range<usize>, +} + +impl AttributeTrace { + /// [`AttributeTrace`] intentionally doesn't implement Default + /// (since it's part of the public API and it wouldn't make sense semantically). + pub(crate) fn new() -> Self { + Self { + value_syntax: None, + name_span: Default::default(), + value_span: Default::default(), + } + } + + /// Returns the span of the attribute name. + pub fn name_span(&self) -> Range<usize> { + self.name_span.clone() + } + + /// For explicitly defined values returns the span of the attribute value. + /// + /// Returns `None` for attributes using the empty attribute syntax (e.g. `disabled` in `<input disabled>`). + pub fn value_span(&self) -> Option<Range<usize>> { + if self.value_syntax.is_none() { + return None; + } + Some(self.value_span.clone()) + } + + /// Returns the attribute value syntax in case the value is explicitly defined. + /// + /// Returns `None` for attributes using the empty attribute syntax (e.g. `disabled` in `<input disabled>`). + pub fn value_syntax(&self) -> Option<AttrValueSyntax> { + self.value_syntax + } +} + +/// List of [`AttributeTrace`]s for the attributes that were present in the source. +#[derive(Eq, PartialEq, Debug)] +pub struct AttributeTraceList { + /// We don't use `HashMap<String, AttributeTrace>` since this would require + /// the attribute names to be cloned (which would be less efficient). + traces: Vec<AttributeTrace>, +} + +impl Index<AttributeTraceIdx> for AttributeTraceList { + type Output = AttributeTrace; + + fn index(&self, index: AttributeTraceIdx) -> &Self::Output { + &self.traces[index.0.get() - 1] + } +} + +impl AttributeTraceList { + pub(crate) fn new() -> Self { + Self { + traces: Default::default(), + } + } + + pub(crate) fn insert(&mut self, trace: AttributeTrace) -> AttributeTraceIdx { + self.traces.push(trace); + let len = self.traces.len(); + AttributeTraceIdx( + // SAFETY: len cannot be zero because we push before calling Vec::len. + unsafe { std::num::NonZeroUsize::new_unchecked(len) }, + ) + } +} |