diff options
| author | Martin Fischer <martin@push-f.com> | 2023-09-10 19:37:34 +0200 | 
|---|---|---|
| committer | Martin Fischer <martin@push-f.com> | 2023-09-28 10:36:08 +0200 | 
| commit | 852d5c6f2e65a5ab466662ae1c649a0ed25c70a9 (patch) | |
| tree | 96d6bcdb2f2274f1081a0b6cfbde314f319159a1 /src | |
| parent | a03cea75d9d120a7519be91ec872b143b5d74276 (diff) | |
break!: move offsets out of Token
Previously the Token enum contained the offsets using the O generic
type parameter, which could be a usize if you're tracking offsets or
a zero-sized type if you didn't care about offsets. This commit moves
all the byte offset and syntax information to a new Trace enum,
which has several advantages:
* Traces can now easily be stored separately, while the tokens are
  fed to the tree builder. (The tree builder only has to keep track
  of which tree nodes originate from which tokens.)
* No needless generics for functions that take a token but don't
  care about offsets (a tree construction implementation is bound
  to have many of such functions).
* The FromIterator<(String, String)> impl for AttributeMap no longer
  has to specify arbitrary values for the spans and the value_syntax).
* The PartialEq implementation of Token is now much more useful
  (since it no longer includes all the offsets).
* The Debug formatting of Token is now more readable
  (since it no longer includes all the offsets).
* Function pointers to functions accepting tokens are possible.
  (Since function pointer types may not have generic parameters.)
Diffstat (limited to 'src')
| -rw-r--r-- | src/basic_emitter.rs | 2 | ||||
| -rw-r--r-- | src/emitter.rs | 2 | ||||
| -rw-r--r-- | src/let_else.rs | 22 | ||||
| -rw-r--r-- | src/lib.rs | 8 | ||||
| -rw-r--r-- | src/token.rs | 217 | ||||
| -rw-r--r-- | src/tokenizer.rs | 1 | ||||
| -rw-r--r-- | src/tokenizer/machine.rs | 2 | ||||
| -rw-r--r-- | src/trace.rs | 241 | ||||
| -rw-r--r-- | src/tracing_emitter.rs | 148 | 
9 files changed, 432 insertions, 211 deletions
| diff --git a/src/basic_emitter.rs b/src/basic_emitter.rs index 046b645..bcb3f41 100644 --- a/src/basic_emitter.rs +++ b/src/basic_emitter.rs @@ -27,7 +27,7 @@ impl<O> BasicEmitter<O> {  }  impl<O> Iterator for BasicEmitter<O> { -    type Item = Token<O>; +    type Item = Token;      fn next(&mut self) -> Option<Self::Item> {          todo!() diff --git a/src/emitter.rs b/src/emitter.rs index 25e0209..d1e1dfe 100644 --- a/src/emitter.rs +++ b/src/emitter.rs @@ -1,6 +1,6 @@  use std::ops::Range; -use crate::token::AttrValueSyntax; +use crate::trace::AttrValueSyntax;  use crate::Error;  /// An emitter is an object providing methods to the tokenizer to produce ("emit") tokens. diff --git a/src/let_else.rs b/src/let_else.rs index da17a68..a1627f1 100644 --- a/src/let_else.rs +++ b/src/let_else.rs @@ -21,3 +21,25 @@ right: {:?}",  }  pub(crate) use assume; + +/// Binds the given expression to the given pattern, or else executes +/// `unreachable!();` with a helpful panic message and returns. +macro_rules! know { +    ($pattern:pat, $value:expr) => { +        // The expression might change each time it's evaluated, so we +        // have to bind it so that we can reuse it in the panic message. +        let _value = $value; + +        let $pattern = _value else { +            unreachable!( +                "assertion `left matches right` failed: + left: {} +right: {:?}", +                stringify!($pattern), +                _value +            ); +        }; +    }; +} + +pub(crate) use know; @@ -18,19 +18,19 @@ mod tracing_emitter;  /// Types for HTML attributes.  pub mod attr { -    pub use crate::token::{ -        AttrIntoIter, AttrIter, AttrValueSyntax, Attribute, AttributeMap, AttributeOwned, -    }; +    pub use crate::token::{AttrIntoIter, AttrIter, Attribute, AttributeMap, AttributeOwned}; +    pub use crate::trace::AttrValueSyntax;  }  pub mod offset;  pub mod reader;  pub mod token; +pub mod trace;  pub use basic_emitter::BasicEmitter;  pub use emitter::Emitter;  pub use error::Error;  pub use naive_parser::NaiveParser; -pub use token::{Comment, Doctype, EndTag, StartTag, Token}; +pub use token::{Doctype, EndTag, StartTag, Token};  pub use tokenizer::{CdataAction, Event, State, Tokenizer};  pub use tracing_emitter::TracingEmitter; diff --git a/src/token.rs b/src/token.rs index ed8c8c8..4f3c0ce 100644 --- a/src/token.rs +++ b/src/token.rs @@ -2,32 +2,30 @@  use std::collections::{btree_map, BTreeMap};  use std::iter::FromIterator; -use std::ops::{Index, Range}; - -use crate::offset::Offset; +use std::ops::Index;  /// A type for the tokens emitted by a WHATWG-compliant HTML tokenizer.  #[derive(Debug, Eq, PartialEq)] -pub enum Token<O> { +pub enum Token {      /// A literal character, a resolved character reference,      /// or part of a resolved character reference (since some      /// character references resolve to two `char`s).      Char(char),      /// An HTML start tag. -    StartTag(StartTag<O>), +    StartTag(StartTag),      /// An HTML end tag. -    EndTag(EndTag<O>), +    EndTag(EndTag),      /// An HTML comment. -    Comment(Comment<O>), +    Comment(String),      /// An HTML doctype declaration. -    Doctype(Doctype<O>), +    Doctype(Doctype),      /// An end-of-file token.      EndOfFile,  }  /// An HTML start tag, such as `<p>` or `<a>`.  #[derive(Debug, Eq, PartialEq)] -pub struct StartTag<O> { +pub struct StartTag {      /// Whether this tag is self-closing. If it is self-closing, no following [`EndTag`] should be      /// expected.      pub self_closing: bool, @@ -39,43 +37,15 @@ pub struct StartTag<O> {      /// A mapping for any HTML attributes this start tag may have.      ///      /// Duplicate attributes are ignored after the first one as per WHATWG spec. -    pub attributes: AttributeMap<O>, - -    /// The source code span of the tag. -    pub span: Range<O>, - -    /// The span of the tag name. -    pub name_span: Range<O>, +    pub attributes: AttributeMap,  }  /// An HTML end/close tag, such as `</p>` or `</a>`.  #[derive(Debug, Eq, PartialEq)] -pub struct EndTag<O> { +pub struct EndTag {      /// The tag name.      /// Uppercase ASCII characters (A-Z) have been converted to lowercase.      pub name: String, - -    /// The source code span of the tag. -    pub span: Range<O>, - -    /// The span of the tag name. -    pub name_span: Range<O>, -} - -/// An HTML comment. -#[derive(PartialEq, Eq, Debug)] -pub struct Comment<O> { -    /// The text within the comment. -    pub data: String, -    /// The source offset of the comment data. -    pub data_span: Range<O>, -} - -impl<O: Offset> Comment<O> { -    /// Returns the span for the comment data. -    pub fn data_span(&self) -> Range<O> { -        self.data_span.clone() -    }  }  /// A doctype. Some examples: @@ -85,7 +55,7 @@ impl<O: Offset> Comment<O> {  /// * `<!DOCTYPE {name} SYSTEM '{system_id}'>`  /// * `<!DOCTYPE {name} PUBLIC '{public_id}' '{system_id}'>`  #[derive(Debug, Eq, PartialEq)] -pub struct Doctype<O> { +pub struct Doctype {      /// The [force-quirks flag].      ///      /// [force-quirks flag]: https://html.spec.whatwg.org/multipage/parsing.html#force-quirks-flag @@ -100,38 +70,6 @@ pub struct Doctype<O> {      /// The doctype's system identifier.      pub system_id: Option<String>, - -    /// The source code span of the doctype. -    pub span: Range<O>, - -    /// The span of the name. -    pub(crate) name_span: Range<O>, - -    /// The span of the public identifier. -    pub(crate) public_id_span: Range<O>, - -    /// The span of the system identifier. -    pub(crate) system_id_span: Range<O>, -} - -impl<O: Offset> Doctype<O> { -    /// Returns the span of the name. -    pub fn name_span(&self) -> Option<Range<O>> { -        self.name.as_ref()?; -        Some(self.name_span.clone()) -    } - -    /// Returns the span of the public identifier. -    pub fn public_id_span(&self) -> Option<Range<O>> { -        self.public_id.as_ref()?; -        Some(self.public_id_span.clone()) -    } - -    /// Returns the span of the system identifier. -    pub fn system_id_span(&self) -> Option<Range<O>> { -        self.system_id.as_ref()?; -        Some(self.system_id_span.clone()) -    }  }  /// A map of HTML attributes. @@ -143,79 +81,81 @@ impl<O: Offset> Doctype<O> {  ///  /// ```  /// # use html5tokenizer::attr::AttributeMap; -/// let attrs: AttributeMap<()> = vec![("href".into(), "http://example.com".into())] +/// let attrs: AttributeMap = vec![("href".into(), "http://example.com".into())]  ///     .into_iter()  ///     .collect();  /// assert_eq!(&attrs["href"], "http://example.com");  /// ```  #[derive(Debug, Default, PartialEq, Eq)] -pub struct AttributeMap<O> { -    pub(crate) inner: BTreeMap<String, AttrInternal<O>>, +pub struct AttributeMap { +    pub(crate) inner: BTreeMap<String, AttrInternal>,  }  /// The value type internally used by the [`AttributeMap`].  /// Not part of the public API. -#[derive(Default, Debug, Eq, PartialEq)] -pub(crate) struct AttrInternal<O> { +#[derive(Default, Debug, Eq)] +pub(crate) struct AttrInternal {      pub value: String, -    /// The span of the attribute name. -    pub name_span: Range<O>, -    /// The span of the attribute value. -    /// For the empty attribute syntax this is just `O::default()..O::default()`. -    /// We intentionally don't use `Option<Range<O>>` here to spare us a byte (and padding) per attribute. -    pub value_span: Range<O>, -    pub value_syntax: Option<AttrValueSyntax>, +    pub trace_idx: Option<AttributeTraceIdx>,  } -/// The syntax of the attribute value. -#[derive(Clone, Copy, PartialEq, Eq, Debug)] -pub enum AttrValueSyntax { -    /// An unquoted attribute value, e.g. `id=foo`. -    Unquoted, -    /// A single-quoted attribute value, e.g. `id='foo'`. -    SingleQuoted, -    /// A double-quoted attribute value, e.g. `id="foo"`. -    DoubleQuoted, +/// The index of an [`AttributeTrace`] within an [`AttributeTraceList`]. +/// +/// [`AttributeTrace`]: crate::trace::AttributeTrace +/// [`AttributeTraceList`]: crate::trace::AttributeTraceList +#[derive(Clone, Copy, Eq, PartialEq, Debug)] +pub struct AttributeTraceIdx( +    // Using NonZeroUsize so that `Option<AttributeTraceIdx>` +    // has the same size as `AttributeTraceIdx`. +    pub std::num::NonZeroUsize, +); + +impl PartialEq for AttrInternal { +    fn eq(&self, other: &Self) -> bool { +        // We intentionally don't include the trace_idx, +        // so that PartialEq of Token only compares semantics. +        self.value == other.value +    }  }  /// An HTML attribute borrowed from an [`AttributeMap`].  #[derive(Debug, Eq, PartialEq)] -pub struct Attribute<'a, O> { +pub struct Attribute<'a> {      name: &'a str, -    map_val: &'a AttrInternal<O>, +    map_val: &'a AttrInternal,  }  /// An owned HTML attribute.  #[derive(Debug, PartialEq, Eq)] -pub struct AttributeOwned<O> { +pub struct AttributeOwned {      /// The attribute name.      /// Uppercase ASCII characters (A-Z) have been converted to lowercase.      pub name: String,      /// The attribute value. Character references have been resolved.      pub value: String, -    /// The span of the attribute name. -    pub name_span: Range<O>, -    /// The span of the attribute value. -    /// `None` in case of the empty attribute syntax (e.g. `disabled` in `<input disabled>`). -    pub value_span: Option<Range<O>>, -    /// The syntax of the attribute value. -    /// `None` indicates the empty attribute syntax (e.g. `disabled` in `<input disabled>`). -    pub value_syntax: Option<AttrValueSyntax>, +    /// The index of the corresponding [`AttributeTrace`] in the +    /// `attribute_traces` field of [`StartTagTrace`], in case this attribute +    /// was present in the source and the [`Emitter`] has tracked this. +    /// +    /// [`AttributeTrace`]: super::trace::AttributeTrace +    /// [`StartTagTrace`]: super::trace::AttributeTrace +    /// [`Emitter`]: super::Emitter +    pub trace_idx: Option<AttributeTraceIdx>,  } -impl<O> AttributeMap<O> { +impl AttributeMap {      /// Returns the attribute with the given name.      ///      /// The name must not contain any uppercase ASCII character (A-Z)      /// or the method will always return `None`. -    pub fn get(&self, name: &str) -> Option<Attribute<O>> { +    pub fn get(&self, name: &str) -> Option<Attribute> {          self.inner              .get_key_value(name)              .map(|(name, map_val)| Attribute { name, map_val })      }  } -impl<'a, O: Offset> Attribute<'a, O> { +impl<'a> Attribute<'a> {      /// Returns the attribute name.      /// Uppercase ASCII characters (A-Z) have been converted to lowercase.      pub fn name(&self) -> &'a str { @@ -227,32 +167,21 @@ impl<'a, O: Offset> Attribute<'a, O> {          &self.map_val.value      } -    /// Returns the span of the attribute name. -    pub fn name_span(&self) -> Range<O> { -        self.map_val.name_span.clone() -    } - -    /// For explicitly defined values returns the span of the attribute value. -    /// -    /// Returns `None` for attributes using the empty attribute syntax (e.g. `disabled` in `<input disabled>`). -    pub fn value_span(&self) -> Option<Range<O>> { -        if self.map_val.value_syntax.is_none() { -            return None; -        } -        Some(self.map_val.value_span.clone()) -    } - -    /// Returns the attribute value syntax in case the value is explicitly defined. +    /// Returns the index of the corresponding [`AttributeTrace`] in the +    /// `attribute_traces` field of [`StartTagTrace`], in case this attribute +    /// was present in the source and the [`Emitter`] has tracked that.      /// -    /// Returns `None` for attributes using the empty attribute syntax (e.g. `disabled` in `<input disabled>`). -    pub fn value_syntax(&self) -> Option<AttrValueSyntax> { -        self.map_val.value_syntax +    /// [`AttributeTrace`]: super::trace::AttributeTrace +    /// [`StartTagTrace`]: super::trace::AttributeTrace +    /// [`Emitter`]: super::Emitter +    pub fn trace_idx(&self) -> Option<AttributeTraceIdx> { +        self.map_val.trace_idx      }  }  // We cannot impl Index<Output=Attribute> because Index::index returns a reference of  // the Output type (and you cannot return a value referencing a temporary value). -impl<O> Index<&str> for AttributeMap<O> { +impl Index<&str> for AttributeMap {      type Output = str;      /// Returns the attribute value with the given name. @@ -264,10 +193,10 @@ impl<O> Index<&str> for AttributeMap<O> {      }  } -impl<O> IntoIterator for AttributeMap<O> { -    type Item = AttributeOwned<O>; +impl IntoIterator for AttributeMap { +    type Item = AttributeOwned; -    type IntoIter = AttrIntoIter<O>; +    type IntoIter = AttrIntoIter;      fn into_iter(self) -> Self::IntoIter {          AttrIntoIter(self.inner.into_iter()) @@ -275,27 +204,25 @@ impl<O> IntoIterator for AttributeMap<O> {  }  /// A consuming iterator over the attributes of an [`AttributeMap`]. -pub struct AttrIntoIter<O>(btree_map::IntoIter<String, AttrInternal<O>>); +pub struct AttrIntoIter(btree_map::IntoIter<String, AttrInternal>); -impl<O> Iterator for AttrIntoIter<O> { -    type Item = AttributeOwned<O>; +impl Iterator for AttrIntoIter { +    type Item = AttributeOwned;      fn next(&mut self) -> Option<Self::Item> {          let (name, map_val) = self.0.next()?;          Some(AttributeOwned {              name,              value: map_val.value, -            name_span: map_val.name_span, -            value_span: map_val.value_syntax.is_some().then_some(map_val.value_span), -            value_syntax: map_val.value_syntax, +            trace_idx: map_val.trace_idx,          })      }  } -impl<'a, O> IntoIterator for &'a AttributeMap<O> { -    type Item = Attribute<'a, O>; +impl<'a> IntoIterator for &'a AttributeMap { +    type Item = Attribute<'a>; -    type IntoIter = AttrIter<'a, O>; +    type IntoIter = AttrIter<'a>;      fn into_iter(self) -> Self::IntoIter {          AttrIter(self.inner.iter()) @@ -303,10 +230,10 @@ impl<'a, O> IntoIterator for &'a AttributeMap<O> {  }  /// A borrowed iterator over the attributes of an [`AttributeMap`]. -pub struct AttrIter<'a, S>(btree_map::Iter<'a, String, AttrInternal<S>>); +pub struct AttrIter<'a>(btree_map::Iter<'a, String, AttrInternal>); -impl<'a, S> Iterator for AttrIter<'a, S> { -    type Item = Attribute<'a, S>; +impl<'a> Iterator for AttrIter<'a> { +    type Item = Attribute<'a>;      fn next(&mut self) -> Option<Self::Item> {          let (name, map_val) = self.0.next()?; @@ -314,7 +241,7 @@ impl<'a, S> Iterator for AttrIter<'a, S> {      }  } -impl<O: Default> FromIterator<(String, String)> for AttributeMap<O> { +impl FromIterator<(String, String)> for AttributeMap {      fn from_iter<T: IntoIterator<Item = (String, String)>>(iter: T) -> Self {          Self {              inner: iter @@ -324,9 +251,7 @@ impl<O: Default> FromIterator<(String, String)> for AttributeMap<O> {                          name,                          AttrInternal {                              value, -                            name_span: O::default()..O::default(), -                            value_span: O::default()..O::default(), -                            value_syntax: Some(AttrValueSyntax::DoubleQuoted), +                            trace_idx: None,                          },                      )                  }) diff --git a/src/tokenizer.rs b/src/tokenizer.rs index d0e2eaf..decd4df 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -20,6 +20,7 @@ pub use machine::State as InternalState;  /// let emitter = DefaultEmitter::default();  /// let html = "<script><b>";  /// let mut tokens = Tokenizer::new(html, emitter).flatten(); +/// let mut tokens = tokens.map(|event| match event { Event::Token((token, _)) => Event::Token(token), Event::CdataOpen => Event::CdataOpen }); // TODO: remove once BasicEmitter can be used instead  /// assert!(matches!(tokens.next(), Some(Event::Token(Token::StartTag(_)))));  /// assert!(matches!(tokens.next(), Some(Event::Token(Token::StartTag(_)))));  /// ``` diff --git a/src/tokenizer/machine.rs b/src/tokenizer/machine.rs index 8b09aa7..9aaac73 100644 --- a/src/tokenizer/machine.rs +++ b/src/tokenizer/machine.rs @@ -2,8 +2,8 @@ mod utils;  use crate::entities::try_read_character_reference;  use crate::offset::{Offset, Position}; -use crate::token::AttrValueSyntax;  use crate::tokenizer::CdataAction; +use crate::trace::AttrValueSyntax;  use crate::{reader::Reader, Emitter, Error};  use utils::{      ascii_digit_pat, control_pat, ctostr, noncharacter_pat, surrogate_pat, whitespace_pat, diff --git a/src/trace.rs b/src/trace.rs new file mode 100644 index 0000000..a816429 --- /dev/null +++ b/src/trace.rs @@ -0,0 +1,241 @@ +//! Provides the [`Trace`] type (byte offsets and syntax information about tokens). + +use std::{ +    num::NonZeroUsize, +    ops::{Index, Range}, +}; + +use crate::let_else::assume; +use crate::token::AttributeTraceIdx; + +/// Provides byte offsets and syntax information about a [`Token`]. +/// +/// [`Token`]: crate::token::Token +#[allow(missing_docs)] +#[derive(Eq, PartialEq, Debug)] +pub enum Trace { +    Char, +    StartTag(StartTagTrace), +    EndTag(EndTagTrace), +    Comment(CommentTrace), +    Doctype(DoctypeTrace), +    EndOfFile, +} + +/// Provides byte offsets and syntax information for a [`StartTag`] token. +/// +/// [`StartTag`]: crate::token::StartTag +#[derive(Eq, PartialEq, Debug)] +pub struct StartTagTrace { +    /// The span of the tag. +    pub span: Range<usize>, + +    /// The span of the tag name. +    pub name_span: Range<usize>, + +    /// List of [`AttributeTrace`]s for the attributes that were present in the source. +    pub attribute_traces: AttributeTraceList, +} + +/// Provides byte offsets for an [`EndTag`] token. +/// +/// [`EndTag`]: crate::token::EndTag +#[derive(Eq, PartialEq, Debug)] +pub struct EndTagTrace { +    /// The span of the tag. +    pub span: Range<usize>, + +    /// The span of the tag name. +    pub name_span: Range<usize>, +} + +/// Provides byte offsets for a [`Token::Comment`]. +/// +/// [`Token::Comment`]: crate::token::Token::Comment +#[derive(Eq, PartialEq, Debug)] +pub struct CommentTrace { +    /// The offset of the comment data. +    pub data_span: Range<usize>, +} + +/// Provides byte offsets for a [`Doctype`] token. +/// +/// [`Doctype`]: crate::token::Doctype +#[derive(Eq, PartialEq, Debug)] +pub struct DoctypeTrace { +    pub(crate) span: Range<usize>, +    // Using NonZeroUsize to optimize the size of the struct. +    name_span: Option<Range<std::num::NonZeroUsize>>, +    public_id_span: Option<Range<std::num::NonZeroUsize>>, +    system_id_span: Option<Range<std::num::NonZeroUsize>>, +} + +impl DoctypeTrace { +    /// Returns the span of the DOCTYPE. +    pub fn span(&self) -> Range<usize> { +        self.span.clone() +    } + +    /// Returns the span of the name. +    pub fn name_span(&self) -> Option<Range<usize>> { +        self.name_span +            .as_ref() +            .map(|range| range.start.get()..range.end.get()) +    } + +    /// Returns the span of the public identifier. +    pub fn public_id_span(&self) -> Option<Range<usize>> { +        self.public_id_span +            .as_ref() +            .map(|range| range.start.get()..range.end.get()) +    } + +    /// Returns the span of the system identifier. +    pub fn system_id_span(&self) -> Option<Range<usize>> { +        self.system_id_span +            .as_ref() +            .map(|range| range.start.get()..range.end.get()) +    } +} + +/// Internal [`DoctypeTrace`] methods. +/// +/// Note that even though it stands to reason that the offsets provided to the `set_` +/// methods can never be zero, we intentionally don't use `new_unchecked` since +/// actually verifying that the offsets provided to the respective Emitter methods can +/// never be zero would non-trivial (since the tokenizer state machine has 80 states). +impl DoctypeTrace { +    #[inline] +    pub(crate) fn new(span_start: usize) -> Self { +        Self { +            span: span_start..0, +            name_span: None, +            public_id_span: None, +            system_id_span: None, +        } +    } + +    #[inline] +    pub(crate) fn set_name_start(&mut self, start: usize) { +        let start = NonZeroUsize::new(start).expect("expected offset to be non-zero"); +        self.name_span = Some(start..start); +    } + +    #[inline] +    pub(crate) fn set_public_id_start(&mut self, start: usize) { +        let start = NonZeroUsize::new(start).expect("expected offset to be non-zero"); +        self.public_id_span = Some(start..start); +    } + +    #[inline] +    pub(crate) fn set_system_id_start(&mut self, start: usize) { +        let start = NonZeroUsize::new(start).expect("expected offset to be non-zero"); +        self.system_id_span = Some(start..start); +    } + +    #[inline] +    pub(crate) fn set_name_end(&mut self, end: usize) { +        assume!(Some(span), &mut self.name_span); +        span.end = NonZeroUsize::new(end).expect("expected offset to be non-zero"); +    } + +    #[inline] +    pub(crate) fn set_public_id_end(&mut self, end: usize) { +        assume!(Some(span), &mut self.public_id_span); +        span.end = NonZeroUsize::new(end).expect("expected offset to be non-zero"); +    } + +    #[inline] +    pub(crate) fn set_system_id_end(&mut self, end: usize) { +        assume!(Some(span), &mut self.system_id_span); +        span.end = NonZeroUsize::new(end).expect("expected offset to be non-zero"); +    } +} + +/// The syntax of the attribute value. +#[derive(Clone, Copy, PartialEq, Eq, Debug)] +pub enum AttrValueSyntax { +    /// An unquoted attribute value, e.g. `id=foo`. +    Unquoted, +    /// A single-quoted attribute value, e.g. `id='foo'`. +    SingleQuoted, +    /// A double-quoted attribute value, e.g. `id="foo"`. +    DoubleQuoted, +} + +/// Provides byte offsets and the [`AttrValueSyntax`] for an attribute that was present in the source. +#[derive(Eq, PartialEq, Debug)] +pub struct AttributeTrace { +    pub(crate) value_syntax: Option<AttrValueSyntax>, +    pub(crate) name_span: Range<usize>, +    /// We intentionally don't use `Option<Range<O>>` here to spare us a byte (and padding) per attribute. +    /// For the empty attribute syntax this is just `O::default()..O::default()`. +    pub(crate) value_span: Range<usize>, +} + +impl AttributeTrace { +    /// [`AttributeTrace`] intentionally doesn't implement Default +    /// (since it's part of the public API and it wouldn't make sense semantically). +    pub(crate) fn new() -> Self { +        Self { +            value_syntax: None, +            name_span: Default::default(), +            value_span: Default::default(), +        } +    } + +    /// Returns the span of the attribute name. +    pub fn name_span(&self) -> Range<usize> { +        self.name_span.clone() +    } + +    /// For explicitly defined values returns the span of the attribute value. +    /// +    /// Returns `None` for attributes using the empty attribute syntax (e.g. `disabled` in `<input disabled>`). +    pub fn value_span(&self) -> Option<Range<usize>> { +        if self.value_syntax.is_none() { +            return None; +        } +        Some(self.value_span.clone()) +    } + +    /// Returns the attribute value syntax in case the value is explicitly defined. +    /// +    /// Returns `None` for attributes using the empty attribute syntax (e.g. `disabled` in `<input disabled>`). +    pub fn value_syntax(&self) -> Option<AttrValueSyntax> { +        self.value_syntax +    } +} + +/// List of [`AttributeTrace`]s for the attributes that were present in the source. +#[derive(Eq, PartialEq, Debug)] +pub struct AttributeTraceList { +    /// We don't use `HashMap<String, AttributeTrace>` since this would require +    /// the attribute names to be cloned (which would be less efficient). +    traces: Vec<AttributeTrace>, +} + +impl Index<AttributeTraceIdx> for AttributeTraceList { +    type Output = AttributeTrace; + +    fn index(&self, index: AttributeTraceIdx) -> &Self::Output { +        &self.traces[index.0.get() - 1] +    } +} + +impl AttributeTraceList { +    pub(crate) fn new() -> Self { +        Self { +            traces: Default::default(), +        } +    } + +    pub(crate) fn insert(&mut self, trace: AttributeTrace) -> AttributeTraceIdx { +        self.traces.push(trace); +        let len = self.traces.len(); +        AttributeTraceIdx( +            // SAFETY: len cannot be zero because we push before calling Vec::len. +            unsafe { std::num::NonZeroUsize::new_unchecked(len) }, +        ) +    } +} diff --git a/src/tracing_emitter.rs b/src/tracing_emitter.rs index 76b20bf..408e832 100644 --- a/src/tracing_emitter.rs +++ b/src/tracing_emitter.rs @@ -3,20 +3,25 @@ use std::collections::BTreeSet;  use std::collections::VecDeque;  use std::ops::Range; -use crate::let_else::assume; -use crate::token::{AttrValueSyntax, Comment, Doctype, EndTag, StartTag}; +use crate::let_else::{assume, know}; +use crate::token::{Doctype, EndTag, StartTag, Token}; +use crate::trace::AttributeTrace; +use crate::trace::AttributeTraceList; +use crate::trace::{ +    AttrValueSyntax, CommentTrace, DoctypeTrace, EndTagTrace, StartTagTrace, Trace, +};  use crate::Emitter;  use crate::Error; -type Token = crate::token::Token<usize>; -  /// The default implementation of [`Emitter`], used to produce tokens.  pub struct TracingEmitter {      current_token: Option<Token>, +    current_trace: Option<Trace>,      current_attribute_name: String, -    current_attr_internal: crate::token::AttrInternal<usize>, +    current_attr_internal: crate::token::AttrInternal, +    current_attribute_trace: crate::trace::AttributeTrace,      seen_attributes: BTreeSet<String>, -    emitted_tokens: VecDeque<Token>, +    emitted_tokens: VecDeque<(Token, Trace)>,      errors: VecDeque<(Error, Range<usize>)>,      attr_in_end_tag_span: Option<Range<usize>>,  } @@ -25,8 +30,10 @@ impl Default for TracingEmitter {      fn default() -> Self {          TracingEmitter {              current_token: None, +            current_trace: None,              current_attribute_name: String::new(),              current_attr_internal: Default::default(), +            current_attribute_trace: crate::trace::AttributeTrace::new(),              seen_attributes: BTreeSet::new(),              emitted_tokens: VecDeque::new(),              errors: VecDeque::new(), @@ -43,7 +50,7 @@ impl TracingEmitter {  }  impl Iterator for TracingEmitter { -    type Item = Token; +    type Item = (Token, Trace);      fn next(&mut self) -> Option<Self::Item> {          self.emitted_tokens.pop_back() @@ -56,27 +63,32 @@ impl Emitter<usize> for TracingEmitter {      }      fn emit_char(&mut self, c: char) { -        self.emit_token(Token::Char(c)); +        self.emit_token(Token::Char(c), Trace::Char);      }      fn emit_eof(&mut self) { -        self.emit_token(Token::EndOfFile); +        self.emit_token(Token::EndOfFile, Trace::EndOfFile);      }      fn init_start_tag(&mut self, tag_offset: usize, name_offset: usize) {          self.current_token = Some(Token::StartTag(StartTag { -            span: tag_offset..0,              self_closing: false,              name: String::new(),              attributes: Default::default(), +        })); +        self.current_trace = Some(Trace::StartTag(StartTagTrace { +            span: tag_offset..0,              name_span: name_offset..0, +            attribute_traces: AttributeTraceList::new(),          }));      }      fn init_end_tag(&mut self, tag_offset: usize, name_offset: usize) {          self.current_token = Some(Token::EndTag(EndTag { -            span: tag_offset..0,              name: String::new(), +        })); +        self.current_trace = Some(Trace::EndTag(EndTagTrace { +            span: tag_offset..0,              name_span: name_offset..0,          }));          self.seen_attributes.clear(); @@ -93,17 +105,17 @@ impl Emitter<usize> for TracingEmitter {      fn terminate_tag_name(&mut self, offset: usize) {          assume!(              Some( -                Token::StartTag(StartTag { name_span, .. }) -                    | Token::EndTag(EndTag { name_span, .. }) +                Trace::StartTag(StartTagTrace { name_span, .. }) +                    | Trace::EndTag(EndTagTrace { name_span, .. })              ), -            &mut self.current_token +            &mut self.current_trace          );          name_span.end = offset;      }      fn init_attribute_name(&mut self, offset: usize) {          self.flush_current_attribute(); -        self.current_attr_internal.name_span.start = offset; +        self.current_attribute_trace.name_span.start = offset;      }      fn push_attribute_name(&mut self, s: &str) { @@ -111,12 +123,12 @@ impl Emitter<usize> for TracingEmitter {      }      fn terminate_attribute_name(&mut self, offset: usize) { -        self.current_attr_internal.name_span.end = offset; +        self.current_attribute_trace.name_span.end = offset;      }      fn init_attribute_value(&mut self, syntax: AttrValueSyntax, offset: usize) { -        self.current_attr_internal.value_span.start = offset; -        self.current_attr_internal.value_syntax = Some(syntax); +        self.current_attribute_trace.value_span.start = offset; +        self.current_attribute_trace.value_syntax = Some(syntax);      }      fn push_attribute_value(&mut self, s: &str) { @@ -124,7 +136,7 @@ impl Emitter<usize> for TracingEmitter {      }      fn terminate_attribute_value(&mut self, offset: usize) { -        self.current_attr_internal.value_span.end = offset; +        self.current_attribute_trace.value_span.end = offset;      }      fn set_self_closing(&mut self, slash_span: Range<usize>) { @@ -144,43 +156,47 @@ impl Emitter<usize> for TracingEmitter {      fn emit_current_tag(&mut self, offset: usize) {          self.flush_current_attribute();          let mut token = self.current_token.take().unwrap(); +        let mut trace = self.current_trace.take().unwrap();          match &mut token { -            Token::EndTag(tag) => { +            Token::EndTag(_) => {                  if !self.seen_attributes.is_empty() {                      let span = self.attr_in_end_tag_span.take().unwrap();                      self.report_error(Error::EndTagWithAttributes, span);                  }                  self.seen_attributes.clear(); -                tag.span.end = offset; +                know!(Trace::EndTag(tag_trace), &mut trace); +                tag_trace.span.end = offset;              } -            Token::StartTag(tag) => { -                tag.span.end = offset; +            Token::StartTag(_) => { +                know!(Trace::StartTag(tag_trace), &mut trace); +                tag_trace.span.end = offset;              }              other => {                  debug_assert!(false, "unexpected current_token: {other:?}");                  return;              }          } -        self.emit_token(token); +        self.emit_token(token, trace);      }      fn init_comment(&mut self, data_start_offset: usize) { -        self.current_token = Some(Token::Comment(Comment { -            data: String::new(), +        self.current_token = Some(Token::Comment(String::new())); +        self.current_trace = Some(Trace::Comment(CommentTrace {              data_span: data_start_offset..0,          }));      }      fn push_comment(&mut self, s: &str) { -        assume!(Some(Token::Comment(comment)), &mut self.current_token); -        comment.data.push_str(s); +        assume!(Some(Token::Comment(data)), &mut self.current_token); +        data.push_str(s);      }      fn emit_current_comment(&mut self, data_end_offset: usize) { -        let mut token = self.current_token.take().unwrap(); -        assume!(Token::Comment(comment), &mut token); -        comment.data_span.end = data_end_offset; -        self.emit_token(token); +        let token = self.current_token.take().unwrap(); +        let mut trace = self.current_trace.take().unwrap(); +        assume!(Trace::Comment(comment_trace), &mut trace); +        comment_trace.data_span.end = data_end_offset; +        self.emit_token(token, trace);      }      fn init_doctype(&mut self, offset: usize) { @@ -189,17 +205,15 @@ impl Emitter<usize> for TracingEmitter {              force_quirks: false,              public_id: None,              system_id: None, -            span: offset..0, -            name_span: 0..0, -            public_id_span: 0..0, -            system_id_span: 0..0,          })); +        self.current_trace = Some(Trace::Doctype(DoctypeTrace::new(offset)));      }      fn init_doctype_name(&mut self, offset: usize) {          assume!(Some(Token::Doctype(doctype)), &mut self.current_token);          doctype.name = Some("".into()); -        doctype.name_span.start = offset; +        know!(Some(Trace::Doctype(doctype_trace)), &mut self.current_trace); +        doctype_trace.set_name_start(offset);      }      fn push_doctype_name(&mut self, s: &str) { @@ -214,14 +228,15 @@ impl Emitter<usize> for TracingEmitter {      }      fn terminate_doctype_name(&mut self, offset: usize) { -        assume!(Some(Token::Doctype(doctype)), &mut self.current_token); -        doctype.name_span.end = offset; +        assume!(Some(Trace::Doctype(doctype_trace)), &mut self.current_trace); +        doctype_trace.set_name_end(offset);      }      fn init_doctype_public_id(&mut self, offset: usize) {          assume!(Some(Token::Doctype(doctype)), &mut self.current_token);          doctype.public_id = Some("".to_owned()); -        doctype.public_id_span.start = offset; +        know!(Some(Trace::Doctype(doctype_trace)), &mut self.current_trace); +        doctype_trace.set_public_id_start(offset);      }      fn push_doctype_public_id(&mut self, s: &str) { @@ -236,14 +251,15 @@ impl Emitter<usize> for TracingEmitter {      }      fn terminate_doctype_public_id(&mut self, offset: usize) { -        assume!(Some(Token::Doctype(doctype)), &mut self.current_token); -        doctype.public_id_span.end = offset; +        assume!(Some(Trace::Doctype(doctype_trace)), &mut self.current_trace); +        doctype_trace.set_public_id_end(offset);      }      fn init_doctype_system_id(&mut self, offset: usize) {          assume!(Some(Token::Doctype(doctype)), &mut self.current_token);          doctype.system_id = Some("".to_owned()); -        doctype.system_id_span.start = offset; +        know!(Some(Trace::Doctype(doctype_trace)), &mut self.current_trace); +        doctype_trace.set_system_id_start(offset);      }      fn push_doctype_system_id(&mut self, s: &str) { @@ -258,8 +274,8 @@ impl Emitter<usize> for TracingEmitter {      }      fn terminate_doctype_system_id(&mut self, offset: usize) { -        assume!(Some(Token::Doctype(doctype)), &mut self.current_token); -        doctype.system_id_span.end = offset; +        assume!(Some(Trace::Doctype(doctype_trace)), &mut self.current_trace); +        doctype_trace.set_system_id_end(offset);      }      fn set_force_quirks(&mut self) { @@ -268,15 +284,17 @@ impl Emitter<usize> for TracingEmitter {      }      fn emit_current_doctype(&mut self, offset: usize) { -        assume!(Some(Token::Doctype(mut doctype)), self.current_token.take()); -        doctype.span.end = offset; -        self.emit_token(Token::Doctype(doctype)); +        assume!(Some(mut trace), self.current_trace.take()); +        assume!(Trace::Doctype(doctype_trace), &mut trace); +        doctype_trace.span.end = offset; +        let token = self.current_token.take().unwrap(); +        self.emit_token(token, trace);      }  }  impl TracingEmitter { -    fn emit_token(&mut self, token: Token) { -        self.emitted_tokens.push_front(token); +    fn emit_token(&mut self, token: Token, trace: Trace) { +        self.emitted_tokens.push_front((token, trace));      }      fn flush_current_attribute(&mut self) { @@ -284,21 +302,26 @@ impl TracingEmitter {              return;          }          let name = std::mem::take(&mut self.current_attribute_name); -        let attr_internal = std::mem::take(&mut self.current_attr_internal); +        let mut attr_internal = std::mem::take(&mut self.current_attr_internal); +        let attr_trace = +            std::mem::replace(&mut self.current_attribute_trace, AttributeTrace::new());          match &mut self.current_token {              Some(Token::StartTag(tag)) => match tag.attributes.inner.entry(name) {                  Entry::Vacant(vacant) => { +                    know!(Some(Trace::StartTag(trace)), &mut self.current_trace); +                    let trace_idx = trace.attribute_traces.insert(attr_trace); +                    attr_internal.trace_idx = Some(trace_idx);                      vacant.insert(attr_internal);                  }                  Entry::Occupied(_) => { -                    self.report_error(Error::DuplicateAttribute, attr_internal.name_span); +                    self.report_error(Error::DuplicateAttribute, attr_trace.name_span);                  }              },              Some(Token::EndTag(_)) => { -                self.attr_in_end_tag_span = Some(attr_internal.name_span.clone()); +                self.attr_in_end_tag_span = Some(attr_trace.name_span.clone());                  if !self.seen_attributes.insert(name) { -                    self.report_error(Error::DuplicateAttribute, attr_internal.name_span); +                    self.report_error(Error::DuplicateAttribute, attr_trace.name_span);                  }              }              other => debug_assert!(false, "unexpected current_token: {other:?}"), @@ -306,6 +329,12 @@ impl TracingEmitter {      }  } +impl From<(Token, Trace)> for Token { +    fn from((token, _): (Token, Trace)) -> Self { +        token +    } +} +  /// The majority of our testing of the [`TracingEmitter`] is done against the  /// html5lib-tests in the html5lib integration test. This module only tests  /// details that aren't present in the html5lib test data. @@ -313,8 +342,8 @@ impl TracingEmitter {  mod tests {      use super::TracingEmitter;      use crate::offset::PosTrackingReader; -    use crate::token::{AttrValueSyntax, Token}; -    use crate::{Event, Tokenizer}; +    use crate::trace::{AttrValueSyntax, Trace}; +    use crate::{Event, Token, Tokenizer};      #[test]      fn test_attribute_value_syntax() { @@ -325,7 +354,9 @@ mod tests {              TracingEmitter::default(),          )          .flatten(); -        let Event::Token(Token::StartTag(tag)) = tokenizer.next().unwrap() else { +        let Event::Token((Token::StartTag(tag), Trace::StartTag(tag_trace))) = +            tokenizer.next().unwrap() +        else {              panic!("expected start tag");          };          for (name, syntax) in [ @@ -334,8 +365,9 @@ mod tests {              ("single-quoted", Some(AttrValueSyntax::SingleQuoted)),              ("double-quoted", Some(AttrValueSyntax::DoubleQuoted)),          ] { +            let attr_trace_idx = tag.attributes.get(name).unwrap().trace_idx().unwrap();              assert_eq!( -                tag.attributes.get(name).unwrap().value_syntax(), +                tag_trace.attribute_traces[attr_trace_idx].value_syntax(),                  syntax,                  "unexpected value for attribute {name}"              ); | 
