diff options
| -rw-r--r-- | CHANGELOG.md | 4 | ||||
| -rw-r--r-- | examples/spans.rs | 13 | ||||
| -rw-r--r-- | integration_tests/tests/test_html5lib.rs | 4 | ||||
| -rw-r--r-- | src/basic_emitter.rs | 2 | ||||
| -rw-r--r-- | src/emitter.rs | 2 | ||||
| -rw-r--r-- | src/let_else.rs | 22 | ||||
| -rw-r--r-- | src/lib.rs | 8 | ||||
| -rw-r--r-- | src/token.rs | 217 | ||||
| -rw-r--r-- | src/tokenizer.rs | 1 | ||||
| -rw-r--r-- | src/tokenizer/machine.rs | 2 | ||||
| -rw-r--r-- | src/trace.rs | 241 | ||||
| -rw-r--r-- | src/tracing_emitter.rs | 148 | ||||
| -rw-r--r-- | tests/test_spans.rs | 78 | 
13 files changed, 494 insertions, 248 deletions
| diff --git a/CHANGELOG.md b/CHANGELOG.md index 146d627..f0c1ed6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,8 @@  #### Breaking changes +* Byte offsets were moved out of the `Token` enum into a new `Trace` enum. +  * `Token` enum    * Removed the `Error` variant.   @@ -21,6 +23,8 @@  * The `DefaultEmitter` has been renamed to `TracingEmitter`. +* The `DefaultEmitter` now yields `(Token, Trace)` instead of just `Token`. +  * The `DefaultEmitter` now emits `Token::EndOfFile` on the end-of-file.      (Previously it did not emit any token symbolizing the end-of-file.) diff --git a/examples/spans.rs b/examples/spans.rs index c1fe23b..b8d5283 100644 --- a/examples/spans.rs +++ b/examples/spans.rs @@ -4,14 +4,14 @@ use codespan_reporting::{      term,      term::termcolor::{ColorChoice, StandardStream},  }; -use html5tokenizer::{offset::PosTrackingReader, NaiveParser, Token, TracingEmitter}; +use html5tokenizer::{offset::PosTrackingReader, trace::Trace, NaiveParser, Token, TracingEmitter};  fn main() {      let html = r#"<img src=example.jpg alt="some description">"#;      let parser =          NaiveParser::new_with_emitter(PosTrackingReader::new(html), TracingEmitter::default()); -    let Token::StartTag(tag) = parser.flatten().next().unwrap() else { +    let (Token::StartTag(tag), Trace::StartTag(trace)) = parser.flatten().next().unwrap() else {          panic!()      }; @@ -20,11 +20,14 @@ fn main() {      let mut labels = Vec::new(); -    labels.push(Label::primary(file_id, tag.name_span).with_message("tag name")); +    labels.push(Label::primary(file_id, trace.name_span).with_message("tag name"));      for attr in &tag.attributes { -        labels.push(Label::primary(file_id, attr.name_span()).with_message("attr name")); -        labels.push(Label::primary(file_id, attr.value_span().unwrap()).with_message("attr value")); +        let attr_trace = &trace.attribute_traces[attr.trace_idx().unwrap()]; +        labels.push(Label::primary(file_id, attr_trace.name_span()).with_message("attr name")); +        labels.push( +            Label::primary(file_id, attr_trace.value_span().unwrap()).with_message("attr value"), +        );      }      let diagnostic = Diagnostic::note().with_labels(labels); diff --git a/integration_tests/tests/test_html5lib.rs b/integration_tests/tests/test_html5lib.rs index eac11dd..42d93f1 100644 --- a/integration_tests/tests/test_html5lib.rs +++ b/integration_tests/tests/test_html5lib.rs @@ -107,7 +107,7 @@ fn run_test_inner<R, O, E, T>(      R: Reader + Position<O>,      O: Offset,      E: Emitter<O> + Iterator<Item = T> + DrainErrors<O>, -    T: Into<Token<O>>, +    T: Into<Token>,  {      println!(          "==== FILE {}, TEST {}, STATE {:?}, TOKENIZER {} ====", @@ -156,7 +156,7 @@ fn run_test_inner<R, O, E, T>(                      actual_tokens.push(TestToken::Character(c.into()));                  }              } -            Token::Comment(comment) => actual_tokens.push(TestToken::Comment(comment.data)), +            Token::Comment(comment) => actual_tokens.push(TestToken::Comment(comment)),              Token::Doctype(doctype) => actual_tokens.push(TestToken::Doctype {                  name: doctype.name,                  public_id: doctype.public_id, diff --git a/src/basic_emitter.rs b/src/basic_emitter.rs index 046b645..bcb3f41 100644 --- a/src/basic_emitter.rs +++ b/src/basic_emitter.rs @@ -27,7 +27,7 @@ impl<O> BasicEmitter<O> {  }  impl<O> Iterator for BasicEmitter<O> { -    type Item = Token<O>; +    type Item = Token;      fn next(&mut self) -> Option<Self::Item> {          todo!() diff --git a/src/emitter.rs b/src/emitter.rs index 25e0209..d1e1dfe 100644 --- a/src/emitter.rs +++ b/src/emitter.rs @@ -1,6 +1,6 @@  use std::ops::Range; -use crate::token::AttrValueSyntax; +use crate::trace::AttrValueSyntax;  use crate::Error;  /// An emitter is an object providing methods to the tokenizer to produce ("emit") tokens. diff --git a/src/let_else.rs b/src/let_else.rs index da17a68..a1627f1 100644 --- a/src/let_else.rs +++ b/src/let_else.rs @@ -21,3 +21,25 @@ right: {:?}",  }  pub(crate) use assume; + +/// Binds the given expression to the given pattern, or else executes +/// `unreachable!();` with a helpful panic message and returns. +macro_rules! know { +    ($pattern:pat, $value:expr) => { +        // The expression might change each time it's evaluated, so we +        // have to bind it so that we can reuse it in the panic message. +        let _value = $value; + +        let $pattern = _value else { +            unreachable!( +                "assertion `left matches right` failed: + left: {} +right: {:?}", +                stringify!($pattern), +                _value +            ); +        }; +    }; +} + +pub(crate) use know; @@ -18,19 +18,19 @@ mod tracing_emitter;  /// Types for HTML attributes.  pub mod attr { -    pub use crate::token::{ -        AttrIntoIter, AttrIter, AttrValueSyntax, Attribute, AttributeMap, AttributeOwned, -    }; +    pub use crate::token::{AttrIntoIter, AttrIter, Attribute, AttributeMap, AttributeOwned}; +    pub use crate::trace::AttrValueSyntax;  }  pub mod offset;  pub mod reader;  pub mod token; +pub mod trace;  pub use basic_emitter::BasicEmitter;  pub use emitter::Emitter;  pub use error::Error;  pub use naive_parser::NaiveParser; -pub use token::{Comment, Doctype, EndTag, StartTag, Token}; +pub use token::{Doctype, EndTag, StartTag, Token};  pub use tokenizer::{CdataAction, Event, State, Tokenizer};  pub use tracing_emitter::TracingEmitter; diff --git a/src/token.rs b/src/token.rs index ed8c8c8..4f3c0ce 100644 --- a/src/token.rs +++ b/src/token.rs @@ -2,32 +2,30 @@  use std::collections::{btree_map, BTreeMap};  use std::iter::FromIterator; -use std::ops::{Index, Range}; - -use crate::offset::Offset; +use std::ops::Index;  /// A type for the tokens emitted by a WHATWG-compliant HTML tokenizer.  #[derive(Debug, Eq, PartialEq)] -pub enum Token<O> { +pub enum Token {      /// A literal character, a resolved character reference,      /// or part of a resolved character reference (since some      /// character references resolve to two `char`s).      Char(char),      /// An HTML start tag. -    StartTag(StartTag<O>), +    StartTag(StartTag),      /// An HTML end tag. -    EndTag(EndTag<O>), +    EndTag(EndTag),      /// An HTML comment. -    Comment(Comment<O>), +    Comment(String),      /// An HTML doctype declaration. -    Doctype(Doctype<O>), +    Doctype(Doctype),      /// An end-of-file token.      EndOfFile,  }  /// An HTML start tag, such as `<p>` or `<a>`.  #[derive(Debug, Eq, PartialEq)] -pub struct StartTag<O> { +pub struct StartTag {      /// Whether this tag is self-closing. If it is self-closing, no following [`EndTag`] should be      /// expected.      pub self_closing: bool, @@ -39,43 +37,15 @@ pub struct StartTag<O> {      /// A mapping for any HTML attributes this start tag may have.      ///      /// Duplicate attributes are ignored after the first one as per WHATWG spec. -    pub attributes: AttributeMap<O>, - -    /// The source code span of the tag. -    pub span: Range<O>, - -    /// The span of the tag name. -    pub name_span: Range<O>, +    pub attributes: AttributeMap,  }  /// An HTML end/close tag, such as `</p>` or `</a>`.  #[derive(Debug, Eq, PartialEq)] -pub struct EndTag<O> { +pub struct EndTag {      /// The tag name.      /// Uppercase ASCII characters (A-Z) have been converted to lowercase.      pub name: String, - -    /// The source code span of the tag. -    pub span: Range<O>, - -    /// The span of the tag name. -    pub name_span: Range<O>, -} - -/// An HTML comment. -#[derive(PartialEq, Eq, Debug)] -pub struct Comment<O> { -    /// The text within the comment. -    pub data: String, -    /// The source offset of the comment data. -    pub data_span: Range<O>, -} - -impl<O: Offset> Comment<O> { -    /// Returns the span for the comment data. -    pub fn data_span(&self) -> Range<O> { -        self.data_span.clone() -    }  }  /// A doctype. Some examples: @@ -85,7 +55,7 @@ impl<O: Offset> Comment<O> {  /// * `<!DOCTYPE {name} SYSTEM '{system_id}'>`  /// * `<!DOCTYPE {name} PUBLIC '{public_id}' '{system_id}'>`  #[derive(Debug, Eq, PartialEq)] -pub struct Doctype<O> { +pub struct Doctype {      /// The [force-quirks flag].      ///      /// [force-quirks flag]: https://html.spec.whatwg.org/multipage/parsing.html#force-quirks-flag @@ -100,38 +70,6 @@ pub struct Doctype<O> {      /// The doctype's system identifier.      pub system_id: Option<String>, - -    /// The source code span of the doctype. -    pub span: Range<O>, - -    /// The span of the name. -    pub(crate) name_span: Range<O>, - -    /// The span of the public identifier. -    pub(crate) public_id_span: Range<O>, - -    /// The span of the system identifier. -    pub(crate) system_id_span: Range<O>, -} - -impl<O: Offset> Doctype<O> { -    /// Returns the span of the name. -    pub fn name_span(&self) -> Option<Range<O>> { -        self.name.as_ref()?; -        Some(self.name_span.clone()) -    } - -    /// Returns the span of the public identifier. -    pub fn public_id_span(&self) -> Option<Range<O>> { -        self.public_id.as_ref()?; -        Some(self.public_id_span.clone()) -    } - -    /// Returns the span of the system identifier. -    pub fn system_id_span(&self) -> Option<Range<O>> { -        self.system_id.as_ref()?; -        Some(self.system_id_span.clone()) -    }  }  /// A map of HTML attributes. @@ -143,79 +81,81 @@ impl<O: Offset> Doctype<O> {  ///  /// ```  /// # use html5tokenizer::attr::AttributeMap; -/// let attrs: AttributeMap<()> = vec![("href".into(), "http://example.com".into())] +/// let attrs: AttributeMap = vec![("href".into(), "http://example.com".into())]  ///     .into_iter()  ///     .collect();  /// assert_eq!(&attrs["href"], "http://example.com");  /// ```  #[derive(Debug, Default, PartialEq, Eq)] -pub struct AttributeMap<O> { -    pub(crate) inner: BTreeMap<String, AttrInternal<O>>, +pub struct AttributeMap { +    pub(crate) inner: BTreeMap<String, AttrInternal>,  }  /// The value type internally used by the [`AttributeMap`].  /// Not part of the public API. -#[derive(Default, Debug, Eq, PartialEq)] -pub(crate) struct AttrInternal<O> { +#[derive(Default, Debug, Eq)] +pub(crate) struct AttrInternal {      pub value: String, -    /// The span of the attribute name. -    pub name_span: Range<O>, -    /// The span of the attribute value. -    /// For the empty attribute syntax this is just `O::default()..O::default()`. -    /// We intentionally don't use `Option<Range<O>>` here to spare us a byte (and padding) per attribute. -    pub value_span: Range<O>, -    pub value_syntax: Option<AttrValueSyntax>, +    pub trace_idx: Option<AttributeTraceIdx>,  } -/// The syntax of the attribute value. -#[derive(Clone, Copy, PartialEq, Eq, Debug)] -pub enum AttrValueSyntax { -    /// An unquoted attribute value, e.g. `id=foo`. -    Unquoted, -    /// A single-quoted attribute value, e.g. `id='foo'`. -    SingleQuoted, -    /// A double-quoted attribute value, e.g. `id="foo"`. -    DoubleQuoted, +/// The index of an [`AttributeTrace`] within an [`AttributeTraceList`]. +/// +/// [`AttributeTrace`]: crate::trace::AttributeTrace +/// [`AttributeTraceList`]: crate::trace::AttributeTraceList +#[derive(Clone, Copy, Eq, PartialEq, Debug)] +pub struct AttributeTraceIdx( +    // Using NonZeroUsize so that `Option<AttributeTraceIdx>` +    // has the same size as `AttributeTraceIdx`. +    pub std::num::NonZeroUsize, +); + +impl PartialEq for AttrInternal { +    fn eq(&self, other: &Self) -> bool { +        // We intentionally don't include the trace_idx, +        // so that PartialEq of Token only compares semantics. +        self.value == other.value +    }  }  /// An HTML attribute borrowed from an [`AttributeMap`].  #[derive(Debug, Eq, PartialEq)] -pub struct Attribute<'a, O> { +pub struct Attribute<'a> {      name: &'a str, -    map_val: &'a AttrInternal<O>, +    map_val: &'a AttrInternal,  }  /// An owned HTML attribute.  #[derive(Debug, PartialEq, Eq)] -pub struct AttributeOwned<O> { +pub struct AttributeOwned {      /// The attribute name.      /// Uppercase ASCII characters (A-Z) have been converted to lowercase.      pub name: String,      /// The attribute value. Character references have been resolved.      pub value: String, -    /// The span of the attribute name. -    pub name_span: Range<O>, -    /// The span of the attribute value. -    /// `None` in case of the empty attribute syntax (e.g. `disabled` in `<input disabled>`). -    pub value_span: Option<Range<O>>, -    /// The syntax of the attribute value. -    /// `None` indicates the empty attribute syntax (e.g. `disabled` in `<input disabled>`). -    pub value_syntax: Option<AttrValueSyntax>, +    /// The index of the corresponding [`AttributeTrace`] in the +    /// `attribute_traces` field of [`StartTagTrace`], in case this attribute +    /// was present in the source and the [`Emitter`] has tracked this. +    /// +    /// [`AttributeTrace`]: super::trace::AttributeTrace +    /// [`StartTagTrace`]: super::trace::AttributeTrace +    /// [`Emitter`]: super::Emitter +    pub trace_idx: Option<AttributeTraceIdx>,  } -impl<O> AttributeMap<O> { +impl AttributeMap {      /// Returns the attribute with the given name.      ///      /// The name must not contain any uppercase ASCII character (A-Z)      /// or the method will always return `None`. -    pub fn get(&self, name: &str) -> Option<Attribute<O>> { +    pub fn get(&self, name: &str) -> Option<Attribute> {          self.inner              .get_key_value(name)              .map(|(name, map_val)| Attribute { name, map_val })      }  } -impl<'a, O: Offset> Attribute<'a, O> { +impl<'a> Attribute<'a> {      /// Returns the attribute name.      /// Uppercase ASCII characters (A-Z) have been converted to lowercase.      pub fn name(&self) -> &'a str { @@ -227,32 +167,21 @@ impl<'a, O: Offset> Attribute<'a, O> {          &self.map_val.value      } -    /// Returns the span of the attribute name. -    pub fn name_span(&self) -> Range<O> { -        self.map_val.name_span.clone() -    } - -    /// For explicitly defined values returns the span of the attribute value. -    /// -    /// Returns `None` for attributes using the empty attribute syntax (e.g. `disabled` in `<input disabled>`). -    pub fn value_span(&self) -> Option<Range<O>> { -        if self.map_val.value_syntax.is_none() { -            return None; -        } -        Some(self.map_val.value_span.clone()) -    } - -    /// Returns the attribute value syntax in case the value is explicitly defined. +    /// Returns the index of the corresponding [`AttributeTrace`] in the +    /// `attribute_traces` field of [`StartTagTrace`], in case this attribute +    /// was present in the source and the [`Emitter`] has tracked that.      /// -    /// Returns `None` for attributes using the empty attribute syntax (e.g. `disabled` in `<input disabled>`). -    pub fn value_syntax(&self) -> Option<AttrValueSyntax> { -        self.map_val.value_syntax +    /// [`AttributeTrace`]: super::trace::AttributeTrace +    /// [`StartTagTrace`]: super::trace::AttributeTrace +    /// [`Emitter`]: super::Emitter +    pub fn trace_idx(&self) -> Option<AttributeTraceIdx> { +        self.map_val.trace_idx      }  }  // We cannot impl Index<Output=Attribute> because Index::index returns a reference of  // the Output type (and you cannot return a value referencing a temporary value). -impl<O> Index<&str> for AttributeMap<O> { +impl Index<&str> for AttributeMap {      type Output = str;      /// Returns the attribute value with the given name. @@ -264,10 +193,10 @@ impl<O> Index<&str> for AttributeMap<O> {      }  } -impl<O> IntoIterator for AttributeMap<O> { -    type Item = AttributeOwned<O>; +impl IntoIterator for AttributeMap { +    type Item = AttributeOwned; -    type IntoIter = AttrIntoIter<O>; +    type IntoIter = AttrIntoIter;      fn into_iter(self) -> Self::IntoIter {          AttrIntoIter(self.inner.into_iter()) @@ -275,27 +204,25 @@ impl<O> IntoIterator for AttributeMap<O> {  }  /// A consuming iterator over the attributes of an [`AttributeMap`]. -pub struct AttrIntoIter<O>(btree_map::IntoIter<String, AttrInternal<O>>); +pub struct AttrIntoIter(btree_map::IntoIter<String, AttrInternal>); -impl<O> Iterator for AttrIntoIter<O> { -    type Item = AttributeOwned<O>; +impl Iterator for AttrIntoIter { +    type Item = AttributeOwned;      fn next(&mut self) -> Option<Self::Item> {          let (name, map_val) = self.0.next()?;          Some(AttributeOwned {              name,              value: map_val.value, -            name_span: map_val.name_span, -            value_span: map_val.value_syntax.is_some().then_some(map_val.value_span), -            value_syntax: map_val.value_syntax, +            trace_idx: map_val.trace_idx,          })      }  } -impl<'a, O> IntoIterator for &'a AttributeMap<O> { -    type Item = Attribute<'a, O>; +impl<'a> IntoIterator for &'a AttributeMap { +    type Item = Attribute<'a>; -    type IntoIter = AttrIter<'a, O>; +    type IntoIter = AttrIter<'a>;      fn into_iter(self) -> Self::IntoIter {          AttrIter(self.inner.iter()) @@ -303,10 +230,10 @@ impl<'a, O> IntoIterator for &'a AttributeMap<O> {  }  /// A borrowed iterator over the attributes of an [`AttributeMap`]. -pub struct AttrIter<'a, S>(btree_map::Iter<'a, String, AttrInternal<S>>); +pub struct AttrIter<'a>(btree_map::Iter<'a, String, AttrInternal>); -impl<'a, S> Iterator for AttrIter<'a, S> { -    type Item = Attribute<'a, S>; +impl<'a> Iterator for AttrIter<'a> { +    type Item = Attribute<'a>;      fn next(&mut self) -> Option<Self::Item> {          let (name, map_val) = self.0.next()?; @@ -314,7 +241,7 @@ impl<'a, S> Iterator for AttrIter<'a, S> {      }  } -impl<O: Default> FromIterator<(String, String)> for AttributeMap<O> { +impl FromIterator<(String, String)> for AttributeMap {      fn from_iter<T: IntoIterator<Item = (String, String)>>(iter: T) -> Self {          Self {              inner: iter @@ -324,9 +251,7 @@ impl<O: Default> FromIterator<(String, String)> for AttributeMap<O> {                          name,                          AttrInternal {                              value, -                            name_span: O::default()..O::default(), -                            value_span: O::default()..O::default(), -                            value_syntax: Some(AttrValueSyntax::DoubleQuoted), +                            trace_idx: None,                          },                      )                  }) diff --git a/src/tokenizer.rs b/src/tokenizer.rs index d0e2eaf..decd4df 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -20,6 +20,7 @@ pub use machine::State as InternalState;  /// let emitter = DefaultEmitter::default();  /// let html = "<script><b>";  /// let mut tokens = Tokenizer::new(html, emitter).flatten(); +/// let mut tokens = tokens.map(|event| match event { Event::Token((token, _)) => Event::Token(token), Event::CdataOpen => Event::CdataOpen }); // TODO: remove once BasicEmitter can be used instead  /// assert!(matches!(tokens.next(), Some(Event::Token(Token::StartTag(_)))));  /// assert!(matches!(tokens.next(), Some(Event::Token(Token::StartTag(_)))));  /// ``` diff --git a/src/tokenizer/machine.rs b/src/tokenizer/machine.rs index 8b09aa7..9aaac73 100644 --- a/src/tokenizer/machine.rs +++ b/src/tokenizer/machine.rs @@ -2,8 +2,8 @@ mod utils;  use crate::entities::try_read_character_reference;  use crate::offset::{Offset, Position}; -use crate::token::AttrValueSyntax;  use crate::tokenizer::CdataAction; +use crate::trace::AttrValueSyntax;  use crate::{reader::Reader, Emitter, Error};  use utils::{      ascii_digit_pat, control_pat, ctostr, noncharacter_pat, surrogate_pat, whitespace_pat, diff --git a/src/trace.rs b/src/trace.rs new file mode 100644 index 0000000..a816429 --- /dev/null +++ b/src/trace.rs @@ -0,0 +1,241 @@ +//! Provides the [`Trace`] type (byte offsets and syntax information about tokens). + +use std::{ +    num::NonZeroUsize, +    ops::{Index, Range}, +}; + +use crate::let_else::assume; +use crate::token::AttributeTraceIdx; + +/// Provides byte offsets and syntax information about a [`Token`]. +/// +/// [`Token`]: crate::token::Token +#[allow(missing_docs)] +#[derive(Eq, PartialEq, Debug)] +pub enum Trace { +    Char, +    StartTag(StartTagTrace), +    EndTag(EndTagTrace), +    Comment(CommentTrace), +    Doctype(DoctypeTrace), +    EndOfFile, +} + +/// Provides byte offsets and syntax information for a [`StartTag`] token. +/// +/// [`StartTag`]: crate::token::StartTag +#[derive(Eq, PartialEq, Debug)] +pub struct StartTagTrace { +    /// The span of the tag. +    pub span: Range<usize>, + +    /// The span of the tag name. +    pub name_span: Range<usize>, + +    /// List of [`AttributeTrace`]s for the attributes that were present in the source. +    pub attribute_traces: AttributeTraceList, +} + +/// Provides byte offsets for an [`EndTag`] token. +/// +/// [`EndTag`]: crate::token::EndTag +#[derive(Eq, PartialEq, Debug)] +pub struct EndTagTrace { +    /// The span of the tag. +    pub span: Range<usize>, + +    /// The span of the tag name. +    pub name_span: Range<usize>, +} + +/// Provides byte offsets for a [`Token::Comment`]. +/// +/// [`Token::Comment`]: crate::token::Token::Comment +#[derive(Eq, PartialEq, Debug)] +pub struct CommentTrace { +    /// The offset of the comment data. +    pub data_span: Range<usize>, +} + +/// Provides byte offsets for a [`Doctype`] token. +/// +/// [`Doctype`]: crate::token::Doctype +#[derive(Eq, PartialEq, Debug)] +pub struct DoctypeTrace { +    pub(crate) span: Range<usize>, +    // Using NonZeroUsize to optimize the size of the struct. +    name_span: Option<Range<std::num::NonZeroUsize>>, +    public_id_span: Option<Range<std::num::NonZeroUsize>>, +    system_id_span: Option<Range<std::num::NonZeroUsize>>, +} + +impl DoctypeTrace { +    /// Returns the span of the DOCTYPE. +    pub fn span(&self) -> Range<usize> { +        self.span.clone() +    } + +    /// Returns the span of the name. +    pub fn name_span(&self) -> Option<Range<usize>> { +        self.name_span +            .as_ref() +            .map(|range| range.start.get()..range.end.get()) +    } + +    /// Returns the span of the public identifier. +    pub fn public_id_span(&self) -> Option<Range<usize>> { +        self.public_id_span +            .as_ref() +            .map(|range| range.start.get()..range.end.get()) +    } + +    /// Returns the span of the system identifier. +    pub fn system_id_span(&self) -> Option<Range<usize>> { +        self.system_id_span +            .as_ref() +            .map(|range| range.start.get()..range.end.get()) +    } +} + +/// Internal [`DoctypeTrace`] methods. +/// +/// Note that even though it stands to reason that the offsets provided to the `set_` +/// methods can never be zero, we intentionally don't use `new_unchecked` since +/// actually verifying that the offsets provided to the respective Emitter methods can +/// never be zero would non-trivial (since the tokenizer state machine has 80 states). +impl DoctypeTrace { +    #[inline] +    pub(crate) fn new(span_start: usize) -> Self { +        Self { +            span: span_start..0, +            name_span: None, +            public_id_span: None, +            system_id_span: None, +        } +    } + +    #[inline] +    pub(crate) fn set_name_start(&mut self, start: usize) { +        let start = NonZeroUsize::new(start).expect("expected offset to be non-zero"); +        self.name_span = Some(start..start); +    } + +    #[inline] +    pub(crate) fn set_public_id_start(&mut self, start: usize) { +        let start = NonZeroUsize::new(start).expect("expected offset to be non-zero"); +        self.public_id_span = Some(start..start); +    } + +    #[inline] +    pub(crate) fn set_system_id_start(&mut self, start: usize) { +        let start = NonZeroUsize::new(start).expect("expected offset to be non-zero"); +        self.system_id_span = Some(start..start); +    } + +    #[inline] +    pub(crate) fn set_name_end(&mut self, end: usize) { +        assume!(Some(span), &mut self.name_span); +        span.end = NonZeroUsize::new(end).expect("expected offset to be non-zero"); +    } + +    #[inline] +    pub(crate) fn set_public_id_end(&mut self, end: usize) { +        assume!(Some(span), &mut self.public_id_span); +        span.end = NonZeroUsize::new(end).expect("expected offset to be non-zero"); +    } + +    #[inline] +    pub(crate) fn set_system_id_end(&mut self, end: usize) { +        assume!(Some(span), &mut self.system_id_span); +        span.end = NonZeroUsize::new(end).expect("expected offset to be non-zero"); +    } +} + +/// The syntax of the attribute value. +#[derive(Clone, Copy, PartialEq, Eq, Debug)] +pub enum AttrValueSyntax { +    /// An unquoted attribute value, e.g. `id=foo`. +    Unquoted, +    /// A single-quoted attribute value, e.g. `id='foo'`. +    SingleQuoted, +    /// A double-quoted attribute value, e.g. `id="foo"`. +    DoubleQuoted, +} + +/// Provides byte offsets and the [`AttrValueSyntax`] for an attribute that was present in the source. +#[derive(Eq, PartialEq, Debug)] +pub struct AttributeTrace { +    pub(crate) value_syntax: Option<AttrValueSyntax>, +    pub(crate) name_span: Range<usize>, +    /// We intentionally don't use `Option<Range<O>>` here to spare us a byte (and padding) per attribute. +    /// For the empty attribute syntax this is just `O::default()..O::default()`. +    pub(crate) value_span: Range<usize>, +} + +impl AttributeTrace { +    /// [`AttributeTrace`] intentionally doesn't implement Default +    /// (since it's part of the public API and it wouldn't make sense semantically). +    pub(crate) fn new() -> Self { +        Self { +            value_syntax: None, +            name_span: Default::default(), +            value_span: Default::default(), +        } +    } + +    /// Returns the span of the attribute name. +    pub fn name_span(&self) -> Range<usize> { +        self.name_span.clone() +    } + +    /// For explicitly defined values returns the span of the attribute value. +    /// +    /// Returns `None` for attributes using the empty attribute syntax (e.g. `disabled` in `<input disabled>`). +    pub fn value_span(&self) -> Option<Range<usize>> { +        if self.value_syntax.is_none() { +            return None; +        } +        Some(self.value_span.clone()) +    } + +    /// Returns the attribute value syntax in case the value is explicitly defined. +    /// +    /// Returns `None` for attributes using the empty attribute syntax (e.g. `disabled` in `<input disabled>`). +    pub fn value_syntax(&self) -> Option<AttrValueSyntax> { +        self.value_syntax +    } +} + +/// List of [`AttributeTrace`]s for the attributes that were present in the source. +#[derive(Eq, PartialEq, Debug)] +pub struct AttributeTraceList { +    /// We don't use `HashMap<String, AttributeTrace>` since this would require +    /// the attribute names to be cloned (which would be less efficient). +    traces: Vec<AttributeTrace>, +} + +impl Index<AttributeTraceIdx> for AttributeTraceList { +    type Output = AttributeTrace; + +    fn index(&self, index: AttributeTraceIdx) -> &Self::Output { +        &self.traces[index.0.get() - 1] +    } +} + +impl AttributeTraceList { +    pub(crate) fn new() -> Self { +        Self { +            traces: Default::default(), +        } +    } + +    pub(crate) fn insert(&mut self, trace: AttributeTrace) -> AttributeTraceIdx { +        self.traces.push(trace); +        let len = self.traces.len(); +        AttributeTraceIdx( +            // SAFETY: len cannot be zero because we push before calling Vec::len. +            unsafe { std::num::NonZeroUsize::new_unchecked(len) }, +        ) +    } +} diff --git a/src/tracing_emitter.rs b/src/tracing_emitter.rs index 76b20bf..408e832 100644 --- a/src/tracing_emitter.rs +++ b/src/tracing_emitter.rs @@ -3,20 +3,25 @@ use std::collections::BTreeSet;  use std::collections::VecDeque;  use std::ops::Range; -use crate::let_else::assume; -use crate::token::{AttrValueSyntax, Comment, Doctype, EndTag, StartTag}; +use crate::let_else::{assume, know}; +use crate::token::{Doctype, EndTag, StartTag, Token}; +use crate::trace::AttributeTrace; +use crate::trace::AttributeTraceList; +use crate::trace::{ +    AttrValueSyntax, CommentTrace, DoctypeTrace, EndTagTrace, StartTagTrace, Trace, +};  use crate::Emitter;  use crate::Error; -type Token = crate::token::Token<usize>; -  /// The default implementation of [`Emitter`], used to produce tokens.  pub struct TracingEmitter {      current_token: Option<Token>, +    current_trace: Option<Trace>,      current_attribute_name: String, -    current_attr_internal: crate::token::AttrInternal<usize>, +    current_attr_internal: crate::token::AttrInternal, +    current_attribute_trace: crate::trace::AttributeTrace,      seen_attributes: BTreeSet<String>, -    emitted_tokens: VecDeque<Token>, +    emitted_tokens: VecDeque<(Token, Trace)>,      errors: VecDeque<(Error, Range<usize>)>,      attr_in_end_tag_span: Option<Range<usize>>,  } @@ -25,8 +30,10 @@ impl Default for TracingEmitter {      fn default() -> Self {          TracingEmitter {              current_token: None, +            current_trace: None,              current_attribute_name: String::new(),              current_attr_internal: Default::default(), +            current_attribute_trace: crate::trace::AttributeTrace::new(),              seen_attributes: BTreeSet::new(),              emitted_tokens: VecDeque::new(),              errors: VecDeque::new(), @@ -43,7 +50,7 @@ impl TracingEmitter {  }  impl Iterator for TracingEmitter { -    type Item = Token; +    type Item = (Token, Trace);      fn next(&mut self) -> Option<Self::Item> {          self.emitted_tokens.pop_back() @@ -56,27 +63,32 @@ impl Emitter<usize> for TracingEmitter {      }      fn emit_char(&mut self, c: char) { -        self.emit_token(Token::Char(c)); +        self.emit_token(Token::Char(c), Trace::Char);      }      fn emit_eof(&mut self) { -        self.emit_token(Token::EndOfFile); +        self.emit_token(Token::EndOfFile, Trace::EndOfFile);      }      fn init_start_tag(&mut self, tag_offset: usize, name_offset: usize) {          self.current_token = Some(Token::StartTag(StartTag { -            span: tag_offset..0,              self_closing: false,              name: String::new(),              attributes: Default::default(), +        })); +        self.current_trace = Some(Trace::StartTag(StartTagTrace { +            span: tag_offset..0,              name_span: name_offset..0, +            attribute_traces: AttributeTraceList::new(),          }));      }      fn init_end_tag(&mut self, tag_offset: usize, name_offset: usize) {          self.current_token = Some(Token::EndTag(EndTag { -            span: tag_offset..0,              name: String::new(), +        })); +        self.current_trace = Some(Trace::EndTag(EndTagTrace { +            span: tag_offset..0,              name_span: name_offset..0,          }));          self.seen_attributes.clear(); @@ -93,17 +105,17 @@ impl Emitter<usize> for TracingEmitter {      fn terminate_tag_name(&mut self, offset: usize) {          assume!(              Some( -                Token::StartTag(StartTag { name_span, .. }) -                    | Token::EndTag(EndTag { name_span, .. }) +                Trace::StartTag(StartTagTrace { name_span, .. }) +                    | Trace::EndTag(EndTagTrace { name_span, .. })              ), -            &mut self.current_token +            &mut self.current_trace          );          name_span.end = offset;      }      fn init_attribute_name(&mut self, offset: usize) {          self.flush_current_attribute(); -        self.current_attr_internal.name_span.start = offset; +        self.current_attribute_trace.name_span.start = offset;      }      fn push_attribute_name(&mut self, s: &str) { @@ -111,12 +123,12 @@ impl Emitter<usize> for TracingEmitter {      }      fn terminate_attribute_name(&mut self, offset: usize) { -        self.current_attr_internal.name_span.end = offset; +        self.current_attribute_trace.name_span.end = offset;      }      fn init_attribute_value(&mut self, syntax: AttrValueSyntax, offset: usize) { -        self.current_attr_internal.value_span.start = offset; -        self.current_attr_internal.value_syntax = Some(syntax); +        self.current_attribute_trace.value_span.start = offset; +        self.current_attribute_trace.value_syntax = Some(syntax);      }      fn push_attribute_value(&mut self, s: &str) { @@ -124,7 +136,7 @@ impl Emitter<usize> for TracingEmitter {      }      fn terminate_attribute_value(&mut self, offset: usize) { -        self.current_attr_internal.value_span.end = offset; +        self.current_attribute_trace.value_span.end = offset;      }      fn set_self_closing(&mut self, slash_span: Range<usize>) { @@ -144,43 +156,47 @@ impl Emitter<usize> for TracingEmitter {      fn emit_current_tag(&mut self, offset: usize) {          self.flush_current_attribute();          let mut token = self.current_token.take().unwrap(); +        let mut trace = self.current_trace.take().unwrap();          match &mut token { -            Token::EndTag(tag) => { +            Token::EndTag(_) => {                  if !self.seen_attributes.is_empty() {                      let span = self.attr_in_end_tag_span.take().unwrap();                      self.report_error(Error::EndTagWithAttributes, span);                  }                  self.seen_attributes.clear(); -                tag.span.end = offset; +                know!(Trace::EndTag(tag_trace), &mut trace); +                tag_trace.span.end = offset;              } -            Token::StartTag(tag) => { -                tag.span.end = offset; +            Token::StartTag(_) => { +                know!(Trace::StartTag(tag_trace), &mut trace); +                tag_trace.span.end = offset;              }              other => {                  debug_assert!(false, "unexpected current_token: {other:?}");                  return;              }          } -        self.emit_token(token); +        self.emit_token(token, trace);      }      fn init_comment(&mut self, data_start_offset: usize) { -        self.current_token = Some(Token::Comment(Comment { -            data: String::new(), +        self.current_token = Some(Token::Comment(String::new())); +        self.current_trace = Some(Trace::Comment(CommentTrace {              data_span: data_start_offset..0,          }));      }      fn push_comment(&mut self, s: &str) { -        assume!(Some(Token::Comment(comment)), &mut self.current_token); -        comment.data.push_str(s); +        assume!(Some(Token::Comment(data)), &mut self.current_token); +        data.push_str(s);      }      fn emit_current_comment(&mut self, data_end_offset: usize) { -        let mut token = self.current_token.take().unwrap(); -        assume!(Token::Comment(comment), &mut token); -        comment.data_span.end = data_end_offset; -        self.emit_token(token); +        let token = self.current_token.take().unwrap(); +        let mut trace = self.current_trace.take().unwrap(); +        assume!(Trace::Comment(comment_trace), &mut trace); +        comment_trace.data_span.end = data_end_offset; +        self.emit_token(token, trace);      }      fn init_doctype(&mut self, offset: usize) { @@ -189,17 +205,15 @@ impl Emitter<usize> for TracingEmitter {              force_quirks: false,              public_id: None,              system_id: None, -            span: offset..0, -            name_span: 0..0, -            public_id_span: 0..0, -            system_id_span: 0..0,          })); +        self.current_trace = Some(Trace::Doctype(DoctypeTrace::new(offset)));      }      fn init_doctype_name(&mut self, offset: usize) {          assume!(Some(Token::Doctype(doctype)), &mut self.current_token);          doctype.name = Some("".into()); -        doctype.name_span.start = offset; +        know!(Some(Trace::Doctype(doctype_trace)), &mut self.current_trace); +        doctype_trace.set_name_start(offset);      }      fn push_doctype_name(&mut self, s: &str) { @@ -214,14 +228,15 @@ impl Emitter<usize> for TracingEmitter {      }      fn terminate_doctype_name(&mut self, offset: usize) { -        assume!(Some(Token::Doctype(doctype)), &mut self.current_token); -        doctype.name_span.end = offset; +        assume!(Some(Trace::Doctype(doctype_trace)), &mut self.current_trace); +        doctype_trace.set_name_end(offset);      }      fn init_doctype_public_id(&mut self, offset: usize) {          assume!(Some(Token::Doctype(doctype)), &mut self.current_token);          doctype.public_id = Some("".to_owned()); -        doctype.public_id_span.start = offset; +        know!(Some(Trace::Doctype(doctype_trace)), &mut self.current_trace); +        doctype_trace.set_public_id_start(offset);      }      fn push_doctype_public_id(&mut self, s: &str) { @@ -236,14 +251,15 @@ impl Emitter<usize> for TracingEmitter {      }      fn terminate_doctype_public_id(&mut self, offset: usize) { -        assume!(Some(Token::Doctype(doctype)), &mut self.current_token); -        doctype.public_id_span.end = offset; +        assume!(Some(Trace::Doctype(doctype_trace)), &mut self.current_trace); +        doctype_trace.set_public_id_end(offset);      }      fn init_doctype_system_id(&mut self, offset: usize) {          assume!(Some(Token::Doctype(doctype)), &mut self.current_token);          doctype.system_id = Some("".to_owned()); -        doctype.system_id_span.start = offset; +        know!(Some(Trace::Doctype(doctype_trace)), &mut self.current_trace); +        doctype_trace.set_system_id_start(offset);      }      fn push_doctype_system_id(&mut self, s: &str) { @@ -258,8 +274,8 @@ impl Emitter<usize> for TracingEmitter {      }      fn terminate_doctype_system_id(&mut self, offset: usize) { -        assume!(Some(Token::Doctype(doctype)), &mut self.current_token); -        doctype.system_id_span.end = offset; +        assume!(Some(Trace::Doctype(doctype_trace)), &mut self.current_trace); +        doctype_trace.set_system_id_end(offset);      }      fn set_force_quirks(&mut self) { @@ -268,15 +284,17 @@ impl Emitter<usize> for TracingEmitter {      }      fn emit_current_doctype(&mut self, offset: usize) { -        assume!(Some(Token::Doctype(mut doctype)), self.current_token.take()); -        doctype.span.end = offset; -        self.emit_token(Token::Doctype(doctype)); +        assume!(Some(mut trace), self.current_trace.take()); +        assume!(Trace::Doctype(doctype_trace), &mut trace); +        doctype_trace.span.end = offset; +        let token = self.current_token.take().unwrap(); +        self.emit_token(token, trace);      }  }  impl TracingEmitter { -    fn emit_token(&mut self, token: Token) { -        self.emitted_tokens.push_front(token); +    fn emit_token(&mut self, token: Token, trace: Trace) { +        self.emitted_tokens.push_front((token, trace));      }      fn flush_current_attribute(&mut self) { @@ -284,21 +302,26 @@ impl TracingEmitter {              return;          }          let name = std::mem::take(&mut self.current_attribute_name); -        let attr_internal = std::mem::take(&mut self.current_attr_internal); +        let mut attr_internal = std::mem::take(&mut self.current_attr_internal); +        let attr_trace = +            std::mem::replace(&mut self.current_attribute_trace, AttributeTrace::new());          match &mut self.current_token {              Some(Token::StartTag(tag)) => match tag.attributes.inner.entry(name) {                  Entry::Vacant(vacant) => { +                    know!(Some(Trace::StartTag(trace)), &mut self.current_trace); +                    let trace_idx = trace.attribute_traces.insert(attr_trace); +                    attr_internal.trace_idx = Some(trace_idx);                      vacant.insert(attr_internal);                  }                  Entry::Occupied(_) => { -                    self.report_error(Error::DuplicateAttribute, attr_internal.name_span); +                    self.report_error(Error::DuplicateAttribute, attr_trace.name_span);                  }              },              Some(Token::EndTag(_)) => { -                self.attr_in_end_tag_span = Some(attr_internal.name_span.clone()); +                self.attr_in_end_tag_span = Some(attr_trace.name_span.clone());                  if !self.seen_attributes.insert(name) { -                    self.report_error(Error::DuplicateAttribute, attr_internal.name_span); +                    self.report_error(Error::DuplicateAttribute, attr_trace.name_span);                  }              }              other => debug_assert!(false, "unexpected current_token: {other:?}"), @@ -306,6 +329,12 @@ impl TracingEmitter {      }  } +impl From<(Token, Trace)> for Token { +    fn from((token, _): (Token, Trace)) -> Self { +        token +    } +} +  /// The majority of our testing of the [`TracingEmitter`] is done against the  /// html5lib-tests in the html5lib integration test. This module only tests  /// details that aren't present in the html5lib test data. @@ -313,8 +342,8 @@ impl TracingEmitter {  mod tests {      use super::TracingEmitter;      use crate::offset::PosTrackingReader; -    use crate::token::{AttrValueSyntax, Token}; -    use crate::{Event, Tokenizer}; +    use crate::trace::{AttrValueSyntax, Trace}; +    use crate::{Event, Token, Tokenizer};      #[test]      fn test_attribute_value_syntax() { @@ -325,7 +354,9 @@ mod tests {              TracingEmitter::default(),          )          .flatten(); -        let Event::Token(Token::StartTag(tag)) = tokenizer.next().unwrap() else { +        let Event::Token((Token::StartTag(tag), Trace::StartTag(tag_trace))) = +            tokenizer.next().unwrap() +        else {              panic!("expected start tag");          };          for (name, syntax) in [ @@ -334,8 +365,9 @@ mod tests {              ("single-quoted", Some(AttrValueSyntax::SingleQuoted)),              ("double-quoted", Some(AttrValueSyntax::DoubleQuoted)),          ] { +            let attr_trace_idx = tag.attributes.get(name).unwrap().trace_idx().unwrap();              assert_eq!( -                tag.attributes.get(name).unwrap().value_syntax(), +                tag_trace.attribute_traces[attr_trace_idx].value_syntax(),                  syntax,                  "unexpected value for attribute {name}"              ); diff --git a/tests/test_spans.rs b/tests/test_spans.rs index 71a6c4b..0e95be0 100644 --- a/tests/test_spans.rs +++ b/tests/test_spans.rs @@ -10,7 +10,8 @@ use codespan_reporting::{  use html5tokenizer::{      offset::PosTrackingReader,      reader::{IntoReader, Reader}, -    NaiveParser, Token, TracingEmitter, +    trace::Trace, +    NaiveParser, Token,  };  use insta::assert_snapshot;  use similar_asserts::assert_eq; @@ -31,7 +32,7 @@ where          PosTrackingReader::new(              Box::new(reader.into_reader()) as Box<dyn Reader<Error = Infallible>>          ), -        TracingEmitter::default(), +        html5tokenizer::TracingEmitter::default(),      )  } @@ -76,9 +77,9 @@ fn start_tag_span() {      let html = "<x> <xyz> <xyz  > <xyz/>";      let labeler = |parser: Parser| {          let mut labels = Vec::new(); -        for token in parser.flatten() { -            if let Token::StartTag(tag) = token { -                labels.push((tag.span, "")); +        for (_, trace) in parser.flatten() { +            if let Trace::StartTag(trace) = trace { +                labels.push((trace.span, ""));              }          }          labels @@ -94,9 +95,9 @@ fn end_tag_span() {      let html = "</x> </xyz> </xyz  > </xyz/>";      let labeler = |parser: Parser| {          let mut labels = Vec::new(); -        for token in parser.flatten() { -            if let Token::EndTag(tag) = token { -                labels.push((tag.span, "")); +        for (_, trace) in parser.flatten() { +            if let Trace::EndTag(trace) = trace { +                labels.push((trace.span, ""));              }          }          labels @@ -112,9 +113,9 @@ fn start_tag_name_span() {      let html = "<x> <xyz> <xyz  > <xyz/>";      let labeler = |parser: Parser| {          let mut labels = Vec::new(); -        for token in parser.flatten() { -            if let Token::StartTag(tag) = token { -                labels.push((tag.name_span, "")); +        for (_, trace) in parser.flatten() { +            if let Trace::StartTag(trace) = trace { +                labels.push((trace.name_span, ""));              }          }          labels @@ -130,9 +131,9 @@ fn end_tag_name_span() {      let html = "</x> </xyz> </xyz  > </xyz/>";      let labeler = |parser: Parser| {          let mut labels = Vec::new(); -        for token in parser.flatten() { -            if let Token::EndTag(tag) = token { -                labels.push((tag.name_span, "")); +        for (_, trace) in parser.flatten() { +            if let Trace::EndTag(trace) = trace { +                labels.push((trace.name_span, ""));              }          }          labels @@ -148,11 +149,15 @@ fn attribute_name_span() {      let html = "<test x xyz y=VAL xy=VAL z = VAL yzx = VAL>";      let labeler = |parser: Parser| {          let mut labels = Vec::new(); -        let Token::StartTag(tag) = parser.flatten().next().unwrap() else { +        let (Token::StartTag(tag), Trace::StartTag(trace)) = parser.flatten().next().unwrap() +        else {              panic!("expected start tag")          };          for attr in &tag.attributes { -            labels.push((attr.name_span(), "")); +            labels.push(( +                trace.attribute_traces[attr.trace_idx().unwrap()].name_span(), +                "", +            ));          }          labels      }; @@ -167,11 +172,17 @@ fn attribute_value_span() {      let html = "<test x=unquoted y = unquoted z='single-quoted' zz=\"double-quoted\" empty=''>";      let labeler = |parser: Parser| {          let mut labels = Vec::new(); -        let Token::StartTag(tag) = parser.flatten().next().unwrap() else { +        let (Token::StartTag(tag), Trace::StartTag(trace)) = parser.flatten().next().unwrap() +        else {              panic!("expected start tag")          };          for attr in &tag.attributes { -            labels.push((attr.value_span().unwrap(), "")); +            labels.push(( +                trace.attribute_traces[attr.trace_idx().unwrap()] +                    .value_span() +                    .unwrap(), +                "", +            ));          }          labels      }; @@ -186,11 +197,17 @@ fn attribute_value_with_char_ref() {      let html = "<test x=& y='&' z=\"&\">";      let labeler = |parser: Parser| {          let mut labels = Vec::new(); -        let Token::StartTag(tag) = parser.flatten().next().unwrap() else { +        let (Token::StartTag(tag), Trace::StartTag(trace)) = parser.flatten().next().unwrap() +        else {              panic!("expected start tag")          };          for attr in &tag.attributes { -            labels.push((attr.value_span().unwrap(), "")); +            labels.push(( +                trace.attribute_traces[attr.trace_idx().unwrap()] +                    .value_span() +                    .unwrap(), +                "", +            ));          }          labels      }; @@ -224,10 +241,10 @@ fn comment_data_span() {      let mut annotated = String::new();      for case in cases {          let labeler = |parser: Parser| { -            let Token::Comment(comment) = parser.flatten().next().unwrap() else { +            let (_, Trace::Comment(comment)) = parser.flatten().next().unwrap() else {                  panic!("expected comment");              }; -            vec![(comment.data_span(), "")] +            vec![(comment.data_span, "")]          };          annotated.push_str(&test_and_annotate(case, labeler)); @@ -263,10 +280,11 @@ fn comment_data_span() {      "###);      for (idx, case) in cases.iter().enumerate() { -        let Token::Comment(comment) = parser(*case).flatten().next().unwrap() else { +        let (Token::Comment(data), Trace::Comment(trace)) = parser(*case).flatten().next().unwrap() +        else {              panic!("expected comment");          }; -        assert_eq!(case[comment.data_span()], comment.data, "case {idx}"); +        assert_eq!(case[trace.data_span], data, "case {idx}");      }  } @@ -280,10 +298,10 @@ fn doctype_span() {      let mut annotated = String::new();      for case in cases {          let labeler = |parser: Parser| { -            let Token::Doctype(doctype) = parser.flatten().next().unwrap() else { +            let (_, Trace::Doctype(trace)) = parser.flatten().next().unwrap() else {                  panic!("expected doctype");              }; -            vec![(doctype.span, "")] +            vec![(trace.span(), "")]          };          annotated.push_str(&test_and_annotate(case, labeler));      } @@ -304,18 +322,18 @@ fn doctype_id_spans() {      let mut annotated = String::new();      for case in cases {          let labeler = |parser: Parser| { -            let Token::Doctype(doctype) = parser.flatten().next().unwrap() else { +            let (_, Trace::Doctype(trace)) = parser.flatten().next().unwrap() else {                  panic!("expected doctype");              };              let mut labels = Vec::new(); -            if let Some(name_span) = doctype.name_span() { +            if let Some(name_span) = trace.name_span() {                  labels.push((name_span, "name"));              } -            if let Some(public_id_span) = doctype.public_id_span() { +            if let Some(public_id_span) = trace.public_id_span() {                  labels.push((public_id_span, "public id"));              } -            if let Some(system_id_span) = doctype.system_id_span() { +            if let Some(system_id_span) = trace.system_id_span() {                  labels.push((system_id_span, "system id"));              }              labels | 
