diff options
-rw-r--r-- | CHANGELOG.md | 4 | ||||
-rw-r--r-- | examples/spans.rs | 13 | ||||
-rw-r--r-- | integration_tests/tests/test_html5lib.rs | 4 | ||||
-rw-r--r-- | src/basic_emitter.rs | 2 | ||||
-rw-r--r-- | src/emitter.rs | 2 | ||||
-rw-r--r-- | src/let_else.rs | 22 | ||||
-rw-r--r-- | src/lib.rs | 8 | ||||
-rw-r--r-- | src/token.rs | 217 | ||||
-rw-r--r-- | src/tokenizer.rs | 1 | ||||
-rw-r--r-- | src/tokenizer/machine.rs | 2 | ||||
-rw-r--r-- | src/trace.rs | 241 | ||||
-rw-r--r-- | src/tracing_emitter.rs | 148 | ||||
-rw-r--r-- | tests/test_spans.rs | 78 |
13 files changed, 494 insertions, 248 deletions
diff --git a/CHANGELOG.md b/CHANGELOG.md index 146d627..f0c1ed6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,8 @@ #### Breaking changes +* Byte offsets were moved out of the `Token` enum into a new `Trace` enum. + * `Token` enum * Removed the `Error` variant. @@ -21,6 +23,8 @@ * The `DefaultEmitter` has been renamed to `TracingEmitter`. +* The `DefaultEmitter` now yields `(Token, Trace)` instead of just `Token`. + * The `DefaultEmitter` now emits `Token::EndOfFile` on the end-of-file. (Previously it did not emit any token symbolizing the end-of-file.) diff --git a/examples/spans.rs b/examples/spans.rs index c1fe23b..b8d5283 100644 --- a/examples/spans.rs +++ b/examples/spans.rs @@ -4,14 +4,14 @@ use codespan_reporting::{ term, term::termcolor::{ColorChoice, StandardStream}, }; -use html5tokenizer::{offset::PosTrackingReader, NaiveParser, Token, TracingEmitter}; +use html5tokenizer::{offset::PosTrackingReader, trace::Trace, NaiveParser, Token, TracingEmitter}; fn main() { let html = r#"<img src=example.jpg alt="some description">"#; let parser = NaiveParser::new_with_emitter(PosTrackingReader::new(html), TracingEmitter::default()); - let Token::StartTag(tag) = parser.flatten().next().unwrap() else { + let (Token::StartTag(tag), Trace::StartTag(trace)) = parser.flatten().next().unwrap() else { panic!() }; @@ -20,11 +20,14 @@ fn main() { let mut labels = Vec::new(); - labels.push(Label::primary(file_id, tag.name_span).with_message("tag name")); + labels.push(Label::primary(file_id, trace.name_span).with_message("tag name")); for attr in &tag.attributes { - labels.push(Label::primary(file_id, attr.name_span()).with_message("attr name")); - labels.push(Label::primary(file_id, attr.value_span().unwrap()).with_message("attr value")); + let attr_trace = &trace.attribute_traces[attr.trace_idx().unwrap()]; + labels.push(Label::primary(file_id, attr_trace.name_span()).with_message("attr name")); + labels.push( + Label::primary(file_id, attr_trace.value_span().unwrap()).with_message("attr value"), + ); } let diagnostic = Diagnostic::note().with_labels(labels); diff --git a/integration_tests/tests/test_html5lib.rs b/integration_tests/tests/test_html5lib.rs index eac11dd..42d93f1 100644 --- a/integration_tests/tests/test_html5lib.rs +++ b/integration_tests/tests/test_html5lib.rs @@ -107,7 +107,7 @@ fn run_test_inner<R, O, E, T>( R: Reader + Position<O>, O: Offset, E: Emitter<O> + Iterator<Item = T> + DrainErrors<O>, - T: Into<Token<O>>, + T: Into<Token>, { println!( "==== FILE {}, TEST {}, STATE {:?}, TOKENIZER {} ====", @@ -156,7 +156,7 @@ fn run_test_inner<R, O, E, T>( actual_tokens.push(TestToken::Character(c.into())); } } - Token::Comment(comment) => actual_tokens.push(TestToken::Comment(comment.data)), + Token::Comment(comment) => actual_tokens.push(TestToken::Comment(comment)), Token::Doctype(doctype) => actual_tokens.push(TestToken::Doctype { name: doctype.name, public_id: doctype.public_id, diff --git a/src/basic_emitter.rs b/src/basic_emitter.rs index 046b645..bcb3f41 100644 --- a/src/basic_emitter.rs +++ b/src/basic_emitter.rs @@ -27,7 +27,7 @@ impl<O> BasicEmitter<O> { } impl<O> Iterator for BasicEmitter<O> { - type Item = Token<O>; + type Item = Token; fn next(&mut self) -> Option<Self::Item> { todo!() diff --git a/src/emitter.rs b/src/emitter.rs index 25e0209..d1e1dfe 100644 --- a/src/emitter.rs +++ b/src/emitter.rs @@ -1,6 +1,6 @@ use std::ops::Range; -use crate::token::AttrValueSyntax; +use crate::trace::AttrValueSyntax; use crate::Error; /// An emitter is an object providing methods to the tokenizer to produce ("emit") tokens. diff --git a/src/let_else.rs b/src/let_else.rs index da17a68..a1627f1 100644 --- a/src/let_else.rs +++ b/src/let_else.rs @@ -21,3 +21,25 @@ right: {:?}", } pub(crate) use assume; + +/// Binds the given expression to the given pattern, or else executes +/// `unreachable!();` with a helpful panic message and returns. +macro_rules! know { + ($pattern:pat, $value:expr) => { + // The expression might change each time it's evaluated, so we + // have to bind it so that we can reuse it in the panic message. + let _value = $value; + + let $pattern = _value else { + unreachable!( + "assertion `left matches right` failed: + left: {} +right: {:?}", + stringify!($pattern), + _value + ); + }; + }; +} + +pub(crate) use know; @@ -18,19 +18,19 @@ mod tracing_emitter; /// Types for HTML attributes. pub mod attr { - pub use crate::token::{ - AttrIntoIter, AttrIter, AttrValueSyntax, Attribute, AttributeMap, AttributeOwned, - }; + pub use crate::token::{AttrIntoIter, AttrIter, Attribute, AttributeMap, AttributeOwned}; + pub use crate::trace::AttrValueSyntax; } pub mod offset; pub mod reader; pub mod token; +pub mod trace; pub use basic_emitter::BasicEmitter; pub use emitter::Emitter; pub use error::Error; pub use naive_parser::NaiveParser; -pub use token::{Comment, Doctype, EndTag, StartTag, Token}; +pub use token::{Doctype, EndTag, StartTag, Token}; pub use tokenizer::{CdataAction, Event, State, Tokenizer}; pub use tracing_emitter::TracingEmitter; diff --git a/src/token.rs b/src/token.rs index ed8c8c8..4f3c0ce 100644 --- a/src/token.rs +++ b/src/token.rs @@ -2,32 +2,30 @@ use std::collections::{btree_map, BTreeMap}; use std::iter::FromIterator; -use std::ops::{Index, Range}; - -use crate::offset::Offset; +use std::ops::Index; /// A type for the tokens emitted by a WHATWG-compliant HTML tokenizer. #[derive(Debug, Eq, PartialEq)] -pub enum Token<O> { +pub enum Token { /// A literal character, a resolved character reference, /// or part of a resolved character reference (since some /// character references resolve to two `char`s). Char(char), /// An HTML start tag. - StartTag(StartTag<O>), + StartTag(StartTag), /// An HTML end tag. - EndTag(EndTag<O>), + EndTag(EndTag), /// An HTML comment. - Comment(Comment<O>), + Comment(String), /// An HTML doctype declaration. - Doctype(Doctype<O>), + Doctype(Doctype), /// An end-of-file token. EndOfFile, } /// An HTML start tag, such as `<p>` or `<a>`. #[derive(Debug, Eq, PartialEq)] -pub struct StartTag<O> { +pub struct StartTag { /// Whether this tag is self-closing. If it is self-closing, no following [`EndTag`] should be /// expected. pub self_closing: bool, @@ -39,43 +37,15 @@ pub struct StartTag<O> { /// A mapping for any HTML attributes this start tag may have. /// /// Duplicate attributes are ignored after the first one as per WHATWG spec. - pub attributes: AttributeMap<O>, - - /// The source code span of the tag. - pub span: Range<O>, - - /// The span of the tag name. - pub name_span: Range<O>, + pub attributes: AttributeMap, } /// An HTML end/close tag, such as `</p>` or `</a>`. #[derive(Debug, Eq, PartialEq)] -pub struct EndTag<O> { +pub struct EndTag { /// The tag name. /// Uppercase ASCII characters (A-Z) have been converted to lowercase. pub name: String, - - /// The source code span of the tag. - pub span: Range<O>, - - /// The span of the tag name. - pub name_span: Range<O>, -} - -/// An HTML comment. -#[derive(PartialEq, Eq, Debug)] -pub struct Comment<O> { - /// The text within the comment. - pub data: String, - /// The source offset of the comment data. - pub data_span: Range<O>, -} - -impl<O: Offset> Comment<O> { - /// Returns the span for the comment data. - pub fn data_span(&self) -> Range<O> { - self.data_span.clone() - } } /// A doctype. Some examples: @@ -85,7 +55,7 @@ impl<O: Offset> Comment<O> { /// * `<!DOCTYPE {name} SYSTEM '{system_id}'>` /// * `<!DOCTYPE {name} PUBLIC '{public_id}' '{system_id}'>` #[derive(Debug, Eq, PartialEq)] -pub struct Doctype<O> { +pub struct Doctype { /// The [force-quirks flag]. /// /// [force-quirks flag]: https://html.spec.whatwg.org/multipage/parsing.html#force-quirks-flag @@ -100,38 +70,6 @@ pub struct Doctype<O> { /// The doctype's system identifier. pub system_id: Option<String>, - - /// The source code span of the doctype. - pub span: Range<O>, - - /// The span of the name. - pub(crate) name_span: Range<O>, - - /// The span of the public identifier. - pub(crate) public_id_span: Range<O>, - - /// The span of the system identifier. - pub(crate) system_id_span: Range<O>, -} - -impl<O: Offset> Doctype<O> { - /// Returns the span of the name. - pub fn name_span(&self) -> Option<Range<O>> { - self.name.as_ref()?; - Some(self.name_span.clone()) - } - - /// Returns the span of the public identifier. - pub fn public_id_span(&self) -> Option<Range<O>> { - self.public_id.as_ref()?; - Some(self.public_id_span.clone()) - } - - /// Returns the span of the system identifier. - pub fn system_id_span(&self) -> Option<Range<O>> { - self.system_id.as_ref()?; - Some(self.system_id_span.clone()) - } } /// A map of HTML attributes. @@ -143,79 +81,81 @@ impl<O: Offset> Doctype<O> { /// /// ``` /// # use html5tokenizer::attr::AttributeMap; -/// let attrs: AttributeMap<()> = vec![("href".into(), "http://example.com".into())] +/// let attrs: AttributeMap = vec![("href".into(), "http://example.com".into())] /// .into_iter() /// .collect(); /// assert_eq!(&attrs["href"], "http://example.com"); /// ``` #[derive(Debug, Default, PartialEq, Eq)] -pub struct AttributeMap<O> { - pub(crate) inner: BTreeMap<String, AttrInternal<O>>, +pub struct AttributeMap { + pub(crate) inner: BTreeMap<String, AttrInternal>, } /// The value type internally used by the [`AttributeMap`]. /// Not part of the public API. -#[derive(Default, Debug, Eq, PartialEq)] -pub(crate) struct AttrInternal<O> { +#[derive(Default, Debug, Eq)] +pub(crate) struct AttrInternal { pub value: String, - /// The span of the attribute name. - pub name_span: Range<O>, - /// The span of the attribute value. - /// For the empty attribute syntax this is just `O::default()..O::default()`. - /// We intentionally don't use `Option<Range<O>>` here to spare us a byte (and padding) per attribute. - pub value_span: Range<O>, - pub value_syntax: Option<AttrValueSyntax>, + pub trace_idx: Option<AttributeTraceIdx>, } -/// The syntax of the attribute value. -#[derive(Clone, Copy, PartialEq, Eq, Debug)] -pub enum AttrValueSyntax { - /// An unquoted attribute value, e.g. `id=foo`. - Unquoted, - /// A single-quoted attribute value, e.g. `id='foo'`. - SingleQuoted, - /// A double-quoted attribute value, e.g. `id="foo"`. - DoubleQuoted, +/// The index of an [`AttributeTrace`] within an [`AttributeTraceList`]. +/// +/// [`AttributeTrace`]: crate::trace::AttributeTrace +/// [`AttributeTraceList`]: crate::trace::AttributeTraceList +#[derive(Clone, Copy, Eq, PartialEq, Debug)] +pub struct AttributeTraceIdx( + // Using NonZeroUsize so that `Option<AttributeTraceIdx>` + // has the same size as `AttributeTraceIdx`. + pub std::num::NonZeroUsize, +); + +impl PartialEq for AttrInternal { + fn eq(&self, other: &Self) -> bool { + // We intentionally don't include the trace_idx, + // so that PartialEq of Token only compares semantics. + self.value == other.value + } } /// An HTML attribute borrowed from an [`AttributeMap`]. #[derive(Debug, Eq, PartialEq)] -pub struct Attribute<'a, O> { +pub struct Attribute<'a> { name: &'a str, - map_val: &'a AttrInternal<O>, + map_val: &'a AttrInternal, } /// An owned HTML attribute. #[derive(Debug, PartialEq, Eq)] -pub struct AttributeOwned<O> { +pub struct AttributeOwned { /// The attribute name. /// Uppercase ASCII characters (A-Z) have been converted to lowercase. pub name: String, /// The attribute value. Character references have been resolved. pub value: String, - /// The span of the attribute name. - pub name_span: Range<O>, - /// The span of the attribute value. - /// `None` in case of the empty attribute syntax (e.g. `disabled` in `<input disabled>`). - pub value_span: Option<Range<O>>, - /// The syntax of the attribute value. - /// `None` indicates the empty attribute syntax (e.g. `disabled` in `<input disabled>`). - pub value_syntax: Option<AttrValueSyntax>, + /// The index of the corresponding [`AttributeTrace`] in the + /// `attribute_traces` field of [`StartTagTrace`], in case this attribute + /// was present in the source and the [`Emitter`] has tracked this. + /// + /// [`AttributeTrace`]: super::trace::AttributeTrace + /// [`StartTagTrace`]: super::trace::AttributeTrace + /// [`Emitter`]: super::Emitter + pub trace_idx: Option<AttributeTraceIdx>, } -impl<O> AttributeMap<O> { +impl AttributeMap { /// Returns the attribute with the given name. /// /// The name must not contain any uppercase ASCII character (A-Z) /// or the method will always return `None`. - pub fn get(&self, name: &str) -> Option<Attribute<O>> { + pub fn get(&self, name: &str) -> Option<Attribute> { self.inner .get_key_value(name) .map(|(name, map_val)| Attribute { name, map_val }) } } -impl<'a, O: Offset> Attribute<'a, O> { +impl<'a> Attribute<'a> { /// Returns the attribute name. /// Uppercase ASCII characters (A-Z) have been converted to lowercase. pub fn name(&self) -> &'a str { @@ -227,32 +167,21 @@ impl<'a, O: Offset> Attribute<'a, O> { &self.map_val.value } - /// Returns the span of the attribute name. - pub fn name_span(&self) -> Range<O> { - self.map_val.name_span.clone() - } - - /// For explicitly defined values returns the span of the attribute value. - /// - /// Returns `None` for attributes using the empty attribute syntax (e.g. `disabled` in `<input disabled>`). - pub fn value_span(&self) -> Option<Range<O>> { - if self.map_val.value_syntax.is_none() { - return None; - } - Some(self.map_val.value_span.clone()) - } - - /// Returns the attribute value syntax in case the value is explicitly defined. + /// Returns the index of the corresponding [`AttributeTrace`] in the + /// `attribute_traces` field of [`StartTagTrace`], in case this attribute + /// was present in the source and the [`Emitter`] has tracked that. /// - /// Returns `None` for attributes using the empty attribute syntax (e.g. `disabled` in `<input disabled>`). - pub fn value_syntax(&self) -> Option<AttrValueSyntax> { - self.map_val.value_syntax + /// [`AttributeTrace`]: super::trace::AttributeTrace + /// [`StartTagTrace`]: super::trace::AttributeTrace + /// [`Emitter`]: super::Emitter + pub fn trace_idx(&self) -> Option<AttributeTraceIdx> { + self.map_val.trace_idx } } // We cannot impl Index<Output=Attribute> because Index::index returns a reference of // the Output type (and you cannot return a value referencing a temporary value). -impl<O> Index<&str> for AttributeMap<O> { +impl Index<&str> for AttributeMap { type Output = str; /// Returns the attribute value with the given name. @@ -264,10 +193,10 @@ impl<O> Index<&str> for AttributeMap<O> { } } -impl<O> IntoIterator for AttributeMap<O> { - type Item = AttributeOwned<O>; +impl IntoIterator for AttributeMap { + type Item = AttributeOwned; - type IntoIter = AttrIntoIter<O>; + type IntoIter = AttrIntoIter; fn into_iter(self) -> Self::IntoIter { AttrIntoIter(self.inner.into_iter()) @@ -275,27 +204,25 @@ impl<O> IntoIterator for AttributeMap<O> { } /// A consuming iterator over the attributes of an [`AttributeMap`]. -pub struct AttrIntoIter<O>(btree_map::IntoIter<String, AttrInternal<O>>); +pub struct AttrIntoIter(btree_map::IntoIter<String, AttrInternal>); -impl<O> Iterator for AttrIntoIter<O> { - type Item = AttributeOwned<O>; +impl Iterator for AttrIntoIter { + type Item = AttributeOwned; fn next(&mut self) -> Option<Self::Item> { let (name, map_val) = self.0.next()?; Some(AttributeOwned { name, value: map_val.value, - name_span: map_val.name_span, - value_span: map_val.value_syntax.is_some().then_some(map_val.value_span), - value_syntax: map_val.value_syntax, + trace_idx: map_val.trace_idx, }) } } -impl<'a, O> IntoIterator for &'a AttributeMap<O> { - type Item = Attribute<'a, O>; +impl<'a> IntoIterator for &'a AttributeMap { + type Item = Attribute<'a>; - type IntoIter = AttrIter<'a, O>; + type IntoIter = AttrIter<'a>; fn into_iter(self) -> Self::IntoIter { AttrIter(self.inner.iter()) @@ -303,10 +230,10 @@ impl<'a, O> IntoIterator for &'a AttributeMap<O> { } /// A borrowed iterator over the attributes of an [`AttributeMap`]. -pub struct AttrIter<'a, S>(btree_map::Iter<'a, String, AttrInternal<S>>); +pub struct AttrIter<'a>(btree_map::Iter<'a, String, AttrInternal>); -impl<'a, S> Iterator for AttrIter<'a, S> { - type Item = Attribute<'a, S>; +impl<'a> Iterator for AttrIter<'a> { + type Item = Attribute<'a>; fn next(&mut self) -> Option<Self::Item> { let (name, map_val) = self.0.next()?; @@ -314,7 +241,7 @@ impl<'a, S> Iterator for AttrIter<'a, S> { } } -impl<O: Default> FromIterator<(String, String)> for AttributeMap<O> { +impl FromIterator<(String, String)> for AttributeMap { fn from_iter<T: IntoIterator<Item = (String, String)>>(iter: T) -> Self { Self { inner: iter @@ -324,9 +251,7 @@ impl<O: Default> FromIterator<(String, String)> for AttributeMap<O> { name, AttrInternal { value, - name_span: O::default()..O::default(), - value_span: O::default()..O::default(), - value_syntax: Some(AttrValueSyntax::DoubleQuoted), + trace_idx: None, }, ) }) diff --git a/src/tokenizer.rs b/src/tokenizer.rs index d0e2eaf..decd4df 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -20,6 +20,7 @@ pub use machine::State as InternalState; /// let emitter = DefaultEmitter::default(); /// let html = "<script><b>"; /// let mut tokens = Tokenizer::new(html, emitter).flatten(); +/// let mut tokens = tokens.map(|event| match event { Event::Token((token, _)) => Event::Token(token), Event::CdataOpen => Event::CdataOpen }); // TODO: remove once BasicEmitter can be used instead /// assert!(matches!(tokens.next(), Some(Event::Token(Token::StartTag(_))))); /// assert!(matches!(tokens.next(), Some(Event::Token(Token::StartTag(_))))); /// ``` diff --git a/src/tokenizer/machine.rs b/src/tokenizer/machine.rs index 8b09aa7..9aaac73 100644 --- a/src/tokenizer/machine.rs +++ b/src/tokenizer/machine.rs @@ -2,8 +2,8 @@ mod utils; use crate::entities::try_read_character_reference; use crate::offset::{Offset, Position}; -use crate::token::AttrValueSyntax; use crate::tokenizer::CdataAction; +use crate::trace::AttrValueSyntax; use crate::{reader::Reader, Emitter, Error}; use utils::{ ascii_digit_pat, control_pat, ctostr, noncharacter_pat, surrogate_pat, whitespace_pat, diff --git a/src/trace.rs b/src/trace.rs new file mode 100644 index 0000000..a816429 --- /dev/null +++ b/src/trace.rs @@ -0,0 +1,241 @@ +//! Provides the [`Trace`] type (byte offsets and syntax information about tokens). + +use std::{ + num::NonZeroUsize, + ops::{Index, Range}, +}; + +use crate::let_else::assume; +use crate::token::AttributeTraceIdx; + +/// Provides byte offsets and syntax information about a [`Token`]. +/// +/// [`Token`]: crate::token::Token +#[allow(missing_docs)] +#[derive(Eq, PartialEq, Debug)] +pub enum Trace { + Char, + StartTag(StartTagTrace), + EndTag(EndTagTrace), + Comment(CommentTrace), + Doctype(DoctypeTrace), + EndOfFile, +} + +/// Provides byte offsets and syntax information for a [`StartTag`] token. +/// +/// [`StartTag`]: crate::token::StartTag +#[derive(Eq, PartialEq, Debug)] +pub struct StartTagTrace { + /// The span of the tag. + pub span: Range<usize>, + + /// The span of the tag name. + pub name_span: Range<usize>, + + /// List of [`AttributeTrace`]s for the attributes that were present in the source. + pub attribute_traces: AttributeTraceList, +} + +/// Provides byte offsets for an [`EndTag`] token. +/// +/// [`EndTag`]: crate::token::EndTag +#[derive(Eq, PartialEq, Debug)] +pub struct EndTagTrace { + /// The span of the tag. + pub span: Range<usize>, + + /// The span of the tag name. + pub name_span: Range<usize>, +} + +/// Provides byte offsets for a [`Token::Comment`]. +/// +/// [`Token::Comment`]: crate::token::Token::Comment +#[derive(Eq, PartialEq, Debug)] +pub struct CommentTrace { + /// The offset of the comment data. + pub data_span: Range<usize>, +} + +/// Provides byte offsets for a [`Doctype`] token. +/// +/// [`Doctype`]: crate::token::Doctype +#[derive(Eq, PartialEq, Debug)] +pub struct DoctypeTrace { + pub(crate) span: Range<usize>, + // Using NonZeroUsize to optimize the size of the struct. + name_span: Option<Range<std::num::NonZeroUsize>>, + public_id_span: Option<Range<std::num::NonZeroUsize>>, + system_id_span: Option<Range<std::num::NonZeroUsize>>, +} + +impl DoctypeTrace { + /// Returns the span of the DOCTYPE. + pub fn span(&self) -> Range<usize> { + self.span.clone() + } + + /// Returns the span of the name. + pub fn name_span(&self) -> Option<Range<usize>> { + self.name_span + .as_ref() + .map(|range| range.start.get()..range.end.get()) + } + + /// Returns the span of the public identifier. + pub fn public_id_span(&self) -> Option<Range<usize>> { + self.public_id_span + .as_ref() + .map(|range| range.start.get()..range.end.get()) + } + + /// Returns the span of the system identifier. + pub fn system_id_span(&self) -> Option<Range<usize>> { + self.system_id_span + .as_ref() + .map(|range| range.start.get()..range.end.get()) + } +} + +/// Internal [`DoctypeTrace`] methods. +/// +/// Note that even though it stands to reason that the offsets provided to the `set_` +/// methods can never be zero, we intentionally don't use `new_unchecked` since +/// actually verifying that the offsets provided to the respective Emitter methods can +/// never be zero would non-trivial (since the tokenizer state machine has 80 states). +impl DoctypeTrace { + #[inline] + pub(crate) fn new(span_start: usize) -> Self { + Self { + span: span_start..0, + name_span: None, + public_id_span: None, + system_id_span: None, + } + } + + #[inline] + pub(crate) fn set_name_start(&mut self, start: usize) { + let start = NonZeroUsize::new(start).expect("expected offset to be non-zero"); + self.name_span = Some(start..start); + } + + #[inline] + pub(crate) fn set_public_id_start(&mut self, start: usize) { + let start = NonZeroUsize::new(start).expect("expected offset to be non-zero"); + self.public_id_span = Some(start..start); + } + + #[inline] + pub(crate) fn set_system_id_start(&mut self, start: usize) { + let start = NonZeroUsize::new(start).expect("expected offset to be non-zero"); + self.system_id_span = Some(start..start); + } + + #[inline] + pub(crate) fn set_name_end(&mut self, end: usize) { + assume!(Some(span), &mut self.name_span); + span.end = NonZeroUsize::new(end).expect("expected offset to be non-zero"); + } + + #[inline] + pub(crate) fn set_public_id_end(&mut self, end: usize) { + assume!(Some(span), &mut self.public_id_span); + span.end = NonZeroUsize::new(end).expect("expected offset to be non-zero"); + } + + #[inline] + pub(crate) fn set_system_id_end(&mut self, end: usize) { + assume!(Some(span), &mut self.system_id_span); + span.end = NonZeroUsize::new(end).expect("expected offset to be non-zero"); + } +} + +/// The syntax of the attribute value. +#[derive(Clone, Copy, PartialEq, Eq, Debug)] +pub enum AttrValueSyntax { + /// An unquoted attribute value, e.g. `id=foo`. + Unquoted, + /// A single-quoted attribute value, e.g. `id='foo'`. + SingleQuoted, + /// A double-quoted attribute value, e.g. `id="foo"`. + DoubleQuoted, +} + +/// Provides byte offsets and the [`AttrValueSyntax`] for an attribute that was present in the source. +#[derive(Eq, PartialEq, Debug)] +pub struct AttributeTrace { + pub(crate) value_syntax: Option<AttrValueSyntax>, + pub(crate) name_span: Range<usize>, + /// We intentionally don't use `Option<Range<O>>` here to spare us a byte (and padding) per attribute. + /// For the empty attribute syntax this is just `O::default()..O::default()`. + pub(crate) value_span: Range<usize>, +} + +impl AttributeTrace { + /// [`AttributeTrace`] intentionally doesn't implement Default + /// (since it's part of the public API and it wouldn't make sense semantically). + pub(crate) fn new() -> Self { + Self { + value_syntax: None, + name_span: Default::default(), + value_span: Default::default(), + } + } + + /// Returns the span of the attribute name. + pub fn name_span(&self) -> Range<usize> { + self.name_span.clone() + } + + /// For explicitly defined values returns the span of the attribute value. + /// + /// Returns `None` for attributes using the empty attribute syntax (e.g. `disabled` in `<input disabled>`). + pub fn value_span(&self) -> Option<Range<usize>> { + if self.value_syntax.is_none() { + return None; + } + Some(self.value_span.clone()) + } + + /// Returns the attribute value syntax in case the value is explicitly defined. + /// + /// Returns `None` for attributes using the empty attribute syntax (e.g. `disabled` in `<input disabled>`). + pub fn value_syntax(&self) -> Option<AttrValueSyntax> { + self.value_syntax + } +} + +/// List of [`AttributeTrace`]s for the attributes that were present in the source. +#[derive(Eq, PartialEq, Debug)] +pub struct AttributeTraceList { + /// We don't use `HashMap<String, AttributeTrace>` since this would require + /// the attribute names to be cloned (which would be less efficient). + traces: Vec<AttributeTrace>, +} + +impl Index<AttributeTraceIdx> for AttributeTraceList { + type Output = AttributeTrace; + + fn index(&self, index: AttributeTraceIdx) -> &Self::Output { + &self.traces[index.0.get() - 1] + } +} + +impl AttributeTraceList { + pub(crate) fn new() -> Self { + Self { + traces: Default::default(), + } + } + + pub(crate) fn insert(&mut self, trace: AttributeTrace) -> AttributeTraceIdx { + self.traces.push(trace); + let len = self.traces.len(); + AttributeTraceIdx( + // SAFETY: len cannot be zero because we push before calling Vec::len. + unsafe { std::num::NonZeroUsize::new_unchecked(len) }, + ) + } +} diff --git a/src/tracing_emitter.rs b/src/tracing_emitter.rs index 76b20bf..408e832 100644 --- a/src/tracing_emitter.rs +++ b/src/tracing_emitter.rs @@ -3,20 +3,25 @@ use std::collections::BTreeSet; use std::collections::VecDeque; use std::ops::Range; -use crate::let_else::assume; -use crate::token::{AttrValueSyntax, Comment, Doctype, EndTag, StartTag}; +use crate::let_else::{assume, know}; +use crate::token::{Doctype, EndTag, StartTag, Token}; +use crate::trace::AttributeTrace; +use crate::trace::AttributeTraceList; +use crate::trace::{ + AttrValueSyntax, CommentTrace, DoctypeTrace, EndTagTrace, StartTagTrace, Trace, +}; use crate::Emitter; use crate::Error; -type Token = crate::token::Token<usize>; - /// The default implementation of [`Emitter`], used to produce tokens. pub struct TracingEmitter { current_token: Option<Token>, + current_trace: Option<Trace>, current_attribute_name: String, - current_attr_internal: crate::token::AttrInternal<usize>, + current_attr_internal: crate::token::AttrInternal, + current_attribute_trace: crate::trace::AttributeTrace, seen_attributes: BTreeSet<String>, - emitted_tokens: VecDeque<Token>, + emitted_tokens: VecDeque<(Token, Trace)>, errors: VecDeque<(Error, Range<usize>)>, attr_in_end_tag_span: Option<Range<usize>>, } @@ -25,8 +30,10 @@ impl Default for TracingEmitter { fn default() -> Self { TracingEmitter { current_token: None, + current_trace: None, current_attribute_name: String::new(), current_attr_internal: Default::default(), + current_attribute_trace: crate::trace::AttributeTrace::new(), seen_attributes: BTreeSet::new(), emitted_tokens: VecDeque::new(), errors: VecDeque::new(), @@ -43,7 +50,7 @@ impl TracingEmitter { } impl Iterator for TracingEmitter { - type Item = Token; + type Item = (Token, Trace); fn next(&mut self) -> Option<Self::Item> { self.emitted_tokens.pop_back() @@ -56,27 +63,32 @@ impl Emitter<usize> for TracingEmitter { } fn emit_char(&mut self, c: char) { - self.emit_token(Token::Char(c)); + self.emit_token(Token::Char(c), Trace::Char); } fn emit_eof(&mut self) { - self.emit_token(Token::EndOfFile); + self.emit_token(Token::EndOfFile, Trace::EndOfFile); } fn init_start_tag(&mut self, tag_offset: usize, name_offset: usize) { self.current_token = Some(Token::StartTag(StartTag { - span: tag_offset..0, self_closing: false, name: String::new(), attributes: Default::default(), + })); + self.current_trace = Some(Trace::StartTag(StartTagTrace { + span: tag_offset..0, name_span: name_offset..0, + attribute_traces: AttributeTraceList::new(), })); } fn init_end_tag(&mut self, tag_offset: usize, name_offset: usize) { self.current_token = Some(Token::EndTag(EndTag { - span: tag_offset..0, name: String::new(), + })); + self.current_trace = Some(Trace::EndTag(EndTagTrace { + span: tag_offset..0, name_span: name_offset..0, })); self.seen_attributes.clear(); @@ -93,17 +105,17 @@ impl Emitter<usize> for TracingEmitter { fn terminate_tag_name(&mut self, offset: usize) { assume!( Some( - Token::StartTag(StartTag { name_span, .. }) - | Token::EndTag(EndTag { name_span, .. }) + Trace::StartTag(StartTagTrace { name_span, .. }) + | Trace::EndTag(EndTagTrace { name_span, .. }) ), - &mut self.current_token + &mut self.current_trace ); name_span.end = offset; } fn init_attribute_name(&mut self, offset: usize) { self.flush_current_attribute(); - self.current_attr_internal.name_span.start = offset; + self.current_attribute_trace.name_span.start = offset; } fn push_attribute_name(&mut self, s: &str) { @@ -111,12 +123,12 @@ impl Emitter<usize> for TracingEmitter { } fn terminate_attribute_name(&mut self, offset: usize) { - self.current_attr_internal.name_span.end = offset; + self.current_attribute_trace.name_span.end = offset; } fn init_attribute_value(&mut self, syntax: AttrValueSyntax, offset: usize) { - self.current_attr_internal.value_span.start = offset; - self.current_attr_internal.value_syntax = Some(syntax); + self.current_attribute_trace.value_span.start = offset; + self.current_attribute_trace.value_syntax = Some(syntax); } fn push_attribute_value(&mut self, s: &str) { @@ -124,7 +136,7 @@ impl Emitter<usize> for TracingEmitter { } fn terminate_attribute_value(&mut self, offset: usize) { - self.current_attr_internal.value_span.end = offset; + self.current_attribute_trace.value_span.end = offset; } fn set_self_closing(&mut self, slash_span: Range<usize>) { @@ -144,43 +156,47 @@ impl Emitter<usize> for TracingEmitter { fn emit_current_tag(&mut self, offset: usize) { self.flush_current_attribute(); let mut token = self.current_token.take().unwrap(); + let mut trace = self.current_trace.take().unwrap(); match &mut token { - Token::EndTag(tag) => { + Token::EndTag(_) => { if !self.seen_attributes.is_empty() { let span = self.attr_in_end_tag_span.take().unwrap(); self.report_error(Error::EndTagWithAttributes, span); } self.seen_attributes.clear(); - tag.span.end = offset; + know!(Trace::EndTag(tag_trace), &mut trace); + tag_trace.span.end = offset; } - Token::StartTag(tag) => { - tag.span.end = offset; + Token::StartTag(_) => { + know!(Trace::StartTag(tag_trace), &mut trace); + tag_trace.span.end = offset; } other => { debug_assert!(false, "unexpected current_token: {other:?}"); return; } } - self.emit_token(token); + self.emit_token(token, trace); } fn init_comment(&mut self, data_start_offset: usize) { - self.current_token = Some(Token::Comment(Comment { - data: String::new(), + self.current_token = Some(Token::Comment(String::new())); + self.current_trace = Some(Trace::Comment(CommentTrace { data_span: data_start_offset..0, })); } fn push_comment(&mut self, s: &str) { - assume!(Some(Token::Comment(comment)), &mut self.current_token); - comment.data.push_str(s); + assume!(Some(Token::Comment(data)), &mut self.current_token); + data.push_str(s); } fn emit_current_comment(&mut self, data_end_offset: usize) { - let mut token = self.current_token.take().unwrap(); - assume!(Token::Comment(comment), &mut token); - comment.data_span.end = data_end_offset; - self.emit_token(token); + let token = self.current_token.take().unwrap(); + let mut trace = self.current_trace.take().unwrap(); + assume!(Trace::Comment(comment_trace), &mut trace); + comment_trace.data_span.end = data_end_offset; + self.emit_token(token, trace); } fn init_doctype(&mut self, offset: usize) { @@ -189,17 +205,15 @@ impl Emitter<usize> for TracingEmitter { force_quirks: false, public_id: None, system_id: None, - span: offset..0, - name_span: 0..0, - public_id_span: 0..0, - system_id_span: 0..0, })); + self.current_trace = Some(Trace::Doctype(DoctypeTrace::new(offset))); } fn init_doctype_name(&mut self, offset: usize) { assume!(Some(Token::Doctype(doctype)), &mut self.current_token); doctype.name = Some("".into()); - doctype.name_span.start = offset; + know!(Some(Trace::Doctype(doctype_trace)), &mut self.current_trace); + doctype_trace.set_name_start(offset); } fn push_doctype_name(&mut self, s: &str) { @@ -214,14 +228,15 @@ impl Emitter<usize> for TracingEmitter { } fn terminate_doctype_name(&mut self, offset: usize) { - assume!(Some(Token::Doctype(doctype)), &mut self.current_token); - doctype.name_span.end = offset; + assume!(Some(Trace::Doctype(doctype_trace)), &mut self.current_trace); + doctype_trace.set_name_end(offset); } fn init_doctype_public_id(&mut self, offset: usize) { assume!(Some(Token::Doctype(doctype)), &mut self.current_token); doctype.public_id = Some("".to_owned()); - doctype.public_id_span.start = offset; + know!(Some(Trace::Doctype(doctype_trace)), &mut self.current_trace); + doctype_trace.set_public_id_start(offset); } fn push_doctype_public_id(&mut self, s: &str) { @@ -236,14 +251,15 @@ impl Emitter<usize> for TracingEmitter { } fn terminate_doctype_public_id(&mut self, offset: usize) { - assume!(Some(Token::Doctype(doctype)), &mut self.current_token); - doctype.public_id_span.end = offset; + assume!(Some(Trace::Doctype(doctype_trace)), &mut self.current_trace); + doctype_trace.set_public_id_end(offset); } fn init_doctype_system_id(&mut self, offset: usize) { assume!(Some(Token::Doctype(doctype)), &mut self.current_token); doctype.system_id = Some("".to_owned()); - doctype.system_id_span.start = offset; + know!(Some(Trace::Doctype(doctype_trace)), &mut self.current_trace); + doctype_trace.set_system_id_start(offset); } fn push_doctype_system_id(&mut self, s: &str) { @@ -258,8 +274,8 @@ impl Emitter<usize> for TracingEmitter { } fn terminate_doctype_system_id(&mut self, offset: usize) { - assume!(Some(Token::Doctype(doctype)), &mut self.current_token); - doctype.system_id_span.end = offset; + assume!(Some(Trace::Doctype(doctype_trace)), &mut self.current_trace); + doctype_trace.set_system_id_end(offset); } fn set_force_quirks(&mut self) { @@ -268,15 +284,17 @@ impl Emitter<usize> for TracingEmitter { } fn emit_current_doctype(&mut self, offset: usize) { - assume!(Some(Token::Doctype(mut doctype)), self.current_token.take()); - doctype.span.end = offset; - self.emit_token(Token::Doctype(doctype)); + assume!(Some(mut trace), self.current_trace.take()); + assume!(Trace::Doctype(doctype_trace), &mut trace); + doctype_trace.span.end = offset; + let token = self.current_token.take().unwrap(); + self.emit_token(token, trace); } } impl TracingEmitter { - fn emit_token(&mut self, token: Token) { - self.emitted_tokens.push_front(token); + fn emit_token(&mut self, token: Token, trace: Trace) { + self.emitted_tokens.push_front((token, trace)); } fn flush_current_attribute(&mut self) { @@ -284,21 +302,26 @@ impl TracingEmitter { return; } let name = std::mem::take(&mut self.current_attribute_name); - let attr_internal = std::mem::take(&mut self.current_attr_internal); + let mut attr_internal = std::mem::take(&mut self.current_attr_internal); + let attr_trace = + std::mem::replace(&mut self.current_attribute_trace, AttributeTrace::new()); match &mut self.current_token { Some(Token::StartTag(tag)) => match tag.attributes.inner.entry(name) { Entry::Vacant(vacant) => { + know!(Some(Trace::StartTag(trace)), &mut self.current_trace); + let trace_idx = trace.attribute_traces.insert(attr_trace); + attr_internal.trace_idx = Some(trace_idx); vacant.insert(attr_internal); } Entry::Occupied(_) => { - self.report_error(Error::DuplicateAttribute, attr_internal.name_span); + self.report_error(Error::DuplicateAttribute, attr_trace.name_span); } }, Some(Token::EndTag(_)) => { - self.attr_in_end_tag_span = Some(attr_internal.name_span.clone()); + self.attr_in_end_tag_span = Some(attr_trace.name_span.clone()); if !self.seen_attributes.insert(name) { - self.report_error(Error::DuplicateAttribute, attr_internal.name_span); + self.report_error(Error::DuplicateAttribute, attr_trace.name_span); } } other => debug_assert!(false, "unexpected current_token: {other:?}"), @@ -306,6 +329,12 @@ impl TracingEmitter { } } +impl From<(Token, Trace)> for Token { + fn from((token, _): (Token, Trace)) -> Self { + token + } +} + /// The majority of our testing of the [`TracingEmitter`] is done against the /// html5lib-tests in the html5lib integration test. This module only tests /// details that aren't present in the html5lib test data. @@ -313,8 +342,8 @@ impl TracingEmitter { mod tests { use super::TracingEmitter; use crate::offset::PosTrackingReader; - use crate::token::{AttrValueSyntax, Token}; - use crate::{Event, Tokenizer}; + use crate::trace::{AttrValueSyntax, Trace}; + use crate::{Event, Token, Tokenizer}; #[test] fn test_attribute_value_syntax() { @@ -325,7 +354,9 @@ mod tests { TracingEmitter::default(), ) .flatten(); - let Event::Token(Token::StartTag(tag)) = tokenizer.next().unwrap() else { + let Event::Token((Token::StartTag(tag), Trace::StartTag(tag_trace))) = + tokenizer.next().unwrap() + else { panic!("expected start tag"); }; for (name, syntax) in [ @@ -334,8 +365,9 @@ mod tests { ("single-quoted", Some(AttrValueSyntax::SingleQuoted)), ("double-quoted", Some(AttrValueSyntax::DoubleQuoted)), ] { + let attr_trace_idx = tag.attributes.get(name).unwrap().trace_idx().unwrap(); assert_eq!( - tag.attributes.get(name).unwrap().value_syntax(), + tag_trace.attribute_traces[attr_trace_idx].value_syntax(), syntax, "unexpected value for attribute {name}" ); diff --git a/tests/test_spans.rs b/tests/test_spans.rs index 71a6c4b..0e95be0 100644 --- a/tests/test_spans.rs +++ b/tests/test_spans.rs @@ -10,7 +10,8 @@ use codespan_reporting::{ use html5tokenizer::{ offset::PosTrackingReader, reader::{IntoReader, Reader}, - NaiveParser, Token, TracingEmitter, + trace::Trace, + NaiveParser, Token, }; use insta::assert_snapshot; use similar_asserts::assert_eq; @@ -31,7 +32,7 @@ where PosTrackingReader::new( Box::new(reader.into_reader()) as Box<dyn Reader<Error = Infallible>> ), - TracingEmitter::default(), + html5tokenizer::TracingEmitter::default(), ) } @@ -76,9 +77,9 @@ fn start_tag_span() { let html = "<x> <xyz> <xyz > <xyz/>"; let labeler = |parser: Parser| { let mut labels = Vec::new(); - for token in parser.flatten() { - if let Token::StartTag(tag) = token { - labels.push((tag.span, "")); + for (_, trace) in parser.flatten() { + if let Trace::StartTag(trace) = trace { + labels.push((trace.span, "")); } } labels @@ -94,9 +95,9 @@ fn end_tag_span() { let html = "</x> </xyz> </xyz > </xyz/>"; let labeler = |parser: Parser| { let mut labels = Vec::new(); - for token in parser.flatten() { - if let Token::EndTag(tag) = token { - labels.push((tag.span, "")); + for (_, trace) in parser.flatten() { + if let Trace::EndTag(trace) = trace { + labels.push((trace.span, "")); } } labels @@ -112,9 +113,9 @@ fn start_tag_name_span() { let html = "<x> <xyz> <xyz > <xyz/>"; let labeler = |parser: Parser| { let mut labels = Vec::new(); - for token in parser.flatten() { - if let Token::StartTag(tag) = token { - labels.push((tag.name_span, "")); + for (_, trace) in parser.flatten() { + if let Trace::StartTag(trace) = trace { + labels.push((trace.name_span, "")); } } labels @@ -130,9 +131,9 @@ fn end_tag_name_span() { let html = "</x> </xyz> </xyz > </xyz/>"; let labeler = |parser: Parser| { let mut labels = Vec::new(); - for token in parser.flatten() { - if let Token::EndTag(tag) = token { - labels.push((tag.name_span, "")); + for (_, trace) in parser.flatten() { + if let Trace::EndTag(trace) = trace { + labels.push((trace.name_span, "")); } } labels @@ -148,11 +149,15 @@ fn attribute_name_span() { let html = "<test x xyz y=VAL xy=VAL z = VAL yzx = VAL>"; let labeler = |parser: Parser| { let mut labels = Vec::new(); - let Token::StartTag(tag) = parser.flatten().next().unwrap() else { + let (Token::StartTag(tag), Trace::StartTag(trace)) = parser.flatten().next().unwrap() + else { panic!("expected start tag") }; for attr in &tag.attributes { - labels.push((attr.name_span(), "")); + labels.push(( + trace.attribute_traces[attr.trace_idx().unwrap()].name_span(), + "", + )); } labels }; @@ -167,11 +172,17 @@ fn attribute_value_span() { let html = "<test x=unquoted y = unquoted z='single-quoted' zz=\"double-quoted\" empty=''>"; let labeler = |parser: Parser| { let mut labels = Vec::new(); - let Token::StartTag(tag) = parser.flatten().next().unwrap() else { + let (Token::StartTag(tag), Trace::StartTag(trace)) = parser.flatten().next().unwrap() + else { panic!("expected start tag") }; for attr in &tag.attributes { - labels.push((attr.value_span().unwrap(), "")); + labels.push(( + trace.attribute_traces[attr.trace_idx().unwrap()] + .value_span() + .unwrap(), + "", + )); } labels }; @@ -186,11 +197,17 @@ fn attribute_value_with_char_ref() { let html = "<test x=& y='&' z=\"&\">"; let labeler = |parser: Parser| { let mut labels = Vec::new(); - let Token::StartTag(tag) = parser.flatten().next().unwrap() else { + let (Token::StartTag(tag), Trace::StartTag(trace)) = parser.flatten().next().unwrap() + else { panic!("expected start tag") }; for attr in &tag.attributes { - labels.push((attr.value_span().unwrap(), "")); + labels.push(( + trace.attribute_traces[attr.trace_idx().unwrap()] + .value_span() + .unwrap(), + "", + )); } labels }; @@ -224,10 +241,10 @@ fn comment_data_span() { let mut annotated = String::new(); for case in cases { let labeler = |parser: Parser| { - let Token::Comment(comment) = parser.flatten().next().unwrap() else { + let (_, Trace::Comment(comment)) = parser.flatten().next().unwrap() else { panic!("expected comment"); }; - vec![(comment.data_span(), "")] + vec![(comment.data_span, "")] }; annotated.push_str(&test_and_annotate(case, labeler)); @@ -263,10 +280,11 @@ fn comment_data_span() { "###); for (idx, case) in cases.iter().enumerate() { - let Token::Comment(comment) = parser(*case).flatten().next().unwrap() else { + let (Token::Comment(data), Trace::Comment(trace)) = parser(*case).flatten().next().unwrap() + else { panic!("expected comment"); }; - assert_eq!(case[comment.data_span()], comment.data, "case {idx}"); + assert_eq!(case[trace.data_span], data, "case {idx}"); } } @@ -280,10 +298,10 @@ fn doctype_span() { let mut annotated = String::new(); for case in cases { let labeler = |parser: Parser| { - let Token::Doctype(doctype) = parser.flatten().next().unwrap() else { + let (_, Trace::Doctype(trace)) = parser.flatten().next().unwrap() else { panic!("expected doctype"); }; - vec![(doctype.span, "")] + vec![(trace.span(), "")] }; annotated.push_str(&test_and_annotate(case, labeler)); } @@ -304,18 +322,18 @@ fn doctype_id_spans() { let mut annotated = String::new(); for case in cases { let labeler = |parser: Parser| { - let Token::Doctype(doctype) = parser.flatten().next().unwrap() else { + let (_, Trace::Doctype(trace)) = parser.flatten().next().unwrap() else { panic!("expected doctype"); }; let mut labels = Vec::new(); - if let Some(name_span) = doctype.name_span() { + if let Some(name_span) = trace.name_span() { labels.push((name_span, "name")); } - if let Some(public_id_span) = doctype.public_id_span() { + if let Some(public_id_span) = trace.public_id_span() { labels.push((public_id_span, "public id")); } - if let Some(system_id_span) = doctype.system_id_span() { + if let Some(system_id_span) = trace.system_id_span() { labels.push((system_id_span, "system id")); } labels |