summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/basic_emitter.rs2
-rw-r--r--src/emitter.rs2
-rw-r--r--src/let_else.rs22
-rw-r--r--src/lib.rs8
-rw-r--r--src/token.rs217
-rw-r--r--src/tokenizer.rs1
-rw-r--r--src/tokenizer/machine.rs2
-rw-r--r--src/trace.rs241
-rw-r--r--src/tracing_emitter.rs148
9 files changed, 432 insertions, 211 deletions
diff --git a/src/basic_emitter.rs b/src/basic_emitter.rs
index 046b645..bcb3f41 100644
--- a/src/basic_emitter.rs
+++ b/src/basic_emitter.rs
@@ -27,7 +27,7 @@ impl<O> BasicEmitter<O> {
}
impl<O> Iterator for BasicEmitter<O> {
- type Item = Token<O>;
+ type Item = Token;
fn next(&mut self) -> Option<Self::Item> {
todo!()
diff --git a/src/emitter.rs b/src/emitter.rs
index 25e0209..d1e1dfe 100644
--- a/src/emitter.rs
+++ b/src/emitter.rs
@@ -1,6 +1,6 @@
use std::ops::Range;
-use crate::token::AttrValueSyntax;
+use crate::trace::AttrValueSyntax;
use crate::Error;
/// An emitter is an object providing methods to the tokenizer to produce ("emit") tokens.
diff --git a/src/let_else.rs b/src/let_else.rs
index da17a68..a1627f1 100644
--- a/src/let_else.rs
+++ b/src/let_else.rs
@@ -21,3 +21,25 @@ right: {:?}",
}
pub(crate) use assume;
+
+/// Binds the given expression to the given pattern, or else executes
+/// `unreachable!();` with a helpful panic message and returns.
+macro_rules! know {
+ ($pattern:pat, $value:expr) => {
+ // The expression might change each time it's evaluated, so we
+ // have to bind it so that we can reuse it in the panic message.
+ let _value = $value;
+
+ let $pattern = _value else {
+ unreachable!(
+ "assertion `left matches right` failed:
+ left: {}
+right: {:?}",
+ stringify!($pattern),
+ _value
+ );
+ };
+ };
+}
+
+pub(crate) use know;
diff --git a/src/lib.rs b/src/lib.rs
index 16728ad..f4e0369 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -18,19 +18,19 @@ mod tracing_emitter;
/// Types for HTML attributes.
pub mod attr {
- pub use crate::token::{
- AttrIntoIter, AttrIter, AttrValueSyntax, Attribute, AttributeMap, AttributeOwned,
- };
+ pub use crate::token::{AttrIntoIter, AttrIter, Attribute, AttributeMap, AttributeOwned};
+ pub use crate::trace::AttrValueSyntax;
}
pub mod offset;
pub mod reader;
pub mod token;
+pub mod trace;
pub use basic_emitter::BasicEmitter;
pub use emitter::Emitter;
pub use error::Error;
pub use naive_parser::NaiveParser;
-pub use token::{Comment, Doctype, EndTag, StartTag, Token};
+pub use token::{Doctype, EndTag, StartTag, Token};
pub use tokenizer::{CdataAction, Event, State, Tokenizer};
pub use tracing_emitter::TracingEmitter;
diff --git a/src/token.rs b/src/token.rs
index ed8c8c8..4f3c0ce 100644
--- a/src/token.rs
+++ b/src/token.rs
@@ -2,32 +2,30 @@
use std::collections::{btree_map, BTreeMap};
use std::iter::FromIterator;
-use std::ops::{Index, Range};
-
-use crate::offset::Offset;
+use std::ops::Index;
/// A type for the tokens emitted by a WHATWG-compliant HTML tokenizer.
#[derive(Debug, Eq, PartialEq)]
-pub enum Token<O> {
+pub enum Token {
/// A literal character, a resolved character reference,
/// or part of a resolved character reference (since some
/// character references resolve to two `char`s).
Char(char),
/// An HTML start tag.
- StartTag(StartTag<O>),
+ StartTag(StartTag),
/// An HTML end tag.
- EndTag(EndTag<O>),
+ EndTag(EndTag),
/// An HTML comment.
- Comment(Comment<O>),
+ Comment(String),
/// An HTML doctype declaration.
- Doctype(Doctype<O>),
+ Doctype(Doctype),
/// An end-of-file token.
EndOfFile,
}
/// An HTML start tag, such as `<p>` or `<a>`.
#[derive(Debug, Eq, PartialEq)]
-pub struct StartTag<O> {
+pub struct StartTag {
/// Whether this tag is self-closing. If it is self-closing, no following [`EndTag`] should be
/// expected.
pub self_closing: bool,
@@ -39,43 +37,15 @@ pub struct StartTag<O> {
/// A mapping for any HTML attributes this start tag may have.
///
/// Duplicate attributes are ignored after the first one as per WHATWG spec.
- pub attributes: AttributeMap<O>,
-
- /// The source code span of the tag.
- pub span: Range<O>,
-
- /// The span of the tag name.
- pub name_span: Range<O>,
+ pub attributes: AttributeMap,
}
/// An HTML end/close tag, such as `</p>` or `</a>`.
#[derive(Debug, Eq, PartialEq)]
-pub struct EndTag<O> {
+pub struct EndTag {
/// The tag name.
/// Uppercase ASCII characters (A-Z) have been converted to lowercase.
pub name: String,
-
- /// The source code span of the tag.
- pub span: Range<O>,
-
- /// The span of the tag name.
- pub name_span: Range<O>,
-}
-
-/// An HTML comment.
-#[derive(PartialEq, Eq, Debug)]
-pub struct Comment<O> {
- /// The text within the comment.
- pub data: String,
- /// The source offset of the comment data.
- pub data_span: Range<O>,
-}
-
-impl<O: Offset> Comment<O> {
- /// Returns the span for the comment data.
- pub fn data_span(&self) -> Range<O> {
- self.data_span.clone()
- }
}
/// A doctype. Some examples:
@@ -85,7 +55,7 @@ impl<O: Offset> Comment<O> {
/// * `<!DOCTYPE {name} SYSTEM '{system_id}'>`
/// * `<!DOCTYPE {name} PUBLIC '{public_id}' '{system_id}'>`
#[derive(Debug, Eq, PartialEq)]
-pub struct Doctype<O> {
+pub struct Doctype {
/// The [force-quirks flag].
///
/// [force-quirks flag]: https://html.spec.whatwg.org/multipage/parsing.html#force-quirks-flag
@@ -100,38 +70,6 @@ pub struct Doctype<O> {
/// The doctype's system identifier.
pub system_id: Option<String>,
-
- /// The source code span of the doctype.
- pub span: Range<O>,
-
- /// The span of the name.
- pub(crate) name_span: Range<O>,
-
- /// The span of the public identifier.
- pub(crate) public_id_span: Range<O>,
-
- /// The span of the system identifier.
- pub(crate) system_id_span: Range<O>,
-}
-
-impl<O: Offset> Doctype<O> {
- /// Returns the span of the name.
- pub fn name_span(&self) -> Option<Range<O>> {
- self.name.as_ref()?;
- Some(self.name_span.clone())
- }
-
- /// Returns the span of the public identifier.
- pub fn public_id_span(&self) -> Option<Range<O>> {
- self.public_id.as_ref()?;
- Some(self.public_id_span.clone())
- }
-
- /// Returns the span of the system identifier.
- pub fn system_id_span(&self) -> Option<Range<O>> {
- self.system_id.as_ref()?;
- Some(self.system_id_span.clone())
- }
}
/// A map of HTML attributes.
@@ -143,79 +81,81 @@ impl<O: Offset> Doctype<O> {
///
/// ```
/// # use html5tokenizer::attr::AttributeMap;
-/// let attrs: AttributeMap<()> = vec![("href".into(), "http://example.com".into())]
+/// let attrs: AttributeMap = vec![("href".into(), "http://example.com".into())]
/// .into_iter()
/// .collect();
/// assert_eq!(&attrs["href"], "http://example.com");
/// ```
#[derive(Debug, Default, PartialEq, Eq)]
-pub struct AttributeMap<O> {
- pub(crate) inner: BTreeMap<String, AttrInternal<O>>,
+pub struct AttributeMap {
+ pub(crate) inner: BTreeMap<String, AttrInternal>,
}
/// The value type internally used by the [`AttributeMap`].
/// Not part of the public API.
-#[derive(Default, Debug, Eq, PartialEq)]
-pub(crate) struct AttrInternal<O> {
+#[derive(Default, Debug, Eq)]
+pub(crate) struct AttrInternal {
pub value: String,
- /// The span of the attribute name.
- pub name_span: Range<O>,
- /// The span of the attribute value.
- /// For the empty attribute syntax this is just `O::default()..O::default()`.
- /// We intentionally don't use `Option<Range<O>>` here to spare us a byte (and padding) per attribute.
- pub value_span: Range<O>,
- pub value_syntax: Option<AttrValueSyntax>,
+ pub trace_idx: Option<AttributeTraceIdx>,
}
-/// The syntax of the attribute value.
-#[derive(Clone, Copy, PartialEq, Eq, Debug)]
-pub enum AttrValueSyntax {
- /// An unquoted attribute value, e.g. `id=foo`.
- Unquoted,
- /// A single-quoted attribute value, e.g. `id='foo'`.
- SingleQuoted,
- /// A double-quoted attribute value, e.g. `id="foo"`.
- DoubleQuoted,
+/// The index of an [`AttributeTrace`] within an [`AttributeTraceList`].
+///
+/// [`AttributeTrace`]: crate::trace::AttributeTrace
+/// [`AttributeTraceList`]: crate::trace::AttributeTraceList
+#[derive(Clone, Copy, Eq, PartialEq, Debug)]
+pub struct AttributeTraceIdx(
+ // Using NonZeroUsize so that `Option<AttributeTraceIdx>`
+ // has the same size as `AttributeTraceIdx`.
+ pub std::num::NonZeroUsize,
+);
+
+impl PartialEq for AttrInternal {
+ fn eq(&self, other: &Self) -> bool {
+ // We intentionally don't include the trace_idx,
+ // so that PartialEq of Token only compares semantics.
+ self.value == other.value
+ }
}
/// An HTML attribute borrowed from an [`AttributeMap`].
#[derive(Debug, Eq, PartialEq)]
-pub struct Attribute<'a, O> {
+pub struct Attribute<'a> {
name: &'a str,
- map_val: &'a AttrInternal<O>,
+ map_val: &'a AttrInternal,
}
/// An owned HTML attribute.
#[derive(Debug, PartialEq, Eq)]
-pub struct AttributeOwned<O> {
+pub struct AttributeOwned {
/// The attribute name.
/// Uppercase ASCII characters (A-Z) have been converted to lowercase.
pub name: String,
/// The attribute value. Character references have been resolved.
pub value: String,
- /// The span of the attribute name.
- pub name_span: Range<O>,
- /// The span of the attribute value.
- /// `None` in case of the empty attribute syntax (e.g. `disabled` in `<input disabled>`).
- pub value_span: Option<Range<O>>,
- /// The syntax of the attribute value.
- /// `None` indicates the empty attribute syntax (e.g. `disabled` in `<input disabled>`).
- pub value_syntax: Option<AttrValueSyntax>,
+ /// The index of the corresponding [`AttributeTrace`] in the
+ /// `attribute_traces` field of [`StartTagTrace`], in case this attribute
+ /// was present in the source and the [`Emitter`] has tracked this.
+ ///
+ /// [`AttributeTrace`]: super::trace::AttributeTrace
+ /// [`StartTagTrace`]: super::trace::AttributeTrace
+ /// [`Emitter`]: super::Emitter
+ pub trace_idx: Option<AttributeTraceIdx>,
}
-impl<O> AttributeMap<O> {
+impl AttributeMap {
/// Returns the attribute with the given name.
///
/// The name must not contain any uppercase ASCII character (A-Z)
/// or the method will always return `None`.
- pub fn get(&self, name: &str) -> Option<Attribute<O>> {
+ pub fn get(&self, name: &str) -> Option<Attribute> {
self.inner
.get_key_value(name)
.map(|(name, map_val)| Attribute { name, map_val })
}
}
-impl<'a, O: Offset> Attribute<'a, O> {
+impl<'a> Attribute<'a> {
/// Returns the attribute name.
/// Uppercase ASCII characters (A-Z) have been converted to lowercase.
pub fn name(&self) -> &'a str {
@@ -227,32 +167,21 @@ impl<'a, O: Offset> Attribute<'a, O> {
&self.map_val.value
}
- /// Returns the span of the attribute name.
- pub fn name_span(&self) -> Range<O> {
- self.map_val.name_span.clone()
- }
-
- /// For explicitly defined values returns the span of the attribute value.
- ///
- /// Returns `None` for attributes using the empty attribute syntax (e.g. `disabled` in `<input disabled>`).
- pub fn value_span(&self) -> Option<Range<O>> {
- if self.map_val.value_syntax.is_none() {
- return None;
- }
- Some(self.map_val.value_span.clone())
- }
-
- /// Returns the attribute value syntax in case the value is explicitly defined.
+ /// Returns the index of the corresponding [`AttributeTrace`] in the
+ /// `attribute_traces` field of [`StartTagTrace`], in case this attribute
+ /// was present in the source and the [`Emitter`] has tracked that.
///
- /// Returns `None` for attributes using the empty attribute syntax (e.g. `disabled` in `<input disabled>`).
- pub fn value_syntax(&self) -> Option<AttrValueSyntax> {
- self.map_val.value_syntax
+ /// [`AttributeTrace`]: super::trace::AttributeTrace
+ /// [`StartTagTrace`]: super::trace::AttributeTrace
+ /// [`Emitter`]: super::Emitter
+ pub fn trace_idx(&self) -> Option<AttributeTraceIdx> {
+ self.map_val.trace_idx
}
}
// We cannot impl Index<Output=Attribute> because Index::index returns a reference of
// the Output type (and you cannot return a value referencing a temporary value).
-impl<O> Index<&str> for AttributeMap<O> {
+impl Index<&str> for AttributeMap {
type Output = str;
/// Returns the attribute value with the given name.
@@ -264,10 +193,10 @@ impl<O> Index<&str> for AttributeMap<O> {
}
}
-impl<O> IntoIterator for AttributeMap<O> {
- type Item = AttributeOwned<O>;
+impl IntoIterator for AttributeMap {
+ type Item = AttributeOwned;
- type IntoIter = AttrIntoIter<O>;
+ type IntoIter = AttrIntoIter;
fn into_iter(self) -> Self::IntoIter {
AttrIntoIter(self.inner.into_iter())
@@ -275,27 +204,25 @@ impl<O> IntoIterator for AttributeMap<O> {
}
/// A consuming iterator over the attributes of an [`AttributeMap`].
-pub struct AttrIntoIter<O>(btree_map::IntoIter<String, AttrInternal<O>>);
+pub struct AttrIntoIter(btree_map::IntoIter<String, AttrInternal>);
-impl<O> Iterator for AttrIntoIter<O> {
- type Item = AttributeOwned<O>;
+impl Iterator for AttrIntoIter {
+ type Item = AttributeOwned;
fn next(&mut self) -> Option<Self::Item> {
let (name, map_val) = self.0.next()?;
Some(AttributeOwned {
name,
value: map_val.value,
- name_span: map_val.name_span,
- value_span: map_val.value_syntax.is_some().then_some(map_val.value_span),
- value_syntax: map_val.value_syntax,
+ trace_idx: map_val.trace_idx,
})
}
}
-impl<'a, O> IntoIterator for &'a AttributeMap<O> {
- type Item = Attribute<'a, O>;
+impl<'a> IntoIterator for &'a AttributeMap {
+ type Item = Attribute<'a>;
- type IntoIter = AttrIter<'a, O>;
+ type IntoIter = AttrIter<'a>;
fn into_iter(self) -> Self::IntoIter {
AttrIter(self.inner.iter())
@@ -303,10 +230,10 @@ impl<'a, O> IntoIterator for &'a AttributeMap<O> {
}
/// A borrowed iterator over the attributes of an [`AttributeMap`].
-pub struct AttrIter<'a, S>(btree_map::Iter<'a, String, AttrInternal<S>>);
+pub struct AttrIter<'a>(btree_map::Iter<'a, String, AttrInternal>);
-impl<'a, S> Iterator for AttrIter<'a, S> {
- type Item = Attribute<'a, S>;
+impl<'a> Iterator for AttrIter<'a> {
+ type Item = Attribute<'a>;
fn next(&mut self) -> Option<Self::Item> {
let (name, map_val) = self.0.next()?;
@@ -314,7 +241,7 @@ impl<'a, S> Iterator for AttrIter<'a, S> {
}
}
-impl<O: Default> FromIterator<(String, String)> for AttributeMap<O> {
+impl FromIterator<(String, String)> for AttributeMap {
fn from_iter<T: IntoIterator<Item = (String, String)>>(iter: T) -> Self {
Self {
inner: iter
@@ -324,9 +251,7 @@ impl<O: Default> FromIterator<(String, String)> for AttributeMap<O> {
name,
AttrInternal {
value,
- name_span: O::default()..O::default(),
- value_span: O::default()..O::default(),
- value_syntax: Some(AttrValueSyntax::DoubleQuoted),
+ trace_idx: None,
},
)
})
diff --git a/src/tokenizer.rs b/src/tokenizer.rs
index d0e2eaf..decd4df 100644
--- a/src/tokenizer.rs
+++ b/src/tokenizer.rs
@@ -20,6 +20,7 @@ pub use machine::State as InternalState;
/// let emitter = DefaultEmitter::default();
/// let html = "<script><b>";
/// let mut tokens = Tokenizer::new(html, emitter).flatten();
+/// let mut tokens = tokens.map(|event| match event { Event::Token((token, _)) => Event::Token(token), Event::CdataOpen => Event::CdataOpen }); // TODO: remove once BasicEmitter can be used instead
/// assert!(matches!(tokens.next(), Some(Event::Token(Token::StartTag(_)))));
/// assert!(matches!(tokens.next(), Some(Event::Token(Token::StartTag(_)))));
/// ```
diff --git a/src/tokenizer/machine.rs b/src/tokenizer/machine.rs
index 8b09aa7..9aaac73 100644
--- a/src/tokenizer/machine.rs
+++ b/src/tokenizer/machine.rs
@@ -2,8 +2,8 @@ mod utils;
use crate::entities::try_read_character_reference;
use crate::offset::{Offset, Position};
-use crate::token::AttrValueSyntax;
use crate::tokenizer::CdataAction;
+use crate::trace::AttrValueSyntax;
use crate::{reader::Reader, Emitter, Error};
use utils::{
ascii_digit_pat, control_pat, ctostr, noncharacter_pat, surrogate_pat, whitespace_pat,
diff --git a/src/trace.rs b/src/trace.rs
new file mode 100644
index 0000000..a816429
--- /dev/null
+++ b/src/trace.rs
@@ -0,0 +1,241 @@
+//! Provides the [`Trace`] type (byte offsets and syntax information about tokens).
+
+use std::{
+ num::NonZeroUsize,
+ ops::{Index, Range},
+};
+
+use crate::let_else::assume;
+use crate::token::AttributeTraceIdx;
+
+/// Provides byte offsets and syntax information about a [`Token`].
+///
+/// [`Token`]: crate::token::Token
+#[allow(missing_docs)]
+#[derive(Eq, PartialEq, Debug)]
+pub enum Trace {
+ Char,
+ StartTag(StartTagTrace),
+ EndTag(EndTagTrace),
+ Comment(CommentTrace),
+ Doctype(DoctypeTrace),
+ EndOfFile,
+}
+
+/// Provides byte offsets and syntax information for a [`StartTag`] token.
+///
+/// [`StartTag`]: crate::token::StartTag
+#[derive(Eq, PartialEq, Debug)]
+pub struct StartTagTrace {
+ /// The span of the tag.
+ pub span: Range<usize>,
+
+ /// The span of the tag name.
+ pub name_span: Range<usize>,
+
+ /// List of [`AttributeTrace`]s for the attributes that were present in the source.
+ pub attribute_traces: AttributeTraceList,
+}
+
+/// Provides byte offsets for an [`EndTag`] token.
+///
+/// [`EndTag`]: crate::token::EndTag
+#[derive(Eq, PartialEq, Debug)]
+pub struct EndTagTrace {
+ /// The span of the tag.
+ pub span: Range<usize>,
+
+ /// The span of the tag name.
+ pub name_span: Range<usize>,
+}
+
+/// Provides byte offsets for a [`Token::Comment`].
+///
+/// [`Token::Comment`]: crate::token::Token::Comment
+#[derive(Eq, PartialEq, Debug)]
+pub struct CommentTrace {
+ /// The offset of the comment data.
+ pub data_span: Range<usize>,
+}
+
+/// Provides byte offsets for a [`Doctype`] token.
+///
+/// [`Doctype`]: crate::token::Doctype
+#[derive(Eq, PartialEq, Debug)]
+pub struct DoctypeTrace {
+ pub(crate) span: Range<usize>,
+ // Using NonZeroUsize to optimize the size of the struct.
+ name_span: Option<Range<std::num::NonZeroUsize>>,
+ public_id_span: Option<Range<std::num::NonZeroUsize>>,
+ system_id_span: Option<Range<std::num::NonZeroUsize>>,
+}
+
+impl DoctypeTrace {
+ /// Returns the span of the DOCTYPE.
+ pub fn span(&self) -> Range<usize> {
+ self.span.clone()
+ }
+
+ /// Returns the span of the name.
+ pub fn name_span(&self) -> Option<Range<usize>> {
+ self.name_span
+ .as_ref()
+ .map(|range| range.start.get()..range.end.get())
+ }
+
+ /// Returns the span of the public identifier.
+ pub fn public_id_span(&self) -> Option<Range<usize>> {
+ self.public_id_span
+ .as_ref()
+ .map(|range| range.start.get()..range.end.get())
+ }
+
+ /// Returns the span of the system identifier.
+ pub fn system_id_span(&self) -> Option<Range<usize>> {
+ self.system_id_span
+ .as_ref()
+ .map(|range| range.start.get()..range.end.get())
+ }
+}
+
+/// Internal [`DoctypeTrace`] methods.
+///
+/// Note that even though it stands to reason that the offsets provided to the `set_`
+/// methods can never be zero, we intentionally don't use `new_unchecked` since
+/// actually verifying that the offsets provided to the respective Emitter methods can
+/// never be zero would non-trivial (since the tokenizer state machine has 80 states).
+impl DoctypeTrace {
+ #[inline]
+ pub(crate) fn new(span_start: usize) -> Self {
+ Self {
+ span: span_start..0,
+ name_span: None,
+ public_id_span: None,
+ system_id_span: None,
+ }
+ }
+
+ #[inline]
+ pub(crate) fn set_name_start(&mut self, start: usize) {
+ let start = NonZeroUsize::new(start).expect("expected offset to be non-zero");
+ self.name_span = Some(start..start);
+ }
+
+ #[inline]
+ pub(crate) fn set_public_id_start(&mut self, start: usize) {
+ let start = NonZeroUsize::new(start).expect("expected offset to be non-zero");
+ self.public_id_span = Some(start..start);
+ }
+
+ #[inline]
+ pub(crate) fn set_system_id_start(&mut self, start: usize) {
+ let start = NonZeroUsize::new(start).expect("expected offset to be non-zero");
+ self.system_id_span = Some(start..start);
+ }
+
+ #[inline]
+ pub(crate) fn set_name_end(&mut self, end: usize) {
+ assume!(Some(span), &mut self.name_span);
+ span.end = NonZeroUsize::new(end).expect("expected offset to be non-zero");
+ }
+
+ #[inline]
+ pub(crate) fn set_public_id_end(&mut self, end: usize) {
+ assume!(Some(span), &mut self.public_id_span);
+ span.end = NonZeroUsize::new(end).expect("expected offset to be non-zero");
+ }
+
+ #[inline]
+ pub(crate) fn set_system_id_end(&mut self, end: usize) {
+ assume!(Some(span), &mut self.system_id_span);
+ span.end = NonZeroUsize::new(end).expect("expected offset to be non-zero");
+ }
+}
+
+/// The syntax of the attribute value.
+#[derive(Clone, Copy, PartialEq, Eq, Debug)]
+pub enum AttrValueSyntax {
+ /// An unquoted attribute value, e.g. `id=foo`.
+ Unquoted,
+ /// A single-quoted attribute value, e.g. `id='foo'`.
+ SingleQuoted,
+ /// A double-quoted attribute value, e.g. `id="foo"`.
+ DoubleQuoted,
+}
+
+/// Provides byte offsets and the [`AttrValueSyntax`] for an attribute that was present in the source.
+#[derive(Eq, PartialEq, Debug)]
+pub struct AttributeTrace {
+ pub(crate) value_syntax: Option<AttrValueSyntax>,
+ pub(crate) name_span: Range<usize>,
+ /// We intentionally don't use `Option<Range<O>>` here to spare us a byte (and padding) per attribute.
+ /// For the empty attribute syntax this is just `O::default()..O::default()`.
+ pub(crate) value_span: Range<usize>,
+}
+
+impl AttributeTrace {
+ /// [`AttributeTrace`] intentionally doesn't implement Default
+ /// (since it's part of the public API and it wouldn't make sense semantically).
+ pub(crate) fn new() -> Self {
+ Self {
+ value_syntax: None,
+ name_span: Default::default(),
+ value_span: Default::default(),
+ }
+ }
+
+ /// Returns the span of the attribute name.
+ pub fn name_span(&self) -> Range<usize> {
+ self.name_span.clone()
+ }
+
+ /// For explicitly defined values returns the span of the attribute value.
+ ///
+ /// Returns `None` for attributes using the empty attribute syntax (e.g. `disabled` in `<input disabled>`).
+ pub fn value_span(&self) -> Option<Range<usize>> {
+ if self.value_syntax.is_none() {
+ return None;
+ }
+ Some(self.value_span.clone())
+ }
+
+ /// Returns the attribute value syntax in case the value is explicitly defined.
+ ///
+ /// Returns `None` for attributes using the empty attribute syntax (e.g. `disabled` in `<input disabled>`).
+ pub fn value_syntax(&self) -> Option<AttrValueSyntax> {
+ self.value_syntax
+ }
+}
+
+/// List of [`AttributeTrace`]s for the attributes that were present in the source.
+#[derive(Eq, PartialEq, Debug)]
+pub struct AttributeTraceList {
+ /// We don't use `HashMap<String, AttributeTrace>` since this would require
+ /// the attribute names to be cloned (which would be less efficient).
+ traces: Vec<AttributeTrace>,
+}
+
+impl Index<AttributeTraceIdx> for AttributeTraceList {
+ type Output = AttributeTrace;
+
+ fn index(&self, index: AttributeTraceIdx) -> &Self::Output {
+ &self.traces[index.0.get() - 1]
+ }
+}
+
+impl AttributeTraceList {
+ pub(crate) fn new() -> Self {
+ Self {
+ traces: Default::default(),
+ }
+ }
+
+ pub(crate) fn insert(&mut self, trace: AttributeTrace) -> AttributeTraceIdx {
+ self.traces.push(trace);
+ let len = self.traces.len();
+ AttributeTraceIdx(
+ // SAFETY: len cannot be zero because we push before calling Vec::len.
+ unsafe { std::num::NonZeroUsize::new_unchecked(len) },
+ )
+ }
+}
diff --git a/src/tracing_emitter.rs b/src/tracing_emitter.rs
index 76b20bf..408e832 100644
--- a/src/tracing_emitter.rs
+++ b/src/tracing_emitter.rs
@@ -3,20 +3,25 @@ use std::collections::BTreeSet;
use std::collections::VecDeque;
use std::ops::Range;
-use crate::let_else::assume;
-use crate::token::{AttrValueSyntax, Comment, Doctype, EndTag, StartTag};
+use crate::let_else::{assume, know};
+use crate::token::{Doctype, EndTag, StartTag, Token};
+use crate::trace::AttributeTrace;
+use crate::trace::AttributeTraceList;
+use crate::trace::{
+ AttrValueSyntax, CommentTrace, DoctypeTrace, EndTagTrace, StartTagTrace, Trace,
+};
use crate::Emitter;
use crate::Error;
-type Token = crate::token::Token<usize>;
-
/// The default implementation of [`Emitter`], used to produce tokens.
pub struct TracingEmitter {
current_token: Option<Token>,
+ current_trace: Option<Trace>,
current_attribute_name: String,
- current_attr_internal: crate::token::AttrInternal<usize>,
+ current_attr_internal: crate::token::AttrInternal,
+ current_attribute_trace: crate::trace::AttributeTrace,
seen_attributes: BTreeSet<String>,
- emitted_tokens: VecDeque<Token>,
+ emitted_tokens: VecDeque<(Token, Trace)>,
errors: VecDeque<(Error, Range<usize>)>,
attr_in_end_tag_span: Option<Range<usize>>,
}
@@ -25,8 +30,10 @@ impl Default for TracingEmitter {
fn default() -> Self {
TracingEmitter {
current_token: None,
+ current_trace: None,
current_attribute_name: String::new(),
current_attr_internal: Default::default(),
+ current_attribute_trace: crate::trace::AttributeTrace::new(),
seen_attributes: BTreeSet::new(),
emitted_tokens: VecDeque::new(),
errors: VecDeque::new(),
@@ -43,7 +50,7 @@ impl TracingEmitter {
}
impl Iterator for TracingEmitter {
- type Item = Token;
+ type Item = (Token, Trace);
fn next(&mut self) -> Option<Self::Item> {
self.emitted_tokens.pop_back()
@@ -56,27 +63,32 @@ impl Emitter<usize> for TracingEmitter {
}
fn emit_char(&mut self, c: char) {
- self.emit_token(Token::Char(c));
+ self.emit_token(Token::Char(c), Trace::Char);
}
fn emit_eof(&mut self) {
- self.emit_token(Token::EndOfFile);
+ self.emit_token(Token::EndOfFile, Trace::EndOfFile);
}
fn init_start_tag(&mut self, tag_offset: usize, name_offset: usize) {
self.current_token = Some(Token::StartTag(StartTag {
- span: tag_offset..0,
self_closing: false,
name: String::new(),
attributes: Default::default(),
+ }));
+ self.current_trace = Some(Trace::StartTag(StartTagTrace {
+ span: tag_offset..0,
name_span: name_offset..0,
+ attribute_traces: AttributeTraceList::new(),
}));
}
fn init_end_tag(&mut self, tag_offset: usize, name_offset: usize) {
self.current_token = Some(Token::EndTag(EndTag {
- span: tag_offset..0,
name: String::new(),
+ }));
+ self.current_trace = Some(Trace::EndTag(EndTagTrace {
+ span: tag_offset..0,
name_span: name_offset..0,
}));
self.seen_attributes.clear();
@@ -93,17 +105,17 @@ impl Emitter<usize> for TracingEmitter {
fn terminate_tag_name(&mut self, offset: usize) {
assume!(
Some(
- Token::StartTag(StartTag { name_span, .. })
- | Token::EndTag(EndTag { name_span, .. })
+ Trace::StartTag(StartTagTrace { name_span, .. })
+ | Trace::EndTag(EndTagTrace { name_span, .. })
),
- &mut self.current_token
+ &mut self.current_trace
);
name_span.end = offset;
}
fn init_attribute_name(&mut self, offset: usize) {
self.flush_current_attribute();
- self.current_attr_internal.name_span.start = offset;
+ self.current_attribute_trace.name_span.start = offset;
}
fn push_attribute_name(&mut self, s: &str) {
@@ -111,12 +123,12 @@ impl Emitter<usize> for TracingEmitter {
}
fn terminate_attribute_name(&mut self, offset: usize) {
- self.current_attr_internal.name_span.end = offset;
+ self.current_attribute_trace.name_span.end = offset;
}
fn init_attribute_value(&mut self, syntax: AttrValueSyntax, offset: usize) {
- self.current_attr_internal.value_span.start = offset;
- self.current_attr_internal.value_syntax = Some(syntax);
+ self.current_attribute_trace.value_span.start = offset;
+ self.current_attribute_trace.value_syntax = Some(syntax);
}
fn push_attribute_value(&mut self, s: &str) {
@@ -124,7 +136,7 @@ impl Emitter<usize> for TracingEmitter {
}
fn terminate_attribute_value(&mut self, offset: usize) {
- self.current_attr_internal.value_span.end = offset;
+ self.current_attribute_trace.value_span.end = offset;
}
fn set_self_closing(&mut self, slash_span: Range<usize>) {
@@ -144,43 +156,47 @@ impl Emitter<usize> for TracingEmitter {
fn emit_current_tag(&mut self, offset: usize) {
self.flush_current_attribute();
let mut token = self.current_token.take().unwrap();
+ let mut trace = self.current_trace.take().unwrap();
match &mut token {
- Token::EndTag(tag) => {
+ Token::EndTag(_) => {
if !self.seen_attributes.is_empty() {
let span = self.attr_in_end_tag_span.take().unwrap();
self.report_error(Error::EndTagWithAttributes, span);
}
self.seen_attributes.clear();
- tag.span.end = offset;
+ know!(Trace::EndTag(tag_trace), &mut trace);
+ tag_trace.span.end = offset;
}
- Token::StartTag(tag) => {
- tag.span.end = offset;
+ Token::StartTag(_) => {
+ know!(Trace::StartTag(tag_trace), &mut trace);
+ tag_trace.span.end = offset;
}
other => {
debug_assert!(false, "unexpected current_token: {other:?}");
return;
}
}
- self.emit_token(token);
+ self.emit_token(token, trace);
}
fn init_comment(&mut self, data_start_offset: usize) {
- self.current_token = Some(Token::Comment(Comment {
- data: String::new(),
+ self.current_token = Some(Token::Comment(String::new()));
+ self.current_trace = Some(Trace::Comment(CommentTrace {
data_span: data_start_offset..0,
}));
}
fn push_comment(&mut self, s: &str) {
- assume!(Some(Token::Comment(comment)), &mut self.current_token);
- comment.data.push_str(s);
+ assume!(Some(Token::Comment(data)), &mut self.current_token);
+ data.push_str(s);
}
fn emit_current_comment(&mut self, data_end_offset: usize) {
- let mut token = self.current_token.take().unwrap();
- assume!(Token::Comment(comment), &mut token);
- comment.data_span.end = data_end_offset;
- self.emit_token(token);
+ let token = self.current_token.take().unwrap();
+ let mut trace = self.current_trace.take().unwrap();
+ assume!(Trace::Comment(comment_trace), &mut trace);
+ comment_trace.data_span.end = data_end_offset;
+ self.emit_token(token, trace);
}
fn init_doctype(&mut self, offset: usize) {
@@ -189,17 +205,15 @@ impl Emitter<usize> for TracingEmitter {
force_quirks: false,
public_id: None,
system_id: None,
- span: offset..0,
- name_span: 0..0,
- public_id_span: 0..0,
- system_id_span: 0..0,
}));
+ self.current_trace = Some(Trace::Doctype(DoctypeTrace::new(offset)));
}
fn init_doctype_name(&mut self, offset: usize) {
assume!(Some(Token::Doctype(doctype)), &mut self.current_token);
doctype.name = Some("".into());
- doctype.name_span.start = offset;
+ know!(Some(Trace::Doctype(doctype_trace)), &mut self.current_trace);
+ doctype_trace.set_name_start(offset);
}
fn push_doctype_name(&mut self, s: &str) {
@@ -214,14 +228,15 @@ impl Emitter<usize> for TracingEmitter {
}
fn terminate_doctype_name(&mut self, offset: usize) {
- assume!(Some(Token::Doctype(doctype)), &mut self.current_token);
- doctype.name_span.end = offset;
+ assume!(Some(Trace::Doctype(doctype_trace)), &mut self.current_trace);
+ doctype_trace.set_name_end(offset);
}
fn init_doctype_public_id(&mut self, offset: usize) {
assume!(Some(Token::Doctype(doctype)), &mut self.current_token);
doctype.public_id = Some("".to_owned());
- doctype.public_id_span.start = offset;
+ know!(Some(Trace::Doctype(doctype_trace)), &mut self.current_trace);
+ doctype_trace.set_public_id_start(offset);
}
fn push_doctype_public_id(&mut self, s: &str) {
@@ -236,14 +251,15 @@ impl Emitter<usize> for TracingEmitter {
}
fn terminate_doctype_public_id(&mut self, offset: usize) {
- assume!(Some(Token::Doctype(doctype)), &mut self.current_token);
- doctype.public_id_span.end = offset;
+ assume!(Some(Trace::Doctype(doctype_trace)), &mut self.current_trace);
+ doctype_trace.set_public_id_end(offset);
}
fn init_doctype_system_id(&mut self, offset: usize) {
assume!(Some(Token::Doctype(doctype)), &mut self.current_token);
doctype.system_id = Some("".to_owned());
- doctype.system_id_span.start = offset;
+ know!(Some(Trace::Doctype(doctype_trace)), &mut self.current_trace);
+ doctype_trace.set_system_id_start(offset);
}
fn push_doctype_system_id(&mut self, s: &str) {
@@ -258,8 +274,8 @@ impl Emitter<usize> for TracingEmitter {
}
fn terminate_doctype_system_id(&mut self, offset: usize) {
- assume!(Some(Token::Doctype(doctype)), &mut self.current_token);
- doctype.system_id_span.end = offset;
+ assume!(Some(Trace::Doctype(doctype_trace)), &mut self.current_trace);
+ doctype_trace.set_system_id_end(offset);
}
fn set_force_quirks(&mut self) {
@@ -268,15 +284,17 @@ impl Emitter<usize> for TracingEmitter {
}
fn emit_current_doctype(&mut self, offset: usize) {
- assume!(Some(Token::Doctype(mut doctype)), self.current_token.take());
- doctype.span.end = offset;
- self.emit_token(Token::Doctype(doctype));
+ assume!(Some(mut trace), self.current_trace.take());
+ assume!(Trace::Doctype(doctype_trace), &mut trace);
+ doctype_trace.span.end = offset;
+ let token = self.current_token.take().unwrap();
+ self.emit_token(token, trace);
}
}
impl TracingEmitter {
- fn emit_token(&mut self, token: Token) {
- self.emitted_tokens.push_front(token);
+ fn emit_token(&mut self, token: Token, trace: Trace) {
+ self.emitted_tokens.push_front((token, trace));
}
fn flush_current_attribute(&mut self) {
@@ -284,21 +302,26 @@ impl TracingEmitter {
return;
}
let name = std::mem::take(&mut self.current_attribute_name);
- let attr_internal = std::mem::take(&mut self.current_attr_internal);
+ let mut attr_internal = std::mem::take(&mut self.current_attr_internal);
+ let attr_trace =
+ std::mem::replace(&mut self.current_attribute_trace, AttributeTrace::new());
match &mut self.current_token {
Some(Token::StartTag(tag)) => match tag.attributes.inner.entry(name) {
Entry::Vacant(vacant) => {
+ know!(Some(Trace::StartTag(trace)), &mut self.current_trace);
+ let trace_idx = trace.attribute_traces.insert(attr_trace);
+ attr_internal.trace_idx = Some(trace_idx);
vacant.insert(attr_internal);
}
Entry::Occupied(_) => {
- self.report_error(Error::DuplicateAttribute, attr_internal.name_span);
+ self.report_error(Error::DuplicateAttribute, attr_trace.name_span);
}
},
Some(Token::EndTag(_)) => {
- self.attr_in_end_tag_span = Some(attr_internal.name_span.clone());
+ self.attr_in_end_tag_span = Some(attr_trace.name_span.clone());
if !self.seen_attributes.insert(name) {
- self.report_error(Error::DuplicateAttribute, attr_internal.name_span);
+ self.report_error(Error::DuplicateAttribute, attr_trace.name_span);
}
}
other => debug_assert!(false, "unexpected current_token: {other:?}"),
@@ -306,6 +329,12 @@ impl TracingEmitter {
}
}
+impl From<(Token, Trace)> for Token {
+ fn from((token, _): (Token, Trace)) -> Self {
+ token
+ }
+}
+
/// The majority of our testing of the [`TracingEmitter`] is done against the
/// html5lib-tests in the html5lib integration test. This module only tests
/// details that aren't present in the html5lib test data.
@@ -313,8 +342,8 @@ impl TracingEmitter {
mod tests {
use super::TracingEmitter;
use crate::offset::PosTrackingReader;
- use crate::token::{AttrValueSyntax, Token};
- use crate::{Event, Tokenizer};
+ use crate::trace::{AttrValueSyntax, Trace};
+ use crate::{Event, Token, Tokenizer};
#[test]
fn test_attribute_value_syntax() {
@@ -325,7 +354,9 @@ mod tests {
TracingEmitter::default(),
)
.flatten();
- let Event::Token(Token::StartTag(tag)) = tokenizer.next().unwrap() else {
+ let Event::Token((Token::StartTag(tag), Trace::StartTag(tag_trace))) =
+ tokenizer.next().unwrap()
+ else {
panic!("expected start tag");
};
for (name, syntax) in [
@@ -334,8 +365,9 @@ mod tests {
("single-quoted", Some(AttrValueSyntax::SingleQuoted)),
("double-quoted", Some(AttrValueSyntax::DoubleQuoted)),
] {
+ let attr_trace_idx = tag.attributes.get(name).unwrap().trace_idx().unwrap();
assert_eq!(
- tag.attributes.get(name).unwrap().value_syntax(),
+ tag_trace.attribute_traces[attr_trace_idx].value_syntax(),
syntax,
"unexpected value for attribute {name}"
);