From 65aca9cbf0318bd3a2f936641b4f5bc3729c98c2 Mon Sep 17 00:00:00 2001 From: Martin Fischer Date: Wed, 16 Aug 2023 09:45:18 +0200 Subject: break!: introduce AttributeMap This has a number of benefits: * it hides the implementation of the map * it hides the type used for the map values (which lets us e.g. change name_span to name_offset while still being able to provide a convenient `Attribute::name_span` method.) * it lets us provide convenience impls for the map such as `FromIterator<(String, String)>` --- integration_tests/tests/test_html5lib.rs | 2 +- src/attr.rs | 127 +++++++++++++++++++++++++++++-- src/emitter.rs | 9 +-- src/lib.rs | 3 +- tests/test_spans.rs | 8 +- 5 files changed, 129 insertions(+), 20 deletions(-) diff --git a/integration_tests/tests/test_html5lib.rs b/integration_tests/tests/test_html5lib.rs index 0f96063..a624c30 100644 --- a/integration_tests/tests/test_html5lib.rs +++ b/integration_tests/tests/test_html5lib.rs @@ -129,7 +129,7 @@ fn run_test_inner( attributes: tag .attributes .into_iter() - .map(|(name, map_val)| (name, map_val.value)) + .map(|attr| (attr.name().to_owned(), attr.value().to_owned())) .collect(), self_closing: tag.self_closing, }), diff --git a/src/attr.rs b/src/attr.rs index d0d506e..9e4c984 100644 --- a/src/attr.rs +++ b/src/attr.rs @@ -1,14 +1,125 @@ -use std::ops::Range; +//! Types for HTML attributes. -/// A HTML attribute value (plus spans). +use std::collections::{btree_map, BTreeMap}; +use std::iter::FromIterator; +use std::ops::{Index, Range}; + +use crate::offset::Offset; + +/// A map of HTML attributes. +/// +/// Does not preserve the order of attributes. +/// Iterating always yields attributes in order by name. +/// +/// # Example +/// +/// ``` +/// # use html5tokenizer::attr::AttributeMap; +/// let attrs: AttributeMap<()> = vec![("href".into(), "http://example.com".into())] +/// .into_iter() +/// .collect(); +/// assert_eq!(&attrs["href"], "http://example.com"); +/// ``` +#[derive(Debug, Default, PartialEq, Eq)] +pub struct AttributeMap { + pub(crate) inner: BTreeMap>, +} + +/// The value type internally used by the [`AttributeMap`]. +/// Not part of the public API. #[derive(Debug, Eq, PartialEq)] -pub struct Attribute { - /// The value of the attribute. +pub(crate) struct AttrInternal { pub value: String, - - /// The source code span of the attribute name. pub name_span: Range, - - /// The source code span of the attribute value. pub value_span: Range, } + +/// An HTML attribute borrowed from an [`AttributeMap`]. +#[derive(Debug, Eq, PartialEq)] +pub struct Attribute<'a, O> { + name: &'a str, + map_val: &'a AttrInternal, +} + +impl AttributeMap { + /// Returns the attribute with the given name. + pub fn get(&self, name: &str) -> Option> { + self.inner + .get_key_value(name) + .map(|(name, map_val)| Attribute { name, map_val }) + } +} + +impl<'a, O: Offset> Attribute<'a, O> { + /// Returns the attribute name. + pub fn name(&self) -> &'a str { + self.name + } + + /// Returns the attribute value. + pub fn value(&self) -> &'a str { + &self.map_val.value + } + + /// Returns the span of the attribute name. + pub fn name_span(&self) -> Range { + self.map_val.name_span.clone() + } + + /// Returns the span of the attribute value. + pub fn value_span(&self) -> Range { + self.map_val.value_span.clone() + } +} + +// We cannot impl Index because Index::index returns a reference of +// the Output type (and you cannot return a value referencing a temporary value). +impl Index<&str> for AttributeMap { + type Output = str; + + fn index(&self, name: &str) -> &Self::Output { + &self.inner[name].value + } +} + +impl<'a, O> IntoIterator for &'a AttributeMap { + type Item = Attribute<'a, O>; + + type IntoIter = AttrIter<'a, O>; + + fn into_iter(self) -> Self::IntoIter { + AttrIter(self.inner.iter()) + } +} + +/// A borrowed iterator over the attributes of an [`AttributeMap`]. +pub struct AttrIter<'a, S>(btree_map::Iter<'a, String, AttrInternal>); + +impl<'a, S> Iterator for AttrIter<'a, S> { + type Item = Attribute<'a, S>; + + fn next(&mut self) -> Option { + let (name, map_val) = self.0.next()?; + Some(Attribute { name, map_val }) + } +} + +impl FromIterator<(String, String)> for AttributeMap { + fn from_iter>(iter: T) -> Self { + Self { + inner: iter + .into_iter() + .map(|(name, value)| { + ( + name, + AttrInternal { + value, + name_span: O::default()..O::default(), + value_span: O::default()..O::default(), + }, + ) + }) + .collect(), + } + } +} diff --git a/src/emitter.rs b/src/emitter.rs index d3258e2..8856589 100644 --- a/src/emitter.rs +++ b/src/emitter.rs @@ -1,5 +1,4 @@ use std::collections::btree_map::Entry; -use std::collections::BTreeMap; use std::collections::BTreeSet; use std::collections::VecDeque; use std::mem; @@ -179,7 +178,7 @@ pub trait Emitter { pub struct DefaultEmitter { current_characters: String, current_token: Option>, - current_attribute: Option<(String, crate::attr::Attribute)>, + current_attribute: Option<(String, crate::attr::AttrInternal)>, seen_attributes: BTreeSet, emitted_tokens: VecDeque>, attr_in_end_tag_span: Option>, @@ -210,7 +209,7 @@ impl DefaultEmitter { { if let Some((k, v)) = self.current_attribute.take() { match self.current_token { - Some(Token::StartTag(ref mut tag)) => match tag.attributes.entry(k) { + Some(Token::StartTag(ref mut tag)) => match tag.attributes.inner.entry(k) { Entry::Vacant(vacant) => { vacant.insert(v); } @@ -380,7 +379,7 @@ impl Emitter for DefaultEmitter { self.flush_current_attribute(); self.current_attribute = Some(( String::new(), - crate::attr::Attribute { + crate::attr::AttrInternal { name_span: offset..offset, value: String::new(), value_span: Range::default(), @@ -461,7 +460,7 @@ pub struct StartTag { /// /// Duplicate attributes are ignored after the first one as per WHATWG spec. Implement your own /// [`Emitter`] to tweak this behavior. - pub attributes: BTreeMap>, + pub attributes: crate::attr::AttributeMap, /// The source code span of the tag. pub span: Range, diff --git a/src/lib.rs b/src/lib.rs index 4f2cf9c..cbaf94d 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -3,7 +3,7 @@ #![forbid(unsafe_code)] #![doc = include_str!("../README.md")] -mod attr; +pub mod attr; mod emitter; mod entities; mod error; @@ -16,7 +16,6 @@ mod utils; #[cfg(feature = "integration-tests")] pub use utils::State as InternalState; -pub use attr::Attribute; pub use emitter::{Comment, DefaultEmitter, Doctype, Emitter, EndTag, StartTag, Token}; pub use error::Error; pub use tokenizer::{State, Tokenizer}; diff --git a/tests/test_spans.rs b/tests/test_spans.rs index 33f5d11..99ff9ee 100644 --- a/tests/test_spans.rs +++ b/tests/test_spans.rs @@ -110,8 +110,8 @@ fn attribute_name_span() { let Token::StartTag(tag) = tokenizer(html).next().unwrap() else { panic!("expected start tag") }; - for (_name, attr) in tag.attributes { - labels.push((attr.name_span, "")); + for attr in &tag.attributes { + labels.push((attr.name_span(), "")); } assert_snapshot!(annotate(html, labels), @r###" @@ -126,8 +126,8 @@ fn attribute_value_span() { let Token::StartTag(tag) = tokenizer(html).next().unwrap() else { panic!("expected start tag") }; - for (_name, attr) in tag.attributes { - labels.push((attr.value_span, "")); + for attr in &tag.attributes { + labels.push((attr.value_span(), "")); } assert_snapshot!(annotate(html, labels), @r###" -- cgit v1.2.3