From ae5d8185a5b419f89d520504c1cb4c59c26879bf Mon Sep 17 00:00:00 2001 From: Martin Fischer Date: Sat, 12 Aug 2023 08:15:45 +0200 Subject: feat: make attribute value syntax recognizable Note that while making this breaking change, we're also swapping the parameter order for more consistency so that the reader parameter always comes last in Emitter methods. --- src/attr.rs | 24 ++++++++++++++++++++++++ src/emitter.rs | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++++---- src/machine.rs | 7 ++++--- 3 files changed, 80 insertions(+), 7 deletions(-) (limited to 'src') diff --git a/src/attr.rs b/src/attr.rs index 4c7e330..d062a84 100644 --- a/src/attr.rs +++ b/src/attr.rs @@ -34,6 +34,18 @@ pub(crate) struct AttrInternal { pub name_offset: O, /// The start offset of the attribute value. pub value_offset: O, + pub value_syntax: Option, +} + +/// The syntax of the attribute value. +#[derive(Clone, Copy, PartialEq, Eq, Debug)] +pub enum AttrValueSyntax { + /// An unquoted attribute value, e.g. `id=foo`. + Unquoted, + /// A single-quoted attribute value, e.g. `id='foo'`. + SingleQuoted, + /// A double-quoted attribute value, e.g. `id="foo"`. + DoubleQuoted, } /// An HTML attribute borrowed from an [`AttributeMap`]. @@ -54,6 +66,9 @@ pub struct AttributeOwned { pub name_offset: O, /// The start offset of the attribute value. pub value_offset: O, // TODO: wrap this in an Option once we can recognize the empty attribute syntax + /// The syntax of the attribute value. + /// `None` indicates the empty attribute syntax (e.g. `disabled` in ``). + pub value_syntax: Option, } impl AttributeMap { @@ -85,6 +100,13 @@ impl<'a, O: Offset> Attribute<'a, O> { pub fn value_span(&self) -> Range { self.map_val.value_offset..self.map_val.value_offset + self.map_val.value.len() } + + /// Returns the attribute value syntax in case the value is explicitly defined. + /// + /// Returns `None` for attributes using the empty attribute syntax (e.g. `disabled` in ``). + pub fn value_syntax(&self) -> Option { + self.map_val.value_syntax + } } // We cannot impl Index because Index::index returns a reference of @@ -120,6 +142,7 @@ impl Iterator for AttrIntoIter { value: map_val.value, name_offset: map_val.name_offset, value_offset: map_val.value_offset, + value_syntax: map_val.value_syntax, }) } } @@ -158,6 +181,7 @@ impl FromIterator<(String, String)> for AttributeMap { value, name_offset: O::default(), value_offset: O::default(), + value_syntax: Some(AttrValueSyntax::DoubleQuoted), }, ) }) diff --git a/src/emitter.rs b/src/emitter.rs index 17a4882..63ef4b1 100644 --- a/src/emitter.rs +++ b/src/emitter.rs @@ -4,6 +4,7 @@ use std::collections::VecDeque; use std::mem; use std::ops::Range; +use crate::attr::AttrValueSyntax; use crate::offset::NoopOffset; use crate::offset::Offset; use crate::Error; @@ -121,11 +122,10 @@ pub trait Emitter { fn init_attribute_name(&mut self, offset: O); /// Called before the first push_attribute_value call. - /// If the value is wrappend in double or single quotes `quoted` is set to true, otherwise false. /// /// If there is no current attribute, this method may panic. #[allow(unused_variables)] - fn init_attribute_value(&mut self, offset: O, quoted: bool) {} + fn init_attribute_value(&mut self, syntax: AttrValueSyntax, offset: O) {} /// Append a string to the current attribute's name. /// @@ -385,11 +385,14 @@ impl Emitter for DefaultEmitter { name_offset: offset, value: String::new(), value_offset: O::default(), + value_syntax: None, }, )); } - fn init_attribute_value(&mut self, offset: O, _quoted: bool) { - self.current_attribute.as_mut().unwrap().1.value_offset = offset; + fn init_attribute_value(&mut self, syntax: AttrValueSyntax, offset: O) { + let (_, current_attribute) = self.current_attribute.as_mut().unwrap(); + current_attribute.value_offset = offset; + current_attribute.value_syntax = Some(syntax); } fn push_attribute_name(&mut self, s: &str) { @@ -554,3 +557,48 @@ pub enum Token { span: Range, }, } + +/// The majority of our testing of the [`DefaultEmitter`] is done against the +/// html5lib-tests in the html5lib integration test. This module only tests +/// details that aren't present in the html5lib test data. +#[cfg(test)] +mod tests { + use super::{DefaultEmitter, Token}; + use crate::{attr::AttrValueSyntax, Tokenizer}; + + #[test] + fn test_attribute_value_syntax() { + let mut tokenizer = Tokenizer::new( + "
", + DefaultEmitter::default(), + ) + .flatten(); + let Token::StartTag(start_tag) = tokenizer.next().unwrap() else { + panic!("expected start tag"); + }; + assert_eq!( + start_tag.attributes.get("empty").unwrap().value_syntax(), + None + ); + assert_eq!( + start_tag.attributes.get("unquoted").unwrap().value_syntax(), + Some(AttrValueSyntax::Unquoted) + ); + assert_eq!( + start_tag + .attributes + .get("single-quoted") + .unwrap() + .value_syntax(), + Some(AttrValueSyntax::SingleQuoted) + ); + assert_eq!( + start_tag + .attributes + .get("double-quoted") + .unwrap() + .value_syntax(), + Some(AttrValueSyntax::DoubleQuoted) + ); + } +} diff --git a/src/machine.rs b/src/machine.rs index a58a754..ccd3052 100644 --- a/src/machine.rs +++ b/src/machine.rs @@ -1,3 +1,4 @@ +use crate::attr::AttrValueSyntax; use crate::entities::try_read_character_reference; use crate::offset::{Offset, Position}; use crate::utils::{ @@ -757,13 +758,13 @@ where Some(whitespace_pat!()) => Ok(ControlToken::Continue), Some('"') => { slf.emitter - .init_attribute_value(slf.reader.position(), true); + .init_attribute_value(AttrValueSyntax::DoubleQuoted, slf.reader.position()); slf.state = State::AttributeValueDoubleQuoted; Ok(ControlToken::Continue) } Some('\'') => { slf.emitter - .init_attribute_value(slf.reader.position(), true); + .init_attribute_value(AttrValueSyntax::SingleQuoted, slf.reader.position()); slf.state = State::AttributeValueSingleQuoted; Ok(ControlToken::Continue) } @@ -775,7 +776,7 @@ where } c => { slf.emitter - .init_attribute_value(slf.reader.position() - 1, false); + .init_attribute_value(AttrValueSyntax::Unquoted, slf.reader.position() - 1); slf.state = State::AttributeValueUnquoted; slf.unread_char(c); Ok(ControlToken::Continue) -- cgit v1.2.3