diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/lib.rs | 1 | ||||
-rw-r--r-- | src/tokenizer/interface.rs | 27 | ||||
-rw-r--r-- | src/tokenizer/mod.rs | 267 |
3 files changed, 211 insertions, 84 deletions
@@ -10,6 +10,7 @@ #![doc = include_str!("../README.md")] #![crate_type = "dylib"] #![cfg_attr(test, deny(warnings))] +#![cfg_attr(docsrs, feature(doc_cfg))] #![allow(unused_parens)] #[macro_use] diff --git a/src/tokenizer/interface.rs b/src/tokenizer/interface.rs index 2c6cc38..f12fb16 100644 --- a/src/tokenizer/interface.rs +++ b/src/tokenizer/interface.rs @@ -9,6 +9,8 @@ use crate::tokenizer::states; use std::borrow::Cow; +#[cfg(feature = "spans")] +use std::ops::Range; pub use self::TagKind::{EndTag, StartTag}; pub use self::Token::{CharacterTokens, CommentToken, DoctypeToken, TagToken}; @@ -47,12 +49,30 @@ pub enum TagKind { /// The tokenizer creates all attributes this way, but the tree /// builder will adjust certain attribute names inside foreign /// content (MathML, SVG). -#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug)] +#[derive(PartialEq, Eq, Clone, Debug)] pub struct Attribute { /// The name of the attribute (e.g. the `class` in `<div class="test">`) pub name: String, /// The value of the attribute (e.g. the `"test"` in `<div class="test">`) pub value: String, + #[cfg(feature = "spans")] + #[cfg_attr(docsrs, doc(cfg(feature = "spans")))] + pub name_span: Range<usize>, + #[cfg(feature = "spans")] + #[cfg_attr(docsrs, doc(cfg(feature = "spans")))] + pub value_span: Range<usize>, +} + +impl Ord for Attribute { + fn cmp(&self, other: &Self) -> std::cmp::Ordering { + (&self.name, &self.value).cmp(&(&other.name, &other.value)) + } +} + +impl PartialOrd for Attribute { + fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> { + (&self.name, &self.value).partial_cmp(&(&other.name, &other.value)) + } } /// A tag token. @@ -62,11 +82,14 @@ pub struct Tag { pub name: String, pub self_closing: bool, pub attrs: Vec<Attribute>, + #[cfg(feature = "spans")] + #[cfg_attr(docsrs, doc(cfg(feature = "spans")))] + pub name_span: Range<usize>, } impl Tag { /// Are the tags equivalent when we don't care about attribute order? - /// Also ignores the self-closing flag. + /// Also ignores the self-closing flag and spans. pub fn equiv_modulo_attr_order(&self, other: &Tag) -> bool { if (self.kind != other.kind) || (self.name != other.name) { return false; diff --git a/src/tokenizer/mod.rs b/src/tokenizer/mod.rs index 2d5e1ed..4511cf8 100644 --- a/src/tokenizer/mod.rs +++ b/src/tokenizer/mod.rs @@ -59,7 +59,8 @@ fn option_push(opt_str: &mut Option<String>, c: char) { #[derive(Clone)] pub struct TokenizerOpts { /// Report all parse errors described in the spec, at some - /// performance penalty? Default: false + /// performance penalty? Defaults to false, except when the + /// `spans` feature is enabled in which case it defaults to true. pub exact_errors: bool, /// Discard a `U+FEFF BYTE ORDER MARK` if we see one at the beginning @@ -82,7 +83,7 @@ pub struct TokenizerOpts { impl Default for TokenizerOpts { fn default() -> TokenizerOpts { TokenizerOpts { - exact_errors: false, + exact_errors: cfg!(feature = "spans"), discard_bom: true, profile: false, initial_state: None, @@ -162,6 +163,36 @@ pub struct Tokenizer<Sink> { /// Track current line current_line: u64, + + #[cfg(feature = "spans")] + spans: Spans, +} + +#[cfg(feature = "spans")] +#[derive(Default)] +struct Spans { + /// Track current byte position + current_pos: usize, + + /// Current tag name span. + current_tag_name: core::ops::Range<usize>, + + /// Current attribute name span. + current_attr_name: core::ops::Range<usize>, + + /// Current attribute value span. + current_attr_value: core::ops::Range<usize>, +} + +#[cfg(feature = "spans")] +impl Spans { + fn end_tag_name(&mut self) { + self.current_tag_name.end = self.current_pos - 1; + } + + fn end_attr_name(&mut self) { + self.current_attr_name.end = self.current_pos - 1; + } } impl<Sink: TokenSink> Tokenizer<Sink> { @@ -193,6 +224,8 @@ impl<Sink: TokenSink> Tokenizer<Sink> { state_profile: BTreeMap::new(), time_in_sink: 0, current_line: 1, + #[cfg(feature = "spans")] + spans: Spans::default(), } } @@ -263,6 +296,11 @@ impl<Sink: TokenSink> Tokenizer<Sink> { self.emit_error(Cow::Owned(msg)); } + #[cfg(feature = "spans")] + { + self.spans.current_pos += c.len_utf8(); + } + self.current_char = c; Some(c) } @@ -430,6 +468,8 @@ impl<Sink: TokenSink> Tokenizer<Sink> { name, self_closing: self.current_tag_self_closing, attrs: replace(&mut self.current_tag_attrs, vec![]), + #[cfg(feature = "spans")] + name_span: self.spans.current_tag_name.clone(), }); match self.process_token(token) { @@ -516,6 +556,10 @@ impl<Sink: TokenSink> Tokenizer<Sink> { self.current_tag_attrs.push(Attribute { name: name, value: replace(&mut self.current_attr_value, String::new()), + #[cfg(feature = "spans")] + name_span: self.spans.current_attr_name.clone(), + #[cfg(feature = "spans")] + value_span: self.spans.current_attr_value.clone(), }); } } @@ -830,6 +874,10 @@ impl<Sink: TokenSink> Tokenizer<Sink> { } c => match lower_ascii_letter(c) { Some(cl) => { + #[cfg(feature = "spans")] + { + self.spans.current_tag_name.start = self.spans.current_pos - 1; + } go!(self: create_tag StartTag cl); return go!(self: to TagName); } @@ -858,6 +906,10 @@ impl<Sink: TokenSink> Tokenizer<Sink> { } c => match lower_ascii_letter(c) { Some(cl) => { + #[cfg(feature = "spans")] + { + self.spans.current_tag_name.start = self.spans.current_pos - 1; + } go!(self: create_tag EndTag cl); return go!(self: to TagName); } @@ -875,12 +927,18 @@ impl<Sink: TokenSink> Tokenizer<Sink> { states::TagName => loop { match get_char!(self, input)? { '\t' | '\n' | '\x0C' | ' ' => { + #[cfg(feature = "spans")] + self.spans.end_tag_name(); return go!(self: to BeforeAttributeName); } '/' => { + #[cfg(feature = "spans")] + self.spans.end_tag_name(); return go!(self: to SelfClosingStartTag); } '>' => { + #[cfg(feature = "spans")] + self.spans.end_tag_name(); return go!(self: emit_tag Data); } '\0' => { @@ -1168,6 +1226,10 @@ impl<Sink: TokenSink> Tokenizer<Sink> { c => match lower_ascii_letter(c) { Some(cl) => { go!(self: create_attr cl); + #[cfg(feature = "spans")] + { + self.spans.current_attr_name.start = self.spans.current_pos - 1; + } return go!(self: to AttributeName); } None => { @@ -1186,15 +1248,23 @@ impl<Sink: TokenSink> Tokenizer<Sink> { states::AttributeName => loop { match get_char!(self, input)? { '\t' | '\n' | '\x0C' | ' ' => { + #[cfg(feature = "spans")] + self.spans.end_attr_name(); return go!(self: to AfterAttributeName); } '/' => { + #[cfg(feature = "spans")] + self.spans.end_attr_name(); return go!(self: to SelfClosingStartTag); } '=' => { + #[cfg(feature = "spans")] + self.spans.end_attr_name(); return go!(self: to BeforeAttributeValue); } '>' => { + #[cfg(feature = "spans")] + self.spans.end_attr_name(); return go!(self: emit_tag Data); } '\0' => { @@ -1285,101 +1355,134 @@ impl<Sink: TokenSink> Tokenizer<Sink> { }, //§ attribute-value-(double-quoted)-state - states::AttributeValue(DoubleQuoted) => loop { - match pop_except_from!(self, input, small_char_set!('\r' '"' '&' '\0' '\n'))? { - FromSet('"') => { - return go!(self: to AfterAttributeValueQuoted); - } - FromSet('&') => { - return go!(self: consume_char_ref '"'); - } - FromSet('\0') => { - go!(self: error); - go!(self: push_value '\u{fffd}'); - } - FromSet(c) => { - go!(self: push_value c); - } - NotFromSet(ref b) => { - go!(self: append_value b); + states::AttributeValue(DoubleQuoted) => { + #[cfg(feature = "spans")] + { + self.spans.current_attr_value.start = self.spans.current_pos; + } + loop { + match pop_except_from!(self, input, small_char_set!('\r' '"' '&' '\0' '\n'))? { + FromSet('"') => { + return go!(self: to AfterAttributeValueQuoted); + } + FromSet('&') => { + return go!(self: consume_char_ref '"'); + } + FromSet('\0') => { + go!(self: error); + go!(self: push_value '\u{fffd}'); + } + FromSet(c) => { + go!(self: push_value c); + } + NotFromSet(ref b) => { + go!(self: append_value b); + } } } - }, + } //§ attribute-value-(single-quoted)-state - states::AttributeValue(SingleQuoted) => loop { - match pop_except_from!(self, input, small_char_set!('\r' '\'' '&' '\0' '\n'))? { - FromSet('\'') => { - return go!(self: to AfterAttributeValueQuoted); - } - FromSet('&') => { - return go!(self: consume_char_ref '\''); - } - FromSet('\0') => { - go!(self: error); - go!(self: push_value '\u{fffd}'); - } - FromSet(c) => { - go!(self: push_value c); - } - NotFromSet(ref b) => { - go!(self: append_value b); + states::AttributeValue(SingleQuoted) => { + #[cfg(feature = "spans")] + { + self.spans.current_attr_value.start = self.spans.current_pos; + } + loop { + match pop_except_from!(self, input, small_char_set!('\r' '\'' '&' '\0' '\n'))? { + FromSet('\'') => { + return go!(self: to AfterAttributeValueQuoted); + } + FromSet('&') => { + return go!(self: consume_char_ref '\''); + } + FromSet('\0') => { + go!(self: error); + go!(self: push_value '\u{fffd}'); + } + FromSet(c) => { + go!(self: push_value c); + } + NotFromSet(ref b) => { + go!(self: append_value b); + } } } - }, + } //§ attribute-value-(unquoted)-state - states::AttributeValue(Unquoted) => loop { - match pop_except_from!( - self, - input, - small_char_set!('\r' '\t' '\n' '\x0C' ' ' '&' '>' '\0') - )? { - FromSet('\t') | FromSet('\n') | FromSet('\x0C') | FromSet(' ') => { - return go!(self: to BeforeAttributeName); - } - FromSet('&') => { - return go!(self: consume_char_ref '>'); - } - FromSet('>') => { - return go!(self: emit_tag Data); - } - FromSet('\0') => { - go!(self: error); - go!(self: push_value '\u{fffd}'); - } - FromSet(c) => { - go_match!(self: c, + states::AttributeValue(Unquoted) => { + #[cfg(feature = "spans")] + { + self.spans.current_attr_value.start = self.spans.current_pos; + } + loop { + match pop_except_from!( + self, + input, + small_char_set!('\r' '\t' '\n' '\x0C' ' ' '&' '>' '\0') + )? { + FromSet('\t') | FromSet('\n') | FromSet('\x0C') | FromSet(' ') => { + #[cfg(feature = "spans")] + { + self.spans.current_attr_value.end = self.spans.current_pos - 1; + } + return go!(self: to BeforeAttributeName); + } + FromSet('&') => { + return go!(self: consume_char_ref '>'); + } + FromSet('>') => { + #[cfg(feature = "spans")] + { + self.spans.current_attr_value.end = self.spans.current_pos - 1; + } + return go!(self: emit_tag Data); + } + FromSet('\0') => { + go!(self: error); + go!(self: push_value '\u{fffd}'); + } + FromSet(c) => { + go_match!(self: c, '"' , '\'' , '<' , '=' , '`' => error); - { - go!(self: push_value c); - }; - } - NotFromSet(ref b) => { - go!(self: append_value b); + { + go!(self: push_value c); + }; + } + NotFromSet(ref b) => { + go!(self: append_value b); + } } } - }, + } //§ after-attribute-value-(quoted)-state - states::AfterAttributeValueQuoted => loop { - match get_char!(self, input)? { - '\t' | '\n' | '\x0C' | ' ' => { - return go!(self: to BeforeAttributeName); - } - '/' => { - return go!(self: to SelfClosingStartTag); - } - '>' => { - return go!(self: emit_tag Data); - } - _ => { - go!(self: error); - self.reconsume = true; - return go!(self: to BeforeAttributeName); + states::AfterAttributeValueQuoted => { + #[cfg(feature = "spans")] + { + self.spans.current_attr_value.end = self.spans.current_pos - 1; + } + + loop { + match get_char!(self, input)? { + '\t' | '\n' | '\x0C' | ' ' => { + return go!(self: to BeforeAttributeName); + } + '/' => { + return go!(self: to SelfClosingStartTag); + } + '>' => { + return go!(self: emit_tag Data); + } + _ => { + go!(self: error); + self.reconsume = true; + return go!(self: to BeforeAttributeName); + } } } - }, + } //§ self-closing-start-tag-state states::SelfClosingStartTag => loop { |