summaryrefslogtreecommitdiff
path: root/src/tokenizer
diff options
context:
space:
mode:
authorMartin Fischer <martin@push-f.com>2021-11-30 07:28:21 +0100
committerMartin Fischer <martin@push-f.com>2021-11-30 11:22:35 +0100
commitbaf1477c587fe22d27e94408cf2505d588ba007e (patch)
treef3e027e3c149cfeb7187a625756ea4b2de47c82a /src/tokenizer
parent25087cce997abc386f881648dfd39c83dfef7667 (diff)
add spans feature
Diffstat (limited to 'src/tokenizer')
-rw-r--r--src/tokenizer/interface.rs27
-rw-r--r--src/tokenizer/mod.rs267
2 files changed, 210 insertions, 84 deletions
diff --git a/src/tokenizer/interface.rs b/src/tokenizer/interface.rs
index 2c6cc38..f12fb16 100644
--- a/src/tokenizer/interface.rs
+++ b/src/tokenizer/interface.rs
@@ -9,6 +9,8 @@
use crate::tokenizer::states;
use std::borrow::Cow;
+#[cfg(feature = "spans")]
+use std::ops::Range;
pub use self::TagKind::{EndTag, StartTag};
pub use self::Token::{CharacterTokens, CommentToken, DoctypeToken, TagToken};
@@ -47,12 +49,30 @@ pub enum TagKind {
/// The tokenizer creates all attributes this way, but the tree
/// builder will adjust certain attribute names inside foreign
/// content (MathML, SVG).
-#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug)]
+#[derive(PartialEq, Eq, Clone, Debug)]
pub struct Attribute {
/// The name of the attribute (e.g. the `class` in `<div class="test">`)
pub name: String,
/// The value of the attribute (e.g. the `"test"` in `<div class="test">`)
pub value: String,
+ #[cfg(feature = "spans")]
+ #[cfg_attr(docsrs, doc(cfg(feature = "spans")))]
+ pub name_span: Range<usize>,
+ #[cfg(feature = "spans")]
+ #[cfg_attr(docsrs, doc(cfg(feature = "spans")))]
+ pub value_span: Range<usize>,
+}
+
+impl Ord for Attribute {
+ fn cmp(&self, other: &Self) -> std::cmp::Ordering {
+ (&self.name, &self.value).cmp(&(&other.name, &other.value))
+ }
+}
+
+impl PartialOrd for Attribute {
+ fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
+ (&self.name, &self.value).partial_cmp(&(&other.name, &other.value))
+ }
}
/// A tag token.
@@ -62,11 +82,14 @@ pub struct Tag {
pub name: String,
pub self_closing: bool,
pub attrs: Vec<Attribute>,
+ #[cfg(feature = "spans")]
+ #[cfg_attr(docsrs, doc(cfg(feature = "spans")))]
+ pub name_span: Range<usize>,
}
impl Tag {
/// Are the tags equivalent when we don't care about attribute order?
- /// Also ignores the self-closing flag.
+ /// Also ignores the self-closing flag and spans.
pub fn equiv_modulo_attr_order(&self, other: &Tag) -> bool {
if (self.kind != other.kind) || (self.name != other.name) {
return false;
diff --git a/src/tokenizer/mod.rs b/src/tokenizer/mod.rs
index 2d5e1ed..4511cf8 100644
--- a/src/tokenizer/mod.rs
+++ b/src/tokenizer/mod.rs
@@ -59,7 +59,8 @@ fn option_push(opt_str: &mut Option<String>, c: char) {
#[derive(Clone)]
pub struct TokenizerOpts {
/// Report all parse errors described in the spec, at some
- /// performance penalty? Default: false
+ /// performance penalty? Defaults to false, except when the
+ /// `spans` feature is enabled in which case it defaults to true.
pub exact_errors: bool,
/// Discard a `U+FEFF BYTE ORDER MARK` if we see one at the beginning
@@ -82,7 +83,7 @@ pub struct TokenizerOpts {
impl Default for TokenizerOpts {
fn default() -> TokenizerOpts {
TokenizerOpts {
- exact_errors: false,
+ exact_errors: cfg!(feature = "spans"),
discard_bom: true,
profile: false,
initial_state: None,
@@ -162,6 +163,36 @@ pub struct Tokenizer<Sink> {
/// Track current line
current_line: u64,
+
+ #[cfg(feature = "spans")]
+ spans: Spans,
+}
+
+#[cfg(feature = "spans")]
+#[derive(Default)]
+struct Spans {
+ /// Track current byte position
+ current_pos: usize,
+
+ /// Current tag name span.
+ current_tag_name: core::ops::Range<usize>,
+
+ /// Current attribute name span.
+ current_attr_name: core::ops::Range<usize>,
+
+ /// Current attribute value span.
+ current_attr_value: core::ops::Range<usize>,
+}
+
+#[cfg(feature = "spans")]
+impl Spans {
+ fn end_tag_name(&mut self) {
+ self.current_tag_name.end = self.current_pos - 1;
+ }
+
+ fn end_attr_name(&mut self) {
+ self.current_attr_name.end = self.current_pos - 1;
+ }
}
impl<Sink: TokenSink> Tokenizer<Sink> {
@@ -193,6 +224,8 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
state_profile: BTreeMap::new(),
time_in_sink: 0,
current_line: 1,
+ #[cfg(feature = "spans")]
+ spans: Spans::default(),
}
}
@@ -263,6 +296,11 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
self.emit_error(Cow::Owned(msg));
}
+ #[cfg(feature = "spans")]
+ {
+ self.spans.current_pos += c.len_utf8();
+ }
+
self.current_char = c;
Some(c)
}
@@ -430,6 +468,8 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
name,
self_closing: self.current_tag_self_closing,
attrs: replace(&mut self.current_tag_attrs, vec![]),
+ #[cfg(feature = "spans")]
+ name_span: self.spans.current_tag_name.clone(),
});
match self.process_token(token) {
@@ -516,6 +556,10 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
self.current_tag_attrs.push(Attribute {
name: name,
value: replace(&mut self.current_attr_value, String::new()),
+ #[cfg(feature = "spans")]
+ name_span: self.spans.current_attr_name.clone(),
+ #[cfg(feature = "spans")]
+ value_span: self.spans.current_attr_value.clone(),
});
}
}
@@ -830,6 +874,10 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
}
c => match lower_ascii_letter(c) {
Some(cl) => {
+ #[cfg(feature = "spans")]
+ {
+ self.spans.current_tag_name.start = self.spans.current_pos - 1;
+ }
go!(self: create_tag StartTag cl);
return go!(self: to TagName);
}
@@ -858,6 +906,10 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
}
c => match lower_ascii_letter(c) {
Some(cl) => {
+ #[cfg(feature = "spans")]
+ {
+ self.spans.current_tag_name.start = self.spans.current_pos - 1;
+ }
go!(self: create_tag EndTag cl);
return go!(self: to TagName);
}
@@ -875,12 +927,18 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
states::TagName => loop {
match get_char!(self, input)? {
'\t' | '\n' | '\x0C' | ' ' => {
+ #[cfg(feature = "spans")]
+ self.spans.end_tag_name();
return go!(self: to BeforeAttributeName);
}
'/' => {
+ #[cfg(feature = "spans")]
+ self.spans.end_tag_name();
return go!(self: to SelfClosingStartTag);
}
'>' => {
+ #[cfg(feature = "spans")]
+ self.spans.end_tag_name();
return go!(self: emit_tag Data);
}
'\0' => {
@@ -1168,6 +1226,10 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
c => match lower_ascii_letter(c) {
Some(cl) => {
go!(self: create_attr cl);
+ #[cfg(feature = "spans")]
+ {
+ self.spans.current_attr_name.start = self.spans.current_pos - 1;
+ }
return go!(self: to AttributeName);
}
None => {
@@ -1186,15 +1248,23 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
states::AttributeName => loop {
match get_char!(self, input)? {
'\t' | '\n' | '\x0C' | ' ' => {
+ #[cfg(feature = "spans")]
+ self.spans.end_attr_name();
return go!(self: to AfterAttributeName);
}
'/' => {
+ #[cfg(feature = "spans")]
+ self.spans.end_attr_name();
return go!(self: to SelfClosingStartTag);
}
'=' => {
+ #[cfg(feature = "spans")]
+ self.spans.end_attr_name();
return go!(self: to BeforeAttributeValue);
}
'>' => {
+ #[cfg(feature = "spans")]
+ self.spans.end_attr_name();
return go!(self: emit_tag Data);
}
'\0' => {
@@ -1285,101 +1355,134 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
},
//§ attribute-value-(double-quoted)-state
- states::AttributeValue(DoubleQuoted) => loop {
- match pop_except_from!(self, input, small_char_set!('\r' '"' '&' '\0' '\n'))? {
- FromSet('"') => {
- return go!(self: to AfterAttributeValueQuoted);
- }
- FromSet('&') => {
- return go!(self: consume_char_ref '"');
- }
- FromSet('\0') => {
- go!(self: error);
- go!(self: push_value '\u{fffd}');
- }
- FromSet(c) => {
- go!(self: push_value c);
- }
- NotFromSet(ref b) => {
- go!(self: append_value b);
+ states::AttributeValue(DoubleQuoted) => {
+ #[cfg(feature = "spans")]
+ {
+ self.spans.current_attr_value.start = self.spans.current_pos;
+ }
+ loop {
+ match pop_except_from!(self, input, small_char_set!('\r' '"' '&' '\0' '\n'))? {
+ FromSet('"') => {
+ return go!(self: to AfterAttributeValueQuoted);
+ }
+ FromSet('&') => {
+ return go!(self: consume_char_ref '"');
+ }
+ FromSet('\0') => {
+ go!(self: error);
+ go!(self: push_value '\u{fffd}');
+ }
+ FromSet(c) => {
+ go!(self: push_value c);
+ }
+ NotFromSet(ref b) => {
+ go!(self: append_value b);
+ }
}
}
- },
+ }
//§ attribute-value-(single-quoted)-state
- states::AttributeValue(SingleQuoted) => loop {
- match pop_except_from!(self, input, small_char_set!('\r' '\'' '&' '\0' '\n'))? {
- FromSet('\'') => {
- return go!(self: to AfterAttributeValueQuoted);
- }
- FromSet('&') => {
- return go!(self: consume_char_ref '\'');
- }
- FromSet('\0') => {
- go!(self: error);
- go!(self: push_value '\u{fffd}');
- }
- FromSet(c) => {
- go!(self: push_value c);
- }
- NotFromSet(ref b) => {
- go!(self: append_value b);
+ states::AttributeValue(SingleQuoted) => {
+ #[cfg(feature = "spans")]
+ {
+ self.spans.current_attr_value.start = self.spans.current_pos;
+ }
+ loop {
+ match pop_except_from!(self, input, small_char_set!('\r' '\'' '&' '\0' '\n'))? {
+ FromSet('\'') => {
+ return go!(self: to AfterAttributeValueQuoted);
+ }
+ FromSet('&') => {
+ return go!(self: consume_char_ref '\'');
+ }
+ FromSet('\0') => {
+ go!(self: error);
+ go!(self: push_value '\u{fffd}');
+ }
+ FromSet(c) => {
+ go!(self: push_value c);
+ }
+ NotFromSet(ref b) => {
+ go!(self: append_value b);
+ }
}
}
- },
+ }
//§ attribute-value-(unquoted)-state
- states::AttributeValue(Unquoted) => loop {
- match pop_except_from!(
- self,
- input,
- small_char_set!('\r' '\t' '\n' '\x0C' ' ' '&' '>' '\0')
- )? {
- FromSet('\t') | FromSet('\n') | FromSet('\x0C') | FromSet(' ') => {
- return go!(self: to BeforeAttributeName);
- }
- FromSet('&') => {
- return go!(self: consume_char_ref '>');
- }
- FromSet('>') => {
- return go!(self: emit_tag Data);
- }
- FromSet('\0') => {
- go!(self: error);
- go!(self: push_value '\u{fffd}');
- }
- FromSet(c) => {
- go_match!(self: c,
+ states::AttributeValue(Unquoted) => {
+ #[cfg(feature = "spans")]
+ {
+ self.spans.current_attr_value.start = self.spans.current_pos;
+ }
+ loop {
+ match pop_except_from!(
+ self,
+ input,
+ small_char_set!('\r' '\t' '\n' '\x0C' ' ' '&' '>' '\0')
+ )? {
+ FromSet('\t') | FromSet('\n') | FromSet('\x0C') | FromSet(' ') => {
+ #[cfg(feature = "spans")]
+ {
+ self.spans.current_attr_value.end = self.spans.current_pos - 1;
+ }
+ return go!(self: to BeforeAttributeName);
+ }
+ FromSet('&') => {
+ return go!(self: consume_char_ref '>');
+ }
+ FromSet('>') => {
+ #[cfg(feature = "spans")]
+ {
+ self.spans.current_attr_value.end = self.spans.current_pos - 1;
+ }
+ return go!(self: emit_tag Data);
+ }
+ FromSet('\0') => {
+ go!(self: error);
+ go!(self: push_value '\u{fffd}');
+ }
+ FromSet(c) => {
+ go_match!(self: c,
'"' , '\'' , '<' , '=' , '`' => error);
- {
- go!(self: push_value c);
- };
- }
- NotFromSet(ref b) => {
- go!(self: append_value b);
+ {
+ go!(self: push_value c);
+ };
+ }
+ NotFromSet(ref b) => {
+ go!(self: append_value b);
+ }
}
}
- },
+ }
//§ after-attribute-value-(quoted)-state
- states::AfterAttributeValueQuoted => loop {
- match get_char!(self, input)? {
- '\t' | '\n' | '\x0C' | ' ' => {
- return go!(self: to BeforeAttributeName);
- }
- '/' => {
- return go!(self: to SelfClosingStartTag);
- }
- '>' => {
- return go!(self: emit_tag Data);
- }
- _ => {
- go!(self: error);
- self.reconsume = true;
- return go!(self: to BeforeAttributeName);
+ states::AfterAttributeValueQuoted => {
+ #[cfg(feature = "spans")]
+ {
+ self.spans.current_attr_value.end = self.spans.current_pos - 1;
+ }
+
+ loop {
+ match get_char!(self, input)? {
+ '\t' | '\n' | '\x0C' | ' ' => {
+ return go!(self: to BeforeAttributeName);
+ }
+ '/' => {
+ return go!(self: to SelfClosingStartTag);
+ }
+ '>' => {
+ return go!(self: emit_tag Data);
+ }
+ _ => {
+ go!(self: error);
+ self.reconsume = true;
+ return go!(self: to BeforeAttributeName);
+ }
}
}
- },
+ }
//§ self-closing-start-tag-state
states::SelfClosingStartTag => loop {