diff options
Diffstat (limited to 'src/tokenizer/mod.rs')
-rw-r--r-- | src/tokenizer/mod.rs | 267 |
1 files changed, 185 insertions, 82 deletions
diff --git a/src/tokenizer/mod.rs b/src/tokenizer/mod.rs index 2d5e1ed..4511cf8 100644 --- a/src/tokenizer/mod.rs +++ b/src/tokenizer/mod.rs @@ -59,7 +59,8 @@ fn option_push(opt_str: &mut Option<String>, c: char) { #[derive(Clone)] pub struct TokenizerOpts { /// Report all parse errors described in the spec, at some - /// performance penalty? Default: false + /// performance penalty? Defaults to false, except when the + /// `spans` feature is enabled in which case it defaults to true. pub exact_errors: bool, /// Discard a `U+FEFF BYTE ORDER MARK` if we see one at the beginning @@ -82,7 +83,7 @@ pub struct TokenizerOpts { impl Default for TokenizerOpts { fn default() -> TokenizerOpts { TokenizerOpts { - exact_errors: false, + exact_errors: cfg!(feature = "spans"), discard_bom: true, profile: false, initial_state: None, @@ -162,6 +163,36 @@ pub struct Tokenizer<Sink> { /// Track current line current_line: u64, + + #[cfg(feature = "spans")] + spans: Spans, +} + +#[cfg(feature = "spans")] +#[derive(Default)] +struct Spans { + /// Track current byte position + current_pos: usize, + + /// Current tag name span. + current_tag_name: core::ops::Range<usize>, + + /// Current attribute name span. + current_attr_name: core::ops::Range<usize>, + + /// Current attribute value span. + current_attr_value: core::ops::Range<usize>, +} + +#[cfg(feature = "spans")] +impl Spans { + fn end_tag_name(&mut self) { + self.current_tag_name.end = self.current_pos - 1; + } + + fn end_attr_name(&mut self) { + self.current_attr_name.end = self.current_pos - 1; + } } impl<Sink: TokenSink> Tokenizer<Sink> { @@ -193,6 +224,8 @@ impl<Sink: TokenSink> Tokenizer<Sink> { state_profile: BTreeMap::new(), time_in_sink: 0, current_line: 1, + #[cfg(feature = "spans")] + spans: Spans::default(), } } @@ -263,6 +296,11 @@ impl<Sink: TokenSink> Tokenizer<Sink> { self.emit_error(Cow::Owned(msg)); } + #[cfg(feature = "spans")] + { + self.spans.current_pos += c.len_utf8(); + } + self.current_char = c; Some(c) } @@ -430,6 +468,8 @@ impl<Sink: TokenSink> Tokenizer<Sink> { name, self_closing: self.current_tag_self_closing, attrs: replace(&mut self.current_tag_attrs, vec![]), + #[cfg(feature = "spans")] + name_span: self.spans.current_tag_name.clone(), }); match self.process_token(token) { @@ -516,6 +556,10 @@ impl<Sink: TokenSink> Tokenizer<Sink> { self.current_tag_attrs.push(Attribute { name: name, value: replace(&mut self.current_attr_value, String::new()), + #[cfg(feature = "spans")] + name_span: self.spans.current_attr_name.clone(), + #[cfg(feature = "spans")] + value_span: self.spans.current_attr_value.clone(), }); } } @@ -830,6 +874,10 @@ impl<Sink: TokenSink> Tokenizer<Sink> { } c => match lower_ascii_letter(c) { Some(cl) => { + #[cfg(feature = "spans")] + { + self.spans.current_tag_name.start = self.spans.current_pos - 1; + } go!(self: create_tag StartTag cl); return go!(self: to TagName); } @@ -858,6 +906,10 @@ impl<Sink: TokenSink> Tokenizer<Sink> { } c => match lower_ascii_letter(c) { Some(cl) => { + #[cfg(feature = "spans")] + { + self.spans.current_tag_name.start = self.spans.current_pos - 1; + } go!(self: create_tag EndTag cl); return go!(self: to TagName); } @@ -875,12 +927,18 @@ impl<Sink: TokenSink> Tokenizer<Sink> { states::TagName => loop { match get_char!(self, input)? { '\t' | '\n' | '\x0C' | ' ' => { + #[cfg(feature = "spans")] + self.spans.end_tag_name(); return go!(self: to BeforeAttributeName); } '/' => { + #[cfg(feature = "spans")] + self.spans.end_tag_name(); return go!(self: to SelfClosingStartTag); } '>' => { + #[cfg(feature = "spans")] + self.spans.end_tag_name(); return go!(self: emit_tag Data); } '\0' => { @@ -1168,6 +1226,10 @@ impl<Sink: TokenSink> Tokenizer<Sink> { c => match lower_ascii_letter(c) { Some(cl) => { go!(self: create_attr cl); + #[cfg(feature = "spans")] + { + self.spans.current_attr_name.start = self.spans.current_pos - 1; + } return go!(self: to AttributeName); } None => { @@ -1186,15 +1248,23 @@ impl<Sink: TokenSink> Tokenizer<Sink> { states::AttributeName => loop { match get_char!(self, input)? { '\t' | '\n' | '\x0C' | ' ' => { + #[cfg(feature = "spans")] + self.spans.end_attr_name(); return go!(self: to AfterAttributeName); } '/' => { + #[cfg(feature = "spans")] + self.spans.end_attr_name(); return go!(self: to SelfClosingStartTag); } '=' => { + #[cfg(feature = "spans")] + self.spans.end_attr_name(); return go!(self: to BeforeAttributeValue); } '>' => { + #[cfg(feature = "spans")] + self.spans.end_attr_name(); return go!(self: emit_tag Data); } '\0' => { @@ -1285,101 +1355,134 @@ impl<Sink: TokenSink> Tokenizer<Sink> { }, //§ attribute-value-(double-quoted)-state - states::AttributeValue(DoubleQuoted) => loop { - match pop_except_from!(self, input, small_char_set!('\r' '"' '&' '\0' '\n'))? { - FromSet('"') => { - return go!(self: to AfterAttributeValueQuoted); - } - FromSet('&') => { - return go!(self: consume_char_ref '"'); - } - FromSet('\0') => { - go!(self: error); - go!(self: push_value '\u{fffd}'); - } - FromSet(c) => { - go!(self: push_value c); - } - NotFromSet(ref b) => { - go!(self: append_value b); + states::AttributeValue(DoubleQuoted) => { + #[cfg(feature = "spans")] + { + self.spans.current_attr_value.start = self.spans.current_pos; + } + loop { + match pop_except_from!(self, input, small_char_set!('\r' '"' '&' '\0' '\n'))? { + FromSet('"') => { + return go!(self: to AfterAttributeValueQuoted); + } + FromSet('&') => { + return go!(self: consume_char_ref '"'); + } + FromSet('\0') => { + go!(self: error); + go!(self: push_value '\u{fffd}'); + } + FromSet(c) => { + go!(self: push_value c); + } + NotFromSet(ref b) => { + go!(self: append_value b); + } } } - }, + } //§ attribute-value-(single-quoted)-state - states::AttributeValue(SingleQuoted) => loop { - match pop_except_from!(self, input, small_char_set!('\r' '\'' '&' '\0' '\n'))? { - FromSet('\'') => { - return go!(self: to AfterAttributeValueQuoted); - } - FromSet('&') => { - return go!(self: consume_char_ref '\''); - } - FromSet('\0') => { - go!(self: error); - go!(self: push_value '\u{fffd}'); - } - FromSet(c) => { - go!(self: push_value c); - } - NotFromSet(ref b) => { - go!(self: append_value b); + states::AttributeValue(SingleQuoted) => { + #[cfg(feature = "spans")] + { + self.spans.current_attr_value.start = self.spans.current_pos; + } + loop { + match pop_except_from!(self, input, small_char_set!('\r' '\'' '&' '\0' '\n'))? { + FromSet('\'') => { + return go!(self: to AfterAttributeValueQuoted); + } + FromSet('&') => { + return go!(self: consume_char_ref '\''); + } + FromSet('\0') => { + go!(self: error); + go!(self: push_value '\u{fffd}'); + } + FromSet(c) => { + go!(self: push_value c); + } + NotFromSet(ref b) => { + go!(self: append_value b); + } } } - }, + } //§ attribute-value-(unquoted)-state - states::AttributeValue(Unquoted) => loop { - match pop_except_from!( - self, - input, - small_char_set!('\r' '\t' '\n' '\x0C' ' ' '&' '>' '\0') - )? { - FromSet('\t') | FromSet('\n') | FromSet('\x0C') | FromSet(' ') => { - return go!(self: to BeforeAttributeName); - } - FromSet('&') => { - return go!(self: consume_char_ref '>'); - } - FromSet('>') => { - return go!(self: emit_tag Data); - } - FromSet('\0') => { - go!(self: error); - go!(self: push_value '\u{fffd}'); - } - FromSet(c) => { - go_match!(self: c, + states::AttributeValue(Unquoted) => { + #[cfg(feature = "spans")] + { + self.spans.current_attr_value.start = self.spans.current_pos; + } + loop { + match pop_except_from!( + self, + input, + small_char_set!('\r' '\t' '\n' '\x0C' ' ' '&' '>' '\0') + )? { + FromSet('\t') | FromSet('\n') | FromSet('\x0C') | FromSet(' ') => { + #[cfg(feature = "spans")] + { + self.spans.current_attr_value.end = self.spans.current_pos - 1; + } + return go!(self: to BeforeAttributeName); + } + FromSet('&') => { + return go!(self: consume_char_ref '>'); + } + FromSet('>') => { + #[cfg(feature = "spans")] + { + self.spans.current_attr_value.end = self.spans.current_pos - 1; + } + return go!(self: emit_tag Data); + } + FromSet('\0') => { + go!(self: error); + go!(self: push_value '\u{fffd}'); + } + FromSet(c) => { + go_match!(self: c, '"' , '\'' , '<' , '=' , '`' => error); - { - go!(self: push_value c); - }; - } - NotFromSet(ref b) => { - go!(self: append_value b); + { + go!(self: push_value c); + }; + } + NotFromSet(ref b) => { + go!(self: append_value b); + } } } - }, + } //§ after-attribute-value-(quoted)-state - states::AfterAttributeValueQuoted => loop { - match get_char!(self, input)? { - '\t' | '\n' | '\x0C' | ' ' => { - return go!(self: to BeforeAttributeName); - } - '/' => { - return go!(self: to SelfClosingStartTag); - } - '>' => { - return go!(self: emit_tag Data); - } - _ => { - go!(self: error); - self.reconsume = true; - return go!(self: to BeforeAttributeName); + states::AfterAttributeValueQuoted => { + #[cfg(feature = "spans")] + { + self.spans.current_attr_value.end = self.spans.current_pos - 1; + } + + loop { + match get_char!(self, input)? { + '\t' | '\n' | '\x0C' | ' ' => { + return go!(self: to BeforeAttributeName); + } + '/' => { + return go!(self: to SelfClosingStartTag); + } + '>' => { + return go!(self: emit_tag Data); + } + _ => { + go!(self: error); + self.reconsume = true; + return go!(self: to BeforeAttributeName); + } } } - }, + } //§ self-closing-start-tag-state states::SelfClosingStartTag => loop { |