diff options
Diffstat (limited to 'src/tokenizer')
| -rw-r--r-- | src/tokenizer/interface.rs | 27 | ||||
| -rw-r--r-- | src/tokenizer/mod.rs | 267 | 
2 files changed, 210 insertions, 84 deletions
diff --git a/src/tokenizer/interface.rs b/src/tokenizer/interface.rs index 2c6cc38..f12fb16 100644 --- a/src/tokenizer/interface.rs +++ b/src/tokenizer/interface.rs @@ -9,6 +9,8 @@  use crate::tokenizer::states;  use std::borrow::Cow; +#[cfg(feature = "spans")] +use std::ops::Range;  pub use self::TagKind::{EndTag, StartTag};  pub use self::Token::{CharacterTokens, CommentToken, DoctypeToken, TagToken}; @@ -47,12 +49,30 @@ pub enum TagKind {  /// The tokenizer creates all attributes this way, but the tree  /// builder will adjust certain attribute names inside foreign  /// content (MathML, SVG). -#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug)] +#[derive(PartialEq, Eq, Clone, Debug)]  pub struct Attribute {      /// The name of the attribute (e.g. the `class` in `<div class="test">`)      pub name: String,      /// The value of the attribute (e.g. the `"test"` in `<div class="test">`)      pub value: String, +    #[cfg(feature = "spans")] +    #[cfg_attr(docsrs, doc(cfg(feature = "spans")))] +    pub name_span: Range<usize>, +    #[cfg(feature = "spans")] +    #[cfg_attr(docsrs, doc(cfg(feature = "spans")))] +    pub value_span: Range<usize>, +} + +impl Ord for Attribute { +    fn cmp(&self, other: &Self) -> std::cmp::Ordering { +        (&self.name, &self.value).cmp(&(&other.name, &other.value)) +    } +} + +impl PartialOrd for Attribute { +    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> { +        (&self.name, &self.value).partial_cmp(&(&other.name, &other.value)) +    }  }  /// A tag token. @@ -62,11 +82,14 @@ pub struct Tag {      pub name: String,      pub self_closing: bool,      pub attrs: Vec<Attribute>, +    #[cfg(feature = "spans")] +    #[cfg_attr(docsrs, doc(cfg(feature = "spans")))] +    pub name_span: Range<usize>,  }  impl Tag {      /// Are the tags equivalent when we don't care about attribute order? -    /// Also ignores the self-closing flag. +    /// Also ignores the self-closing flag and spans.      pub fn equiv_modulo_attr_order(&self, other: &Tag) -> bool {          if (self.kind != other.kind) || (self.name != other.name) {              return false; diff --git a/src/tokenizer/mod.rs b/src/tokenizer/mod.rs index 2d5e1ed..4511cf8 100644 --- a/src/tokenizer/mod.rs +++ b/src/tokenizer/mod.rs @@ -59,7 +59,8 @@ fn option_push(opt_str: &mut Option<String>, c: char) {  #[derive(Clone)]  pub struct TokenizerOpts {      /// Report all parse errors described in the spec, at some -    /// performance penalty?  Default: false +    /// performance penalty? Defaults to false, except when the +    /// `spans` feature is enabled in which case it defaults to true.      pub exact_errors: bool,      /// Discard a `U+FEFF BYTE ORDER MARK` if we see one at the beginning @@ -82,7 +83,7 @@ pub struct TokenizerOpts {  impl Default for TokenizerOpts {      fn default() -> TokenizerOpts {          TokenizerOpts { -            exact_errors: false, +            exact_errors: cfg!(feature = "spans"),              discard_bom: true,              profile: false,              initial_state: None, @@ -162,6 +163,36 @@ pub struct Tokenizer<Sink> {      /// Track current line      current_line: u64, + +    #[cfg(feature = "spans")] +    spans: Spans, +} + +#[cfg(feature = "spans")] +#[derive(Default)] +struct Spans { +    /// Track current byte position +    current_pos: usize, + +    /// Current tag name span. +    current_tag_name: core::ops::Range<usize>, + +    /// Current attribute name span. +    current_attr_name: core::ops::Range<usize>, + +    /// Current attribute value span. +    current_attr_value: core::ops::Range<usize>, +} + +#[cfg(feature = "spans")] +impl Spans { +    fn end_tag_name(&mut self) { +        self.current_tag_name.end = self.current_pos - 1; +    } + +    fn end_attr_name(&mut self) { +        self.current_attr_name.end = self.current_pos - 1; +    }  }  impl<Sink: TokenSink> Tokenizer<Sink> { @@ -193,6 +224,8 @@ impl<Sink: TokenSink> Tokenizer<Sink> {              state_profile: BTreeMap::new(),              time_in_sink: 0,              current_line: 1, +            #[cfg(feature = "spans")] +            spans: Spans::default(),          }      } @@ -263,6 +296,11 @@ impl<Sink: TokenSink> Tokenizer<Sink> {              self.emit_error(Cow::Owned(msg));          } +        #[cfg(feature = "spans")] +        { +            self.spans.current_pos += c.len_utf8(); +        } +          self.current_char = c;          Some(c)      } @@ -430,6 +468,8 @@ impl<Sink: TokenSink> Tokenizer<Sink> {              name,              self_closing: self.current_tag_self_closing,              attrs: replace(&mut self.current_tag_attrs, vec![]), +            #[cfg(feature = "spans")] +            name_span: self.spans.current_tag_name.clone(),          });          match self.process_token(token) { @@ -516,6 +556,10 @@ impl<Sink: TokenSink> Tokenizer<Sink> {              self.current_tag_attrs.push(Attribute {                  name: name,                  value: replace(&mut self.current_attr_value, String::new()), +                #[cfg(feature = "spans")] +                name_span: self.spans.current_attr_name.clone(), +                #[cfg(feature = "spans")] +                value_span: self.spans.current_attr_value.clone(),              });          }      } @@ -830,6 +874,10 @@ impl<Sink: TokenSink> Tokenizer<Sink> {                      }                      c => match lower_ascii_letter(c) {                          Some(cl) => { +                            #[cfg(feature = "spans")] +                            { +                                self.spans.current_tag_name.start = self.spans.current_pos - 1; +                            }                              go!(self: create_tag StartTag cl);                              return go!(self: to TagName);                          } @@ -858,6 +906,10 @@ impl<Sink: TokenSink> Tokenizer<Sink> {                      }                      c => match lower_ascii_letter(c) {                          Some(cl) => { +                            #[cfg(feature = "spans")] +                            { +                                self.spans.current_tag_name.start = self.spans.current_pos - 1; +                            }                              go!(self: create_tag EndTag cl);                              return go!(self: to TagName);                          } @@ -875,12 +927,18 @@ impl<Sink: TokenSink> Tokenizer<Sink> {              states::TagName => loop {                  match get_char!(self, input)? {                      '\t' | '\n' | '\x0C' | ' ' => { +                        #[cfg(feature = "spans")] +                        self.spans.end_tag_name();                          return go!(self: to BeforeAttributeName);                      }                      '/' => { +                        #[cfg(feature = "spans")] +                        self.spans.end_tag_name();                          return go!(self: to SelfClosingStartTag);                      }                      '>' => { +                        #[cfg(feature = "spans")] +                        self.spans.end_tag_name();                          return go!(self: emit_tag Data);                      }                      '\0' => { @@ -1168,6 +1226,10 @@ impl<Sink: TokenSink> Tokenizer<Sink> {                      c => match lower_ascii_letter(c) {                          Some(cl) => {                              go!(self: create_attr cl); +                            #[cfg(feature = "spans")] +                            { +                                self.spans.current_attr_name.start = self.spans.current_pos - 1; +                            }                              return go!(self: to AttributeName);                          }                          None => { @@ -1186,15 +1248,23 @@ impl<Sink: TokenSink> Tokenizer<Sink> {              states::AttributeName => loop {                  match get_char!(self, input)? {                      '\t' | '\n' | '\x0C' | ' ' => { +                        #[cfg(feature = "spans")] +                        self.spans.end_attr_name();                          return go!(self: to AfterAttributeName);                      }                      '/' => { +                        #[cfg(feature = "spans")] +                        self.spans.end_attr_name();                          return go!(self: to SelfClosingStartTag);                      }                      '=' => { +                        #[cfg(feature = "spans")] +                        self.spans.end_attr_name();                          return go!(self: to BeforeAttributeValue);                      }                      '>' => { +                        #[cfg(feature = "spans")] +                        self.spans.end_attr_name();                          return go!(self: emit_tag Data);                      }                      '\0' => { @@ -1285,101 +1355,134 @@ impl<Sink: TokenSink> Tokenizer<Sink> {              },              //§ attribute-value-(double-quoted)-state -            states::AttributeValue(DoubleQuoted) => loop { -                match pop_except_from!(self, input, small_char_set!('\r' '"' '&' '\0' '\n'))? { -                    FromSet('"') => { -                        return go!(self: to AfterAttributeValueQuoted); -                    } -                    FromSet('&') => { -                        return go!(self: consume_char_ref '"'); -                    } -                    FromSet('\0') => { -                        go!(self: error); -                        go!(self: push_value '\u{fffd}'); -                    } -                    FromSet(c) => { -                        go!(self: push_value c); -                    } -                    NotFromSet(ref b) => { -                        go!(self: append_value b); +            states::AttributeValue(DoubleQuoted) => { +                #[cfg(feature = "spans")] +                { +                    self.spans.current_attr_value.start = self.spans.current_pos; +                } +                loop { +                    match pop_except_from!(self, input, small_char_set!('\r' '"' '&' '\0' '\n'))? { +                        FromSet('"') => { +                            return go!(self: to AfterAttributeValueQuoted); +                        } +                        FromSet('&') => { +                            return go!(self: consume_char_ref '"'); +                        } +                        FromSet('\0') => { +                            go!(self: error); +                            go!(self: push_value '\u{fffd}'); +                        } +                        FromSet(c) => { +                            go!(self: push_value c); +                        } +                        NotFromSet(ref b) => { +                            go!(self: append_value b); +                        }                      }                  } -            }, +            }              //§ attribute-value-(single-quoted)-state -            states::AttributeValue(SingleQuoted) => loop { -                match pop_except_from!(self, input, small_char_set!('\r' '\'' '&' '\0' '\n'))? { -                    FromSet('\'') => { -                        return go!(self: to AfterAttributeValueQuoted); -                    } -                    FromSet('&') => { -                        return go!(self: consume_char_ref '\''); -                    } -                    FromSet('\0') => { -                        go!(self: error); -                        go!(self: push_value '\u{fffd}'); -                    } -                    FromSet(c) => { -                        go!(self: push_value c); -                    } -                    NotFromSet(ref b) => { -                        go!(self: append_value b); +            states::AttributeValue(SingleQuoted) => { +                #[cfg(feature = "spans")] +                { +                    self.spans.current_attr_value.start = self.spans.current_pos; +                } +                loop { +                    match pop_except_from!(self, input, small_char_set!('\r' '\'' '&' '\0' '\n'))? { +                        FromSet('\'') => { +                            return go!(self: to AfterAttributeValueQuoted); +                        } +                        FromSet('&') => { +                            return go!(self: consume_char_ref '\''); +                        } +                        FromSet('\0') => { +                            go!(self: error); +                            go!(self: push_value '\u{fffd}'); +                        } +                        FromSet(c) => { +                            go!(self: push_value c); +                        } +                        NotFromSet(ref b) => { +                            go!(self: append_value b); +                        }                      }                  } -            }, +            }              //§ attribute-value-(unquoted)-state -            states::AttributeValue(Unquoted) => loop { -                match pop_except_from!( -                    self, -                    input, -                    small_char_set!('\r' '\t' '\n' '\x0C' ' ' '&' '>' '\0') -                )? { -                    FromSet('\t') | FromSet('\n') | FromSet('\x0C') | FromSet(' ') => { -                        return go!(self: to BeforeAttributeName); -                    } -                    FromSet('&') => { -                        return go!(self: consume_char_ref '>'); -                    } -                    FromSet('>') => { -                        return go!(self: emit_tag Data); -                    } -                    FromSet('\0') => { -                        go!(self: error); -                        go!(self: push_value '\u{fffd}'); -                    } -                    FromSet(c) => { -                        go_match!(self: c, +            states::AttributeValue(Unquoted) => { +                #[cfg(feature = "spans")] +                { +                    self.spans.current_attr_value.start = self.spans.current_pos; +                } +                loop { +                    match pop_except_from!( +                        self, +                        input, +                        small_char_set!('\r' '\t' '\n' '\x0C' ' ' '&' '>' '\0') +                    )? { +                        FromSet('\t') | FromSet('\n') | FromSet('\x0C') | FromSet(' ') => { +                            #[cfg(feature = "spans")] +                            { +                                self.spans.current_attr_value.end = self.spans.current_pos - 1; +                            } +                            return go!(self: to BeforeAttributeName); +                        } +                        FromSet('&') => { +                            return go!(self: consume_char_ref '>'); +                        } +                        FromSet('>') => { +                            #[cfg(feature = "spans")] +                            { +                                self.spans.current_attr_value.end = self.spans.current_pos - 1; +                            } +                            return go!(self: emit_tag Data); +                        } +                        FromSet('\0') => { +                            go!(self: error); +                            go!(self: push_value '\u{fffd}'); +                        } +                        FromSet(c) => { +                            go_match!(self: c,                              '"' , '\'' , '<' , '=' , '`' => error); -                        { -                            go!(self: push_value c); -                        }; -                    } -                    NotFromSet(ref b) => { -                        go!(self: append_value b); +                            { +                                go!(self: push_value c); +                            }; +                        } +                        NotFromSet(ref b) => { +                            go!(self: append_value b); +                        }                      }                  } -            }, +            }              //§ after-attribute-value-(quoted)-state -            states::AfterAttributeValueQuoted => loop { -                match get_char!(self, input)? { -                    '\t' | '\n' | '\x0C' | ' ' => { -                        return go!(self: to BeforeAttributeName); -                    } -                    '/' => { -                        return go!(self: to SelfClosingStartTag); -                    } -                    '>' => { -                        return go!(self: emit_tag Data); -                    } -                    _ => { -                        go!(self: error); -                        self.reconsume = true; -                        return go!(self: to BeforeAttributeName); +            states::AfterAttributeValueQuoted => { +                #[cfg(feature = "spans")] +                { +                    self.spans.current_attr_value.end = self.spans.current_pos - 1; +                } + +                loop { +                    match get_char!(self, input)? { +                        '\t' | '\n' | '\x0C' | ' ' => { +                            return go!(self: to BeforeAttributeName); +                        } +                        '/' => { +                            return go!(self: to SelfClosingStartTag); +                        } +                        '>' => { +                            return go!(self: emit_tag Data); +                        } +                        _ => { +                            go!(self: error); +                            self.reconsume = true; +                            return go!(self: to BeforeAttributeName); +                        }                      }                  } -            }, +            }              //§ self-closing-start-tag-state              states::SelfClosingStartTag => loop {  | 
