diff options
| author | Martin Fischer <martin@push-f.com> | 2021-11-30 07:28:21 +0100 | 
|---|---|---|
| committer | Martin Fischer <martin@push-f.com> | 2021-11-30 11:22:35 +0100 | 
| commit | baf1477c587fe22d27e94408cf2505d588ba007e (patch) | |
| tree | f3e027e3c149cfeb7187a625756ea4b2de47c82a | |
| parent | 25087cce997abc386f881648dfd39c83dfef7667 (diff) | |
add spans feature
| -rw-r--r-- | Cargo.toml | 6 | ||||
| -rw-r--r-- | README.md | 4 | ||||
| -rw-r--r-- | src/lib.rs | 1 | ||||
| -rw-r--r-- | src/tokenizer/interface.rs | 27 | ||||
| -rw-r--r-- | src/tokenizer/mod.rs | 267 | ||||
| -rw-r--r-- | tests/files/test.html | 7 | ||||
| -rw-r--r-- | tests/files/test.out | 17 | ||||
| -rw-r--r-- | tests/spans.rs | 89 | 
8 files changed, 334 insertions, 84 deletions
| @@ -12,6 +12,7 @@ edition = "2018"  [features]  default = ["named-entities"] +spans = []  # resolve named entities like &  named-entities = ["phf", "phf_codegen"] @@ -23,8 +24,13 @@ phf = { version = "0.9", optional = true  }  phf_codegen = { version = "0.9", optional = true }  [dev-dependencies] +codespan-reporting = "0.11.1"  criterion = "0.3"  [[bench]]  name = "html5ever"  harness = false + +[package.metadata.docs.rs] +all-features = true +rustdoc-args = ["--cfg", "docsrs"] @@ -15,6 +15,10 @@ changes:    tag names (e.g. for `script` and `styles`) ... with the html5ever tokenizer    you had to do this yourself. +* An optional `spans` feature has been added to make the tokenizer report the +  source code spans for tag names, attribute names and attribute values. +  The feature is disabled by default. +  * The API has been cleaned up a bit (e.g. the internal tokenizer state enums    are no longer public). @@ -10,6 +10,7 @@  #![doc = include_str!("../README.md")]  #![crate_type = "dylib"]  #![cfg_attr(test, deny(warnings))] +#![cfg_attr(docsrs, feature(doc_cfg))]  #![allow(unused_parens)]  #[macro_use] diff --git a/src/tokenizer/interface.rs b/src/tokenizer/interface.rs index 2c6cc38..f12fb16 100644 --- a/src/tokenizer/interface.rs +++ b/src/tokenizer/interface.rs @@ -9,6 +9,8 @@  use crate::tokenizer::states;  use std::borrow::Cow; +#[cfg(feature = "spans")] +use std::ops::Range;  pub use self::TagKind::{EndTag, StartTag};  pub use self::Token::{CharacterTokens, CommentToken, DoctypeToken, TagToken}; @@ -47,12 +49,30 @@ pub enum TagKind {  /// The tokenizer creates all attributes this way, but the tree  /// builder will adjust certain attribute names inside foreign  /// content (MathML, SVG). -#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug)] +#[derive(PartialEq, Eq, Clone, Debug)]  pub struct Attribute {      /// The name of the attribute (e.g. the `class` in `<div class="test">`)      pub name: String,      /// The value of the attribute (e.g. the `"test"` in `<div class="test">`)      pub value: String, +    #[cfg(feature = "spans")] +    #[cfg_attr(docsrs, doc(cfg(feature = "spans")))] +    pub name_span: Range<usize>, +    #[cfg(feature = "spans")] +    #[cfg_attr(docsrs, doc(cfg(feature = "spans")))] +    pub value_span: Range<usize>, +} + +impl Ord for Attribute { +    fn cmp(&self, other: &Self) -> std::cmp::Ordering { +        (&self.name, &self.value).cmp(&(&other.name, &other.value)) +    } +} + +impl PartialOrd for Attribute { +    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> { +        (&self.name, &self.value).partial_cmp(&(&other.name, &other.value)) +    }  }  /// A tag token. @@ -62,11 +82,14 @@ pub struct Tag {      pub name: String,      pub self_closing: bool,      pub attrs: Vec<Attribute>, +    #[cfg(feature = "spans")] +    #[cfg_attr(docsrs, doc(cfg(feature = "spans")))] +    pub name_span: Range<usize>,  }  impl Tag {      /// Are the tags equivalent when we don't care about attribute order? -    /// Also ignores the self-closing flag. +    /// Also ignores the self-closing flag and spans.      pub fn equiv_modulo_attr_order(&self, other: &Tag) -> bool {          if (self.kind != other.kind) || (self.name != other.name) {              return false; diff --git a/src/tokenizer/mod.rs b/src/tokenizer/mod.rs index 2d5e1ed..4511cf8 100644 --- a/src/tokenizer/mod.rs +++ b/src/tokenizer/mod.rs @@ -59,7 +59,8 @@ fn option_push(opt_str: &mut Option<String>, c: char) {  #[derive(Clone)]  pub struct TokenizerOpts {      /// Report all parse errors described in the spec, at some -    /// performance penalty?  Default: false +    /// performance penalty? Defaults to false, except when the +    /// `spans` feature is enabled in which case it defaults to true.      pub exact_errors: bool,      /// Discard a `U+FEFF BYTE ORDER MARK` if we see one at the beginning @@ -82,7 +83,7 @@ pub struct TokenizerOpts {  impl Default for TokenizerOpts {      fn default() -> TokenizerOpts {          TokenizerOpts { -            exact_errors: false, +            exact_errors: cfg!(feature = "spans"),              discard_bom: true,              profile: false,              initial_state: None, @@ -162,6 +163,36 @@ pub struct Tokenizer<Sink> {      /// Track current line      current_line: u64, + +    #[cfg(feature = "spans")] +    spans: Spans, +} + +#[cfg(feature = "spans")] +#[derive(Default)] +struct Spans { +    /// Track current byte position +    current_pos: usize, + +    /// Current tag name span. +    current_tag_name: core::ops::Range<usize>, + +    /// Current attribute name span. +    current_attr_name: core::ops::Range<usize>, + +    /// Current attribute value span. +    current_attr_value: core::ops::Range<usize>, +} + +#[cfg(feature = "spans")] +impl Spans { +    fn end_tag_name(&mut self) { +        self.current_tag_name.end = self.current_pos - 1; +    } + +    fn end_attr_name(&mut self) { +        self.current_attr_name.end = self.current_pos - 1; +    }  }  impl<Sink: TokenSink> Tokenizer<Sink> { @@ -193,6 +224,8 @@ impl<Sink: TokenSink> Tokenizer<Sink> {              state_profile: BTreeMap::new(),              time_in_sink: 0,              current_line: 1, +            #[cfg(feature = "spans")] +            spans: Spans::default(),          }      } @@ -263,6 +296,11 @@ impl<Sink: TokenSink> Tokenizer<Sink> {              self.emit_error(Cow::Owned(msg));          } +        #[cfg(feature = "spans")] +        { +            self.spans.current_pos += c.len_utf8(); +        } +          self.current_char = c;          Some(c)      } @@ -430,6 +468,8 @@ impl<Sink: TokenSink> Tokenizer<Sink> {              name,              self_closing: self.current_tag_self_closing,              attrs: replace(&mut self.current_tag_attrs, vec![]), +            #[cfg(feature = "spans")] +            name_span: self.spans.current_tag_name.clone(),          });          match self.process_token(token) { @@ -516,6 +556,10 @@ impl<Sink: TokenSink> Tokenizer<Sink> {              self.current_tag_attrs.push(Attribute {                  name: name,                  value: replace(&mut self.current_attr_value, String::new()), +                #[cfg(feature = "spans")] +                name_span: self.spans.current_attr_name.clone(), +                #[cfg(feature = "spans")] +                value_span: self.spans.current_attr_value.clone(),              });          }      } @@ -830,6 +874,10 @@ impl<Sink: TokenSink> Tokenizer<Sink> {                      }                      c => match lower_ascii_letter(c) {                          Some(cl) => { +                            #[cfg(feature = "spans")] +                            { +                                self.spans.current_tag_name.start = self.spans.current_pos - 1; +                            }                              go!(self: create_tag StartTag cl);                              return go!(self: to TagName);                          } @@ -858,6 +906,10 @@ impl<Sink: TokenSink> Tokenizer<Sink> {                      }                      c => match lower_ascii_letter(c) {                          Some(cl) => { +                            #[cfg(feature = "spans")] +                            { +                                self.spans.current_tag_name.start = self.spans.current_pos - 1; +                            }                              go!(self: create_tag EndTag cl);                              return go!(self: to TagName);                          } @@ -875,12 +927,18 @@ impl<Sink: TokenSink> Tokenizer<Sink> {              states::TagName => loop {                  match get_char!(self, input)? {                      '\t' | '\n' | '\x0C' | ' ' => { +                        #[cfg(feature = "spans")] +                        self.spans.end_tag_name();                          return go!(self: to BeforeAttributeName);                      }                      '/' => { +                        #[cfg(feature = "spans")] +                        self.spans.end_tag_name();                          return go!(self: to SelfClosingStartTag);                      }                      '>' => { +                        #[cfg(feature = "spans")] +                        self.spans.end_tag_name();                          return go!(self: emit_tag Data);                      }                      '\0' => { @@ -1168,6 +1226,10 @@ impl<Sink: TokenSink> Tokenizer<Sink> {                      c => match lower_ascii_letter(c) {                          Some(cl) => {                              go!(self: create_attr cl); +                            #[cfg(feature = "spans")] +                            { +                                self.spans.current_attr_name.start = self.spans.current_pos - 1; +                            }                              return go!(self: to AttributeName);                          }                          None => { @@ -1186,15 +1248,23 @@ impl<Sink: TokenSink> Tokenizer<Sink> {              states::AttributeName => loop {                  match get_char!(self, input)? {                      '\t' | '\n' | '\x0C' | ' ' => { +                        #[cfg(feature = "spans")] +                        self.spans.end_attr_name();                          return go!(self: to AfterAttributeName);                      }                      '/' => { +                        #[cfg(feature = "spans")] +                        self.spans.end_attr_name();                          return go!(self: to SelfClosingStartTag);                      }                      '=' => { +                        #[cfg(feature = "spans")] +                        self.spans.end_attr_name();                          return go!(self: to BeforeAttributeValue);                      }                      '>' => { +                        #[cfg(feature = "spans")] +                        self.spans.end_attr_name();                          return go!(self: emit_tag Data);                      }                      '\0' => { @@ -1285,101 +1355,134 @@ impl<Sink: TokenSink> Tokenizer<Sink> {              },              //§ attribute-value-(double-quoted)-state -            states::AttributeValue(DoubleQuoted) => loop { -                match pop_except_from!(self, input, small_char_set!('\r' '"' '&' '\0' '\n'))? { -                    FromSet('"') => { -                        return go!(self: to AfterAttributeValueQuoted); -                    } -                    FromSet('&') => { -                        return go!(self: consume_char_ref '"'); -                    } -                    FromSet('\0') => { -                        go!(self: error); -                        go!(self: push_value '\u{fffd}'); -                    } -                    FromSet(c) => { -                        go!(self: push_value c); -                    } -                    NotFromSet(ref b) => { -                        go!(self: append_value b); +            states::AttributeValue(DoubleQuoted) => { +                #[cfg(feature = "spans")] +                { +                    self.spans.current_attr_value.start = self.spans.current_pos; +                } +                loop { +                    match pop_except_from!(self, input, small_char_set!('\r' '"' '&' '\0' '\n'))? { +                        FromSet('"') => { +                            return go!(self: to AfterAttributeValueQuoted); +                        } +                        FromSet('&') => { +                            return go!(self: consume_char_ref '"'); +                        } +                        FromSet('\0') => { +                            go!(self: error); +                            go!(self: push_value '\u{fffd}'); +                        } +                        FromSet(c) => { +                            go!(self: push_value c); +                        } +                        NotFromSet(ref b) => { +                            go!(self: append_value b); +                        }                      }                  } -            }, +            }              //§ attribute-value-(single-quoted)-state -            states::AttributeValue(SingleQuoted) => loop { -                match pop_except_from!(self, input, small_char_set!('\r' '\'' '&' '\0' '\n'))? { -                    FromSet('\'') => { -                        return go!(self: to AfterAttributeValueQuoted); -                    } -                    FromSet('&') => { -                        return go!(self: consume_char_ref '\''); -                    } -                    FromSet('\0') => { -                        go!(self: error); -                        go!(self: push_value '\u{fffd}'); -                    } -                    FromSet(c) => { -                        go!(self: push_value c); -                    } -                    NotFromSet(ref b) => { -                        go!(self: append_value b); +            states::AttributeValue(SingleQuoted) => { +                #[cfg(feature = "spans")] +                { +                    self.spans.current_attr_value.start = self.spans.current_pos; +                } +                loop { +                    match pop_except_from!(self, input, small_char_set!('\r' '\'' '&' '\0' '\n'))? { +                        FromSet('\'') => { +                            return go!(self: to AfterAttributeValueQuoted); +                        } +                        FromSet('&') => { +                            return go!(self: consume_char_ref '\''); +                        } +                        FromSet('\0') => { +                            go!(self: error); +                            go!(self: push_value '\u{fffd}'); +                        } +                        FromSet(c) => { +                            go!(self: push_value c); +                        } +                        NotFromSet(ref b) => { +                            go!(self: append_value b); +                        }                      }                  } -            }, +            }              //§ attribute-value-(unquoted)-state -            states::AttributeValue(Unquoted) => loop { -                match pop_except_from!( -                    self, -                    input, -                    small_char_set!('\r' '\t' '\n' '\x0C' ' ' '&' '>' '\0') -                )? { -                    FromSet('\t') | FromSet('\n') | FromSet('\x0C') | FromSet(' ') => { -                        return go!(self: to BeforeAttributeName); -                    } -                    FromSet('&') => { -                        return go!(self: consume_char_ref '>'); -                    } -                    FromSet('>') => { -                        return go!(self: emit_tag Data); -                    } -                    FromSet('\0') => { -                        go!(self: error); -                        go!(self: push_value '\u{fffd}'); -                    } -                    FromSet(c) => { -                        go_match!(self: c, +            states::AttributeValue(Unquoted) => { +                #[cfg(feature = "spans")] +                { +                    self.spans.current_attr_value.start = self.spans.current_pos; +                } +                loop { +                    match pop_except_from!( +                        self, +                        input, +                        small_char_set!('\r' '\t' '\n' '\x0C' ' ' '&' '>' '\0') +                    )? { +                        FromSet('\t') | FromSet('\n') | FromSet('\x0C') | FromSet(' ') => { +                            #[cfg(feature = "spans")] +                            { +                                self.spans.current_attr_value.end = self.spans.current_pos - 1; +                            } +                            return go!(self: to BeforeAttributeName); +                        } +                        FromSet('&') => { +                            return go!(self: consume_char_ref '>'); +                        } +                        FromSet('>') => { +                            #[cfg(feature = "spans")] +                            { +                                self.spans.current_attr_value.end = self.spans.current_pos - 1; +                            } +                            return go!(self: emit_tag Data); +                        } +                        FromSet('\0') => { +                            go!(self: error); +                            go!(self: push_value '\u{fffd}'); +                        } +                        FromSet(c) => { +                            go_match!(self: c,                              '"' , '\'' , '<' , '=' , '`' => error); -                        { -                            go!(self: push_value c); -                        }; -                    } -                    NotFromSet(ref b) => { -                        go!(self: append_value b); +                            { +                                go!(self: push_value c); +                            }; +                        } +                        NotFromSet(ref b) => { +                            go!(self: append_value b); +                        }                      }                  } -            }, +            }              //§ after-attribute-value-(quoted)-state -            states::AfterAttributeValueQuoted => loop { -                match get_char!(self, input)? { -                    '\t' | '\n' | '\x0C' | ' ' => { -                        return go!(self: to BeforeAttributeName); -                    } -                    '/' => { -                        return go!(self: to SelfClosingStartTag); -                    } -                    '>' => { -                        return go!(self: emit_tag Data); -                    } -                    _ => { -                        go!(self: error); -                        self.reconsume = true; -                        return go!(self: to BeforeAttributeName); +            states::AfterAttributeValueQuoted => { +                #[cfg(feature = "spans")] +                { +                    self.spans.current_attr_value.end = self.spans.current_pos - 1; +                } + +                loop { +                    match get_char!(self, input)? { +                        '\t' | '\n' | '\x0C' | ' ' => { +                            return go!(self: to BeforeAttributeName); +                        } +                        '/' => { +                            return go!(self: to SelfClosingStartTag); +                        } +                        '>' => { +                            return go!(self: emit_tag Data); +                        } +                        _ => { +                            go!(self: error); +                            self.reconsume = true; +                            return go!(self: to BeforeAttributeName); +                        }                      }                  } -            }, +            }              //§ self-closing-start-tag-state              states::SelfClosingStartTag => loop { diff --git a/tests/files/test.html b/tests/files/test.html new file mode 100644 index 0000000..0dcbdbf --- /dev/null +++ b/tests/files/test.html @@ -0,0 +1,7 @@ +This is a file. + +Here is a tag: <strong >very cool</strong> + +Tags can have attributes: <div id = foo >...</div> + +Attribute values can be quoted: <input name = 'age' type = "number"> diff --git a/tests/files/test.out b/tests/files/test.out new file mode 100644 index 0000000..7127ebc --- /dev/null +++ b/tests/files/test.out @@ -0,0 +1,17 @@ +note: +  ┌─ test.html:3:17 +  │ +3 │ Here is a tag: <strong >very cool</strong> +  │                 ^^^^^^             ^^^^^^ EndTag +  │                 │ +  │                 StartTag +4 │ +5 │ Tags can have attributes: <div id = foo >...</div> +  │                                ^^   ^^^ attribute value +  │                                │ +  │                                attribute name +6 │ +7 │ Attribute values can be quoted: <input name = 'age' type = "number"> +  │                                                ^^^          ^^^^^^ in double quotes +  │                                                │ +  │                                                in single quotes diff --git a/tests/spans.rs b/tests/spans.rs new file mode 100644 index 0000000..bfa42f6 --- /dev/null +++ b/tests/spans.rs @@ -0,0 +1,89 @@ +#![cfg(feature = "spans")] +use std::include_str; + +use codespan_reporting::{ +    self, +    diagnostic::{Diagnostic, Label}, +    files::SimpleFiles, +    term::{self, termcolor::Buffer}, +}; +use html5tokenizer::{ +    BufferQueue, Tag, Token, TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts, +}; + +#[derive(Default)] +struct TagSink { +    tags: Vec<Tag>, +} + +impl TokenSink for TagSink { +    fn process_token(&mut self, token: Token, _line_number: u64) -> TokenSinkResult { +        if let Token::TagToken(tag) = token { +            self.tags.push(tag); +        } +        TokenSinkResult::Continue +    } +} + +#[test] +fn test() { +    let sink = TagSink::default(); + +    let mut input = BufferQueue::new(); +    let text = include_str!("files/test.html"); +    input.push_back(text.to_string()); + +    let mut tok = Tokenizer::new(sink, TokenizerOpts::default()); +    let _ = tok.feed(&mut input); + +    let mut files = SimpleFiles::new(); +    let file_id = files.add("test.html", text); +    let mut labels = Vec::new(); + +    let tags = tok.sink.tags; +    for tag in &tags[..2] { +        labels.push( +            Label::primary(file_id, tag.name_span.clone()).with_message(format!("{:?}", tag.kind)), +        ); +    } +    labels.push( +        Label::primary(file_id, tags[2].attrs[0].name_span.clone()).with_message("attribute name"), +    ); +    labels.push( +        Label::primary(file_id, tags[2].attrs[0].value_span.clone()) +            .with_message("attribute value"), +    ); +    labels.push( +        Label::primary(file_id, tags[4].attrs[0].value_span.clone()) +            .with_message("in single quotes"), +    ); +    labels.push( +        Label::primary(file_id, tags[4].attrs[1].value_span.clone()) +            .with_message("in double quotes"), +    ); +    let diagnostic = Diagnostic::note().with_labels(labels); + +    let mut writer = Buffer::no_color(); +    let config = codespan_reporting::term::Config::default(); +    term::emit(&mut writer, &config, &files, &diagnostic).unwrap(); + +    let actual = remove_trailing_spaces(std::str::from_utf8(writer.as_slice()).unwrap()); +    let expected = include_str!("files/test.out"); + +    if actual != expected { +        println!( +            "EXPECTED:\n{banner}\n{expected}{banner}\n\nACTUAL OUTPUT:\n{banner}\n{actual}{banner}", +            banner = "-".repeat(30), +            expected = expected, +            actual = actual +        ); +        panic!("failed"); +    } +} + +fn remove_trailing_spaces(text: &str) -> String { +    text.lines() +        .map(|l| l.trim_end()) +        .collect::<Vec<_>>() +        .join("\n") +} | 
