diff options
Diffstat (limited to 'src/tokenizer')
| -rw-r--r-- | src/tokenizer/char_ref/data.rs | 2 | ||||
| -rw-r--r-- | src/tokenizer/char_ref/mod.rs | 25 | ||||
| -rw-r--r-- | src/tokenizer/mod.rs | 24 | 
3 files changed, 49 insertions, 2 deletions
| diff --git a/src/tokenizer/char_ref/data.rs b/src/tokenizer/char_ref/data.rs index fa839ba..9487034 100644 --- a/src/tokenizer/char_ref/data.rs +++ b/src/tokenizer/char_ref/data.rs @@ -7,6 +7,7 @@  // option. This file may not be copied, modified, or distributed  // except according to those terms.  //! Data that is known at compile-time and hard-coded into the binary. +#[cfg(feature = "named-entities")]  use phf::Map;  /// The spec replaces most characters in the ISO-2022 C1 control code range @@ -47,4 +48,5 @@ pub static C1_REPLACEMENTS: [Option<char>; 32] = [      Some('\u{0178}'),  ]; +#[cfg(feature = "named-entities")]  include!(concat!(env!("OUT_DIR"), "/named_entities.rs")); diff --git a/src/tokenizer/char_ref/mod.rs b/src/tokenizer/char_ref/mod.rs index 41f4c13..9c01bdf 100644 --- a/src/tokenizer/char_ref/mod.rs +++ b/src/tokenizer/char_ref/mod.rs @@ -40,6 +40,7 @@ enum State {      Octothorpe,      Numeric(u32), // base      NumericSemicolon, +    #[cfg(feature = "named-entities")]      Named,      BogusName,  } @@ -55,7 +56,9 @@ pub struct CharRefTokenizer {      hex_marker: Option<char>,      name_buf_opt: Option<String>, +    #[cfg(feature = "named-entities")]      name_match: Option<(u32, u32)>, +    #[cfg(feature = "named-entities")]      name_len: usize,  } @@ -72,7 +75,9 @@ impl CharRefTokenizer {              seen_digit: false,              hex_marker: None,              name_buf_opt: None, +            #[cfg(feature = "named-entities")]              name_match: None, +            #[cfg(feature = "named-entities")]              name_len: 0,          }      } @@ -83,6 +88,7 @@ impl CharRefTokenizer {          self.result.expect("get_result called before done")      } +    #[cfg(feature = "named-entities")]      fn name_buf(&self) -> &str {          self.name_buf_opt              .as_ref() @@ -127,6 +133,7 @@ impl CharRefTokenizer {              Octothorpe => self.do_octothorpe(tokenizer, input),              Numeric(base) => self.do_numeric(tokenizer, input, base),              NumericSemicolon => self.do_numeric_semicolon(tokenizer, input), +            #[cfg(feature = "named-entities")]              Named => self.do_named(tokenizer, input),              BogusName => self.do_bogus_name(tokenizer, input),          } @@ -148,7 +155,14 @@ impl CharRefTokenizer {              }              _ => { -                self.state = Named; +                #[cfg(feature = "named-entities")] +                { +                    self.state = Named; +                } +                #[cfg(not(feature = "named-entities"))] +                { +                    self.state = BogusName; +                }                  self.name_buf_opt = Some(String::new());                  Progress              } @@ -270,6 +284,7 @@ impl CharRefTokenizer {          self.finish_one(c)      } +    #[cfg(feature = "named-entities")]      fn do_named<Sink: TokenSink>(          &mut self,          tokenizer: &mut Tokenizer<Sink>, @@ -294,6 +309,7 @@ impl CharRefTokenizer {          }      } +    #[cfg(feature = "named-entities")]      fn emit_name_error<Sink: TokenSink>(&mut self, tokenizer: &mut Tokenizer<Sink>) {          let msg = format_if!(              tokenizer.opts.exact_errors, @@ -308,6 +324,7 @@ impl CharRefTokenizer {          input.push_front(self.name_buf_opt.take().unwrap());      } +    #[cfg(feature = "named-entities")]      fn finish_named<Sink: TokenSink>(          &mut self,          tokenizer: &mut Tokenizer<Sink>, @@ -405,7 +422,10 @@ impl CharRefTokenizer {          self.name_buf_mut().push(c);          match c {              _ if is_ascii_alnum(c) => return Progress, -            ';' => self.emit_name_error(tokenizer), +            ';' => { +                #[cfg(feature = "named-entities")] +                self.emit_name_error(tokenizer); +            }              _ => (),          }          self.unconsume_name(input); @@ -428,6 +448,7 @@ impl CharRefTokenizer {                      self.finish_numeric(tokenizer);                  } +                #[cfg(feature = "named-entities")]                  Named => drop(self.finish_named(tokenizer, input, None)),                  BogusName => { diff --git a/src/tokenizer/mod.rs b/src/tokenizer/mod.rs index 0acdcaf..5f3d65d 100644 --- a/src/tokenizer/mod.rs +++ b/src/tokenizer/mod.rs @@ -1705,4 +1705,28 @@ mod test {          let results = tokenize(vector, opts);          assert_eq!(results, expected);      } + +    #[test] +    #[cfg(not(feature = "named-entities"))] +    fn named_entities() { +        let opts = TokenizerOpts::default(); +        let vector = vec![String::from("&\r\n"), String::from("&aamp;\r\n")]; +        let expected = vec![(Token::CharacterTokens("&\n&aamp;\n".into()), 3)]; +        let results = tokenize(vector, opts); +        assert_eq!(results, expected); +    } + +    #[test] +    #[cfg(feature = "named-entities")] +    fn named_entities() { +        let opts = TokenizerOpts::default(); +        let vector = vec![String::from("&\r\n"), String::from("&aamp;\r\n")]; +        let expected = vec![ +            (CharacterTokens("&\n".into()), 3), +            (ParseError("Invalid character reference".into()), 3), +            (CharacterTokens("&aamp;\n".into()), 4), +        ]; +        let results = tokenize(vector, opts); +        assert_eq!(results, expected); +    }  } | 
