diff options
Diffstat (limited to 'src/tokenizer')
-rw-r--r-- | src/tokenizer/char_ref/data.rs | 2 | ||||
-rw-r--r-- | src/tokenizer/char_ref/mod.rs | 25 | ||||
-rw-r--r-- | src/tokenizer/mod.rs | 24 |
3 files changed, 49 insertions, 2 deletions
diff --git a/src/tokenizer/char_ref/data.rs b/src/tokenizer/char_ref/data.rs index fa839ba..9487034 100644 --- a/src/tokenizer/char_ref/data.rs +++ b/src/tokenizer/char_ref/data.rs @@ -7,6 +7,7 @@ // option. This file may not be copied, modified, or distributed // except according to those terms. //! Data that is known at compile-time and hard-coded into the binary. +#[cfg(feature = "named-entities")] use phf::Map; /// The spec replaces most characters in the ISO-2022 C1 control code range @@ -47,4 +48,5 @@ pub static C1_REPLACEMENTS: [Option<char>; 32] = [ Some('\u{0178}'), ]; +#[cfg(feature = "named-entities")] include!(concat!(env!("OUT_DIR"), "/named_entities.rs")); diff --git a/src/tokenizer/char_ref/mod.rs b/src/tokenizer/char_ref/mod.rs index 41f4c13..9c01bdf 100644 --- a/src/tokenizer/char_ref/mod.rs +++ b/src/tokenizer/char_ref/mod.rs @@ -40,6 +40,7 @@ enum State { Octothorpe, Numeric(u32), // base NumericSemicolon, + #[cfg(feature = "named-entities")] Named, BogusName, } @@ -55,7 +56,9 @@ pub struct CharRefTokenizer { hex_marker: Option<char>, name_buf_opt: Option<String>, + #[cfg(feature = "named-entities")] name_match: Option<(u32, u32)>, + #[cfg(feature = "named-entities")] name_len: usize, } @@ -72,7 +75,9 @@ impl CharRefTokenizer { seen_digit: false, hex_marker: None, name_buf_opt: None, + #[cfg(feature = "named-entities")] name_match: None, + #[cfg(feature = "named-entities")] name_len: 0, } } @@ -83,6 +88,7 @@ impl CharRefTokenizer { self.result.expect("get_result called before done") } + #[cfg(feature = "named-entities")] fn name_buf(&self) -> &str { self.name_buf_opt .as_ref() @@ -127,6 +133,7 @@ impl CharRefTokenizer { Octothorpe => self.do_octothorpe(tokenizer, input), Numeric(base) => self.do_numeric(tokenizer, input, base), NumericSemicolon => self.do_numeric_semicolon(tokenizer, input), + #[cfg(feature = "named-entities")] Named => self.do_named(tokenizer, input), BogusName => self.do_bogus_name(tokenizer, input), } @@ -148,7 +155,14 @@ impl CharRefTokenizer { } _ => { - self.state = Named; + #[cfg(feature = "named-entities")] + { + self.state = Named; + } + #[cfg(not(feature = "named-entities"))] + { + self.state = BogusName; + } self.name_buf_opt = Some(String::new()); Progress } @@ -270,6 +284,7 @@ impl CharRefTokenizer { self.finish_one(c) } + #[cfg(feature = "named-entities")] fn do_named<Sink: TokenSink>( &mut self, tokenizer: &mut Tokenizer<Sink>, @@ -294,6 +309,7 @@ impl CharRefTokenizer { } } + #[cfg(feature = "named-entities")] fn emit_name_error<Sink: TokenSink>(&mut self, tokenizer: &mut Tokenizer<Sink>) { let msg = format_if!( tokenizer.opts.exact_errors, @@ -308,6 +324,7 @@ impl CharRefTokenizer { input.push_front(self.name_buf_opt.take().unwrap()); } + #[cfg(feature = "named-entities")] fn finish_named<Sink: TokenSink>( &mut self, tokenizer: &mut Tokenizer<Sink>, @@ -405,7 +422,10 @@ impl CharRefTokenizer { self.name_buf_mut().push(c); match c { _ if is_ascii_alnum(c) => return Progress, - ';' => self.emit_name_error(tokenizer), + ';' => { + #[cfg(feature = "named-entities")] + self.emit_name_error(tokenizer); + } _ => (), } self.unconsume_name(input); @@ -428,6 +448,7 @@ impl CharRefTokenizer { self.finish_numeric(tokenizer); } + #[cfg(feature = "named-entities")] Named => drop(self.finish_named(tokenizer, input, None)), BogusName => { diff --git a/src/tokenizer/mod.rs b/src/tokenizer/mod.rs index 0acdcaf..5f3d65d 100644 --- a/src/tokenizer/mod.rs +++ b/src/tokenizer/mod.rs @@ -1705,4 +1705,28 @@ mod test { let results = tokenize(vector, opts); assert_eq!(results, expected); } + + #[test] + #[cfg(not(feature = "named-entities"))] + fn named_entities() { + let opts = TokenizerOpts::default(); + let vector = vec![String::from("&\r\n"), String::from("&aamp;\r\n")]; + let expected = vec![(Token::CharacterTokens("&\n&aamp;\n".into()), 3)]; + let results = tokenize(vector, opts); + assert_eq!(results, expected); + } + + #[test] + #[cfg(feature = "named-entities")] + fn named_entities() { + let opts = TokenizerOpts::default(); + let vector = vec![String::from("&\r\n"), String::from("&aamp;\r\n")]; + let expected = vec![ + (CharacterTokens("&\n".into()), 3), + (ParseError("Invalid character reference".into()), 3), + (CharacterTokens("&aamp;\n".into()), 4), + ]; + let results = tokenize(vector, opts); + assert_eq!(results, expected); + } } |