summaryrefslogtreecommitdiff
path: root/src/tokenizer
diff options
context:
space:
mode:
Diffstat (limited to 'src/tokenizer')
-rw-r--r--src/tokenizer/char_ref/data.rs2
-rw-r--r--src/tokenizer/char_ref/mod.rs25
-rw-r--r--src/tokenizer/mod.rs24
3 files changed, 49 insertions, 2 deletions
diff --git a/src/tokenizer/char_ref/data.rs b/src/tokenizer/char_ref/data.rs
index fa839ba..9487034 100644
--- a/src/tokenizer/char_ref/data.rs
+++ b/src/tokenizer/char_ref/data.rs
@@ -7,6 +7,7 @@
// option. This file may not be copied, modified, or distributed
// except according to those terms.
//! Data that is known at compile-time and hard-coded into the binary.
+#[cfg(feature = "named-entities")]
use phf::Map;
/// The spec replaces most characters in the ISO-2022 C1 control code range
@@ -47,4 +48,5 @@ pub static C1_REPLACEMENTS: [Option<char>; 32] = [
Some('\u{0178}'),
];
+#[cfg(feature = "named-entities")]
include!(concat!(env!("OUT_DIR"), "/named_entities.rs"));
diff --git a/src/tokenizer/char_ref/mod.rs b/src/tokenizer/char_ref/mod.rs
index 41f4c13..9c01bdf 100644
--- a/src/tokenizer/char_ref/mod.rs
+++ b/src/tokenizer/char_ref/mod.rs
@@ -40,6 +40,7 @@ enum State {
Octothorpe,
Numeric(u32), // base
NumericSemicolon,
+ #[cfg(feature = "named-entities")]
Named,
BogusName,
}
@@ -55,7 +56,9 @@ pub struct CharRefTokenizer {
hex_marker: Option<char>,
name_buf_opt: Option<String>,
+ #[cfg(feature = "named-entities")]
name_match: Option<(u32, u32)>,
+ #[cfg(feature = "named-entities")]
name_len: usize,
}
@@ -72,7 +75,9 @@ impl CharRefTokenizer {
seen_digit: false,
hex_marker: None,
name_buf_opt: None,
+ #[cfg(feature = "named-entities")]
name_match: None,
+ #[cfg(feature = "named-entities")]
name_len: 0,
}
}
@@ -83,6 +88,7 @@ impl CharRefTokenizer {
self.result.expect("get_result called before done")
}
+ #[cfg(feature = "named-entities")]
fn name_buf(&self) -> &str {
self.name_buf_opt
.as_ref()
@@ -127,6 +133,7 @@ impl CharRefTokenizer {
Octothorpe => self.do_octothorpe(tokenizer, input),
Numeric(base) => self.do_numeric(tokenizer, input, base),
NumericSemicolon => self.do_numeric_semicolon(tokenizer, input),
+ #[cfg(feature = "named-entities")]
Named => self.do_named(tokenizer, input),
BogusName => self.do_bogus_name(tokenizer, input),
}
@@ -148,7 +155,14 @@ impl CharRefTokenizer {
}
_ => {
- self.state = Named;
+ #[cfg(feature = "named-entities")]
+ {
+ self.state = Named;
+ }
+ #[cfg(not(feature = "named-entities"))]
+ {
+ self.state = BogusName;
+ }
self.name_buf_opt = Some(String::new());
Progress
}
@@ -270,6 +284,7 @@ impl CharRefTokenizer {
self.finish_one(c)
}
+ #[cfg(feature = "named-entities")]
fn do_named<Sink: TokenSink>(
&mut self,
tokenizer: &mut Tokenizer<Sink>,
@@ -294,6 +309,7 @@ impl CharRefTokenizer {
}
}
+ #[cfg(feature = "named-entities")]
fn emit_name_error<Sink: TokenSink>(&mut self, tokenizer: &mut Tokenizer<Sink>) {
let msg = format_if!(
tokenizer.opts.exact_errors,
@@ -308,6 +324,7 @@ impl CharRefTokenizer {
input.push_front(self.name_buf_opt.take().unwrap());
}
+ #[cfg(feature = "named-entities")]
fn finish_named<Sink: TokenSink>(
&mut self,
tokenizer: &mut Tokenizer<Sink>,
@@ -405,7 +422,10 @@ impl CharRefTokenizer {
self.name_buf_mut().push(c);
match c {
_ if is_ascii_alnum(c) => return Progress,
- ';' => self.emit_name_error(tokenizer),
+ ';' => {
+ #[cfg(feature = "named-entities")]
+ self.emit_name_error(tokenizer);
+ }
_ => (),
}
self.unconsume_name(input);
@@ -428,6 +448,7 @@ impl CharRefTokenizer {
self.finish_numeric(tokenizer);
}
+ #[cfg(feature = "named-entities")]
Named => drop(self.finish_named(tokenizer, input, None)),
BogusName => {
diff --git a/src/tokenizer/mod.rs b/src/tokenizer/mod.rs
index 0acdcaf..5f3d65d 100644
--- a/src/tokenizer/mod.rs
+++ b/src/tokenizer/mod.rs
@@ -1705,4 +1705,28 @@ mod test {
let results = tokenize(vector, opts);
assert_eq!(results, expected);
}
+
+ #[test]
+ #[cfg(not(feature = "named-entities"))]
+ fn named_entities() {
+ let opts = TokenizerOpts::default();
+ let vector = vec![String::from("&amp;\r\n"), String::from("&aamp;\r\n")];
+ let expected = vec![(Token::CharacterTokens("&amp;\n&aamp;\n".into()), 3)];
+ let results = tokenize(vector, opts);
+ assert_eq!(results, expected);
+ }
+
+ #[test]
+ #[cfg(feature = "named-entities")]
+ fn named_entities() {
+ let opts = TokenizerOpts::default();
+ let vector = vec![String::from("&amp;\r\n"), String::from("&aamp;\r\n")];
+ let expected = vec![
+ (CharacterTokens("&\n".into()), 3),
+ (ParseError("Invalid character reference".into()), 3),
+ (CharacterTokens("&aamp;\n".into()), 4),
+ ];
+ let results = tokenize(vector, opts);
+ assert_eq!(results, expected);
+ }
}