aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMartin Fischer <martin@push-f.com>2021-11-19 08:38:57 +0100
committerMartin Fischer <martin@push-f.com>2021-11-19 08:38:58 +0100
commit98ad8cec144900c7799772b3a53241825b416b4f (patch)
tree8a18df3e9ff4c4be5316112c0f02db72f9ef94f2
parent7207abccd9dccb15eb37f43a8f763cac99be14d4 (diff)
feature gate named-entities (making phf optional)
-rw-r--r--Cargo.toml10
-rw-r--r--README.md5
-rw-r--r--build.rs11
-rw-r--r--src/tokenizer/char_ref/data.rs2
-rw-r--r--src/tokenizer/char_ref/mod.rs25
-rw-r--r--src/tokenizer/mod.rs24
6 files changed, 68 insertions, 9 deletions
diff --git a/Cargo.toml b/Cargo.toml
index c9086e8..a4257b3 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -10,11 +10,17 @@ categories = [ "parser-implementations", "web-programming" ]
keywords = ["html", "html5", "tokenizer", "parser"]
edition = "2018"
+[features]
+default = ["named-entities"]
+
+# resolve named entities like &amp;
+named-entities = ["phf", "phf_codegen"]
+
[dependencies]
-phf = "0.9"
+phf = { version = "0.9", optional = true }
[build-dependencies]
-phf_codegen = "0.9"
+phf_codegen = { version = "0.9", optional = true }
[dev-dependencies]
typed-arena = "1.3.0"
diff --git a/README.md b/README.md
index e5b51be..193b43d 100644
--- a/README.md
+++ b/README.md
@@ -21,6 +21,11 @@ If you want to parse HTML into a tree (DOM) you should by all means use
html5ever, this crate is merely for those who only want an HTML5 tokenizer and
seek to minimize their compile dependencies (html5ever pulls in 56).
+To efficiently resolve named entities like `&amp;` the tokenizer uses
+[phf](https://crates.io/crates/phf) for a compile-time static map. If you
+don't need to resolve named entities, you can avoid the `phf` dependency
+by disabling the `named-entities` feature (which is enabled by default).
+
## Credits
Thanks to the developers of html5ever for their awesome parser!
diff --git a/build.rs b/build.rs
index 8d4404c..d41934c 100644
--- a/build.rs
+++ b/build.rs
@@ -7,20 +7,21 @@
// option. This file may not be copied, modified, or distributed
// except according to those terms.
+#[cfg(feature = "named-entities")]
extern crate phf_codegen;
-use std::collections::HashMap;
-use std::env;
-use std::fs::File;
-use std::io::Write;
-use std::path::Path;
+#[cfg(feature = "named-entities")]
+use {std::collections::HashMap, std::env, std::fs::File, std::io::Write, std::path::Path};
+#[cfg(feature = "named-entities")]
mod entities;
fn main() {
+ #[cfg(feature = "named-entities")]
named_entities_to_phf(&Path::new(&env::var("OUT_DIR").unwrap()).join("named_entities.rs"));
}
+#[cfg(feature = "named-entities")]
fn named_entities_to_phf(to: &Path) {
let mut entities: HashMap<&str, (u32, u32)> = entities::NAMED_ENTITIES
.iter()
diff --git a/src/tokenizer/char_ref/data.rs b/src/tokenizer/char_ref/data.rs
index fa839ba..9487034 100644
--- a/src/tokenizer/char_ref/data.rs
+++ b/src/tokenizer/char_ref/data.rs
@@ -7,6 +7,7 @@
// option. This file may not be copied, modified, or distributed
// except according to those terms.
//! Data that is known at compile-time and hard-coded into the binary.
+#[cfg(feature = "named-entities")]
use phf::Map;
/// The spec replaces most characters in the ISO-2022 C1 control code range
@@ -47,4 +48,5 @@ pub static C1_REPLACEMENTS: [Option<char>; 32] = [
Some('\u{0178}'),
];
+#[cfg(feature = "named-entities")]
include!(concat!(env!("OUT_DIR"), "/named_entities.rs"));
diff --git a/src/tokenizer/char_ref/mod.rs b/src/tokenizer/char_ref/mod.rs
index 41f4c13..9c01bdf 100644
--- a/src/tokenizer/char_ref/mod.rs
+++ b/src/tokenizer/char_ref/mod.rs
@@ -40,6 +40,7 @@ enum State {
Octothorpe,
Numeric(u32), // base
NumericSemicolon,
+ #[cfg(feature = "named-entities")]
Named,
BogusName,
}
@@ -55,7 +56,9 @@ pub struct CharRefTokenizer {
hex_marker: Option<char>,
name_buf_opt: Option<String>,
+ #[cfg(feature = "named-entities")]
name_match: Option<(u32, u32)>,
+ #[cfg(feature = "named-entities")]
name_len: usize,
}
@@ -72,7 +75,9 @@ impl CharRefTokenizer {
seen_digit: false,
hex_marker: None,
name_buf_opt: None,
+ #[cfg(feature = "named-entities")]
name_match: None,
+ #[cfg(feature = "named-entities")]
name_len: 0,
}
}
@@ -83,6 +88,7 @@ impl CharRefTokenizer {
self.result.expect("get_result called before done")
}
+ #[cfg(feature = "named-entities")]
fn name_buf(&self) -> &str {
self.name_buf_opt
.as_ref()
@@ -127,6 +133,7 @@ impl CharRefTokenizer {
Octothorpe => self.do_octothorpe(tokenizer, input),
Numeric(base) => self.do_numeric(tokenizer, input, base),
NumericSemicolon => self.do_numeric_semicolon(tokenizer, input),
+ #[cfg(feature = "named-entities")]
Named => self.do_named(tokenizer, input),
BogusName => self.do_bogus_name(tokenizer, input),
}
@@ -148,7 +155,14 @@ impl CharRefTokenizer {
}
_ => {
- self.state = Named;
+ #[cfg(feature = "named-entities")]
+ {
+ self.state = Named;
+ }
+ #[cfg(not(feature = "named-entities"))]
+ {
+ self.state = BogusName;
+ }
self.name_buf_opt = Some(String::new());
Progress
}
@@ -270,6 +284,7 @@ impl CharRefTokenizer {
self.finish_one(c)
}
+ #[cfg(feature = "named-entities")]
fn do_named<Sink: TokenSink>(
&mut self,
tokenizer: &mut Tokenizer<Sink>,
@@ -294,6 +309,7 @@ impl CharRefTokenizer {
}
}
+ #[cfg(feature = "named-entities")]
fn emit_name_error<Sink: TokenSink>(&mut self, tokenizer: &mut Tokenizer<Sink>) {
let msg = format_if!(
tokenizer.opts.exact_errors,
@@ -308,6 +324,7 @@ impl CharRefTokenizer {
input.push_front(self.name_buf_opt.take().unwrap());
}
+ #[cfg(feature = "named-entities")]
fn finish_named<Sink: TokenSink>(
&mut self,
tokenizer: &mut Tokenizer<Sink>,
@@ -405,7 +422,10 @@ impl CharRefTokenizer {
self.name_buf_mut().push(c);
match c {
_ if is_ascii_alnum(c) => return Progress,
- ';' => self.emit_name_error(tokenizer),
+ ';' => {
+ #[cfg(feature = "named-entities")]
+ self.emit_name_error(tokenizer);
+ }
_ => (),
}
self.unconsume_name(input);
@@ -428,6 +448,7 @@ impl CharRefTokenizer {
self.finish_numeric(tokenizer);
}
+ #[cfg(feature = "named-entities")]
Named => drop(self.finish_named(tokenizer, input, None)),
BogusName => {
diff --git a/src/tokenizer/mod.rs b/src/tokenizer/mod.rs
index 0acdcaf..5f3d65d 100644
--- a/src/tokenizer/mod.rs
+++ b/src/tokenizer/mod.rs
@@ -1705,4 +1705,28 @@ mod test {
let results = tokenize(vector, opts);
assert_eq!(results, expected);
}
+
+ #[test]
+ #[cfg(not(feature = "named-entities"))]
+ fn named_entities() {
+ let opts = TokenizerOpts::default();
+ let vector = vec![String::from("&amp;\r\n"), String::from("&aamp;\r\n")];
+ let expected = vec![(Token::CharacterTokens("&amp;\n&aamp;\n".into()), 3)];
+ let results = tokenize(vector, opts);
+ assert_eq!(results, expected);
+ }
+
+ #[test]
+ #[cfg(feature = "named-entities")]
+ fn named_entities() {
+ let opts = TokenizerOpts::default();
+ let vector = vec![String::from("&amp;\r\n"), String::from("&aamp;\r\n")];
+ let expected = vec![
+ (CharacterTokens("&\n".into()), 3),
+ (ParseError("Invalid character reference".into()), 3),
+ (CharacterTokens("&aamp;\n".into()), 4),
+ ];
+ let results = tokenize(vector, opts);
+ assert_eq!(results, expected);
+ }
}