fix named entities

In 462bb0ef0ba9e027f5138c87438328db718d15da I dropped the markup5ever::data dependency but omitted to include its named_entities data. This commit remedies that mistake.
author: Martin Fischer <martin@push-f.com> 2021-11-12 03:47:19 +0100
committer: Martin Fischer <martin@push-f.com> 2021-11-12 03:52:47 +0100
commit: c914b71a28ce7177171b83df2815352bf5741844 (patch)
tree: e11a30a30730a9fc3dd03ff49e3a5db029ce253d /src/tokenizer
parent: ea7c3c9b8b45d846bcb45573c9b2250d8152be30 (diff)
2 files changed, 68 insertions, 40 deletions
diff --git a/src/tokenizer/char_ref/data.rs b/src/tokenizer/char_ref/data.rs
new file mode 100644
index 0000000..fa839ba
--- /dev/null
+++ b/src/tokenizer/char_ref/data.rs
@@ -0,0 +1,50 @@
+// Copyright 2014-2017 The html5ever Project Developers. See the
+// COPYRIGHT file at the top-level directory of this distribution.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+//! Data that is known at compile-time and hard-coded into the binary.
+use phf::Map;
+
+/// The spec replaces most characters in the ISO-2022 C1 control code range
+/// (U+0080 through U+009F) with these characters, based on Windows 8-bit
+/// codepages.
+pub static C1_REPLACEMENTS: [Option<char>; 32] = [
+    Some('\u{20ac}'),
+    None,
+    Some('\u{201a}'),
+    Some('\u{0192}'),
+    Some('\u{201e}'),
+    Some('\u{2026}'),
+    Some('\u{2020}'),
+    Some('\u{2021}'),
+    Some('\u{02c6}'),
+    Some('\u{2030}'),
+    Some('\u{0160}'),
+    Some('\u{2039}'),
+    Some('\u{0152}'),
+    None,
+    Some('\u{017d}'),
+    None,
+    None,
+    Some('\u{2018}'),
+    Some('\u{2019}'),
+    Some('\u{201c}'),
+    Some('\u{201d}'),
+    Some('\u{2022}'),
+    Some('\u{2013}'),
+    Some('\u{2014}'),
+    Some('\u{02dc}'),
+    Some('\u{2122}'),
+    Some('\u{0161}'),
+    Some('\u{203a}'),
+    Some('\u{0153}'),
+    None,
+    Some('\u{017e}'),
+    Some('\u{0178}'),
+];
+
+include!(concat!(env!("OUT_DIR"), "/named_entities.rs"));
diff --git a/src/tokenizer/char_ref/mod.rs b/src/tokenizer/char_ref/mod.rs
index 4c231b2..7b27bff 100644
--- a/src/tokenizer/char_ref/mod.rs
+++ b/src/tokenizer/char_ref/mod.rs
@@ -17,6 +17,8 @@ use std::char::from_u32;
 use self::State::*;
 pub use self::Status::*;
 
+mod data;
+
 //§ tokenizing-character-references
 pub struct CharRef {
     /// The resulting character(s)
@@ -110,44 +112,6 @@ impl CharRefTokenizer {
     }
 }
 
-/// The spec replaces most characters in the ISO-2022 C1 control code range
-/// (U+0080 through U+009F) with these characters, based on Windows 8-bit
-/// codepages.
-pub static C1_REPLACEMENTS: [Option<char>; 32] = [
-    Some('\u{20ac}'),
-    None,
-    Some('\u{201a}'),
-    Some('\u{0192}'),
-    Some('\u{201e}'),
-    Some('\u{2026}'),
-    Some('\u{2020}'),
-    Some('\u{2021}'),
-    Some('\u{02c6}'),
-    Some('\u{2030}'),
-    Some('\u{0160}'),
-    Some('\u{2039}'),
-    Some('\u{0152}'),
-    None,
-    Some('\u{017d}'),
-    None,
-    None,
-    Some('\u{2018}'),
-    Some('\u{2019}'),
-    Some('\u{201c}'),
-    Some('\u{201d}'),
-    Some('\u{2022}'),
-    Some('\u{2013}'),
-    Some('\u{2014}'),
-    Some('\u{02dc}'),
-    Some('\u{2122}'),
-    Some('\u{0161}'),
-    Some('\u{203a}'),
-    Some('\u{0153}'),
-    None,
-    Some('\u{017e}'),
-    Some('\u{0178}'),
-];
-
 impl CharRefTokenizer {
     pub fn step<Sink: TokenSink>(
         &mut self,
@@ -281,7 +245,7 @@ impl CharRefTokenizer {
             n if (n > 0x10FFFF) || self.num_too_big => ('\u{fffd}', true),
             0x00 | 0xD800..=0xDFFF => ('\u{fffd}', true),
 
-            0x80..=0x9F => match C1_REPLACEMENTS[(self.num - 0x80) as usize] {
+            0x80..=0x9F => match data::C1_REPLACEMENTS[(self.num - 0x80) as usize] {
                 Some(c) => (c, true),
                 None => (conv(self.num), true),
             },
@@ -313,7 +277,21 @@ impl CharRefTokenizer {
     ) -> Status {
         let c = unwrap_or_return!(tokenizer.get_char(input), Stuck);
         self.name_buf_mut().push(c);
-        self.finish_named(tokenizer, input, Some(c))
+        match data::NAMED_ENTITIES.get(&self.name_buf()[..]) {
+            // We have either a full match or a prefix of one.
+            Some(&m) => {
+                if m.0 != 0 {
+                    // We have a full match, but there might be a longer one to come.
+                    self.name_match = Some(m);
+                    self.name_len = self.name_buf().len();
+                }
+                // Otherwise we just have a prefix match.
+                Progress
+            },
+
+            // Can't continue the match.
+            None => self.finish_named(tokenizer, input, Some(c)),
+        }
     }
 
     fn emit_name_error<Sink: TokenSink>(&mut self, tokenizer: &mut Tokenizer<Sink>) {
author	Martin Fischer <martin@push-f.com>	2021-11-12 03:47:19 +0100
committer	Martin Fischer <martin@push-f.com>	2021-11-12 03:52:47 +0100
commit	c914b71a28ce7177171b83df2815352bf5741844 (patch)
tree	e11a30a30730a9fc3dd03ff49e3a5db029ce253d /src/tokenizer
parent	ea7c3c9b8b45d846bcb45573c9b2250d8152be30 (diff)