stop relying on markup5ever::data

author: Martin Fischer <martin@push-f.com> 2021-04-08 09:38:16 +0200
committer: Martin Fischer <martin@push-f.com> 2021-04-08 15:40:48 +0200
commit: 462bb0ef0ba9e027f5138c87438328db718d15da (patch)
tree: dd63ed85d6c630a842cdf780713aa6ecf10490d1 /src/tokenizer
parent: 6133f17a178c0b746a124f52df36fe98d7d2db7a (diff)
1 files changed, 40 insertions, 17 deletions
diff --git a/src/tokenizer/char_ref/mod.rs b/src/tokenizer/char_ref/mod.rs
index 2840c73..8f56efb 100644
--- a/src/tokenizer/char_ref/mod.rs
+++ b/src/tokenizer/char_ref/mod.rs
@@ -9,7 +9,6 @@
 
 use super::{TokenSink, Tokenizer};
 use markup5ever::buffer_queue::BufferQueue;
-use markup5ever::data;
 use markup5ever::tendril::StrTendril;
 use crate::util::str::is_ascii_alnum;
 
@@ -114,6 +113,44 @@ impl CharRefTokenizer {
     }
 }
 
+/// The spec replaces most characters in the ISO-2022 C1 control code range
+/// (U+0080 through U+009F) with these characters, based on Windows 8-bit
+/// codepages.
+pub static C1_REPLACEMENTS: [Option<char>; 32] = [
+    Some('\u{20ac}'),
+    None,
+    Some('\u{201a}'),
+    Some('\u{0192}'),
+    Some('\u{201e}'),
+    Some('\u{2026}'),
+    Some('\u{2020}'),
+    Some('\u{2021}'),
+    Some('\u{02c6}'),
+    Some('\u{2030}'),
+    Some('\u{0160}'),
+    Some('\u{2039}'),
+    Some('\u{0152}'),
+    None,
+    Some('\u{017d}'),
+    None,
+    None,
+    Some('\u{2018}'),
+    Some('\u{2019}'),
+    Some('\u{201c}'),
+    Some('\u{201d}'),
+    Some('\u{2022}'),
+    Some('\u{2013}'),
+    Some('\u{2014}'),
+    Some('\u{02dc}'),
+    Some('\u{2122}'),
+    Some('\u{0161}'),
+    Some('\u{203a}'),
+    Some('\u{0153}'),
+    None,
+    Some('\u{017e}'),
+    Some('\u{0178}'),
+];
+
 impl CharRefTokenizer {
     pub fn step<Sink: TokenSink>(
         &mut self,
@@ -248,7 +285,7 @@ impl CharRefTokenizer {
             n if (n > 0x10FFFF) || self.num_too_big => ('\u{fffd}', true),
             0x00 | 0xD800..=0xDFFF => ('\u{fffd}', true),
 
-            0x80..=0x9F => match data::C1_REPLACEMENTS[(self.num - 0x80) as usize] {
+            0x80..=0x9F => match C1_REPLACEMENTS[(self.num - 0x80) as usize] {
                 Some(c) => (c, true),
                 None => (conv(self.num), true),
             },
@@ -280,21 +317,7 @@ impl CharRefTokenizer {
     ) -> Status {
         let c = unwrap_or_return!(tokenizer.get_char(input), Stuck);
         self.name_buf_mut().push_char(c);
-        match data::NAMED_ENTITIES.get(&self.name_buf()[..]) {
-            // We have either a full match or a prefix of one.
-            Some(&m) => {
-                if m.0 != 0 {
-                    // We have a full match, but there might be a longer one to come.
-                    self.name_match = Some(m);
-                    self.name_len = self.name_buf().len();
-                }
-                // Otherwise we just have a prefix match.
-                Progress
-            },
-
-            // Can't continue the match.
-            None => self.finish_named(tokenizer, input, Some(c)),
-        }
+        self.finish_named(tokenizer, input, Some(c))
     }
 
     fn emit_name_error<Sink: TokenSink>(&mut self, tokenizer: &mut Tokenizer<Sink>) {
author	Martin Fischer <martin@push-f.com>	2021-04-08 09:38:16 +0200
committer	Martin Fischer <martin@push-f.com>	2021-04-08 15:40:48 +0200
commit	462bb0ef0ba9e027f5138c87438328db718d15da (patch)
tree	dd63ed85d6c630a842cdf780713aa6ecf10490d1 /src/tokenizer
parent	6133f17a178c0b746a124f52df36fe98d7d2db7a (diff)