diff options
-rw-r--r-- | src/tokenizer/char_ref/mod.rs | 57 |
1 files changed, 40 insertions, 17 deletions
diff --git a/src/tokenizer/char_ref/mod.rs b/src/tokenizer/char_ref/mod.rs index 2840c73..8f56efb 100644 --- a/src/tokenizer/char_ref/mod.rs +++ b/src/tokenizer/char_ref/mod.rs @@ -9,7 +9,6 @@ use super::{TokenSink, Tokenizer}; use markup5ever::buffer_queue::BufferQueue; -use markup5ever::data; use markup5ever::tendril::StrTendril; use crate::util::str::is_ascii_alnum; @@ -114,6 +113,44 @@ impl CharRefTokenizer { } } +/// The spec replaces most characters in the ISO-2022 C1 control code range +/// (U+0080 through U+009F) with these characters, based on Windows 8-bit +/// codepages. +pub static C1_REPLACEMENTS: [Option<char>; 32] = [ + Some('\u{20ac}'), + None, + Some('\u{201a}'), + Some('\u{0192}'), + Some('\u{201e}'), + Some('\u{2026}'), + Some('\u{2020}'), + Some('\u{2021}'), + Some('\u{02c6}'), + Some('\u{2030}'), + Some('\u{0160}'), + Some('\u{2039}'), + Some('\u{0152}'), + None, + Some('\u{017d}'), + None, + None, + Some('\u{2018}'), + Some('\u{2019}'), + Some('\u{201c}'), + Some('\u{201d}'), + Some('\u{2022}'), + Some('\u{2013}'), + Some('\u{2014}'), + Some('\u{02dc}'), + Some('\u{2122}'), + Some('\u{0161}'), + Some('\u{203a}'), + Some('\u{0153}'), + None, + Some('\u{017e}'), + Some('\u{0178}'), +]; + impl CharRefTokenizer { pub fn step<Sink: TokenSink>( &mut self, @@ -248,7 +285,7 @@ impl CharRefTokenizer { n if (n > 0x10FFFF) || self.num_too_big => ('\u{fffd}', true), 0x00 | 0xD800..=0xDFFF => ('\u{fffd}', true), - 0x80..=0x9F => match data::C1_REPLACEMENTS[(self.num - 0x80) as usize] { + 0x80..=0x9F => match C1_REPLACEMENTS[(self.num - 0x80) as usize] { Some(c) => (c, true), None => (conv(self.num), true), }, @@ -280,21 +317,7 @@ impl CharRefTokenizer { ) -> Status { let c = unwrap_or_return!(tokenizer.get_char(input), Stuck); self.name_buf_mut().push_char(c); - match data::NAMED_ENTITIES.get(&self.name_buf()[..]) { - // We have either a full match or a prefix of one. - Some(&m) => { - if m.0 != 0 { - // We have a full match, but there might be a longer one to come. - self.name_match = Some(m); - self.name_len = self.name_buf().len(); - } - // Otherwise we just have a prefix match. - Progress - }, - - // Can't continue the match. - None => self.finish_named(tokenizer, input, Some(c)), - } + self.finish_named(tokenizer, input, Some(c)) } fn emit_name_error<Sink: TokenSink>(&mut self, tokenizer: &mut Tokenizer<Sink>) { |