diff options
| author | Martin Fischer <martin@push-f.com> | 2021-04-08 09:38:16 +0200 | 
|---|---|---|
| committer | Martin Fischer <martin@push-f.com> | 2021-04-08 15:40:48 +0200 | 
| commit | 462bb0ef0ba9e027f5138c87438328db718d15da (patch) | |
| tree | dd63ed85d6c630a842cdf780713aa6ecf10490d1 /src/tokenizer/char_ref | |
| parent | 6133f17a178c0b746a124f52df36fe98d7d2db7a (diff) | |
stop relying on markup5ever::data
Diffstat (limited to 'src/tokenizer/char_ref')
| -rw-r--r-- | src/tokenizer/char_ref/mod.rs | 57 | 
1 files changed, 40 insertions, 17 deletions
| diff --git a/src/tokenizer/char_ref/mod.rs b/src/tokenizer/char_ref/mod.rs index 2840c73..8f56efb 100644 --- a/src/tokenizer/char_ref/mod.rs +++ b/src/tokenizer/char_ref/mod.rs @@ -9,7 +9,6 @@  use super::{TokenSink, Tokenizer};  use markup5ever::buffer_queue::BufferQueue; -use markup5ever::data;  use markup5ever::tendril::StrTendril;  use crate::util::str::is_ascii_alnum; @@ -114,6 +113,44 @@ impl CharRefTokenizer {      }  } +/// The spec replaces most characters in the ISO-2022 C1 control code range +/// (U+0080 through U+009F) with these characters, based on Windows 8-bit +/// codepages. +pub static C1_REPLACEMENTS: [Option<char>; 32] = [ +    Some('\u{20ac}'), +    None, +    Some('\u{201a}'), +    Some('\u{0192}'), +    Some('\u{201e}'), +    Some('\u{2026}'), +    Some('\u{2020}'), +    Some('\u{2021}'), +    Some('\u{02c6}'), +    Some('\u{2030}'), +    Some('\u{0160}'), +    Some('\u{2039}'), +    Some('\u{0152}'), +    None, +    Some('\u{017d}'), +    None, +    None, +    Some('\u{2018}'), +    Some('\u{2019}'), +    Some('\u{201c}'), +    Some('\u{201d}'), +    Some('\u{2022}'), +    Some('\u{2013}'), +    Some('\u{2014}'), +    Some('\u{02dc}'), +    Some('\u{2122}'), +    Some('\u{0161}'), +    Some('\u{203a}'), +    Some('\u{0153}'), +    None, +    Some('\u{017e}'), +    Some('\u{0178}'), +]; +  impl CharRefTokenizer {      pub fn step<Sink: TokenSink>(          &mut self, @@ -248,7 +285,7 @@ impl CharRefTokenizer {              n if (n > 0x10FFFF) || self.num_too_big => ('\u{fffd}', true),              0x00 | 0xD800..=0xDFFF => ('\u{fffd}', true), -            0x80..=0x9F => match data::C1_REPLACEMENTS[(self.num - 0x80) as usize] { +            0x80..=0x9F => match C1_REPLACEMENTS[(self.num - 0x80) as usize] {                  Some(c) => (c, true),                  None => (conv(self.num), true),              }, @@ -280,21 +317,7 @@ impl CharRefTokenizer {      ) -> Status {          let c = unwrap_or_return!(tokenizer.get_char(input), Stuck);          self.name_buf_mut().push_char(c); -        match data::NAMED_ENTITIES.get(&self.name_buf()[..]) { -            // We have either a full match or a prefix of one. -            Some(&m) => { -                if m.0 != 0 { -                    // We have a full match, but there might be a longer one to come. -                    self.name_match = Some(m); -                    self.name_len = self.name_buf().len(); -                } -                // Otherwise we just have a prefix match. -                Progress -            }, - -            // Can't continue the match. -            None => self.finish_named(tokenizer, input, Some(c)), -        } +        self.finish_named(tokenizer, input, Some(c))      }      fn emit_name_error<Sink: TokenSink>(&mut self, tokenizer: &mut Tokenizer<Sink>) { | 
