diff options
author | Martin Fischer <martin@push-f.com> | 2021-11-12 03:47:19 +0100 |
---|---|---|
committer | Martin Fischer <martin@push-f.com> | 2021-11-12 03:52:47 +0100 |
commit | c914b71a28ce7177171b83df2815352bf5741844 (patch) | |
tree | e11a30a30730a9fc3dd03ff49e3a5db029ce253d /src/tokenizer/char_ref/mod.rs | |
parent | ea7c3c9b8b45d846bcb45573c9b2250d8152be30 (diff) |
fix named entities
In 462bb0ef0ba9e027f5138c87438328db718d15da I dropped the
markup5ever::data dependency but omitted to include its
named_entities data. This commit remedies that mistake.
Diffstat (limited to 'src/tokenizer/char_ref/mod.rs')
-rw-r--r-- | src/tokenizer/char_ref/mod.rs | 58 |
1 files changed, 18 insertions, 40 deletions
diff --git a/src/tokenizer/char_ref/mod.rs b/src/tokenizer/char_ref/mod.rs index 4c231b2..7b27bff 100644 --- a/src/tokenizer/char_ref/mod.rs +++ b/src/tokenizer/char_ref/mod.rs @@ -17,6 +17,8 @@ use std::char::from_u32; use self::State::*; pub use self::Status::*; +mod data; + //ยง tokenizing-character-references pub struct CharRef { /// The resulting character(s) @@ -110,44 +112,6 @@ impl CharRefTokenizer { } } -/// The spec replaces most characters in the ISO-2022 C1 control code range -/// (U+0080 through U+009F) with these characters, based on Windows 8-bit -/// codepages. -pub static C1_REPLACEMENTS: [Option<char>; 32] = [ - Some('\u{20ac}'), - None, - Some('\u{201a}'), - Some('\u{0192}'), - Some('\u{201e}'), - Some('\u{2026}'), - Some('\u{2020}'), - Some('\u{2021}'), - Some('\u{02c6}'), - Some('\u{2030}'), - Some('\u{0160}'), - Some('\u{2039}'), - Some('\u{0152}'), - None, - Some('\u{017d}'), - None, - None, - Some('\u{2018}'), - Some('\u{2019}'), - Some('\u{201c}'), - Some('\u{201d}'), - Some('\u{2022}'), - Some('\u{2013}'), - Some('\u{2014}'), - Some('\u{02dc}'), - Some('\u{2122}'), - Some('\u{0161}'), - Some('\u{203a}'), - Some('\u{0153}'), - None, - Some('\u{017e}'), - Some('\u{0178}'), -]; - impl CharRefTokenizer { pub fn step<Sink: TokenSink>( &mut self, @@ -281,7 +245,7 @@ impl CharRefTokenizer { n if (n > 0x10FFFF) || self.num_too_big => ('\u{fffd}', true), 0x00 | 0xD800..=0xDFFF => ('\u{fffd}', true), - 0x80..=0x9F => match C1_REPLACEMENTS[(self.num - 0x80) as usize] { + 0x80..=0x9F => match data::C1_REPLACEMENTS[(self.num - 0x80) as usize] { Some(c) => (c, true), None => (conv(self.num), true), }, @@ -313,7 +277,21 @@ impl CharRefTokenizer { ) -> Status { let c = unwrap_or_return!(tokenizer.get_char(input), Stuck); self.name_buf_mut().push(c); - self.finish_named(tokenizer, input, Some(c)) + match data::NAMED_ENTITIES.get(&self.name_buf()[..]) { + // We have either a full match or a prefix of one. + Some(&m) => { + if m.0 != 0 { + // We have a full match, but there might be a longer one to come. + self.name_match = Some(m); + self.name_len = self.name_buf().len(); + } + // Otherwise we just have a prefix match. + Progress + }, + + // Can't continue the match. + None => self.finish_named(tokenizer, input, Some(c)), + } } fn emit_name_error<Sink: TokenSink>(&mut self, tokenizer: &mut Tokenizer<Sink>) { |