aboutsummaryrefslogtreecommitdiff
path: root/src/tokenizer/char_ref/mod.rs
diff options
context:
space:
mode:
Diffstat (limited to 'src/tokenizer/char_ref/mod.rs')
-rw-r--r--src/tokenizer/char_ref/mod.rs58
1 files changed, 18 insertions, 40 deletions
diff --git a/src/tokenizer/char_ref/mod.rs b/src/tokenizer/char_ref/mod.rs
index 4c231b2..7b27bff 100644
--- a/src/tokenizer/char_ref/mod.rs
+++ b/src/tokenizer/char_ref/mod.rs
@@ -17,6 +17,8 @@ use std::char::from_u32;
use self::State::*;
pub use self::Status::*;
+mod data;
+
//ยง tokenizing-character-references
pub struct CharRef {
/// The resulting character(s)
@@ -110,44 +112,6 @@ impl CharRefTokenizer {
}
}
-/// The spec replaces most characters in the ISO-2022 C1 control code range
-/// (U+0080 through U+009F) with these characters, based on Windows 8-bit
-/// codepages.
-pub static C1_REPLACEMENTS: [Option<char>; 32] = [
- Some('\u{20ac}'),
- None,
- Some('\u{201a}'),
- Some('\u{0192}'),
- Some('\u{201e}'),
- Some('\u{2026}'),
- Some('\u{2020}'),
- Some('\u{2021}'),
- Some('\u{02c6}'),
- Some('\u{2030}'),
- Some('\u{0160}'),
- Some('\u{2039}'),
- Some('\u{0152}'),
- None,
- Some('\u{017d}'),
- None,
- None,
- Some('\u{2018}'),
- Some('\u{2019}'),
- Some('\u{201c}'),
- Some('\u{201d}'),
- Some('\u{2022}'),
- Some('\u{2013}'),
- Some('\u{2014}'),
- Some('\u{02dc}'),
- Some('\u{2122}'),
- Some('\u{0161}'),
- Some('\u{203a}'),
- Some('\u{0153}'),
- None,
- Some('\u{017e}'),
- Some('\u{0178}'),
-];
-
impl CharRefTokenizer {
pub fn step<Sink: TokenSink>(
&mut self,
@@ -281,7 +245,7 @@ impl CharRefTokenizer {
n if (n > 0x10FFFF) || self.num_too_big => ('\u{fffd}', true),
0x00 | 0xD800..=0xDFFF => ('\u{fffd}', true),
- 0x80..=0x9F => match C1_REPLACEMENTS[(self.num - 0x80) as usize] {
+ 0x80..=0x9F => match data::C1_REPLACEMENTS[(self.num - 0x80) as usize] {
Some(c) => (c, true),
None => (conv(self.num), true),
},
@@ -313,7 +277,21 @@ impl CharRefTokenizer {
) -> Status {
let c = unwrap_or_return!(tokenizer.get_char(input), Stuck);
self.name_buf_mut().push(c);
- self.finish_named(tokenizer, input, Some(c))
+ match data::NAMED_ENTITIES.get(&self.name_buf()[..]) {
+ // We have either a full match or a prefix of one.
+ Some(&m) => {
+ if m.0 != 0 {
+ // We have a full match, but there might be a longer one to come.
+ self.name_match = Some(m);
+ self.name_len = self.name_buf().len();
+ }
+ // Otherwise we just have a prefix match.
+ Progress
+ },
+
+ // Can't continue the match.
+ None => self.finish_named(tokenizer, input, Some(c)),
+ }
}
fn emit_name_error<Sink: TokenSink>(&mut self, tokenizer: &mut Tokenizer<Sink>) {