diff options
Diffstat (limited to 'src/tokenizer')
| -rw-r--r-- | src/tokenizer/char_ref/data.rs | 50 | ||||
| -rw-r--r-- | src/tokenizer/char_ref/mod.rs | 58 | 
2 files changed, 68 insertions, 40 deletions
| diff --git a/src/tokenizer/char_ref/data.rs b/src/tokenizer/char_ref/data.rs new file mode 100644 index 0000000..fa839ba --- /dev/null +++ b/src/tokenizer/char_ref/data.rs @@ -0,0 +1,50 @@ +// Copyright 2014-2017 The html5ever Project Developers. See the +// COPYRIGHT file at the top-level directory of this distribution. +// +// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or +// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license +// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. +//! Data that is known at compile-time and hard-coded into the binary. +use phf::Map; + +/// The spec replaces most characters in the ISO-2022 C1 control code range +/// (U+0080 through U+009F) with these characters, based on Windows 8-bit +/// codepages. +pub static C1_REPLACEMENTS: [Option<char>; 32] = [ +    Some('\u{20ac}'), +    None, +    Some('\u{201a}'), +    Some('\u{0192}'), +    Some('\u{201e}'), +    Some('\u{2026}'), +    Some('\u{2020}'), +    Some('\u{2021}'), +    Some('\u{02c6}'), +    Some('\u{2030}'), +    Some('\u{0160}'), +    Some('\u{2039}'), +    Some('\u{0152}'), +    None, +    Some('\u{017d}'), +    None, +    None, +    Some('\u{2018}'), +    Some('\u{2019}'), +    Some('\u{201c}'), +    Some('\u{201d}'), +    Some('\u{2022}'), +    Some('\u{2013}'), +    Some('\u{2014}'), +    Some('\u{02dc}'), +    Some('\u{2122}'), +    Some('\u{0161}'), +    Some('\u{203a}'), +    Some('\u{0153}'), +    None, +    Some('\u{017e}'), +    Some('\u{0178}'), +]; + +include!(concat!(env!("OUT_DIR"), "/named_entities.rs")); diff --git a/src/tokenizer/char_ref/mod.rs b/src/tokenizer/char_ref/mod.rs index 4c231b2..7b27bff 100644 --- a/src/tokenizer/char_ref/mod.rs +++ b/src/tokenizer/char_ref/mod.rs @@ -17,6 +17,8 @@ use std::char::from_u32;  use self::State::*;  pub use self::Status::*; +mod data; +  //ยง tokenizing-character-references  pub struct CharRef {      /// The resulting character(s) @@ -110,44 +112,6 @@ impl CharRefTokenizer {      }  } -/// The spec replaces most characters in the ISO-2022 C1 control code range -/// (U+0080 through U+009F) with these characters, based on Windows 8-bit -/// codepages. -pub static C1_REPLACEMENTS: [Option<char>; 32] = [ -    Some('\u{20ac}'), -    None, -    Some('\u{201a}'), -    Some('\u{0192}'), -    Some('\u{201e}'), -    Some('\u{2026}'), -    Some('\u{2020}'), -    Some('\u{2021}'), -    Some('\u{02c6}'), -    Some('\u{2030}'), -    Some('\u{0160}'), -    Some('\u{2039}'), -    Some('\u{0152}'), -    None, -    Some('\u{017d}'), -    None, -    None, -    Some('\u{2018}'), -    Some('\u{2019}'), -    Some('\u{201c}'), -    Some('\u{201d}'), -    Some('\u{2022}'), -    Some('\u{2013}'), -    Some('\u{2014}'), -    Some('\u{02dc}'), -    Some('\u{2122}'), -    Some('\u{0161}'), -    Some('\u{203a}'), -    Some('\u{0153}'), -    None, -    Some('\u{017e}'), -    Some('\u{0178}'), -]; -  impl CharRefTokenizer {      pub fn step<Sink: TokenSink>(          &mut self, @@ -281,7 +245,7 @@ impl CharRefTokenizer {              n if (n > 0x10FFFF) || self.num_too_big => ('\u{fffd}', true),              0x00 | 0xD800..=0xDFFF => ('\u{fffd}', true), -            0x80..=0x9F => match C1_REPLACEMENTS[(self.num - 0x80) as usize] { +            0x80..=0x9F => match data::C1_REPLACEMENTS[(self.num - 0x80) as usize] {                  Some(c) => (c, true),                  None => (conv(self.num), true),              }, @@ -313,7 +277,21 @@ impl CharRefTokenizer {      ) -> Status {          let c = unwrap_or_return!(tokenizer.get_char(input), Stuck);          self.name_buf_mut().push(c); -        self.finish_named(tokenizer, input, Some(c)) +        match data::NAMED_ENTITIES.get(&self.name_buf()[..]) { +            // We have either a full match or a prefix of one. +            Some(&m) => { +                if m.0 != 0 { +                    // We have a full match, but there might be a longer one to come. +                    self.name_match = Some(m); +                    self.name_len = self.name_buf().len(); +                } +                // Otherwise we just have a prefix match. +                Progress +            }, + +            // Can't continue the match. +            None => self.finish_named(tokenizer, input, Some(c)), +        }      }      fn emit_name_error<Sink: TokenSink>(&mut self, tokenizer: &mut Tokenizer<Sink>) { | 
