diff options
author | Martin Fischer <martin@push-f.com> | 2021-04-08 08:42:01 +0200 |
---|---|---|
committer | Martin Fischer <martin@push-f.com> | 2021-04-08 15:40:37 +0200 |
commit | 57e7eefcbe6fb8c3dc4b01c707be9de4c34963a7 (patch) | |
tree | 6a9d296389bf3023396592c8514ed6712e011c7f /src/tokenizer/char_ref |
import https://github.com/servo/html5ever
commit d1206daa740305f55a5fa159e43eb33afc359cb4
Diffstat (limited to 'src/tokenizer/char_ref')
-rw-r--r-- | src/tokenizer/char_ref/mod.rs | 449 |
1 files changed, 449 insertions, 0 deletions
diff --git a/src/tokenizer/char_ref/mod.rs b/src/tokenizer/char_ref/mod.rs new file mode 100644 index 0000000..a52485d --- /dev/null +++ b/src/tokenizer/char_ref/mod.rs @@ -0,0 +1,449 @@ +// Copyright 2014-2017 The html5ever Project Developers. See the +// COPYRIGHT file at the top-level directory of this distribution. +// +// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or +// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license +// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +use super::{TokenSink, Tokenizer}; +use crate::buffer_queue::BufferQueue; +use crate::data; +use crate::tendril::StrTendril; +use crate::util::str::is_ascii_alnum; + +use log::debug; +use mac::format_if; +use std::borrow::Cow::Borrowed; +use std::char::from_u32; + +use self::State::*; +pub use self::Status::*; + +//ยง tokenizing-character-references +pub struct CharRef { + /// The resulting character(s) + pub chars: [char; 2], + + /// How many slots in `chars` are valid? + pub num_chars: u8, +} + +pub enum Status { + Stuck, + Progress, + Done, +} + +#[derive(Debug)] +enum State { + Begin, + Octothorpe, + Numeric(u32), // base + NumericSemicolon, + Named, + BogusName, +} + +pub struct CharRefTokenizer { + state: State, + addnl_allowed: Option<char>, + result: Option<CharRef>, + + num: u32, + num_too_big: bool, + seen_digit: bool, + hex_marker: Option<char>, + + name_buf_opt: Option<StrTendril>, + name_match: Option<(u32, u32)>, + name_len: usize, +} + +impl CharRefTokenizer { + // NB: We assume that we have an additional allowed character iff we're + // tokenizing in an attribute value. + pub fn new(addnl_allowed: Option<char>) -> CharRefTokenizer { + CharRefTokenizer { + state: Begin, + addnl_allowed, + result: None, + num: 0, + num_too_big: false, + seen_digit: false, + hex_marker: None, + name_buf_opt: None, + name_match: None, + name_len: 0, + } + } + + // A CharRefTokenizer can only tokenize one character reference, + // so this method consumes the tokenizer. + pub fn get_result(self) -> CharRef { + self.result.expect("get_result called before done") + } + + fn name_buf(&self) -> &StrTendril { + self.name_buf_opt + .as_ref() + .expect("name_buf missing in named character reference") + } + + fn name_buf_mut(&mut self) -> &mut StrTendril { + self.name_buf_opt + .as_mut() + .expect("name_buf missing in named character reference") + } + + fn finish_none(&mut self) -> Status { + self.result = Some(CharRef { + chars: ['\0', '\0'], + num_chars: 0, + }); + Done + } + + fn finish_one(&mut self, c: char) -> Status { + self.result = Some(CharRef { + chars: [c, '\0'], + num_chars: 1, + }); + Done + } +} + +impl CharRefTokenizer { + pub fn step<Sink: TokenSink>( + &mut self, + tokenizer: &mut Tokenizer<Sink>, + input: &mut BufferQueue, + ) -> Status { + if self.result.is_some() { + return Done; + } + + debug!("char ref tokenizer stepping in state {:?}", self.state); + match self.state { + Begin => self.do_begin(tokenizer, input), + Octothorpe => self.do_octothorpe(tokenizer, input), + Numeric(base) => self.do_numeric(tokenizer, input, base), + NumericSemicolon => self.do_numeric_semicolon(tokenizer, input), + Named => self.do_named(tokenizer, input), + BogusName => self.do_bogus_name(tokenizer, input), + } + } + + fn do_begin<Sink: TokenSink>( + &mut self, + tokenizer: &mut Tokenizer<Sink>, + input: &mut BufferQueue, + ) -> Status { + match unwrap_or_return!(tokenizer.peek(input), Stuck) { + '\t' | '\n' | '\x0C' | ' ' | '<' | '&' => self.finish_none(), + c if Some(c) == self.addnl_allowed => self.finish_none(), + + '#' => { + tokenizer.discard_char(input); + self.state = Octothorpe; + Progress + }, + + _ => { + self.state = Named; + self.name_buf_opt = Some(StrTendril::new()); + Progress + }, + } + } + + fn do_octothorpe<Sink: TokenSink>( + &mut self, + tokenizer: &mut Tokenizer<Sink>, + input: &mut BufferQueue, + ) -> Status { + let c = unwrap_or_return!(tokenizer.peek(input), Stuck); + match c { + 'x' | 'X' => { + tokenizer.discard_char(input); + self.hex_marker = Some(c); + self.state = Numeric(16); + }, + + _ => { + self.hex_marker = None; + self.state = Numeric(10); + }, + } + Progress + } + + fn do_numeric<Sink: TokenSink>( + &mut self, + tokenizer: &mut Tokenizer<Sink>, + input: &mut BufferQueue, + base: u32, + ) -> Status { + let c = unwrap_or_return!(tokenizer.peek(input), Stuck); + match c.to_digit(base) { + Some(n) => { + tokenizer.discard_char(input); + self.num = self.num.wrapping_mul(base); + if self.num > 0x10FFFF { + // We might overflow, and the character is definitely invalid. + // We still parse digits and semicolon, but don't use the result. + self.num_too_big = true; + } + self.num = self.num.wrapping_add(n); + self.seen_digit = true; + Progress + }, + + None if !self.seen_digit => self.unconsume_numeric(tokenizer, input), + + None => { + self.state = NumericSemicolon; + Progress + }, + } + } + + fn do_numeric_semicolon<Sink: TokenSink>( + &mut self, + tokenizer: &mut Tokenizer<Sink>, + input: &mut BufferQueue, + ) -> Status { + match unwrap_or_return!(tokenizer.peek(input), Stuck) { + ';' => tokenizer.discard_char(input), + _ => tokenizer.emit_error(Borrowed( + "Semicolon missing after numeric character reference", + )), + }; + self.finish_numeric(tokenizer) + } + + fn unconsume_numeric<Sink: TokenSink>( + &mut self, + tokenizer: &mut Tokenizer<Sink>, + input: &mut BufferQueue, + ) -> Status { + let mut unconsume = StrTendril::from_char('#'); + match self.hex_marker { + Some(c) => unconsume.push_char(c), + None => (), + } + + input.push_front(unconsume); + tokenizer.emit_error(Borrowed("Numeric character reference without digits")); + self.finish_none() + } + + fn finish_numeric<Sink: TokenSink>(&mut self, tokenizer: &mut Tokenizer<Sink>) -> Status { + fn conv(n: u32) -> char { + from_u32(n).expect("invalid char missed by error handling cases") + } + + let (c, error) = match self.num { + n if (n > 0x10FFFF) || self.num_too_big => ('\u{fffd}', true), + 0x00 | 0xD800..=0xDFFF => ('\u{fffd}', true), + + 0x80..=0x9F => match data::C1_REPLACEMENTS[(self.num - 0x80) as usize] { + Some(c) => (c, true), + None => (conv(self.num), true), + }, + + 0x01..=0x08 | 0x0B | 0x0D..=0x1F | 0x7F | 0xFDD0..=0xFDEF => (conv(self.num), true), + + n if (n & 0xFFFE) == 0xFFFE => (conv(n), true), + + n => (conv(n), false), + }; + + if error { + let msg = format_if!( + tokenizer.opts.exact_errors, + "Invalid numeric character reference", + "Invalid numeric character reference value 0x{:06X}", + self.num + ); + tokenizer.emit_error(msg); + } + + self.finish_one(c) + } + + fn do_named<Sink: TokenSink>( + &mut self, + tokenizer: &mut Tokenizer<Sink>, + input: &mut BufferQueue, + ) -> Status { + let c = unwrap_or_return!(tokenizer.get_char(input), Stuck); + self.name_buf_mut().push_char(c); + match data::NAMED_ENTITIES.get(&self.name_buf()[..]) { + // We have either a full match or a prefix of one. + Some(&m) => { + if m.0 != 0 { + // We have a full match, but there might be a longer one to come. + self.name_match = Some(m); + self.name_len = self.name_buf().len(); + } + // Otherwise we just have a prefix match. + Progress + }, + + // Can't continue the match. + None => self.finish_named(tokenizer, input, Some(c)), + } + } + + fn emit_name_error<Sink: TokenSink>(&mut self, tokenizer: &mut Tokenizer<Sink>) { + let msg = format_if!( + tokenizer.opts.exact_errors, + "Invalid character reference", + "Invalid character reference &{}", + self.name_buf() + ); + tokenizer.emit_error(msg); + } + + fn unconsume_name(&mut self, input: &mut BufferQueue) { + input.push_front(self.name_buf_opt.take().unwrap()); + } + + fn finish_named<Sink: TokenSink>( + &mut self, + tokenizer: &mut Tokenizer<Sink>, + input: &mut BufferQueue, + end_char: Option<char>, + ) -> Status { + match self.name_match { + None => { + match end_char { + Some(c) if is_ascii_alnum(c) => { + // Keep looking for a semicolon, to determine whether + // we emit a parse error. + self.state = BogusName; + return Progress; + }, + + // Check length because &; is not a parse error. + Some(';') if self.name_buf().len() > 1 => self.emit_name_error(tokenizer), + + _ => (), + } + self.unconsume_name(input); + self.finish_none() + }, + + Some((c1, c2)) => { + // We have a complete match, but we may have consumed + // additional characters into self.name_buf. Usually + // at least one, but several in cases like + // + // ¬ => match for U+00AC + // ¬i => valid prefix for ¬in + // ¬it => can't continue match + + let name_len = self.name_len; + assert!(name_len > 0); + let last_matched = self.name_buf()[name_len - 1..].chars().next().unwrap(); + + // There might not be a next character after the match, if + // we had a full match and then hit EOF. + let next_after = if name_len == self.name_buf().len() { + None + } else { + Some(self.name_buf()[name_len..].chars().next().unwrap()) + }; + + // "If the character reference is being consumed as part of an + // attribute, and the last character matched is not a U+003B + // SEMICOLON character (;), and the next character is either a + // U+003D EQUALS SIGN character (=) or an alphanumeric ASCII + // character, then, for historical reasons, all the characters + // that were matched after the U+0026 AMPERSAND character (&) + // must be unconsumed, and nothing is returned. However, if + // this next character is in fact a U+003D EQUALS SIGN + // character (=), then this is a parse error" + + let unconsume_all = match (self.addnl_allowed, last_matched, next_after) { + (_, ';', _) => false, + (Some(_), _, Some('=')) => { + tokenizer.emit_error(Borrowed( + "Equals sign after character reference in attribute", + )); + true + }, + (Some(_), _, Some(c)) if is_ascii_alnum(c) => true, + _ => { + tokenizer.emit_error(Borrowed( + "Character reference does not end with semicolon", + )); + false + }, + }; + + if unconsume_all { + self.unconsume_name(input); + self.finish_none() + } else { + input.push_front(StrTendril::from_slice(&self.name_buf()[name_len..])); + self.result = Some(CharRef { + chars: [from_u32(c1).unwrap(), from_u32(c2).unwrap()], + num_chars: if c2 == 0 { 1 } else { 2 }, + }); + Done + } + }, + } + } + + fn do_bogus_name<Sink: TokenSink>( + &mut self, + tokenizer: &mut Tokenizer<Sink>, + input: &mut BufferQueue, + ) -> Status { + let c = unwrap_or_return!(tokenizer.get_char(input), Stuck); + self.name_buf_mut().push_char(c); + match c { + _ if is_ascii_alnum(c) => return Progress, + ';' => self.emit_name_error(tokenizer), + _ => (), + } + self.unconsume_name(input); + self.finish_none() + } + + pub fn end_of_file<Sink: TokenSink>( + &mut self, + tokenizer: &mut Tokenizer<Sink>, + input: &mut BufferQueue, + ) { + while self.result.is_none() { + match self.state { + Begin => drop(self.finish_none()), + + Numeric(_) if !self.seen_digit => drop(self.unconsume_numeric(tokenizer, input)), + + Numeric(_) | NumericSemicolon => { + tokenizer.emit_error(Borrowed("EOF in numeric character reference")); + self.finish_numeric(tokenizer); + }, + + Named => drop(self.finish_named(tokenizer, input, None)), + + BogusName => { + self.unconsume_name(input); + self.finish_none(); + }, + + Octothorpe => { + input.push_front(StrTendril::from_slice("#")); + tokenizer.emit_error(Borrowed("EOF after '#' in character reference")); + self.finish_none(); + }, + } + } + } +} |