// Copyright 2014-2017 The html5ever Project Developers. See the // COPYRIGHT file at the top-level directory of this distribution. // // Licensed under the Apache License, Version 2.0 or the MIT license // , at your // option. This file may not be copied, modified, or distributed // except according to those terms. use super::{TokenSink, Tokenizer}; use crate::util::buffer_queue::BufferQueue; use crate::util::str::is_ascii_alnum; use std::borrow::Cow::Borrowed; use std::char::from_u32; use self::State::*; pub use self::Status::*; mod data; //ยง tokenizing-character-references pub struct CharRef { /// The resulting character(s) pub chars: [char; 2], /// How many slots in `chars` are valid? pub num_chars: u8, } pub enum Status { Stuck, Progress, Done, } #[derive(Debug)] enum State { Begin, Octothorpe, Numeric(u32), // base NumericSemicolon, #[cfg(feature = "named-entities")] Named, BogusName, } pub struct CharRefTokenizer { state: State, addnl_allowed: Option, result: Option, num: u32, num_too_big: bool, seen_digit: bool, hex_marker: Option, name_buf_opt: Option, #[cfg(feature = "named-entities")] name_match: Option<(u32, u32)>, #[cfg(feature = "named-entities")] name_len: usize, } impl CharRefTokenizer { // NB: We assume that we have an additional allowed character iff we're // tokenizing in an attribute value. pub fn new(addnl_allowed: Option) -> CharRefTokenizer { CharRefTokenizer { state: Begin, addnl_allowed, result: None, num: 0, num_too_big: false, seen_digit: false, hex_marker: None, name_buf_opt: None, #[cfg(feature = "named-entities")] name_match: None, #[cfg(feature = "named-entities")] name_len: 0, } } // A CharRefTokenizer can only tokenize one character reference, // so this method consumes the tokenizer. pub fn get_result(self) -> CharRef { self.result.expect("get_result called before done") } #[cfg(feature = "named-entities")] fn name_buf(&self) -> &str { self.name_buf_opt .as_ref() .expect("name_buf missing in named character reference") } fn name_buf_mut(&mut self) -> &mut String { self.name_buf_opt .as_mut() .expect("name_buf missing in named character reference") } fn finish_none(&mut self) -> Status { self.result = Some(CharRef { chars: ['\0', '\0'], num_chars: 0, }); Done } fn finish_one(&mut self, c: char) -> Status { self.result = Some(CharRef { chars: [c, '\0'], num_chars: 1, }); Done } } impl CharRefTokenizer { pub fn step( &mut self, tokenizer: &mut Tokenizer, input: &mut BufferQueue, ) -> Status { if self.result.is_some() { return Done; } match self.state { Begin => self.do_begin(tokenizer, input), Octothorpe => self.do_octothorpe(tokenizer, input), Numeric(base) => self.do_numeric(tokenizer, input, base), NumericSemicolon => self.do_numeric_semicolon(tokenizer, input), #[cfg(feature = "named-entities")] Named => self.do_named(tokenizer, input), BogusName => self.do_bogus_name(tokenizer, input), } } fn do_begin( &mut self, tokenizer: &mut Tokenizer, input: &mut BufferQueue, ) -> Status { match unwrap_or_return!(tokenizer.peek(input), Stuck) { '\t' | '\n' | '\x0C' | ' ' | '<' | '&' => self.finish_none(), c if Some(c) == self.addnl_allowed => self.finish_none(), '#' => { tokenizer.discard_char(input); self.state = Octothorpe; Progress } _ => { #[cfg(feature = "named-entities")] { self.state = Named; } #[cfg(not(feature = "named-entities"))] { self.state = BogusName; } self.name_buf_opt = Some(String::new()); Progress } } } fn do_octothorpe( &mut self, tokenizer: &mut Tokenizer, input: &mut BufferQueue, ) -> Status { let c = unwrap_or_return!(tokenizer.peek(input), Stuck); match c { 'x' | 'X' => { tokenizer.discard_char(input); self.hex_marker = Some(c); self.state = Numeric(16); } _ => { self.hex_marker = None; self.state = Numeric(10); } } Progress } fn do_numeric( &mut self, tokenizer: &mut Tokenizer, input: &mut BufferQueue, base: u32, ) -> Status { let c = unwrap_or_return!(tokenizer.peek(input), Stuck); match c.to_digit(base) { Some(n) => { tokenizer.discard_char(input); self.num = self.num.wrapping_mul(base); if self.num > 0x10FFFF { // We might overflow, and the character is definitely invalid. // We still parse digits and semicolon, but don't use the result. self.num_too_big = true; } self.num = self.num.wrapping_add(n); self.seen_digit = true; Progress } None if !self.seen_digit => self.unconsume_numeric(tokenizer, input), None => { self.state = NumericSemicolon; Progress } } } fn do_numeric_semicolon( &mut self, tokenizer: &mut Tokenizer, input: &mut BufferQueue, ) -> Status { match unwrap_or_return!(tokenizer.peek(input), Stuck) { ';' => tokenizer.discard_char(input), _ => tokenizer.emit_error(Borrowed( "Semicolon missing after numeric character reference", )), }; self.finish_numeric(tokenizer) } fn unconsume_numeric( &mut self, tokenizer: &mut Tokenizer, input: &mut BufferQueue, ) -> Status { let mut unconsume = String::from('#'); match self.hex_marker { Some(c) => unconsume.push(c), None => (), } input.push_front(unconsume); tokenizer.emit_error(Borrowed("Numeric character reference without digits")); self.finish_none() } fn finish_numeric(&mut self, tokenizer: &mut Tokenizer) -> Status { fn conv(n: u32) -> char { from_u32(n).expect("invalid char missed by error handling cases") } let (c, error) = match self.num { n if (n > 0x10FFFF) || self.num_too_big => ('\u{fffd}', true), 0x00 | 0xD800..=0xDFFF => ('\u{fffd}', true), 0x80..=0x9F => match data::C1_REPLACEMENTS[(self.num - 0x80) as usize] { Some(c) => (c, true), None => (conv(self.num), true), }, 0x01..=0x08 | 0x0B | 0x0D..=0x1F | 0x7F | 0xFDD0..=0xFDEF => (conv(self.num), true), n if (n & 0xFFFE) == 0xFFFE => (conv(n), true), n => (conv(n), false), }; if error { let msg = format_if!( tokenizer.opts.exact_errors, "Invalid numeric character reference", "Invalid numeric character reference value 0x{:06X}", self.num ); tokenizer.emit_error(msg); } self.finish_one(c) } #[cfg(feature = "named-entities")] fn do_named( &mut self, tokenizer: &mut Tokenizer, input: &mut BufferQueue, ) -> Status { let c = unwrap_or_return!(tokenizer.get_char(input), Stuck); self.name_buf_mut().push(c); match data::NAMED_ENTITIES.get(&self.name_buf()[..]) { // We have either a full match or a prefix of one. Some(&m) => { if m.0 != 0 { // We have a full match, but there might be a longer one to come. self.name_match = Some(m); self.name_len = self.name_buf().len(); } // Otherwise we just have a prefix match. Progress } // Can't continue the match. None => self.finish_named(tokenizer, input, Some(c)), } } #[cfg(feature = "named-entities")] fn emit_name_error(&mut self, tokenizer: &mut Tokenizer) { let msg = format_if!( tokenizer.opts.exact_errors, "Invalid character reference", "Invalid character reference &{}", self.name_buf() ); tokenizer.emit_error(msg); } fn unconsume_name(&mut self, input: &mut BufferQueue) { input.push_front(self.name_buf_opt.take().unwrap()); } #[cfg(feature = "named-entities")] fn finish_named( &mut self, tokenizer: &mut Tokenizer, input: &mut BufferQueue, end_char: Option, ) -> Status { match self.name_match { None => { match end_char { Some(c) if is_ascii_alnum(c) => { // Keep looking for a semicolon, to determine whether // we emit a parse error. self.state = BogusName; return Progress; } // Check length because &; is not a parse error. Some(';') if self.name_buf().len() > 1 => self.emit_name_error(tokenizer), _ => (), } self.unconsume_name(input); self.finish_none() } Some((c1, c2)) => { // We have a complete match, but we may have consumed // additional characters into self.name_buf. Usually // at least one, but several in cases like // // ¬ => match for U+00AC // ¬i => valid prefix for ¬in // ¬it => can't continue match let name_len = self.name_len; assert!(name_len > 0); let last_matched = self.name_buf()[name_len - 1..].chars().next().unwrap(); // There might not be a next character after the match, if // we had a full match and then hit EOF. let next_after = if name_len == self.name_buf().len() { None } else { Some(self.name_buf()[name_len..].chars().next().unwrap()) }; // "If the character reference is being consumed as part of an // attribute, and the last character matched is not a U+003B // SEMICOLON character (;), and the next character is either a // U+003D EQUALS SIGN character (=) or an alphanumeric ASCII // character, then, for historical reasons, all the characters // that were matched after the U+0026 AMPERSAND character (&) // must be unconsumed, and nothing is returned. However, if // this next character is in fact a U+003D EQUALS SIGN // character (=), then this is a parse error" let unconsume_all = match (self.addnl_allowed, last_matched, next_after) { (_, ';', _) => false, (Some(_), _, Some('=')) => { tokenizer.emit_error(Borrowed( "Equals sign after character reference in attribute", )); true } (Some(_), _, Some(c)) if is_ascii_alnum(c) => true, _ => { tokenizer.emit_error(Borrowed( "Character reference does not end with semicolon", )); false } }; if unconsume_all { self.unconsume_name(input); self.finish_none() } else { input.push_front(String::from(&self.name_buf()[name_len..])); self.result = Some(CharRef { chars: [from_u32(c1).unwrap(), from_u32(c2).unwrap()], num_chars: if c2 == 0 { 1 } else { 2 }, }); Done } } } } fn do_bogus_name( &mut self, tokenizer: &mut Tokenizer, input: &mut BufferQueue, ) -> Status { let c = unwrap_or_return!(tokenizer.get_char(input), Stuck); self.name_buf_mut().push(c); match c { _ if is_ascii_alnum(c) => return Progress, ';' => { #[cfg(feature = "named-entities")] self.emit_name_error(tokenizer); } _ => (), } self.unconsume_name(input); self.finish_none() } pub fn end_of_file( &mut self, tokenizer: &mut Tokenizer, input: &mut BufferQueue, ) { while self.result.is_none() { match self.state { Begin => drop(self.finish_none()), Numeric(_) if !self.seen_digit => drop(self.unconsume_numeric(tokenizer, input)), Numeric(_) | NumericSemicolon => { tokenizer.emit_error(Borrowed("EOF in numeric character reference")); self.finish_numeric(tokenizer); } #[cfg(feature = "named-entities")] Named => drop(self.finish_named(tokenizer, input, None)), BogusName => { self.unconsume_name(input); self.finish_none(); } Octothorpe => { input.push_front(String::from("#")); tokenizer.emit_error(Borrowed("EOF after '#' in character reference")); self.finish_none(); } } } } }