// Copyright 2014-2017 The html5ever Project Developers. See the
// COPYRIGHT file at the top-level directory of this distribution.
//
// Licensed under the Apache License, Version 2.0 or the MIT license
// , at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
use super::{TokenSink, Tokenizer};
use crate::error::{CharRefError, Error};
use crate::util::buffer_queue::BufferQueue;
use crate::util::str::is_ascii_alnum;
use std::char::from_u32;
use self::State::*;
pub use self::Status::*;
mod data;
//ยง tokenizing-character-references
pub struct CharRef {
/// The resulting character(s)
pub chars: [char; 2],
/// How many slots in `chars` are valid?
pub num_chars: u8,
}
pub enum Status {
Stuck,
Progress,
Done,
}
#[derive(Debug)]
enum State {
Begin,
Octothorpe,
Numeric(u32), // base
NumericSemicolon,
#[cfg(feature = "named-entities")]
Named,
BogusName,
}
pub struct CharRefTokenizer {
state: State,
addnl_allowed: Option,
result: Option,
num: u32,
num_too_big: bool,
seen_digit: bool,
hex_marker: Option,
name_buf_opt: Option,
#[cfg(feature = "named-entities")]
name_match: Option<(u32, u32)>,
#[cfg(feature = "named-entities")]
name_len: usize,
}
impl CharRefTokenizer {
// NB: We assume that we have an additional allowed character iff we're
// tokenizing in an attribute value.
pub fn new(addnl_allowed: Option) -> CharRefTokenizer {
CharRefTokenizer {
state: Begin,
addnl_allowed,
result: None,
num: 0,
num_too_big: false,
seen_digit: false,
hex_marker: None,
name_buf_opt: None,
#[cfg(feature = "named-entities")]
name_match: None,
#[cfg(feature = "named-entities")]
name_len: 0,
}
}
// A CharRefTokenizer can only tokenize one character reference,
// so this method consumes the tokenizer.
pub fn get_result(self) -> CharRef {
self.result.expect("get_result called before done")
}
#[cfg(feature = "named-entities")]
fn name_buf(&self) -> &str {
self.name_buf_opt
.as_ref()
.expect("name_buf missing in named character reference")
}
fn name_buf_mut(&mut self) -> &mut String {
self.name_buf_opt
.as_mut()
.expect("name_buf missing in named character reference")
}
fn finish_none(&mut self) -> Status {
self.result = Some(CharRef {
chars: ['\0', '\0'],
num_chars: 0,
});
Done
}
fn finish_one(&mut self, c: char) -> Status {
self.result = Some(CharRef {
chars: [c, '\0'],
num_chars: 1,
});
Done
}
}
impl CharRefTokenizer {
pub fn step(
&mut self,
tokenizer: &mut Tokenizer,
input: &mut BufferQueue,
) -> Status {
if self.result.is_some() {
return Done;
}
match self.state {
Begin => self.do_begin(tokenizer, input),
Octothorpe => self.do_octothorpe(tokenizer, input),
Numeric(base) => self.do_numeric(tokenizer, input, base),
NumericSemicolon => self.do_numeric_semicolon(tokenizer, input),
#[cfg(feature = "named-entities")]
Named => self.do_named(tokenizer, input),
BogusName => self.do_bogus_name(tokenizer, input),
}
}
fn do_begin(
&mut self,
tokenizer: &mut Tokenizer,
input: &mut BufferQueue,
) -> Status {
match unwrap_or_return!(tokenizer.peek(input), Stuck) {
'\t' | '\n' | '\x0C' | ' ' | '<' | '&' => self.finish_none(),
c if Some(c) == self.addnl_allowed => self.finish_none(),
'#' => {
tokenizer.discard_char(input);
self.state = Octothorpe;
Progress
}
_ => {
#[cfg(feature = "named-entities")]
{
self.state = Named;
}
#[cfg(not(feature = "named-entities"))]
{
self.state = BogusName;
}
self.name_buf_opt = Some(String::new());
Progress
}
}
}
fn do_octothorpe(
&mut self,
tokenizer: &mut Tokenizer,
input: &mut BufferQueue,
) -> Status {
let c = unwrap_or_return!(tokenizer.peek(input), Stuck);
match c {
'x' | 'X' => {
tokenizer.discard_char(input);
self.hex_marker = Some(c);
self.state = Numeric(16);
}
_ => {
self.hex_marker = None;
self.state = Numeric(10);
}
}
Progress
}
fn do_numeric(
&mut self,
tokenizer: &mut Tokenizer,
input: &mut BufferQueue,
base: u32,
) -> Status {
let c = unwrap_or_return!(tokenizer.peek(input), Stuck);
match c.to_digit(base) {
Some(n) => {
tokenizer.discard_char(input);
self.num = self.num.wrapping_mul(base);
if self.num > 0x10FFFF {
// We might overflow, and the character is definitely invalid.
// We still parse digits and semicolon, but don't use the result.
self.num_too_big = true;
}
self.num = self.num.wrapping_add(n);
self.seen_digit = true;
Progress
}
None if !self.seen_digit => self.unconsume_numeric(tokenizer, input),
None => {
self.state = NumericSemicolon;
Progress
}
}
}
fn do_numeric_semicolon(
&mut self,
tokenizer: &mut Tokenizer,
input: &mut BufferQueue,
) -> Status {
match unwrap_or_return!(tokenizer.peek(input), Stuck) {
';' => tokenizer.discard_char(input),
_ => tokenizer.emit_error(Error::CharRef(CharRefError::MissingSemicolon)),
};
self.finish_numeric(tokenizer)
}
fn unconsume_numeric(
&mut self,
tokenizer: &mut Tokenizer,
input: &mut BufferQueue,
) -> Status {
let mut unconsume = String::from('#');
match self.hex_marker {
Some(c) => unconsume.push(c),
None => (),
}
input.push_front(unconsume);
tokenizer.emit_error(Error::CharRef(CharRefError::NumericCharRefWithoutDigits));
self.finish_none()
}
fn finish_numeric(&mut self, tokenizer: &mut Tokenizer) -> Status {
fn conv(n: u32) -> char {
from_u32(n).expect("invalid char missed by error handling cases")
}
let (c, error) = match self.num {
n if (n > 0x10FFFF) || self.num_too_big => ('\u{fffd}', true),
0x00 | 0xD800..=0xDFFF => ('\u{fffd}', true),
0x80..=0x9F => match data::C1_REPLACEMENTS[(self.num - 0x80) as usize] {
Some(c) => (c, true),
None => (conv(self.num), true),
},
0x01..=0x08 | 0x0B | 0x0D..=0x1F | 0x7F | 0xFDD0..=0xFDEF => (conv(self.num), true),
n if (n & 0xFFFE) == 0xFFFE => (conv(n), true),
n => (conv(n), false),
};
if error {
tokenizer.emit_error(Error::CharRef(CharRefError::NumericCharRefInvalid(
self.num,
)));
}
self.finish_one(c)
}
#[cfg(feature = "named-entities")]
fn do_named(
&mut self,
tokenizer: &mut Tokenizer,
input: &mut BufferQueue,
) -> Status {
let c = unwrap_or_return!(tokenizer.get_char(input), Stuck);
self.name_buf_mut().push(c);
match data::NAMED_ENTITIES.get(&self.name_buf()[..]) {
// We have either a full match or a prefix of one.
Some(&m) => {
if m.0 != 0 {
// We have a full match, but there might be a longer one to come.
self.name_match = Some(m);
self.name_len = self.name_buf().len();
}
// Otherwise we just have a prefix match.
Progress
}
// Can't continue the match.
None => self.finish_named(tokenizer, input, Some(c)),
}
}
#[cfg(feature = "named-entities")]
fn emit_name_error(&mut self, tokenizer: &mut Tokenizer) {
tokenizer.emit_error(Error::CharRef(CharRefError::InvalidNamedCharRef));
}
fn unconsume_name(&mut self, input: &mut BufferQueue) {
input.push_front(self.name_buf_opt.take().unwrap());
}
#[cfg(feature = "named-entities")]
fn finish_named(
&mut self,
tokenizer: &mut Tokenizer,
input: &mut BufferQueue,
end_char: Option,
) -> Status {
match self.name_match {
None => {
match end_char {
Some(c) if is_ascii_alnum(c) => {
// Keep looking for a semicolon, to determine whether
// we emit a parse error.
self.state = BogusName;
return Progress;
}
// Check length because &; is not a parse error.
Some(';') if self.name_buf().len() > 1 => self.emit_name_error(tokenizer),
_ => (),
}
self.unconsume_name(input);
self.finish_none()
}
Some((c1, c2)) => {
// We have a complete match, but we may have consumed
// additional characters into self.name_buf. Usually
// at least one, but several in cases like
//
// ¬ => match for U+00AC
// ¬i => valid prefix for ¬in
// ¬it => can't continue match
let name_len = self.name_len;
assert!(name_len > 0);
let last_matched = self.name_buf()[name_len - 1..].chars().next().unwrap();
// There might not be a next character after the match, if
// we had a full match and then hit EOF.
let next_after = if name_len == self.name_buf().len() {
None
} else {
Some(self.name_buf()[name_len..].chars().next().unwrap())
};
// "If the character reference is being consumed as part of an
// attribute, and the last character matched is not a U+003B
// SEMICOLON character (;), and the next character is either a
// U+003D EQUALS SIGN character (=) or an alphanumeric ASCII
// character, then, for historical reasons, all the characters
// that were matched after the U+0026 AMPERSAND character (&)
// must be unconsumed, and nothing is returned. However, if
// this next character is in fact a U+003D EQUALS SIGN
// character (=), then this is a parse error"
let unconsume_all = match (self.addnl_allowed, last_matched, next_after) {
(_, ';', _) => false,
(Some(_), _, Some('=')) => {
tokenizer
.emit_error(Error::CharRef(CharRefError::EqualsSignAfterCharRefInAttr));
true
}
(Some(_), _, Some(c)) if is_ascii_alnum(c) => true,
_ => {
tokenizer.emit_error(Error::CharRef(CharRefError::MissingSemicolon));
false
}
};
if unconsume_all {
self.unconsume_name(input);
self.finish_none()
} else {
input.push_front(String::from(&self.name_buf()[name_len..]));
self.result = Some(CharRef {
chars: [from_u32(c1).unwrap(), from_u32(c2).unwrap()],
num_chars: if c2 == 0 { 1 } else { 2 },
});
Done
}
}
}
}
fn do_bogus_name(
&mut self,
tokenizer: &mut Tokenizer,
input: &mut BufferQueue,
) -> Status {
let c = unwrap_or_return!(tokenizer.get_char(input), Stuck);
self.name_buf_mut().push(c);
match c {
_ if is_ascii_alnum(c) => return Progress,
';' => {
#[cfg(feature = "named-entities")]
self.emit_name_error(tokenizer);
}
_ => (),
}
self.unconsume_name(input);
self.finish_none()
}
pub fn end_of_file(
&mut self,
tokenizer: &mut Tokenizer,
input: &mut BufferQueue,
) {
while self.result.is_none() {
match self.state {
Begin => drop(self.finish_none()),
Numeric(_) if !self.seen_digit => drop(self.unconsume_numeric(tokenizer, input)),
Numeric(_) | NumericSemicolon => {
tokenizer.emit_error(Error::CharRef(CharRefError::EofInNumericCharRef));
self.finish_numeric(tokenizer);
}
#[cfg(feature = "named-entities")]
Named => drop(self.finish_named(tokenizer, input, None)),
BogusName => {
self.unconsume_name(input);
self.finish_none();
}
Octothorpe => {
input.push_front(String::from("#"));
tokenizer.emit_error(Error::CharRef(CharRefError::EofAfterNumberSign));
self.finish_none();
}
}
}
}
}