aboutsummaryrefslogtreecommitdiff
path: root/src/tokenizer/char_ref
diff options
context:
space:
mode:
authorMartin Fischer <martin@push-f.com>2021-04-08 08:42:01 +0200
committerMartin Fischer <martin@push-f.com>2021-04-08 15:40:37 +0200
commit57e7eefcbe6fb8c3dc4b01c707be9de4c34963a7 (patch)
tree6a9d296389bf3023396592c8514ed6712e011c7f /src/tokenizer/char_ref
import https://github.com/servo/html5ever
commit d1206daa740305f55a5fa159e43eb33afc359cb4
Diffstat (limited to 'src/tokenizer/char_ref')
-rw-r--r--src/tokenizer/char_ref/mod.rs449
1 files changed, 449 insertions, 0 deletions
diff --git a/src/tokenizer/char_ref/mod.rs b/src/tokenizer/char_ref/mod.rs
new file mode 100644
index 0000000..a52485d
--- /dev/null
+++ b/src/tokenizer/char_ref/mod.rs
@@ -0,0 +1,449 @@
+// Copyright 2014-2017 The html5ever Project Developers. See the
+// COPYRIGHT file at the top-level directory of this distribution.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+use super::{TokenSink, Tokenizer};
+use crate::buffer_queue::BufferQueue;
+use crate::data;
+use crate::tendril::StrTendril;
+use crate::util::str::is_ascii_alnum;
+
+use log::debug;
+use mac::format_if;
+use std::borrow::Cow::Borrowed;
+use std::char::from_u32;
+
+use self::State::*;
+pub use self::Status::*;
+
+//ยง tokenizing-character-references
+pub struct CharRef {
+ /// The resulting character(s)
+ pub chars: [char; 2],
+
+ /// How many slots in `chars` are valid?
+ pub num_chars: u8,
+}
+
+pub enum Status {
+ Stuck,
+ Progress,
+ Done,
+}
+
+#[derive(Debug)]
+enum State {
+ Begin,
+ Octothorpe,
+ Numeric(u32), // base
+ NumericSemicolon,
+ Named,
+ BogusName,
+}
+
+pub struct CharRefTokenizer {
+ state: State,
+ addnl_allowed: Option<char>,
+ result: Option<CharRef>,
+
+ num: u32,
+ num_too_big: bool,
+ seen_digit: bool,
+ hex_marker: Option<char>,
+
+ name_buf_opt: Option<StrTendril>,
+ name_match: Option<(u32, u32)>,
+ name_len: usize,
+}
+
+impl CharRefTokenizer {
+ // NB: We assume that we have an additional allowed character iff we're
+ // tokenizing in an attribute value.
+ pub fn new(addnl_allowed: Option<char>) -> CharRefTokenizer {
+ CharRefTokenizer {
+ state: Begin,
+ addnl_allowed,
+ result: None,
+ num: 0,
+ num_too_big: false,
+ seen_digit: false,
+ hex_marker: None,
+ name_buf_opt: None,
+ name_match: None,
+ name_len: 0,
+ }
+ }
+
+ // A CharRefTokenizer can only tokenize one character reference,
+ // so this method consumes the tokenizer.
+ pub fn get_result(self) -> CharRef {
+ self.result.expect("get_result called before done")
+ }
+
+ fn name_buf(&self) -> &StrTendril {
+ self.name_buf_opt
+ .as_ref()
+ .expect("name_buf missing in named character reference")
+ }
+
+ fn name_buf_mut(&mut self) -> &mut StrTendril {
+ self.name_buf_opt
+ .as_mut()
+ .expect("name_buf missing in named character reference")
+ }
+
+ fn finish_none(&mut self) -> Status {
+ self.result = Some(CharRef {
+ chars: ['\0', '\0'],
+ num_chars: 0,
+ });
+ Done
+ }
+
+ fn finish_one(&mut self, c: char) -> Status {
+ self.result = Some(CharRef {
+ chars: [c, '\0'],
+ num_chars: 1,
+ });
+ Done
+ }
+}
+
+impl CharRefTokenizer {
+ pub fn step<Sink: TokenSink>(
+ &mut self,
+ tokenizer: &mut Tokenizer<Sink>,
+ input: &mut BufferQueue,
+ ) -> Status {
+ if self.result.is_some() {
+ return Done;
+ }
+
+ debug!("char ref tokenizer stepping in state {:?}", self.state);
+ match self.state {
+ Begin => self.do_begin(tokenizer, input),
+ Octothorpe => self.do_octothorpe(tokenizer, input),
+ Numeric(base) => self.do_numeric(tokenizer, input, base),
+ NumericSemicolon => self.do_numeric_semicolon(tokenizer, input),
+ Named => self.do_named(tokenizer, input),
+ BogusName => self.do_bogus_name(tokenizer, input),
+ }
+ }
+
+ fn do_begin<Sink: TokenSink>(
+ &mut self,
+ tokenizer: &mut Tokenizer<Sink>,
+ input: &mut BufferQueue,
+ ) -> Status {
+ match unwrap_or_return!(tokenizer.peek(input), Stuck) {
+ '\t' | '\n' | '\x0C' | ' ' | '<' | '&' => self.finish_none(),
+ c if Some(c) == self.addnl_allowed => self.finish_none(),
+
+ '#' => {
+ tokenizer.discard_char(input);
+ self.state = Octothorpe;
+ Progress
+ },
+
+ _ => {
+ self.state = Named;
+ self.name_buf_opt = Some(StrTendril::new());
+ Progress
+ },
+ }
+ }
+
+ fn do_octothorpe<Sink: TokenSink>(
+ &mut self,
+ tokenizer: &mut Tokenizer<Sink>,
+ input: &mut BufferQueue,
+ ) -> Status {
+ let c = unwrap_or_return!(tokenizer.peek(input), Stuck);
+ match c {
+ 'x' | 'X' => {
+ tokenizer.discard_char(input);
+ self.hex_marker = Some(c);
+ self.state = Numeric(16);
+ },
+
+ _ => {
+ self.hex_marker = None;
+ self.state = Numeric(10);
+ },
+ }
+ Progress
+ }
+
+ fn do_numeric<Sink: TokenSink>(
+ &mut self,
+ tokenizer: &mut Tokenizer<Sink>,
+ input: &mut BufferQueue,
+ base: u32,
+ ) -> Status {
+ let c = unwrap_or_return!(tokenizer.peek(input), Stuck);
+ match c.to_digit(base) {
+ Some(n) => {
+ tokenizer.discard_char(input);
+ self.num = self.num.wrapping_mul(base);
+ if self.num > 0x10FFFF {
+ // We might overflow, and the character is definitely invalid.
+ // We still parse digits and semicolon, but don't use the result.
+ self.num_too_big = true;
+ }
+ self.num = self.num.wrapping_add(n);
+ self.seen_digit = true;
+ Progress
+ },
+
+ None if !self.seen_digit => self.unconsume_numeric(tokenizer, input),
+
+ None => {
+ self.state = NumericSemicolon;
+ Progress
+ },
+ }
+ }
+
+ fn do_numeric_semicolon<Sink: TokenSink>(
+ &mut self,
+ tokenizer: &mut Tokenizer<Sink>,
+ input: &mut BufferQueue,
+ ) -> Status {
+ match unwrap_or_return!(tokenizer.peek(input), Stuck) {
+ ';' => tokenizer.discard_char(input),
+ _ => tokenizer.emit_error(Borrowed(
+ "Semicolon missing after numeric character reference",
+ )),
+ };
+ self.finish_numeric(tokenizer)
+ }
+
+ fn unconsume_numeric<Sink: TokenSink>(
+ &mut self,
+ tokenizer: &mut Tokenizer<Sink>,
+ input: &mut BufferQueue,
+ ) -> Status {
+ let mut unconsume = StrTendril::from_char('#');
+ match self.hex_marker {
+ Some(c) => unconsume.push_char(c),
+ None => (),
+ }
+
+ input.push_front(unconsume);
+ tokenizer.emit_error(Borrowed("Numeric character reference without digits"));
+ self.finish_none()
+ }
+
+ fn finish_numeric<Sink: TokenSink>(&mut self, tokenizer: &mut Tokenizer<Sink>) -> Status {
+ fn conv(n: u32) -> char {
+ from_u32(n).expect("invalid char missed by error handling cases")
+ }
+
+ let (c, error) = match self.num {
+ n if (n > 0x10FFFF) || self.num_too_big => ('\u{fffd}', true),
+ 0x00 | 0xD800..=0xDFFF => ('\u{fffd}', true),
+
+ 0x80..=0x9F => match data::C1_REPLACEMENTS[(self.num - 0x80) as usize] {
+ Some(c) => (c, true),
+ None => (conv(self.num), true),
+ },
+
+ 0x01..=0x08 | 0x0B | 0x0D..=0x1F | 0x7F | 0xFDD0..=0xFDEF => (conv(self.num), true),
+
+ n if (n & 0xFFFE) == 0xFFFE => (conv(n), true),
+
+ n => (conv(n), false),
+ };
+
+ if error {
+ let msg = format_if!(
+ tokenizer.opts.exact_errors,
+ "Invalid numeric character reference",
+ "Invalid numeric character reference value 0x{:06X}",
+ self.num
+ );
+ tokenizer.emit_error(msg);
+ }
+
+ self.finish_one(c)
+ }
+
+ fn do_named<Sink: TokenSink>(
+ &mut self,
+ tokenizer: &mut Tokenizer<Sink>,
+ input: &mut BufferQueue,
+ ) -> Status {
+ let c = unwrap_or_return!(tokenizer.get_char(input), Stuck);
+ self.name_buf_mut().push_char(c);
+ match data::NAMED_ENTITIES.get(&self.name_buf()[..]) {
+ // We have either a full match or a prefix of one.
+ Some(&m) => {
+ if m.0 != 0 {
+ // We have a full match, but there might be a longer one to come.
+ self.name_match = Some(m);
+ self.name_len = self.name_buf().len();
+ }
+ // Otherwise we just have a prefix match.
+ Progress
+ },
+
+ // Can't continue the match.
+ None => self.finish_named(tokenizer, input, Some(c)),
+ }
+ }
+
+ fn emit_name_error<Sink: TokenSink>(&mut self, tokenizer: &mut Tokenizer<Sink>) {
+ let msg = format_if!(
+ tokenizer.opts.exact_errors,
+ "Invalid character reference",
+ "Invalid character reference &{}",
+ self.name_buf()
+ );
+ tokenizer.emit_error(msg);
+ }
+
+ fn unconsume_name(&mut self, input: &mut BufferQueue) {
+ input.push_front(self.name_buf_opt.take().unwrap());
+ }
+
+ fn finish_named<Sink: TokenSink>(
+ &mut self,
+ tokenizer: &mut Tokenizer<Sink>,
+ input: &mut BufferQueue,
+ end_char: Option<char>,
+ ) -> Status {
+ match self.name_match {
+ None => {
+ match end_char {
+ Some(c) if is_ascii_alnum(c) => {
+ // Keep looking for a semicolon, to determine whether
+ // we emit a parse error.
+ self.state = BogusName;
+ return Progress;
+ },
+
+ // Check length because &; is not a parse error.
+ Some(';') if self.name_buf().len() > 1 => self.emit_name_error(tokenizer),
+
+ _ => (),
+ }
+ self.unconsume_name(input);
+ self.finish_none()
+ },
+
+ Some((c1, c2)) => {
+ // We have a complete match, but we may have consumed
+ // additional characters into self.name_buf. Usually
+ // at least one, but several in cases like
+ //
+ // &not => match for U+00AC
+ // &noti => valid prefix for &notin
+ // &notit => can't continue match
+
+ let name_len = self.name_len;
+ assert!(name_len > 0);
+ let last_matched = self.name_buf()[name_len - 1..].chars().next().unwrap();
+
+ // There might not be a next character after the match, if
+ // we had a full match and then hit EOF.
+ let next_after = if name_len == self.name_buf().len() {
+ None
+ } else {
+ Some(self.name_buf()[name_len..].chars().next().unwrap())
+ };
+
+ // "If the character reference is being consumed as part of an
+ // attribute, and the last character matched is not a U+003B
+ // SEMICOLON character (;), and the next character is either a
+ // U+003D EQUALS SIGN character (=) or an alphanumeric ASCII
+ // character, then, for historical reasons, all the characters
+ // that were matched after the U+0026 AMPERSAND character (&)
+ // must be unconsumed, and nothing is returned. However, if
+ // this next character is in fact a U+003D EQUALS SIGN
+ // character (=), then this is a parse error"
+
+ let unconsume_all = match (self.addnl_allowed, last_matched, next_after) {
+ (_, ';', _) => false,
+ (Some(_), _, Some('=')) => {
+ tokenizer.emit_error(Borrowed(
+ "Equals sign after character reference in attribute",
+ ));
+ true
+ },
+ (Some(_), _, Some(c)) if is_ascii_alnum(c) => true,
+ _ => {
+ tokenizer.emit_error(Borrowed(
+ "Character reference does not end with semicolon",
+ ));
+ false
+ },
+ };
+
+ if unconsume_all {
+ self.unconsume_name(input);
+ self.finish_none()
+ } else {
+ input.push_front(StrTendril::from_slice(&self.name_buf()[name_len..]));
+ self.result = Some(CharRef {
+ chars: [from_u32(c1).unwrap(), from_u32(c2).unwrap()],
+ num_chars: if c2 == 0 { 1 } else { 2 },
+ });
+ Done
+ }
+ },
+ }
+ }
+
+ fn do_bogus_name<Sink: TokenSink>(
+ &mut self,
+ tokenizer: &mut Tokenizer<Sink>,
+ input: &mut BufferQueue,
+ ) -> Status {
+ let c = unwrap_or_return!(tokenizer.get_char(input), Stuck);
+ self.name_buf_mut().push_char(c);
+ match c {
+ _ if is_ascii_alnum(c) => return Progress,
+ ';' => self.emit_name_error(tokenizer),
+ _ => (),
+ }
+ self.unconsume_name(input);
+ self.finish_none()
+ }
+
+ pub fn end_of_file<Sink: TokenSink>(
+ &mut self,
+ tokenizer: &mut Tokenizer<Sink>,
+ input: &mut BufferQueue,
+ ) {
+ while self.result.is_none() {
+ match self.state {
+ Begin => drop(self.finish_none()),
+
+ Numeric(_) if !self.seen_digit => drop(self.unconsume_numeric(tokenizer, input)),
+
+ Numeric(_) | NumericSemicolon => {
+ tokenizer.emit_error(Borrowed("EOF in numeric character reference"));
+ self.finish_numeric(tokenizer);
+ },
+
+ Named => drop(self.finish_named(tokenizer, input, None)),
+
+ BogusName => {
+ self.unconsume_name(input);
+ self.finish_none();
+ },
+
+ Octothorpe => {
+ input.push_front(StrTendril::from_slice("#"));
+ tokenizer.emit_error(Borrowed("EOF after '#' in character reference"));
+ self.finish_none();
+ },
+ }
+ }
+ }
+}