diff options
Diffstat (limited to 'src/tokenizer/char_ref')
| -rw-r--r-- | src/tokenizer/char_ref/mod.rs | 449 | 
1 files changed, 449 insertions, 0 deletions
| diff --git a/src/tokenizer/char_ref/mod.rs b/src/tokenizer/char_ref/mod.rs new file mode 100644 index 0000000..a52485d --- /dev/null +++ b/src/tokenizer/char_ref/mod.rs @@ -0,0 +1,449 @@ +// Copyright 2014-2017 The html5ever Project Developers. See the +// COPYRIGHT file at the top-level directory of this distribution. +// +// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or +// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license +// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +use super::{TokenSink, Tokenizer}; +use crate::buffer_queue::BufferQueue; +use crate::data; +use crate::tendril::StrTendril; +use crate::util::str::is_ascii_alnum; + +use log::debug; +use mac::format_if; +use std::borrow::Cow::Borrowed; +use std::char::from_u32; + +use self::State::*; +pub use self::Status::*; + +//ยง tokenizing-character-references +pub struct CharRef { +    /// The resulting character(s) +    pub chars: [char; 2], + +    /// How many slots in `chars` are valid? +    pub num_chars: u8, +} + +pub enum Status { +    Stuck, +    Progress, +    Done, +} + +#[derive(Debug)] +enum State { +    Begin, +    Octothorpe, +    Numeric(u32), // base +    NumericSemicolon, +    Named, +    BogusName, +} + +pub struct CharRefTokenizer { +    state: State, +    addnl_allowed: Option<char>, +    result: Option<CharRef>, + +    num: u32, +    num_too_big: bool, +    seen_digit: bool, +    hex_marker: Option<char>, + +    name_buf_opt: Option<StrTendril>, +    name_match: Option<(u32, u32)>, +    name_len: usize, +} + +impl CharRefTokenizer { +    // NB: We assume that we have an additional allowed character iff we're +    // tokenizing in an attribute value. +    pub fn new(addnl_allowed: Option<char>) -> CharRefTokenizer { +        CharRefTokenizer { +            state: Begin, +            addnl_allowed, +            result: None, +            num: 0, +            num_too_big: false, +            seen_digit: false, +            hex_marker: None, +            name_buf_opt: None, +            name_match: None, +            name_len: 0, +        } +    } + +    // A CharRefTokenizer can only tokenize one character reference, +    // so this method consumes the tokenizer. +    pub fn get_result(self) -> CharRef { +        self.result.expect("get_result called before done") +    } + +    fn name_buf(&self) -> &StrTendril { +        self.name_buf_opt +            .as_ref() +            .expect("name_buf missing in named character reference") +    } + +    fn name_buf_mut(&mut self) -> &mut StrTendril { +        self.name_buf_opt +            .as_mut() +            .expect("name_buf missing in named character reference") +    } + +    fn finish_none(&mut self) -> Status { +        self.result = Some(CharRef { +            chars: ['\0', '\0'], +            num_chars: 0, +        }); +        Done +    } + +    fn finish_one(&mut self, c: char) -> Status { +        self.result = Some(CharRef { +            chars: [c, '\0'], +            num_chars: 1, +        }); +        Done +    } +} + +impl CharRefTokenizer { +    pub fn step<Sink: TokenSink>( +        &mut self, +        tokenizer: &mut Tokenizer<Sink>, +        input: &mut BufferQueue, +    ) -> Status { +        if self.result.is_some() { +            return Done; +        } + +        debug!("char ref tokenizer stepping in state {:?}", self.state); +        match self.state { +            Begin => self.do_begin(tokenizer, input), +            Octothorpe => self.do_octothorpe(tokenizer, input), +            Numeric(base) => self.do_numeric(tokenizer, input, base), +            NumericSemicolon => self.do_numeric_semicolon(tokenizer, input), +            Named => self.do_named(tokenizer, input), +            BogusName => self.do_bogus_name(tokenizer, input), +        } +    } + +    fn do_begin<Sink: TokenSink>( +        &mut self, +        tokenizer: &mut Tokenizer<Sink>, +        input: &mut BufferQueue, +    ) -> Status { +        match unwrap_or_return!(tokenizer.peek(input), Stuck) { +            '\t' | '\n' | '\x0C' | ' ' | '<' | '&' => self.finish_none(), +            c if Some(c) == self.addnl_allowed => self.finish_none(), + +            '#' => { +                tokenizer.discard_char(input); +                self.state = Octothorpe; +                Progress +            }, + +            _ => { +                self.state = Named; +                self.name_buf_opt = Some(StrTendril::new()); +                Progress +            }, +        } +    } + +    fn do_octothorpe<Sink: TokenSink>( +        &mut self, +        tokenizer: &mut Tokenizer<Sink>, +        input: &mut BufferQueue, +    ) -> Status { +        let c = unwrap_or_return!(tokenizer.peek(input), Stuck); +        match c { +            'x' | 'X' => { +                tokenizer.discard_char(input); +                self.hex_marker = Some(c); +                self.state = Numeric(16); +            }, + +            _ => { +                self.hex_marker = None; +                self.state = Numeric(10); +            }, +        } +        Progress +    } + +    fn do_numeric<Sink: TokenSink>( +        &mut self, +        tokenizer: &mut Tokenizer<Sink>, +        input: &mut BufferQueue, +        base: u32, +    ) -> Status { +        let c = unwrap_or_return!(tokenizer.peek(input), Stuck); +        match c.to_digit(base) { +            Some(n) => { +                tokenizer.discard_char(input); +                self.num = self.num.wrapping_mul(base); +                if self.num > 0x10FFFF { +                    // We might overflow, and the character is definitely invalid. +                    // We still parse digits and semicolon, but don't use the result. +                    self.num_too_big = true; +                } +                self.num = self.num.wrapping_add(n); +                self.seen_digit = true; +                Progress +            }, + +            None if !self.seen_digit => self.unconsume_numeric(tokenizer, input), + +            None => { +                self.state = NumericSemicolon; +                Progress +            }, +        } +    } + +    fn do_numeric_semicolon<Sink: TokenSink>( +        &mut self, +        tokenizer: &mut Tokenizer<Sink>, +        input: &mut BufferQueue, +    ) -> Status { +        match unwrap_or_return!(tokenizer.peek(input), Stuck) { +            ';' => tokenizer.discard_char(input), +            _ => tokenizer.emit_error(Borrowed( +                "Semicolon missing after numeric character reference", +            )), +        }; +        self.finish_numeric(tokenizer) +    } + +    fn unconsume_numeric<Sink: TokenSink>( +        &mut self, +        tokenizer: &mut Tokenizer<Sink>, +        input: &mut BufferQueue, +    ) -> Status { +        let mut unconsume = StrTendril::from_char('#'); +        match self.hex_marker { +            Some(c) => unconsume.push_char(c), +            None => (), +        } + +        input.push_front(unconsume); +        tokenizer.emit_error(Borrowed("Numeric character reference without digits")); +        self.finish_none() +    } + +    fn finish_numeric<Sink: TokenSink>(&mut self, tokenizer: &mut Tokenizer<Sink>) -> Status { +        fn conv(n: u32) -> char { +            from_u32(n).expect("invalid char missed by error handling cases") +        } + +        let (c, error) = match self.num { +            n if (n > 0x10FFFF) || self.num_too_big => ('\u{fffd}', true), +            0x00 | 0xD800..=0xDFFF => ('\u{fffd}', true), + +            0x80..=0x9F => match data::C1_REPLACEMENTS[(self.num - 0x80) as usize] { +                Some(c) => (c, true), +                None => (conv(self.num), true), +            }, + +            0x01..=0x08 | 0x0B | 0x0D..=0x1F | 0x7F | 0xFDD0..=0xFDEF => (conv(self.num), true), + +            n if (n & 0xFFFE) == 0xFFFE => (conv(n), true), + +            n => (conv(n), false), +        }; + +        if error { +            let msg = format_if!( +                tokenizer.opts.exact_errors, +                "Invalid numeric character reference", +                "Invalid numeric character reference value 0x{:06X}", +                self.num +            ); +            tokenizer.emit_error(msg); +        } + +        self.finish_one(c) +    } + +    fn do_named<Sink: TokenSink>( +        &mut self, +        tokenizer: &mut Tokenizer<Sink>, +        input: &mut BufferQueue, +    ) -> Status { +        let c = unwrap_or_return!(tokenizer.get_char(input), Stuck); +        self.name_buf_mut().push_char(c); +        match data::NAMED_ENTITIES.get(&self.name_buf()[..]) { +            // We have either a full match or a prefix of one. +            Some(&m) => { +                if m.0 != 0 { +                    // We have a full match, but there might be a longer one to come. +                    self.name_match = Some(m); +                    self.name_len = self.name_buf().len(); +                } +                // Otherwise we just have a prefix match. +                Progress +            }, + +            // Can't continue the match. +            None => self.finish_named(tokenizer, input, Some(c)), +        } +    } + +    fn emit_name_error<Sink: TokenSink>(&mut self, tokenizer: &mut Tokenizer<Sink>) { +        let msg = format_if!( +            tokenizer.opts.exact_errors, +            "Invalid character reference", +            "Invalid character reference &{}", +            self.name_buf() +        ); +        tokenizer.emit_error(msg); +    } + +    fn unconsume_name(&mut self, input: &mut BufferQueue) { +        input.push_front(self.name_buf_opt.take().unwrap()); +    } + +    fn finish_named<Sink: TokenSink>( +        &mut self, +        tokenizer: &mut Tokenizer<Sink>, +        input: &mut BufferQueue, +        end_char: Option<char>, +    ) -> Status { +        match self.name_match { +            None => { +                match end_char { +                    Some(c) if is_ascii_alnum(c) => { +                        // Keep looking for a semicolon, to determine whether +                        // we emit a parse error. +                        self.state = BogusName; +                        return Progress; +                    }, + +                    // Check length because &; is not a parse error. +                    Some(';') if self.name_buf().len() > 1 => self.emit_name_error(tokenizer), + +                    _ => (), +                } +                self.unconsume_name(input); +                self.finish_none() +            }, + +            Some((c1, c2)) => { +                // We have a complete match, but we may have consumed +                // additional characters into self.name_buf.  Usually +                // at least one, but several in cases like +                // +                //     ¬    => match for U+00AC +                //     ¬i   => valid prefix for ¬in +                //     ¬it  => can't continue match + +                let name_len = self.name_len; +                assert!(name_len > 0); +                let last_matched = self.name_buf()[name_len - 1..].chars().next().unwrap(); + +                // There might not be a next character after the match, if +                // we had a full match and then hit EOF. +                let next_after = if name_len == self.name_buf().len() { +                    None +                } else { +                    Some(self.name_buf()[name_len..].chars().next().unwrap()) +                }; + +                // "If the character reference is being consumed as part of an +                // attribute, and the last character matched is not a U+003B +                // SEMICOLON character (;), and the next character is either a +                // U+003D EQUALS SIGN character (=) or an alphanumeric ASCII +                // character, then, for historical reasons, all the characters +                // that were matched after the U+0026 AMPERSAND character (&) +                // must be unconsumed, and nothing is returned. However, if +                // this next character is in fact a U+003D EQUALS SIGN +                // character (=), then this is a parse error" + +                let unconsume_all = match (self.addnl_allowed, last_matched, next_after) { +                    (_, ';', _) => false, +                    (Some(_), _, Some('=')) => { +                        tokenizer.emit_error(Borrowed( +                            "Equals sign after character reference in attribute", +                        )); +                        true +                    }, +                    (Some(_), _, Some(c)) if is_ascii_alnum(c) => true, +                    _ => { +                        tokenizer.emit_error(Borrowed( +                            "Character reference does not end with semicolon", +                        )); +                        false +                    }, +                }; + +                if unconsume_all { +                    self.unconsume_name(input); +                    self.finish_none() +                } else { +                    input.push_front(StrTendril::from_slice(&self.name_buf()[name_len..])); +                    self.result = Some(CharRef { +                        chars: [from_u32(c1).unwrap(), from_u32(c2).unwrap()], +                        num_chars: if c2 == 0 { 1 } else { 2 }, +                    }); +                    Done +                } +            }, +        } +    } + +    fn do_bogus_name<Sink: TokenSink>( +        &mut self, +        tokenizer: &mut Tokenizer<Sink>, +        input: &mut BufferQueue, +    ) -> Status { +        let c = unwrap_or_return!(tokenizer.get_char(input), Stuck); +        self.name_buf_mut().push_char(c); +        match c { +            _ if is_ascii_alnum(c) => return Progress, +            ';' => self.emit_name_error(tokenizer), +            _ => (), +        } +        self.unconsume_name(input); +        self.finish_none() +    } + +    pub fn end_of_file<Sink: TokenSink>( +        &mut self, +        tokenizer: &mut Tokenizer<Sink>, +        input: &mut BufferQueue, +    ) { +        while self.result.is_none() { +            match self.state { +                Begin => drop(self.finish_none()), + +                Numeric(_) if !self.seen_digit => drop(self.unconsume_numeric(tokenizer, input)), + +                Numeric(_) | NumericSemicolon => { +                    tokenizer.emit_error(Borrowed("EOF in numeric character reference")); +                    self.finish_numeric(tokenizer); +                }, + +                Named => drop(self.finish_named(tokenizer, input, None)), + +                BogusName => { +                    self.unconsume_name(input); +                    self.finish_none(); +                }, + +                Octothorpe => { +                    input.push_front(StrTendril::from_slice("#")); +                    tokenizer.emit_error(Borrowed("EOF after '#' in character reference")); +                    self.finish_none(); +                }, +            } +        } +    } +} | 
