diff options
author | Markus Unterwaditzer <markus-honeypot@unterwaditzer.net> | 2021-11-27 01:08:48 +0100 |
---|---|---|
committer | Markus Unterwaditzer <markus-honeypot@unterwaditzer.net> | 2021-11-27 01:10:09 +0100 |
commit | 96808d0d940e1580cf86e433d0c844e943157e0d (patch) | |
tree | 4e4cabd0431dc2b465fad1b68ae54c27358dcd4e /src/tokenizer.rs | |
parent | 8c6e4ccbb3fc6830239429c994468bc1ccd6832a (diff) |
split up match-arms and tokenizer to isolate some tokenizer-internal state
purpose: don't want to expose self.to_reconsume to the consume() method
Diffstat (limited to 'src/tokenizer.rs')
-rw-r--r-- | src/tokenizer.rs | 244 |
1 files changed, 244 insertions, 0 deletions
diff --git a/src/tokenizer.rs b/src/tokenizer.rs new file mode 100644 index 0000000..d7e60ac --- /dev/null +++ b/src/tokenizer.rs @@ -0,0 +1,244 @@ +use crate::machine; +use crate::utils::{control_pat, noncharacter_pat, surrogate_pat, ControlToken, State}; +use crate::{DefaultEmitter, Emitter, Error, Never, Readable, Reader}; + +// this is a stack that can hold 0 to 2 Ts +#[derive(Debug, Default)] +struct Stack2<T: Copy>(Option<(T, Option<T>)>); + +impl<T: Copy> Stack2<T> { + #[inline] + fn push(&mut self, c: T) { + self.0 = match self.0 { + None => Some((c, None)), + Some((c1, None)) => Some((c1, Some(c))), + Some((_c1, Some(_c2))) => panic!("stack full!"), + } + } + + #[inline] + fn pop(&mut self) -> Option<T> { + let (new_self, rv) = match self.0 { + Some((c1, Some(c2))) => (Some((c1, None)), Some(c2)), + Some((c1, None)) => (None, Some(c1)), + None => (None, None), + }; + self.0 = new_self; + rv + } + + #[inline] + fn is_empty(&self) -> bool { + matches!(self.0, None) + } +} + +/// A HTML tokenizer. See crate-level docs for basic usage. +pub struct Tokenizer<R: Reader, E: Emitter = DefaultEmitter> { + eof: bool, + pub(crate) state: State, + pub(crate) emitter: E, + pub(crate) temporary_buffer: String, + reader: R, + to_reconsume: Stack2<Option<char>>, + pub(crate) character_reference_code: u32, + pub(crate) return_state: Option<State>, +} + +impl<R: Reader> Tokenizer<R> { + /// Create a new tokenizer from some input. + /// + /// `input` can be `&String` or `&str` at the moment, as those are the types for which + /// [`crate::Readable`] is implemented, but you can implement that trait on your own types. + /// + /// Patches are welcome for providing an efficient implementation over async streams, + /// iterators, files, etc, as long as any dependencies come behind featureflags. + pub fn new<'a, S: Readable<'a, Reader = R>>(input: S) -> Self { + Tokenizer::<S::Reader>::new_with_emitter(input, DefaultEmitter::default()) + } +} + +impl<R: Reader, E: Emitter> Tokenizer<R, E> { + /// Construct a new tokenizer from some input and a custom emitter. + /// + /// Use this method over [`Tokenizer::new`] when you want to have more control over string allocation for + /// tokens. + pub fn new_with_emitter<'a, S: Readable<'a, Reader = R>>(input: S, emitter: E) -> Self { + Tokenizer { + eof: false, + state: State::Data, + emitter, + temporary_buffer: String::new(), + to_reconsume: Stack2::default(), + reader: input.to_reader(), + character_reference_code: 0, + return_state: None, + } + } + + /// Test-internal function to override internal state. + /// + /// Only available with the `integration-tests` feature which is not public API. + #[cfg(feature = "integration-tests")] + pub fn set_state(&mut self, state: State) { + self.state = state; + } + + /// Set the statemachine to start/continue in [plaintext + /// state](https://html.spec.whatwg.org/#plaintext-state). + /// + /// This tokenizer never gets into that state naturally. + pub fn set_plaintext_state(&mut self) { + self.state = State::PlainText; + } + + /// Test-internal function to override internal state. + /// + /// Only available with the `integration-tests` feature which is not public API. + #[cfg(feature = "integration-tests")] + pub fn set_last_start_tag(&mut self, last_start_tag: Option<&str>) { + self.emitter.set_last_start_tag(last_start_tag); + } + + #[inline] + pub(crate) fn unread_char(&mut self, c: Option<char>) { + self.to_reconsume.push(c); + } + + #[inline] + fn validate_char(&mut self, c: char) { + match c as u32 { + surrogate_pat!() => { + self.emitter.emit_error(Error::SurrogateInInputStream); + } + noncharacter_pat!() => { + self.emitter.emit_error(Error::NoncharacterInInputStream); + } + // control without whitespace or nul + x @ control_pat!() + if !matches!(x, 0x0000 | 0x0009 | 0x000a | 0x000c | 0x000d | 0x0020) => + { + self.emitter + .emit_error(Error::ControlCharacterInInputStream); + } + _ => (), + } + } + + pub(crate) fn read_char(&mut self) -> Result<Option<char>, R::Error> { + let (c_res, reconsumed) = match self.to_reconsume.pop() { + Some(c) => (Ok(c), true), + None => (self.reader.read_char(), false), + }; + + let mut c = match c_res { + Ok(Some(c)) => c, + res => return res, + }; + + if c == '\r' { + c = '\n'; + let c2 = self.reader.read_char()?; + if c2 != Some('\n') { + self.unread_char(c2); + } + } + + if !reconsumed { + self.validate_char(c); + } + + Ok(Some(c)) + } + + #[inline] + pub(crate) fn try_read_string( + &mut self, + s: &str, + case_sensitive: bool, + ) -> Result<bool, R::Error> { + debug_assert!(!s.is_empty()); + debug_assert!(self.to_reconsume.is_empty()); + self.reader.try_read_string(s, case_sensitive) + } + + pub(crate) fn is_consumed_as_part_of_an_attribute(&self) -> bool { + matches!( + self.return_state, + Some( + State::AttributeValueDoubleQuoted + | State::AttributeValueSingleQuoted + | State::AttributeValueUnquoted + ) + ) + } + + pub(crate) fn flush_code_points_consumed_as_character_reference(&mut self) { + if self.is_consumed_as_part_of_an_attribute() { + self.emitter.push_attribute_value(&self.temporary_buffer); + self.temporary_buffer.clear(); + } else { + self.flush_buffer_characters(); + } + } + + pub(crate) fn next_input_character(&mut self) -> Result<Option<char>, R::Error> { + let rv = self.read_char()?; + self.unread_char(rv); + Ok(rv) + } + + pub(crate) fn flush_buffer_characters(&mut self) { + self.emitter.emit_string(&self.temporary_buffer); + self.temporary_buffer.clear(); + } +} + +impl<R: Reader, E: Emitter> Iterator for Tokenizer<R, E> { + type Item = Result<E::Token, R::Error>; + + fn next(&mut self) -> Option<Self::Item> { + loop { + if let Some(token) = self.emitter.pop_token() { + break Some(Ok(token)); + } else if !self.eof { + match machine::consume(self) { + Ok(ControlToken::Continue) => (), + Ok(ControlToken::Eof) => { + self.eof = true; + self.emitter.emit_eof(); + } + Err(e) => break Some(Err(e)), + } + } else { + break None; + } + } + } +} + +/// A kind of tokenizer that directly yields tokens when used as an iterator, so `Token` instead of +/// `Result<Token, _>`. +/// +/// This is the return value of [`Tokenizer::infallible`]. +pub struct InfallibleTokenizer<R: Reader<Error = Never>, E: Emitter>(Tokenizer<R, E>); + +impl<R: Reader<Error = Never>, E: Emitter> Tokenizer<R, E> { + /// Statically assert that this iterator is infallible. + /// + /// Call this to get rid of error handling when parsing HTML from strings. + pub fn infallible(self) -> InfallibleTokenizer<R, E> { + InfallibleTokenizer(self) + } +} + +impl<R: Reader<Error = Never>, E: Emitter> Iterator for InfallibleTokenizer<R, E> { + type Item = E::Token; + + fn next(&mut self) -> Option<Self::Item> { + match self.0.next()? { + Ok(token) => Some(token), + Err(e) => match e {}, + } + } +} |