diff options
| author | Martin Fischer <martin@push-f.com> | 2021-11-30 17:16:17 +0100 | 
|---|---|---|
| committer | Martin Fischer <martin@push-f.com> | 2021-12-05 02:52:36 +0100 | 
| commit | 927ac122a63ad5e1b8037a895d9e9b63883bcc01 (patch) | |
| tree | aa226caead5b563bb46c72e1438e7a1a8385eae4 /src | |
| parent | 1f99ea9e16f85945e2606905ed6345519ce16e4e (diff) | |
spans: make Emitter generic over Reader
Diffstat (limited to 'src')
| -rw-r--r-- | src/emitter.rs | 45 | ||||
| -rw-r--r-- | src/machine.rs | 42 | ||||
| -rw-r--r-- | src/tokenizer.rs | 18 | 
3 files changed, 61 insertions, 44 deletions
| diff --git a/src/emitter.rs b/src/emitter.rs index 2c4ba41..20bcba4 100644 --- a/src/emitter.rs +++ b/src/emitter.rs @@ -1,6 +1,7 @@  use std::collections::BTreeMap;  use std::collections::BTreeSet;  use std::collections::VecDeque; +use std::marker::PhantomData;  use std::mem;  use crate::Error; @@ -28,7 +29,7 @@ use crate::State;  ///  /// The state machine needs to have a functional implementation of  /// `current_is_appropriate_end_tag_token` to do correct transitions, however. -pub trait Emitter { +pub trait Emitter<R> {      /// The token type emitted by this emitter. This controls what type of values the [`crate::Tokenizer`]      /// yields when used as an iterator.      type Token; @@ -54,13 +55,13 @@ pub trait Emitter {      fn emit_string(&mut self, c: &str);      /// Set the _current token_ to a start tag. -    fn init_start_tag(&mut self); +    fn init_start_tag(&mut self, reader: &R);      /// Set the _current token_ to an end tag. -    fn init_end_tag(&mut self); +    fn init_end_tag(&mut self, reader: &R);      /// Set the _current token_ to a comment. -    fn init_comment(&mut self); +    fn init_comment(&mut self, reader: &R);      /// Emit the _current token_, assuming it is a tag.      /// @@ -116,7 +117,7 @@ pub trait Emitter {      /// * the "public identifier" should be null (different from empty)      /// * the "system identifier" should be null (different from empty)      /// * the "force quirks" flag should be `false` -    fn init_doctype(&mut self); +    fn init_doctype(&mut self, reader: &R);      /// Set the _current attribute_ to a new one, starting with empty name and value strings.      /// @@ -128,7 +129,7 @@ pub trait Emitter {      /// emitted.      ///      /// If the current token is no tag at all, this method may panic. -    fn init_attribute(&mut self); +    fn init_attribute(&mut self, reader: &R);      /// Append a string to the current attribute's name.      /// @@ -172,17 +173,31 @@ pub trait Emitter {  }  /// The default implementation of [`crate::Emitter`], used to produce ("emit") tokens. -#[derive(Default)] -pub struct DefaultEmitter<S> { +pub struct DefaultEmitter<R, S> {      current_characters: String,      current_token: Option<Token<S>>,      last_start_tag: String,      current_attribute: Option<(String, String)>,      seen_attributes: BTreeSet<String>,      emitted_tokens: VecDeque<Token<S>>, +    reader: PhantomData<R>,  } -impl DefaultEmitter<()> { +impl<R, S> Default for DefaultEmitter<R, S> { +    fn default() -> Self { +        DefaultEmitter { +            current_characters: String::new(), +            current_token: None, +            last_start_tag: String::new(), +            current_attribute: None, +            seen_attributes: BTreeSet::new(), +            emitted_tokens: VecDeque::new(), +            reader: PhantomData::default(), +        } +    } +} + +impl<R> DefaultEmitter<R, ()> {      fn emit_token(&mut self, token: Token<()>) {          self.flush_current_characters();          self.emitted_tokens.push_front(token); @@ -226,7 +241,7 @@ impl DefaultEmitter<()> {      }  } -impl Emitter for DefaultEmitter<()> { +impl<R> Emitter<R> for DefaultEmitter<R, ()> {      type Token = Token<()>;      fn set_last_start_tag(&mut self, last_start_tag: Option<&str>) { @@ -253,15 +268,15 @@ impl Emitter for DefaultEmitter<()> {          self.current_characters.push_str(s);      } -    fn init_start_tag(&mut self) { +    fn init_start_tag(&mut self, _reader: &R) {          self.current_token = Some(Token::StartTag(Default::default()));      } -    fn init_end_tag(&mut self) { +    fn init_end_tag(&mut self, _reader: &R) {          self.current_token = Some(Token::EndTag(Default::default()));          self.seen_attributes.clear();      } -    fn init_comment(&mut self) { +    fn init_comment(&mut self, _reader: &R) {          self.current_token = Some(Token::Comment(String::new()));      }      fn emit_current_tag(&mut self) { @@ -341,7 +356,7 @@ impl Emitter for DefaultEmitter<()> {              _ => debug_assert!(false),          }      } -    fn init_doctype(&mut self) { +    fn init_doctype(&mut self, _reader: &R) {          self.current_token = Some(Token::Doctype(Doctype {              name: String::new(),              force_quirks: false, @@ -350,7 +365,7 @@ impl Emitter for DefaultEmitter<()> {          }));      } -    fn init_attribute(&mut self) { +    fn init_attribute(&mut self, _reader: &R) {          self.flush_current_attribute();          self.current_attribute = Some((String::new(), String::new()));      } diff --git a/src/machine.rs b/src/machine.rs index 5222735..931abf1 100644 --- a/src/machine.rs +++ b/src/machine.rs @@ -8,7 +8,9 @@ use crate::{Emitter, Error, Reader, Tokenizer};  // Note: This is not implemented as a method on Tokenizer because there's fields on Tokenizer that  // should not be available in this method, such as Tokenizer.to_reconsume or the Reader instance  #[inline] -pub fn consume<R: Reader, E: Emitter>(slf: &mut Tokenizer<R, E>) -> Result<ControlToken, R::Error> { +pub fn consume<R: Reader, E: Emitter<R>>( +    slf: &mut Tokenizer<R, E>, +) -> Result<ControlToken, R::Error> {      macro_rules! mutate_character_reference {          (* $mul:literal + $x:ident - $sub:literal) => {              match slf @@ -122,7 +124,7 @@ pub fn consume<R: Reader, E: Emitter>(slf: &mut Tokenizer<R, E>) -> Result<Contr                  Ok(ControlToken::Continue)              }              Some(x) if x.is_ascii_alphabetic() => { -                slf.emitter.init_start_tag(); +                slf.emitter.init_start_tag(&slf.reader);                  slf.state = State::TagName;                  slf.unread_char(Some(x));                  Ok(ControlToken::Continue) @@ -130,7 +132,7 @@ pub fn consume<R: Reader, E: Emitter>(slf: &mut Tokenizer<R, E>) -> Result<Contr              c @ Some('?') => {                  slf.emitter                      .emit_error(Error::UnexpectedQuestionMarkInsteadOfTagName); -                slf.emitter.init_comment(); +                slf.emitter.init_comment(&slf.reader);                  slf.state = State::BogusComment;                  slf.unread_char(c);                  Ok(ControlToken::Continue) @@ -151,7 +153,7 @@ pub fn consume<R: Reader, E: Emitter>(slf: &mut Tokenizer<R, E>) -> Result<Contr          },          State::EndTagOpen => match slf.read_char()? {              Some(x) if x.is_ascii_alphabetic() => { -                slf.emitter.init_end_tag(); +                slf.emitter.init_end_tag(&slf.reader);                  slf.state = State::TagName;                  slf.unread_char(Some(x));                  Ok(ControlToken::Continue) @@ -169,7 +171,7 @@ pub fn consume<R: Reader, E: Emitter>(slf: &mut Tokenizer<R, E>) -> Result<Contr              Some(x) => {                  slf.emitter                      .emit_error(Error::InvalidFirstCharacterOfTagName); -                slf.emitter.init_comment(); +                slf.emitter.init_comment(&slf.reader);                  slf.state = State::BogusComment;                  slf.unread_char(Some(x));                  Ok(ControlToken::Continue) @@ -218,7 +220,7 @@ pub fn consume<R: Reader, E: Emitter>(slf: &mut Tokenizer<R, E>) -> Result<Contr          },          State::RcDataEndTagOpen => match slf.read_char()? {              Some(x) if x.is_ascii_alphabetic() => { -                slf.emitter.init_end_tag(); +                slf.emitter.init_end_tag(&slf.reader);                  slf.state = State::RcDataEndTagName;                  slf.unread_char(Some(x));                  Ok(ControlToken::Continue) @@ -273,7 +275,7 @@ pub fn consume<R: Reader, E: Emitter>(slf: &mut Tokenizer<R, E>) -> Result<Contr          },          State::RawTextEndTagOpen => match slf.read_char()? {              Some(x) if x.is_ascii_alphabetic() => { -                slf.emitter.init_end_tag(); +                slf.emitter.init_end_tag(&slf.reader);                  slf.state = State::RawTextEndTagName;                  slf.unread_char(Some(x));                  Ok(ControlToken::Continue) @@ -333,7 +335,7 @@ pub fn consume<R: Reader, E: Emitter>(slf: &mut Tokenizer<R, E>) -> Result<Contr          },          State::ScriptDataEndTagOpen => match slf.read_char()? {              Some(x) if x.is_ascii_alphabetic() => { -                slf.emitter.init_end_tag(); +                slf.emitter.init_end_tag(&slf.reader);                  slf.state = State::ScriptDataEndTagName;                  slf.unread_char(Some(x));                  Ok(ControlToken::Continue) @@ -501,7 +503,7 @@ pub fn consume<R: Reader, E: Emitter>(slf: &mut Tokenizer<R, E>) -> Result<Contr          },          State::ScriptDataEscapedEndTagOpen => match slf.read_char()? {              Some(x) if x.is_ascii_alphabetic() => { -                slf.emitter.init_end_tag(); +                slf.emitter.init_end_tag(&slf.reader);                  slf.state = State::ScriptDataEscapedEndTagName;                  slf.unread_char(Some(x));                  Ok(ControlToken::Continue) @@ -692,13 +694,13 @@ pub fn consume<R: Reader, E: Emitter>(slf: &mut Tokenizer<R, E>) -> Result<Contr              Some('=') => {                  slf.emitter                      .emit_error(Error::UnexpectedEqualsSignBeforeAttributeName); -                slf.emitter.init_attribute(); +                slf.emitter.init_attribute(&slf.reader);                  slf.emitter.push_attribute_name("=");                  slf.state = State::AttributeName;                  Ok(ControlToken::Continue)              }              Some(x) => { -                slf.emitter.init_attribute(); +                slf.emitter.init_attribute(&slf.reader);                  slf.state = State::AttributeName;                  slf.unread_char(Some(x));                  Ok(ControlToken::Continue) @@ -752,7 +754,7 @@ pub fn consume<R: Reader, E: Emitter>(slf: &mut Tokenizer<R, E>) -> Result<Contr                  Ok(ControlToken::Eof)              }              Some(x) => { -                slf.emitter.init_attribute(); +                slf.emitter.init_attribute(&slf.reader);                  slf.state = State::AttributeName;                  slf.unread_char(Some(x));                  Ok(ControlToken::Continue) @@ -929,7 +931,7 @@ pub fn consume<R: Reader, E: Emitter>(slf: &mut Tokenizer<R, E>) -> Result<Contr          },          State::MarkupDeclarationOpen => match slf.read_char()? {              Some('-') if slf.try_read_string("-", true)? => { -                slf.emitter.init_comment(); +                slf.emitter.init_comment(&slf.reader);                  slf.state = State::CommentStart;                  Ok(ControlToken::Continue)              } @@ -946,14 +948,14 @@ pub fn consume<R: Reader, E: Emitter>(slf: &mut Tokenizer<R, E>) -> Result<Contr                  // let's hope that bogus comment can just sort of skip over cdata                  slf.emitter.emit_error(Error::CdataInHtmlContent); -                slf.emitter.init_comment(); +                slf.emitter.init_comment(&slf.reader);                  slf.emitter.push_comment("[CDATA[");                  slf.state = State::BogusComment;                  Ok(ControlToken::Continue)              }              c => {                  slf.emitter.emit_error(Error::IncorrectlyOpenedComment); -                slf.emitter.init_comment(); +                slf.emitter.init_comment(&slf.reader);                  slf.state = State::BogusComment;                  slf.unread_char(c);                  Ok(ControlToken::Continue) @@ -1159,7 +1161,7 @@ pub fn consume<R: Reader, E: Emitter>(slf: &mut Tokenizer<R, E>) -> Result<Contr              }              None => {                  slf.emitter.emit_error(Error::EofInDoctype); -                slf.emitter.init_doctype(); +                slf.emitter.init_doctype(&slf.reader);                  slf.emitter.set_force_quirks();                  slf.emitter.emit_current_doctype();                  Ok(ControlToken::Eof) @@ -1176,14 +1178,14 @@ pub fn consume<R: Reader, E: Emitter>(slf: &mut Tokenizer<R, E>) -> Result<Contr              Some(whitespace_pat!()) => Ok(ControlToken::Continue),              Some('\0') => {                  slf.emitter.emit_error(Error::UnexpectedNullCharacter); -                slf.emitter.init_doctype(); +                slf.emitter.init_doctype(&slf.reader);                  slf.emitter.push_doctype_name("\u{fffd}");                  slf.state = State::DoctypeName;                  Ok(ControlToken::Continue)              }              Some('>') => {                  slf.emitter.emit_error(Error::MissingDoctypeName); -                slf.emitter.init_doctype(); +                slf.emitter.init_doctype(&slf.reader);                  slf.emitter.set_force_quirks();                  slf.state = State::Data;                  slf.emitter.emit_current_doctype(); @@ -1191,13 +1193,13 @@ pub fn consume<R: Reader, E: Emitter>(slf: &mut Tokenizer<R, E>) -> Result<Contr              }              None => {                  slf.emitter.emit_error(Error::EofInDoctype); -                slf.emitter.init_doctype(); +                slf.emitter.init_doctype(&slf.reader);                  slf.emitter.set_force_quirks();                  slf.emitter.emit_current_doctype();                  Ok(ControlToken::Eof)              }              Some(x) => { -                slf.emitter.init_doctype(); +                slf.emitter.init_doctype(&slf.reader);                  slf.emitter                      .push_doctype_name(ctostr!(x.to_ascii_lowercase()));                  slf.state = State::DoctypeName; diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 377dd01..efaa870 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -33,12 +33,12 @@ impl<T: Copy> Stack2<T> {  }  /// A HTML tokenizer. See crate-level docs for basic usage. -pub struct Tokenizer<R: Reader, E: Emitter = DefaultEmitter<()>> { +pub struct Tokenizer<R: Reader, E: Emitter<R> = DefaultEmitter<R, ()>> {      eof: bool,      pub(crate) state: InternalState,      pub(crate) emitter: E,      pub(crate) temporary_buffer: String, -    reader: R, +    pub(crate) reader: R,      to_reconsume: Stack2<Option<char>>,      pub(crate) character_reference_code: u32,      pub(crate) return_state: Option<InternalState>, @@ -91,7 +91,7 @@ impl From<State> for InternalState {      }  } -impl<R: Reader, E: Emitter> Tokenizer<R, E> { +impl<R: Reader, E: Emitter<R>> Tokenizer<R, E> {      /// Construct a new tokenizer from some input and a custom emitter.      ///      /// Use this method over [`Tokenizer::new`] when you want to have more control over string allocation for @@ -239,7 +239,7 @@ impl<R: Reader, E: Emitter> Tokenizer<R, E> {      }  } -impl<R: Reader, E: Emitter> Iterator for Tokenizer<R, E> { +impl<R: Reader, E: Emitter<R>> Iterator for Tokenizer<R, E> {      type Item = Result<E::Token, R::Error>;      fn next(&mut self) -> Option<Self::Item> { @@ -266,9 +266,9 @@ impl<R: Reader, E: Emitter> Iterator for Tokenizer<R, E> {  /// `Result<Token, _>`.  ///  /// This is the return value of [`Tokenizer::infallible`]. -pub struct InfallibleTokenizer<R: Reader<Error = Never>, E: Emitter>(Tokenizer<R, E>); +pub struct InfallibleTokenizer<R: Reader<Error = Never>, E: Emitter<R>>(Tokenizer<R, E>); -impl<R: Reader<Error = Never>, E: Emitter> Tokenizer<R, E> { +impl<R: Reader<Error = Never>, E: Emitter<R>> Tokenizer<R, E> {      /// Statically assert that this iterator is infallible.      ///      /// Call this to get rid of error handling when parsing HTML from strings. @@ -277,7 +277,7 @@ impl<R: Reader<Error = Never>, E: Emitter> Tokenizer<R, E> {      }  } -impl<R: Reader<Error = Never>, E: Emitter> Iterator for InfallibleTokenizer<R, E> { +impl<R: Reader<Error = Never>, E: Emitter<R>> Iterator for InfallibleTokenizer<R, E> {      type Item = E::Token;      fn next(&mut self) -> Option<Self::Item> { @@ -288,7 +288,7 @@ impl<R: Reader<Error = Never>, E: Emitter> Iterator for InfallibleTokenizer<R, E      }  } -impl<R: Reader<Error = Never>, E: Emitter> Deref for InfallibleTokenizer<R, E> { +impl<R: Reader<Error = Never>, E: Emitter<R>> Deref for InfallibleTokenizer<R, E> {      type Target = Tokenizer<R, E>;      fn deref(&self) -> &Self::Target { @@ -296,7 +296,7 @@ impl<R: Reader<Error = Never>, E: Emitter> Deref for InfallibleTokenizer<R, E> {      }  } -impl<R: Reader<Error = Never>, E: Emitter> DerefMut for InfallibleTokenizer<R, E> { +impl<R: Reader<Error = Never>, E: Emitter<R>> DerefMut for InfallibleTokenizer<R, E> {      fn deref_mut(&mut self) -> &mut Self::Target {          &mut self.0      } | 
