diff options
Diffstat (limited to 'src')
| -rw-r--r-- | src/emitter.rs | 60 | ||||
| -rw-r--r-- | src/machine.rs | 45 | ||||
| -rw-r--r-- | src/offset.rs | 2 | ||||
| -rw-r--r-- | src/tokenizer.rs | 27 | 
4 files changed, 70 insertions, 64 deletions
| diff --git a/src/emitter.rs b/src/emitter.rs index caf7b55..1f60f70 100644 --- a/src/emitter.rs +++ b/src/emitter.rs @@ -2,13 +2,11 @@ use std::collections::btree_map::Entry;  use std::collections::BTreeMap;  use std::collections::BTreeSet;  use std::collections::VecDeque; -use std::marker::PhantomData;  use std::mem;  use std::ops::Range;  use crate::offset::NoopOffset;  use crate::offset::Offset; -use crate::offset::Position;  use crate::Error;  /// An emitter is an object providing methods to the tokenizer to produce ("emit") tokens. @@ -29,7 +27,7 @@ use crate::Error;  ///   checks that would emit errors.  ///  /// * If you don't care about attributes at all, you can make all related methods a noop. -pub trait Emitter<R> { +pub trait Emitter<O> {      /// The token type emitted by this emitter. This controls what type of values the [`Tokenizer`](crate::Tokenizer)      /// yields when used as an iterator.      type Token; @@ -39,7 +37,7 @@ pub trait Emitter<R> {      fn emit_eof(&mut self);      /// A (probably recoverable) parsing error has occured. -    fn emit_error(&mut self, error: Error, reader: &R); +    fn emit_error(&mut self, error: Error, offset: O);      /// After every state change, the tokenizer calls this method to retrieve a new token that can      /// be returned via the tokenizer's iterator interface. @@ -49,13 +47,13 @@ pub trait Emitter<R> {      fn emit_string(&mut self, c: &str);      /// Set the _current token_ to a start tag. -    fn init_start_tag(&mut self, reader: &R); +    fn init_start_tag(&mut self, offset: O);      /// Set the _current token_ to an end tag. -    fn init_end_tag(&mut self, reader: &R); +    fn init_end_tag(&mut self, offset: O);      /// Set the _current token_ to a comment. -    fn init_comment(&mut self, reader: &R); +    fn init_comment(&mut self, data_offset: O);      /// Emit the _current token_, assuming it is a tag.      /// @@ -84,7 +82,7 @@ pub trait Emitter<R> {      ///      /// If the current token is an end tag, the emitter should emit the      /// [`Error::EndTagWithTrailingSolidus`] error. -    fn set_self_closing(&mut self, reader: &R); +    fn set_self_closing(&mut self, offset: O);      /// Assuming the _current token_ is a doctype, set its "force quirks" flag to true.      /// @@ -112,7 +110,7 @@ pub trait Emitter<R> {      /// * the "public identifier" should be null (different from empty)      /// * the "system identifier" should be null (different from empty)      /// * the "force quirks" flag should be `false` -    fn init_doctype(&mut self, reader: &R); +    fn init_doctype(&mut self, offset: O);      /// Set the _current attribute_ to a new one, starting with empty name and value strings.      /// @@ -121,14 +119,14 @@ pub trait Emitter<R> {      /// [`Error::DuplicateAttribute`] error should be emitted.      ///      /// If the current token is no tag at all, this method may panic. -    fn init_attribute_name(&mut self, reader: &R); +    fn init_attribute_name(&mut self, offset: O);      /// Called before the first push_attribute_value call.      /// If the value is wrappend in double or single quotes `quoted` is set to true, otherwise false.      ///      /// If there is no current attribute, this method may panic.      #[allow(unused_variables)] -    fn init_attribute_value(&mut self, reader: &R, quoted: bool) {} +    fn init_attribute_value(&mut self, offset: O, quoted: bool) {}      /// Append a string to the current attribute's name.      /// @@ -162,17 +160,16 @@ pub trait Emitter<R> {  }  /// The default implementation of [`Emitter`], used to produce tokens. -pub struct DefaultEmitter<R, O = NoopOffset> { +pub struct DefaultEmitter<O = NoopOffset> {      current_characters: String,      current_token: Option<Token<O>>,      current_attribute: Option<(String, Attribute<O>)>,      seen_attributes: BTreeSet<String>,      emitted_tokens: VecDeque<Token<O>>, -    reader: PhantomData<R>,      attr_in_end_tag_span: Option<Range<O>>,  } -impl<R, O> Default for DefaultEmitter<R, O> { +impl<O> Default for DefaultEmitter<O> {      fn default() -> Self {          DefaultEmitter {              current_characters: String::new(), @@ -180,13 +177,12 @@ impl<R, O> Default for DefaultEmitter<R, O> {              current_attribute: None,              seen_attributes: BTreeSet::new(),              emitted_tokens: VecDeque::new(), -            reader: PhantomData::default(),              attr_in_end_tag_span: None,          }      }  } -impl<R, O> DefaultEmitter<R, O> { +impl<O> DefaultEmitter<O> {      fn emit_token(&mut self, token: Token<O>) {          self.flush_current_characters();          self.emitted_tokens.push_front(token); @@ -235,15 +231,15 @@ impl<R, O> DefaultEmitter<R, O> {      }  } -impl<O: Offset, R: Position<O>> Emitter<R> for DefaultEmitter<R, O> { +impl<O: Offset> Emitter<O> for DefaultEmitter<O> {      type Token = Token<O>;      fn emit_eof(&mut self) {          self.flush_current_characters();      } -    fn emit_error(&mut self, error: Error, reader: &R) { -        self.push_error(error, reader.position()..reader.position()); +    fn emit_error(&mut self, error: Error, offset: O) { +        self.push_error(error, offset..offset);      }      fn pop_token(&mut self) -> Option<Self::Token> { @@ -254,26 +250,26 @@ impl<O: Offset, R: Position<O>> Emitter<R> for DefaultEmitter<R, O> {          self.current_characters.push_str(s);      } -    fn init_start_tag(&mut self, reader: &R) { +    fn init_start_tag(&mut self, offset: O) {          self.current_token = Some(Token::StartTag(StartTag { -            name_span: reader.position()..reader.position(), +            name_span: offset..offset,              self_closing: false,              name: String::new(),              attributes: Default::default(),          }));      } -    fn init_end_tag(&mut self, reader: &R) { +    fn init_end_tag(&mut self, offset: O) {          self.current_token = Some(Token::EndTag(EndTag { -            name_span: reader.position()..reader.position(), +            name_span: offset..offset,              name: String::new(),          }));          self.seen_attributes.clear();      } -    fn init_comment(&mut self, reader: &R) { +    fn init_comment(&mut self, data_offset: O) {          self.current_token = Some(Token::Comment(Comment {              data: String::new(), -            data_offset: reader.position(), +            data_offset,          }));      }      fn emit_current_tag(&mut self) { @@ -304,7 +300,7 @@ impl<O: Offset, R: Position<O>> Emitter<R> for DefaultEmitter<R, O> {          self.emit_token(doctype);      } -    fn set_self_closing(&mut self, reader: &R) { +    fn set_self_closing(&mut self, offset: O) {          let tag = self.current_token.as_mut().unwrap();          match tag {              Token::StartTag(StartTag { @@ -314,7 +310,7 @@ impl<O: Offset, R: Position<O>> Emitter<R> for DefaultEmitter<R, O> {                  *self_closing = true;              }              Token::EndTag(_) => { -                self.emit_error(Error::EndTagWithTrailingSolidus, reader); +                self.emit_error(Error::EndTagWithTrailingSolidus, offset);              }              _ => {                  debug_assert!(false); @@ -362,7 +358,7 @@ impl<O: Offset, R: Position<O>> Emitter<R> for DefaultEmitter<R, O> {              _ => debug_assert!(false),          }      } -    fn init_doctype(&mut self, _reader: &R) { +    fn init_doctype(&mut self, _offset: O) {          self.current_token = Some(Token::Doctype(Doctype {              name: String::new(),              force_quirks: false, @@ -371,20 +367,20 @@ impl<O: Offset, R: Position<O>> Emitter<R> for DefaultEmitter<R, O> {          }));      } -    fn init_attribute_name(&mut self, reader: &R) { +    fn init_attribute_name(&mut self, offset: O) {          self.flush_current_attribute();          self.current_attribute = Some((              String::new(),              Attribute { -                name_span: reader.position()..reader.position(), +                name_span: offset..offset,                  value: String::new(),                  value_span: Range::default(),              },          ));      } -    fn init_attribute_value(&mut self, reader: &R, quoted: bool) { +    fn init_attribute_value(&mut self, offset: O, quoted: bool) {          self.current_attribute.as_mut().unwrap().1.value_span = -            reader.position() + quoted as usize..reader.position() + quoted as usize; +            offset + quoted as usize..offset + quoted as usize;      }      fn push_attribute_name(&mut self, s: &str) { diff --git a/src/machine.rs b/src/machine.rs index c11720d..deb3983 100644 --- a/src/machine.rs +++ b/src/machine.rs @@ -1,4 +1,5 @@  use crate::entities::try_read_character_reference; +use crate::offset::{Offset, Position};  use crate::utils::{      ascii_digit_pat, control_pat, ctostr, noncharacter_pat, surrogate_pat, whitespace_pat,      ControlToken, State, @@ -8,10 +9,11 @@ use crate::{reader::Reader, Emitter, Error, Tokenizer};  // Note: This is not implemented as a method on Tokenizer because there's fields on Tokenizer that  // should not be available in this method, such as Tokenizer.to_reconsume or the Reader instance  #[inline] -pub fn consume<R, E>(slf: &mut Tokenizer<R, E>) -> Result<ControlToken, R::Error> +pub fn consume<O, R, E>(slf: &mut Tokenizer<R, O, E>) -> Result<ControlToken, R::Error>  where -    R: Reader, -    E: Emitter<R>, +    O: Offset, +    R: Reader + Position<O>, +    E: Emitter<O>,  {      macro_rules! mutate_character_reference {          (* $mul:literal + $x:ident - $sub:literal) => { @@ -133,7 +135,7 @@ where              }              c @ Some('?') => {                  slf.emit_error(Error::UnexpectedQuestionMarkInsteadOfTagName); -                slf.emitter.init_comment(&slf.reader); +                slf.emitter.init_comment(slf.reader.position());                  slf.state = State::BogusComment;                  slf.unread_char(c);                  Ok(ControlToken::Continue) @@ -170,7 +172,7 @@ where              }              Some(x) => {                  slf.emit_error(Error::InvalidFirstCharacterOfTagName); -                slf.emitter.init_comment(&slf.reader); +                slf.emitter.init_comment(slf.reader.position());                  slf.state = State::BogusComment;                  slf.unread_char(Some(x));                  Ok(ControlToken::Continue) @@ -686,13 +688,13 @@ where              }              Some('=') => {                  slf.emit_error(Error::UnexpectedEqualsSignBeforeAttributeName); -                slf.emitter.init_attribute_name(&slf.reader); +                slf.emitter.init_attribute_name(slf.reader.position());                  slf.emitter.push_attribute_name("=");                  slf.state = State::AttributeName;                  Ok(ControlToken::Continue)              }              Some(x) => { -                slf.emitter.init_attribute_name(&slf.reader); +                slf.emitter.init_attribute_name(slf.reader.position());                  slf.state = State::AttributeName;                  slf.unread_char(Some(x));                  Ok(ControlToken::Continue) @@ -745,7 +747,7 @@ where                  Ok(ControlToken::Eof)              }              Some(x) => { -                slf.emitter.init_attribute_name(&slf.reader); +                slf.emitter.init_attribute_name(slf.reader.position());                  slf.state = State::AttributeName;                  slf.unread_char(Some(x));                  Ok(ControlToken::Continue) @@ -754,12 +756,14 @@ where          State::BeforeAttributeValue => match slf.read_char()? {              Some(whitespace_pat!()) => Ok(ControlToken::Continue),              Some('"') => { -                slf.emitter.init_attribute_value(&slf.reader, true); +                slf.emitter +                    .init_attribute_value(slf.reader.position(), true);                  slf.state = State::AttributeValueDoubleQuoted;                  Ok(ControlToken::Continue)              }              Some('\'') => { -                slf.emitter.init_attribute_value(&slf.reader, true); +                slf.emitter +                    .init_attribute_value(slf.reader.position(), true);                  slf.state = State::AttributeValueSingleQuoted;                  Ok(ControlToken::Continue)              } @@ -770,7 +774,8 @@ where                  Ok(ControlToken::Continue)              }              c => { -                slf.emitter.init_attribute_value(&slf.reader, false); +                slf.emitter +                    .init_attribute_value(slf.reader.position(), false);                  slf.state = State::AttributeValueUnquoted;                  slf.unread_char(c);                  Ok(ControlToken::Continue) @@ -885,7 +890,7 @@ where          },          State::SelfClosingStartTag => match slf.read_char()? {              Some('>') => { -                slf.emitter.set_self_closing(&slf.reader); +                slf.emitter.set_self_closing(slf.reader.position());                  slf.state = State::Data;                  slf.emit_current_tag();                  Ok(ControlToken::Continue) @@ -923,7 +928,7 @@ where          },          State::MarkupDeclarationOpen => match slf.read_char()? {              Some('-') if slf.try_read_string("-", true)? => { -                slf.emitter.init_comment(&slf.reader); +                slf.emitter.init_comment(slf.reader.position());                  slf.state = State::CommentStart;                  Ok(ControlToken::Continue)              } @@ -940,14 +945,14 @@ where                  // let's hope that bogus comment can just sort of skip over cdata                  slf.emit_error(Error::CdataInHtmlContent); -                slf.emitter.init_comment(&slf.reader); +                slf.emitter.init_comment(slf.reader.position());                  slf.emitter.push_comment("[CDATA[");                  slf.state = State::BogusComment;                  Ok(ControlToken::Continue)              }              c => {                  slf.emit_error(Error::IncorrectlyOpenedComment); -                slf.emitter.init_comment(&slf.reader); +                slf.emitter.init_comment(slf.reader.position());                  slf.state = State::BogusComment;                  slf.unread_char(c);                  Ok(ControlToken::Continue) @@ -1153,7 +1158,7 @@ where              }              None => {                  slf.emit_error(Error::EofInDoctype); -                slf.emitter.init_doctype(&slf.reader); +                slf.emitter.init_doctype(slf.reader.position());                  slf.emitter.set_force_quirks();                  slf.emitter.emit_current_doctype();                  Ok(ControlToken::Eof) @@ -1169,14 +1174,14 @@ where              Some(whitespace_pat!()) => Ok(ControlToken::Continue),              Some('\0') => {                  slf.emit_error(Error::UnexpectedNullCharacter); -                slf.emitter.init_doctype(&slf.reader); +                slf.emitter.init_doctype(slf.reader.position());                  slf.emitter.push_doctype_name("\u{fffd}");                  slf.state = State::DoctypeName;                  Ok(ControlToken::Continue)              }              Some('>') => {                  slf.emit_error(Error::MissingDoctypeName); -                slf.emitter.init_doctype(&slf.reader); +                slf.emitter.init_doctype(slf.reader.position());                  slf.emitter.set_force_quirks();                  slf.state = State::Data;                  slf.emitter.emit_current_doctype(); @@ -1184,13 +1189,13 @@ where              }              None => {                  slf.emit_error(Error::EofInDoctype); -                slf.emitter.init_doctype(&slf.reader); +                slf.emitter.init_doctype(slf.reader.position());                  slf.emitter.set_force_quirks();                  slf.emitter.emit_current_doctype();                  Ok(ControlToken::Eof)              }              Some(x) => { -                slf.emitter.init_doctype(&slf.reader); +                slf.emitter.init_doctype(slf.reader.position());                  slf.emitter                      .push_doctype_name(ctostr!(x.to_ascii_lowercase()));                  slf.state = State::DoctypeName; diff --git a/src/offset.rs b/src/offset.rs index f1f436d..8809366 100644 --- a/src/offset.rs +++ b/src/offset.rs @@ -1,6 +1,6 @@  //! Source code offsets.  //! -//! The [`DefaultEmitter`](crate::DefaultEmitter) is generic over an [`Offset`]. +//! The [`Emitter`](crate::Emitter) is generic over an [`Offset`].  //! This library comes with two Offset implementations:  //!  //! * [`NoopOffset`] for when you don't want to track source offsets diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 7eb33f7..02a4d62 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -1,5 +1,7 @@ +use std::marker::PhantomData; +  use crate::machine; -use crate::offset::NoopOffset; +use crate::offset::{NoopOffset, Offset, Position};  use crate::reader::{IntoReader, Reader};  use crate::utils::{      control_pat, noncharacter_pat, surrogate_pat, ControlToken, State as InternalState, @@ -33,12 +35,13 @@ impl<T: Copy> Stack2<T> {  }  /// A HTML tokenizer. See crate-level docs for basic usage. -pub struct Tokenizer<R: Reader, E: Emitter<R> = DefaultEmitter<R, NoopOffset>> { +pub struct Tokenizer<R: Reader, O = NoopOffset, E: Emitter<O> = DefaultEmitter<O>> {      eof: bool,      pub(crate) state: InternalState,      pub(crate) emitter: E,      pub(crate) temporary_buffer: String,      pub(crate) reader: R, +    _offset: PhantomData<O>,      to_reconsume: Stack2<Option<char>>,      pub(crate) character_reference_code: u32,      pub(crate) return_state: Option<InternalState>, @@ -47,7 +50,7 @@ pub struct Tokenizer<R: Reader, E: Emitter<R> = DefaultEmitter<R, NoopOffset>> {      is_start_tag: bool,  } -impl<R: Reader, E: Emitter<R>> Tokenizer<R, E> { +impl<R: Reader, O, E: Emitter<O>> Tokenizer<R, O, E> {      /// Creates a new tokenizer from some input and an emitter.      ///      /// TODO: add warning about you needing to do the state switching @@ -55,6 +58,7 @@ impl<R: Reader, E: Emitter<R>> Tokenizer<R, E> {          Tokenizer {              reader: reader.into_reader(),              emitter, +            _offset: PhantomData,              state: InternalState::Data,              to_reconsume: Stack2::default(),              return_state: None, @@ -102,7 +106,7 @@ impl From<State> for InternalState {      }  } -impl<R: Reader, E: Emitter<R>> Tokenizer<R, E> { +impl<R: Reader + Position<O>, O, E: Emitter<O>> Tokenizer<R, O, E> {      /// Test-internal function to override internal state.      ///      /// Only available with the `integration-tests` feature which is not public API. @@ -119,7 +123,7 @@ impl<R: Reader, E: Emitter<R>> Tokenizer<R, E> {      /// Just a helper method for the machine.      #[inline]      pub(crate) fn emit_error(&mut self, error: Error) { -        self.emitter.emit_error(error, &self.reader); +        self.emitter.emit_error(error, self.reader.position());      }      /// Assuming the _current token_ is an end tag, return true if all of these hold. Return false otherwise. @@ -136,14 +140,14 @@ impl<R: Reader, E: Emitter<R>> Tokenizer<R, E> {      #[inline]      pub(crate) fn init_start_tag(&mut self) { -        self.emitter.init_start_tag(&self.reader); +        self.emitter.init_start_tag(self.reader.position());          self.current_tag_name.clear();          self.is_start_tag = true;      }      #[inline]      pub(crate) fn init_end_tag(&mut self) { -        self.emitter.init_end_tag(&self.reader); +        self.emitter.init_end_tag(self.reader.position());          self.current_tag_name.clear();          self.is_start_tag = false;      } @@ -270,10 +274,11 @@ impl<R: Reader, E: Emitter<R>> Tokenizer<R, E> {      }  } -impl<R, E> Iterator for Tokenizer<R, E> +impl<O, R, E> Iterator for Tokenizer<R, O, E>  where -    R: Reader, -    E: Emitter<R>, +    O: Offset, +    R: Reader + Position<O>, +    E: Emitter<O>,  {      type Item = Result<E::Token, R::Error>; @@ -297,7 +302,7 @@ where      }  } -impl<R: Reader, E: Emitter<R>> Tokenizer<R, E> { +impl<R: Reader, O, E: Emitter<O>> Tokenizer<R, O, E> {      /// Test-internal function to override internal state.      ///      /// Only available with the `integration-tests` feature which is not public API. | 
