diff options
Diffstat (limited to 'src')
| -rw-r--r-- | src/emitter.rs | 81 | ||||
| -rw-r--r-- | src/lib.rs | 2 | ||||
| -rw-r--r-- | src/offset.rs (renamed from src/spans.rs) | 69 | ||||
| -rw-r--r-- | src/tokenizer.rs | 4 | 
4 files changed, 79 insertions, 77 deletions
| diff --git a/src/emitter.rs b/src/emitter.rs index 18b2539..b3fdb99 100644 --- a/src/emitter.rs +++ b/src/emitter.rs @@ -4,9 +4,11 @@ use std::collections::BTreeSet;  use std::collections::VecDeque;  use std::marker::PhantomData;  use std::mem; +use std::ops::Range; -use crate::spans::Position; -use crate::spans::Span; +use crate::offset::NoopOffset; +use crate::offset::Offset; +use crate::offset::Position;  use crate::Error;  /// An emitter is an object providing methods to the tokenizer to produce ("emit") tokens. @@ -160,17 +162,17 @@ pub trait Emitter<R> {  }  /// The default implementation of [`Emitter`], used to produce tokens. -pub struct DefaultEmitter<R, S = ()> { +pub struct DefaultEmitter<R, O = NoopOffset> {      current_characters: String, -    current_token: Option<Token<S>>, -    current_attribute: Option<(String, Attribute<S>)>, +    current_token: Option<Token<O>>, +    current_attribute: Option<(String, Attribute<O>)>,      seen_attributes: BTreeSet<String>, -    emitted_tokens: VecDeque<Token<S>>, +    emitted_tokens: VecDeque<Token<O>>,      reader: PhantomData<R>, -    attr_in_end_tag_span: Option<S>, +    attr_in_end_tag_span: Option<Range<O>>,  } -impl<R, S> Default for DefaultEmitter<R, S> { +impl<R, O> Default for DefaultEmitter<R, O> {      fn default() -> Self {          DefaultEmitter {              current_characters: String::new(), @@ -184,13 +186,16 @@ impl<R, S> Default for DefaultEmitter<R, S> {      }  } -impl<R, S: Span> DefaultEmitter<R, S> { -    fn emit_token(&mut self, token: Token<S>) { +impl<R, O> DefaultEmitter<R, O> { +    fn emit_token(&mut self, token: Token<O>) {          self.flush_current_characters();          self.emitted_tokens.push_front(token);      } -    fn flush_current_attribute(&mut self) { +    fn flush_current_attribute(&mut self) +    where +        O: Clone, +    {          if let Some((k, v)) = self.current_attribute.take() {              match self.current_token {                  Some(Token::StartTag(ref mut tag)) => match tag.attributes.entry(k) { @@ -223,22 +228,22 @@ impl<R, S: Span> DefaultEmitter<R, S> {          self.emit_token(Token::String(s));      } -    fn push_error(&mut self, error: Error, span: S) { +    fn push_error(&mut self, error: Error, span: Range<O>) {          // bypass character flushing in self.emit_token: we don't need the error location to be          // that exact          self.emitted_tokens.push_front(Token::Error { error, span });      }  } -impl<R: Position<S::Offset>, S: Span> Emitter<R> for DefaultEmitter<R, S> { -    type Token = Token<S>; +impl<O: Offset, R: Position<O>> Emitter<R> for DefaultEmitter<R, O> { +    type Token = Token<O>;      fn emit_eof(&mut self) {          self.flush_current_characters();      }      fn emit_error(&mut self, error: Error, reader: &R) { -        self.push_error(error, S::new(reader.position(), reader.position())); +        self.push_error(error, reader.position()..reader.position());      }      fn pop_token(&mut self) -> Option<Self::Token> { @@ -251,7 +256,7 @@ impl<R: Position<S::Offset>, S: Span> Emitter<R> for DefaultEmitter<R, S> {      fn init_start_tag(&mut self, reader: &R) {          self.current_token = Some(Token::StartTag(StartTag { -            name_span: S::new(reader.position(), reader.position()), +            name_span: reader.position()..reader.position(),              self_closing: false,              name: String::new(),              attributes: Default::default(), @@ -259,7 +264,7 @@ impl<R: Position<S::Offset>, S: Span> Emitter<R> for DefaultEmitter<R, S> {      }      fn init_end_tag(&mut self, reader: &R) {          self.current_token = Some(Token::EndTag(EndTag { -            name_span: S::new(reader.position(), reader.position()), +            name_span: reader.position()..reader.position(),              name: String::new(),          }));          self.seen_attributes.clear(); @@ -327,7 +332,7 @@ impl<R: Position<S::Offset>, S: Span> Emitter<R> for DefaultEmitter<R, S> {                  ..              })) => {                  name.push_str(s); -                name_span.push_str(s); +                name_span.end += s.len();              }              Some(Token::EndTag(EndTag {                  ref mut name, @@ -335,7 +340,7 @@ impl<R: Position<S::Offset>, S: Span> Emitter<R> for DefaultEmitter<R, S> {                  ..              })) => {                  name.push_str(s); -                name_span.push_str(s); +                name_span.end += s.len();              }              _ => debug_assert!(false),          } @@ -368,28 +373,26 @@ impl<R: Position<S::Offset>, S: Span> Emitter<R> for DefaultEmitter<R, S> {          self.current_attribute = Some((              String::new(),              Attribute { -                name_span: S::new(reader.position(), reader.position()), +                name_span: reader.position()..reader.position(),                  value: String::new(), -                value_span: S::default(), +                value_span: Range::default(),              },          ));      }      fn init_attribute_value(&mut self, reader: &R, quoted: bool) { -        self.current_attribute.as_mut().unwrap().1.value_span = S::new( -            reader.position() + quoted as usize, -            reader.position() + quoted as usize, -        ); +        self.current_attribute.as_mut().unwrap().1.value_span = +            reader.position() + quoted as usize..reader.position() + quoted as usize;      }      fn push_attribute_name(&mut self, s: &str) {          let current_attr = self.current_attribute.as_mut().unwrap();          current_attr.0.push_str(s); -        current_attr.1.name_span.push_str(s); +        current_attr.1.name_span.end += s.len();      }      fn push_attribute_value(&mut self, s: &str) {          let current_attr = self.current_attribute.as_mut().unwrap();          current_attr.1.value.push_str(s); -        current_attr.1.value_span.push_str(s); +        current_attr.1.value_span.end += s.len();      }      fn set_doctype_public_identifier(&mut self, value: &str) {          if let Some(Token::Doctype(Doctype { @@ -439,7 +442,7 @@ impl<R: Position<S::Offset>, S: Span> Emitter<R> for DefaultEmitter<R, S> {  /// An HTML start tag, such as `<p>` or `<a>`.  #[derive(Debug, Eq, PartialEq)] -pub struct StartTag<S> { +pub struct StartTag<O> {      /// Whether this tag is self-closing. If it is self-closing, no following [`EndTag`] should be      /// expected.      pub self_closing: bool, @@ -451,33 +454,33 @@ pub struct StartTag<S> {      ///      /// Duplicate attributes are ignored after the first one as per WHATWG spec. Implement your own      /// [`Emitter`] to tweak this behavior. -    pub attributes: BTreeMap<String, Attribute<S>>, +    pub attributes: BTreeMap<String, Attribute<O>>,      /// The source code span of the tag name. -    pub name_span: S, +    pub name_span: Range<O>,  }  /// A HTML attribute value (plus spans).  #[derive(Debug, Eq, PartialEq)] -pub struct Attribute<S> { +pub struct Attribute<O> {      /// The value of the attribute.      pub value: String,      /// The source code span of the attribute name. -    pub name_span: S, +    pub name_span: Range<O>,      /// The source code span of the attribute value. -    pub value_span: S, +    pub value_span: Range<O>,  }  /// A HTML end/close tag, such as `</p>` or `</a>`.  #[derive(Debug, Eq, PartialEq)] -pub struct EndTag<S> { +pub struct EndTag<O> {      /// The ending tag's name, such as `"p"` or `"a"`.      pub name: String,      /// The source code span of the tag name. -    pub name_span: S, +    pub name_span: Range<O>,  }  /// A doctype. Some examples: @@ -504,11 +507,11 @@ pub struct Doctype {  /// The token type used by default. You can define your own token type by implementing the  /// [`Emitter`] trait.  #[derive(Debug, Eq, PartialEq)] -pub enum Token<S> { +pub enum Token<O> {      /// A HTML start tag. -    StartTag(StartTag<S>), +    StartTag(StartTag<O>),      /// A HTML end tag. -    EndTag(EndTag<S>), +    EndTag(EndTag<O>),      /// A literal string.      String(String),      /// A HTML comment. @@ -523,6 +526,6 @@ pub enum Token<S> {          /// What kind of error occured.          error: Error,          /// The source code span of the error. -        span: S, +        span: Range<O>,      },  } @@ -7,8 +7,8 @@ mod emitter;  mod entities;  mod error;  mod machine; +pub mod offset;  pub mod reader; -pub mod spans;  mod tokenizer;  mod utils; diff --git a/src/spans.rs b/src/offset.rs index 14392cd..f1f436d 100644 --- a/src/spans.rs +++ b/src/offset.rs @@ -1,19 +1,39 @@ -//! Source code spans. +//! Source code offsets.  //! -//! The [`DefaultEmitter`](crate::DefaultEmitter) is generic over a [`Span`]. -//! This library comes with two Span implementations: +//! The [`DefaultEmitter`](crate::DefaultEmitter) is generic over an [`Offset`]. +//! This library comes with two Offset implementations:  //! -//! * one for `()` which acts as the no-op implementation for when you don't want to track spans -//! * one for [`Range<usize>`] for when you do want to track spans +//! * [`NoopOffset`] for when you don't want to track source offsets +//! * `usize` for when you do want to track source offsets  //!  //! To use the latter your reader however has to implement [`Position<usize>`].  //! You can easily use any existing reader by wrapping it in the [`PosTrackingReader`] struct  //! which implements the [`Position<usize>`] trait and takes care of tracking the current position. -use std::ops::{Add, Range}; +use std::fmt::Debug; +use std::ops::{Add, AddAssign, Sub};  use crate::reader::{IntoReader, Reader}; +/// A byte offset in the source. +pub trait Offset: +    Default +    + Copy +    + Eq +    + Ord +    + Add<usize, Output = Self> +    + Sub<usize, Output = Self> +    + AddAssign<usize> +    + Debug +{ +} + +impl Offset for usize {} + +impl Offset for NoopOffset {} + +/// A zero-sized no-op implementation of [`Offset`] (for when you don't want to track offsets). +#[derive(Default, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Debug)]  pub struct NoopOffset;  /// A trait to be implemented by readers that track their own position. @@ -48,31 +68,10 @@ impl<R> PosTrackingReader<R> {  impl<R> Position<usize> for PosTrackingReader<R> {      fn position(&self) -> usize { -        self.position +        self.position - 1      }  } -/// A byte range in the source code. -pub trait Span: Default + Clone { -    type Offset: Add<usize, Output = Self::Offset>; - -    /// Constructs a new span from the given byte offsets. -    fn new(start: Self::Offset, end: Self::Offset) -> Self; - -    /// Extends the span by the length of the given string. -    fn push_str(&mut self, str: &str); -} - -impl Span for () { -    type Offset = NoopOffset; - -    fn new(_start: Self::Offset, _end: Self::Offset) -> Self { -        () -    } - -    fn push_str(&mut self, _str: &str) {} -} -  impl Add<usize> for NoopOffset {      type Output = Self; @@ -81,16 +80,16 @@ impl Add<usize> for NoopOffset {      }  } -impl Span for Range<usize> { -    type Offset = usize; +impl Sub<usize> for NoopOffset { +    type Output = Self; -    fn new(start: Self::Offset, end: Self::Offset) -> Self { -        start - 1..end - 1 +    fn sub(self, _rhs: usize) -> NoopOffset { +        self      } +} -    fn push_str(&mut self, str: &str) { -        self.end += str.len(); -    } +impl AddAssign<usize> for NoopOffset { +    fn add_assign(&mut self, _rhs: usize) {}  }  impl<R: Reader> Reader for PosTrackingReader<R> { diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 7b8b1ce..141efb9 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -1,6 +1,6 @@  use crate::machine; +use crate::offset::NoopOffset;  use crate::reader::{IntoReader, Reader}; -use crate::spans::Position;  use crate::utils::{      control_pat, noncharacter_pat, surrogate_pat, ControlToken, State as InternalState,  }; @@ -33,7 +33,7 @@ impl<T: Copy> Stack2<T> {  }  /// A HTML tokenizer. See crate-level docs for basic usage. -pub struct Tokenizer<R: Reader, E: Emitter<R> = DefaultEmitter<R, ()>> { +pub struct Tokenizer<R: Reader, E: Emitter<R> = DefaultEmitter<R, NoopOffset>> {      eof: bool,      pub(crate) state: InternalState,      pub(crate) emitter: E, | 
