diff options
-rw-r--r-- | src/emitter.rs | 81 | ||||
-rw-r--r-- | src/lib.rs | 2 | ||||
-rw-r--r-- | src/offset.rs (renamed from src/spans.rs) | 69 | ||||
-rw-r--r-- | src/tokenizer.rs | 4 | ||||
-rw-r--r-- | tests/test_spans.rs | 6 |
5 files changed, 82 insertions, 80 deletions
diff --git a/src/emitter.rs b/src/emitter.rs index 18b2539..b3fdb99 100644 --- a/src/emitter.rs +++ b/src/emitter.rs @@ -4,9 +4,11 @@ use std::collections::BTreeSet; use std::collections::VecDeque; use std::marker::PhantomData; use std::mem; +use std::ops::Range; -use crate::spans::Position; -use crate::spans::Span; +use crate::offset::NoopOffset; +use crate::offset::Offset; +use crate::offset::Position; use crate::Error; /// An emitter is an object providing methods to the tokenizer to produce ("emit") tokens. @@ -160,17 +162,17 @@ pub trait Emitter<R> { } /// The default implementation of [`Emitter`], used to produce tokens. -pub struct DefaultEmitter<R, S = ()> { +pub struct DefaultEmitter<R, O = NoopOffset> { current_characters: String, - current_token: Option<Token<S>>, - current_attribute: Option<(String, Attribute<S>)>, + current_token: Option<Token<O>>, + current_attribute: Option<(String, Attribute<O>)>, seen_attributes: BTreeSet<String>, - emitted_tokens: VecDeque<Token<S>>, + emitted_tokens: VecDeque<Token<O>>, reader: PhantomData<R>, - attr_in_end_tag_span: Option<S>, + attr_in_end_tag_span: Option<Range<O>>, } -impl<R, S> Default for DefaultEmitter<R, S> { +impl<R, O> Default for DefaultEmitter<R, O> { fn default() -> Self { DefaultEmitter { current_characters: String::new(), @@ -184,13 +186,16 @@ impl<R, S> Default for DefaultEmitter<R, S> { } } -impl<R, S: Span> DefaultEmitter<R, S> { - fn emit_token(&mut self, token: Token<S>) { +impl<R, O> DefaultEmitter<R, O> { + fn emit_token(&mut self, token: Token<O>) { self.flush_current_characters(); self.emitted_tokens.push_front(token); } - fn flush_current_attribute(&mut self) { + fn flush_current_attribute(&mut self) + where + O: Clone, + { if let Some((k, v)) = self.current_attribute.take() { match self.current_token { Some(Token::StartTag(ref mut tag)) => match tag.attributes.entry(k) { @@ -223,22 +228,22 @@ impl<R, S: Span> DefaultEmitter<R, S> { self.emit_token(Token::String(s)); } - fn push_error(&mut self, error: Error, span: S) { + fn push_error(&mut self, error: Error, span: Range<O>) { // bypass character flushing in self.emit_token: we don't need the error location to be // that exact self.emitted_tokens.push_front(Token::Error { error, span }); } } -impl<R: Position<S::Offset>, S: Span> Emitter<R> for DefaultEmitter<R, S> { - type Token = Token<S>; +impl<O: Offset, R: Position<O>> Emitter<R> for DefaultEmitter<R, O> { + type Token = Token<O>; fn emit_eof(&mut self) { self.flush_current_characters(); } fn emit_error(&mut self, error: Error, reader: &R) { - self.push_error(error, S::new(reader.position(), reader.position())); + self.push_error(error, reader.position()..reader.position()); } fn pop_token(&mut self) -> Option<Self::Token> { @@ -251,7 +256,7 @@ impl<R: Position<S::Offset>, S: Span> Emitter<R> for DefaultEmitter<R, S> { fn init_start_tag(&mut self, reader: &R) { self.current_token = Some(Token::StartTag(StartTag { - name_span: S::new(reader.position(), reader.position()), + name_span: reader.position()..reader.position(), self_closing: false, name: String::new(), attributes: Default::default(), @@ -259,7 +264,7 @@ impl<R: Position<S::Offset>, S: Span> Emitter<R> for DefaultEmitter<R, S> { } fn init_end_tag(&mut self, reader: &R) { self.current_token = Some(Token::EndTag(EndTag { - name_span: S::new(reader.position(), reader.position()), + name_span: reader.position()..reader.position(), name: String::new(), })); self.seen_attributes.clear(); @@ -327,7 +332,7 @@ impl<R: Position<S::Offset>, S: Span> Emitter<R> for DefaultEmitter<R, S> { .. })) => { name.push_str(s); - name_span.push_str(s); + name_span.end += s.len(); } Some(Token::EndTag(EndTag { ref mut name, @@ -335,7 +340,7 @@ impl<R: Position<S::Offset>, S: Span> Emitter<R> for DefaultEmitter<R, S> { .. })) => { name.push_str(s); - name_span.push_str(s); + name_span.end += s.len(); } _ => debug_assert!(false), } @@ -368,28 +373,26 @@ impl<R: Position<S::Offset>, S: Span> Emitter<R> for DefaultEmitter<R, S> { self.current_attribute = Some(( String::new(), Attribute { - name_span: S::new(reader.position(), reader.position()), + name_span: reader.position()..reader.position(), value: String::new(), - value_span: S::default(), + value_span: Range::default(), }, )); } fn init_attribute_value(&mut self, reader: &R, quoted: bool) { - self.current_attribute.as_mut().unwrap().1.value_span = S::new( - reader.position() + quoted as usize, - reader.position() + quoted as usize, - ); + self.current_attribute.as_mut().unwrap().1.value_span = + reader.position() + quoted as usize..reader.position() + quoted as usize; } fn push_attribute_name(&mut self, s: &str) { let current_attr = self.current_attribute.as_mut().unwrap(); current_attr.0.push_str(s); - current_attr.1.name_span.push_str(s); + current_attr.1.name_span.end += s.len(); } fn push_attribute_value(&mut self, s: &str) { let current_attr = self.current_attribute.as_mut().unwrap(); current_attr.1.value.push_str(s); - current_attr.1.value_span.push_str(s); + current_attr.1.value_span.end += s.len(); } fn set_doctype_public_identifier(&mut self, value: &str) { if let Some(Token::Doctype(Doctype { @@ -439,7 +442,7 @@ impl<R: Position<S::Offset>, S: Span> Emitter<R> for DefaultEmitter<R, S> { /// An HTML start tag, such as `<p>` or `<a>`. #[derive(Debug, Eq, PartialEq)] -pub struct StartTag<S> { +pub struct StartTag<O> { /// Whether this tag is self-closing. If it is self-closing, no following [`EndTag`] should be /// expected. pub self_closing: bool, @@ -451,33 +454,33 @@ pub struct StartTag<S> { /// /// Duplicate attributes are ignored after the first one as per WHATWG spec. Implement your own /// [`Emitter`] to tweak this behavior. - pub attributes: BTreeMap<String, Attribute<S>>, + pub attributes: BTreeMap<String, Attribute<O>>, /// The source code span of the tag name. - pub name_span: S, + pub name_span: Range<O>, } /// A HTML attribute value (plus spans). #[derive(Debug, Eq, PartialEq)] -pub struct Attribute<S> { +pub struct Attribute<O> { /// The value of the attribute. pub value: String, /// The source code span of the attribute name. - pub name_span: S, + pub name_span: Range<O>, /// The source code span of the attribute value. - pub value_span: S, + pub value_span: Range<O>, } /// A HTML end/close tag, such as `</p>` or `</a>`. #[derive(Debug, Eq, PartialEq)] -pub struct EndTag<S> { +pub struct EndTag<O> { /// The ending tag's name, such as `"p"` or `"a"`. pub name: String, /// The source code span of the tag name. - pub name_span: S, + pub name_span: Range<O>, } /// A doctype. Some examples: @@ -504,11 +507,11 @@ pub struct Doctype { /// The token type used by default. You can define your own token type by implementing the /// [`Emitter`] trait. #[derive(Debug, Eq, PartialEq)] -pub enum Token<S> { +pub enum Token<O> { /// A HTML start tag. - StartTag(StartTag<S>), + StartTag(StartTag<O>), /// A HTML end tag. - EndTag(EndTag<S>), + EndTag(EndTag<O>), /// A literal string. String(String), /// A HTML comment. @@ -523,6 +526,6 @@ pub enum Token<S> { /// What kind of error occured. error: Error, /// The source code span of the error. - span: S, + span: Range<O>, }, } @@ -7,8 +7,8 @@ mod emitter; mod entities; mod error; mod machine; +pub mod offset; pub mod reader; -pub mod spans; mod tokenizer; mod utils; diff --git a/src/spans.rs b/src/offset.rs index 14392cd..f1f436d 100644 --- a/src/spans.rs +++ b/src/offset.rs @@ -1,19 +1,39 @@ -//! Source code spans. +//! Source code offsets. //! -//! The [`DefaultEmitter`](crate::DefaultEmitter) is generic over a [`Span`]. -//! This library comes with two Span implementations: +//! The [`DefaultEmitter`](crate::DefaultEmitter) is generic over an [`Offset`]. +//! This library comes with two Offset implementations: //! -//! * one for `()` which acts as the no-op implementation for when you don't want to track spans -//! * one for [`Range<usize>`] for when you do want to track spans +//! * [`NoopOffset`] for when you don't want to track source offsets +//! * `usize` for when you do want to track source offsets //! //! To use the latter your reader however has to implement [`Position<usize>`]. //! You can easily use any existing reader by wrapping it in the [`PosTrackingReader`] struct //! which implements the [`Position<usize>`] trait and takes care of tracking the current position. -use std::ops::{Add, Range}; +use std::fmt::Debug; +use std::ops::{Add, AddAssign, Sub}; use crate::reader::{IntoReader, Reader}; +/// A byte offset in the source. +pub trait Offset: + Default + + Copy + + Eq + + Ord + + Add<usize, Output = Self> + + Sub<usize, Output = Self> + + AddAssign<usize> + + Debug +{ +} + +impl Offset for usize {} + +impl Offset for NoopOffset {} + +/// A zero-sized no-op implementation of [`Offset`] (for when you don't want to track offsets). +#[derive(Default, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Debug)] pub struct NoopOffset; /// A trait to be implemented by readers that track their own position. @@ -48,31 +68,10 @@ impl<R> PosTrackingReader<R> { impl<R> Position<usize> for PosTrackingReader<R> { fn position(&self) -> usize { - self.position + self.position - 1 } } -/// A byte range in the source code. -pub trait Span: Default + Clone { - type Offset: Add<usize, Output = Self::Offset>; - - /// Constructs a new span from the given byte offsets. - fn new(start: Self::Offset, end: Self::Offset) -> Self; - - /// Extends the span by the length of the given string. - fn push_str(&mut self, str: &str); -} - -impl Span for () { - type Offset = NoopOffset; - - fn new(_start: Self::Offset, _end: Self::Offset) -> Self { - () - } - - fn push_str(&mut self, _str: &str) {} -} - impl Add<usize> for NoopOffset { type Output = Self; @@ -81,16 +80,16 @@ impl Add<usize> for NoopOffset { } } -impl Span for Range<usize> { - type Offset = usize; +impl Sub<usize> for NoopOffset { + type Output = Self; - fn new(start: Self::Offset, end: Self::Offset) -> Self { - start - 1..end - 1 + fn sub(self, _rhs: usize) -> NoopOffset { + self } +} - fn push_str(&mut self, str: &str) { - self.end += str.len(); - } +impl AddAssign<usize> for NoopOffset { + fn add_assign(&mut self, _rhs: usize) {} } impl<R: Reader> Reader for PosTrackingReader<R> { diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 7b8b1ce..141efb9 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -1,6 +1,6 @@ use crate::machine; +use crate::offset::NoopOffset; use crate::reader::{IntoReader, Reader}; -use crate::spans::Position; use crate::utils::{ control_pat, noncharacter_pat, surrogate_pat, ControlToken, State as InternalState, }; @@ -33,7 +33,7 @@ impl<T: Copy> Stack2<T> { } /// A HTML tokenizer. See crate-level docs for basic usage. -pub struct Tokenizer<R: Reader, E: Emitter<R> = DefaultEmitter<R, ()>> { +pub struct Tokenizer<R: Reader, E: Emitter<R> = DefaultEmitter<R, NoopOffset>> { eof: bool, pub(crate) state: InternalState, pub(crate) emitter: E, diff --git a/tests/test_spans.rs b/tests/test_spans.rs index f8e54a2..b41b1b9 100644 --- a/tests/test_spans.rs +++ b/tests/test_spans.rs @@ -6,14 +6,14 @@ use codespan_reporting::{ files::SimpleFiles, term::{self, termcolor::Buffer}, }; -use html5tokenizer::{spans::PosTrackingReader, DefaultEmitter, Token, Tokenizer}; +use html5tokenizer::{offset::PosTrackingReader, DefaultEmitter, Token, Tokenizer}; use insta::assert_snapshot; use pretty_assertions::assert_eq; -fn tokenizer(html: &'static str) -> impl Iterator<Item = Token<Range<usize>>> { +fn tokenizer(html: &'static str) -> impl Iterator<Item = Token<usize>> { Tokenizer::new( PosTrackingReader::new(html), - DefaultEmitter::<_, Range<usize>>::default(), + DefaultEmitter::<_, usize>::default(), ) .flatten() } |