From c169e78f120ea9be451f337306b8bff6c1fb4955 Mon Sep 17 00:00:00 2001 From: Martin Fischer Date: Thu, 17 Aug 2023 08:59:05 +0200 Subject: refactor!: remove Span trait, just use Range `std::mem::size_of::>()` is 0 so there's no need to abstract over Range. --- src/emitter.rs | 81 +++++++++++++++++++----------------- src/lib.rs | 2 +- src/offset.rs | 117 +++++++++++++++++++++++++++++++++++++++++++++++++++ src/spans.rs | 118 ---------------------------------------------------- src/tokenizer.rs | 4 +- tests/test_spans.rs | 6 +-- 6 files changed, 165 insertions(+), 163 deletions(-) create mode 100644 src/offset.rs delete mode 100644 src/spans.rs diff --git a/src/emitter.rs b/src/emitter.rs index 18b2539..b3fdb99 100644 --- a/src/emitter.rs +++ b/src/emitter.rs @@ -4,9 +4,11 @@ use std::collections::BTreeSet; use std::collections::VecDeque; use std::marker::PhantomData; use std::mem; +use std::ops::Range; -use crate::spans::Position; -use crate::spans::Span; +use crate::offset::NoopOffset; +use crate::offset::Offset; +use crate::offset::Position; use crate::Error; /// An emitter is an object providing methods to the tokenizer to produce ("emit") tokens. @@ -160,17 +162,17 @@ pub trait Emitter { } /// The default implementation of [`Emitter`], used to produce tokens. -pub struct DefaultEmitter { +pub struct DefaultEmitter { current_characters: String, - current_token: Option>, - current_attribute: Option<(String, Attribute)>, + current_token: Option>, + current_attribute: Option<(String, Attribute)>, seen_attributes: BTreeSet, - emitted_tokens: VecDeque>, + emitted_tokens: VecDeque>, reader: PhantomData, - attr_in_end_tag_span: Option, + attr_in_end_tag_span: Option>, } -impl Default for DefaultEmitter { +impl Default for DefaultEmitter { fn default() -> Self { DefaultEmitter { current_characters: String::new(), @@ -184,13 +186,16 @@ impl Default for DefaultEmitter { } } -impl DefaultEmitter { - fn emit_token(&mut self, token: Token) { +impl DefaultEmitter { + fn emit_token(&mut self, token: Token) { self.flush_current_characters(); self.emitted_tokens.push_front(token); } - fn flush_current_attribute(&mut self) { + fn flush_current_attribute(&mut self) + where + O: Clone, + { if let Some((k, v)) = self.current_attribute.take() { match self.current_token { Some(Token::StartTag(ref mut tag)) => match tag.attributes.entry(k) { @@ -223,22 +228,22 @@ impl DefaultEmitter { self.emit_token(Token::String(s)); } - fn push_error(&mut self, error: Error, span: S) { + fn push_error(&mut self, error: Error, span: Range) { // bypass character flushing in self.emit_token: we don't need the error location to be // that exact self.emitted_tokens.push_front(Token::Error { error, span }); } } -impl, S: Span> Emitter for DefaultEmitter { - type Token = Token; +impl> Emitter for DefaultEmitter { + type Token = Token; fn emit_eof(&mut self) { self.flush_current_characters(); } fn emit_error(&mut self, error: Error, reader: &R) { - self.push_error(error, S::new(reader.position(), reader.position())); + self.push_error(error, reader.position()..reader.position()); } fn pop_token(&mut self) -> Option { @@ -251,7 +256,7 @@ impl, S: Span> Emitter for DefaultEmitter { fn init_start_tag(&mut self, reader: &R) { self.current_token = Some(Token::StartTag(StartTag { - name_span: S::new(reader.position(), reader.position()), + name_span: reader.position()..reader.position(), self_closing: false, name: String::new(), attributes: Default::default(), @@ -259,7 +264,7 @@ impl, S: Span> Emitter for DefaultEmitter { } fn init_end_tag(&mut self, reader: &R) { self.current_token = Some(Token::EndTag(EndTag { - name_span: S::new(reader.position(), reader.position()), + name_span: reader.position()..reader.position(), name: String::new(), })); self.seen_attributes.clear(); @@ -327,7 +332,7 @@ impl, S: Span> Emitter for DefaultEmitter { .. })) => { name.push_str(s); - name_span.push_str(s); + name_span.end += s.len(); } Some(Token::EndTag(EndTag { ref mut name, @@ -335,7 +340,7 @@ impl, S: Span> Emitter for DefaultEmitter { .. })) => { name.push_str(s); - name_span.push_str(s); + name_span.end += s.len(); } _ => debug_assert!(false), } @@ -368,28 +373,26 @@ impl, S: Span> Emitter for DefaultEmitter { self.current_attribute = Some(( String::new(), Attribute { - name_span: S::new(reader.position(), reader.position()), + name_span: reader.position()..reader.position(), value: String::new(), - value_span: S::default(), + value_span: Range::default(), }, )); } fn init_attribute_value(&mut self, reader: &R, quoted: bool) { - self.current_attribute.as_mut().unwrap().1.value_span = S::new( - reader.position() + quoted as usize, - reader.position() + quoted as usize, - ); + self.current_attribute.as_mut().unwrap().1.value_span = + reader.position() + quoted as usize..reader.position() + quoted as usize; } fn push_attribute_name(&mut self, s: &str) { let current_attr = self.current_attribute.as_mut().unwrap(); current_attr.0.push_str(s); - current_attr.1.name_span.push_str(s); + current_attr.1.name_span.end += s.len(); } fn push_attribute_value(&mut self, s: &str) { let current_attr = self.current_attribute.as_mut().unwrap(); current_attr.1.value.push_str(s); - current_attr.1.value_span.push_str(s); + current_attr.1.value_span.end += s.len(); } fn set_doctype_public_identifier(&mut self, value: &str) { if let Some(Token::Doctype(Doctype { @@ -439,7 +442,7 @@ impl, S: Span> Emitter for DefaultEmitter { /// An HTML start tag, such as `

` or ``. #[derive(Debug, Eq, PartialEq)] -pub struct StartTag { +pub struct StartTag { /// Whether this tag is self-closing. If it is self-closing, no following [`EndTag`] should be /// expected. pub self_closing: bool, @@ -451,33 +454,33 @@ pub struct StartTag { /// /// Duplicate attributes are ignored after the first one as per WHATWG spec. Implement your own /// [`Emitter`] to tweak this behavior. - pub attributes: BTreeMap>, + pub attributes: BTreeMap>, /// The source code span of the tag name. - pub name_span: S, + pub name_span: Range, } /// A HTML attribute value (plus spans). #[derive(Debug, Eq, PartialEq)] -pub struct Attribute { +pub struct Attribute { /// The value of the attribute. pub value: String, /// The source code span of the attribute name. - pub name_span: S, + pub name_span: Range, /// The source code span of the attribute value. - pub value_span: S, + pub value_span: Range, } /// A HTML end/close tag, such as `

` or ``. #[derive(Debug, Eq, PartialEq)] -pub struct EndTag { +pub struct EndTag { /// The ending tag's name, such as `"p"` or `"a"`. pub name: String, /// The source code span of the tag name. - pub name_span: S, + pub name_span: Range, } /// A doctype. Some examples: @@ -504,11 +507,11 @@ pub struct Doctype { /// The token type used by default. You can define your own token type by implementing the /// [`Emitter`] trait. #[derive(Debug, Eq, PartialEq)] -pub enum Token { +pub enum Token { /// A HTML start tag. - StartTag(StartTag), + StartTag(StartTag), /// A HTML end tag. - EndTag(EndTag), + EndTag(EndTag), /// A literal string. String(String), /// A HTML comment. @@ -523,6 +526,6 @@ pub enum Token { /// What kind of error occured. error: Error, /// The source code span of the error. - span: S, + span: Range, }, } diff --git a/src/lib.rs b/src/lib.rs index 12eb6a2..fd0349c 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -7,8 +7,8 @@ mod emitter; mod entities; mod error; mod machine; +pub mod offset; pub mod reader; -pub mod spans; mod tokenizer; mod utils; diff --git a/src/offset.rs b/src/offset.rs new file mode 100644 index 0000000..f1f436d --- /dev/null +++ b/src/offset.rs @@ -0,0 +1,117 @@ +//! Source code offsets. +//! +//! The [`DefaultEmitter`](crate::DefaultEmitter) is generic over an [`Offset`]. +//! This library comes with two Offset implementations: +//! +//! * [`NoopOffset`] for when you don't want to track source offsets +//! * `usize` for when you do want to track source offsets +//! +//! To use the latter your reader however has to implement [`Position`]. +//! You can easily use any existing reader by wrapping it in the [`PosTrackingReader`] struct +//! which implements the [`Position`] trait and takes care of tracking the current position. + +use std::fmt::Debug; +use std::ops::{Add, AddAssign, Sub}; + +use crate::reader::{IntoReader, Reader}; + +/// A byte offset in the source. +pub trait Offset: + Default + + Copy + + Eq + + Ord + + Add + + Sub + + AddAssign + + Debug +{ +} + +impl Offset for usize {} + +impl Offset for NoopOffset {} + +/// A zero-sized no-op implementation of [`Offset`] (for when you don't want to track offsets). +#[derive(Default, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Debug)] +pub struct NoopOffset; + +/// A trait to be implemented by readers that track their own position. +pub trait Position { + /// Returns the byte index of the current position. + fn position(&self) -> T; +} + +impl Position for R { + fn position(&self) -> NoopOffset { + NoopOffset + } +} + +/// Wraps a [`Reader`] so that it implements [`Position`]. +pub struct PosTrackingReader { + /// The wrapped reader. + reader: R, + /// The current position. + position: usize, +} + +impl PosTrackingReader { + /// Wraps the given [`Reader`] so that it implements [`Position`] with the position starting from 0. + pub fn new<'a>(into_reader: impl IntoReader<'a, Reader = R>) -> Self { + Self { + reader: into_reader.into_reader(), + position: 0, + } + } +} + +impl Position for PosTrackingReader { + fn position(&self) -> usize { + self.position - 1 + } +} + +impl Add for NoopOffset { + type Output = Self; + + fn add(self, _rhs: usize) -> NoopOffset { + self + } +} + +impl Sub for NoopOffset { + type Output = Self; + + fn sub(self, _rhs: usize) -> NoopOffset { + self + } +} + +impl AddAssign for NoopOffset { + fn add_assign(&mut self, _rhs: usize) {} +} + +impl Reader for PosTrackingReader { + type Error = R::Error; + + fn read_char(&mut self) -> Result, Self::Error> { + match self.reader.read_char()? { + Some(char) => { + self.position += char.len_utf8(); + Ok(Some(char)) + } + None => Ok(None), + } + } + + fn try_read_string(&mut self, s: &str, case_sensitive: bool) -> Result { + match self.reader.try_read_string(s, case_sensitive)? { + true => { + self.position += s.len(); + Ok(true) + } + false => Ok(false), + } + } +} diff --git a/src/spans.rs b/src/spans.rs deleted file mode 100644 index 14392cd..0000000 --- a/src/spans.rs +++ /dev/null @@ -1,118 +0,0 @@ -//! Source code spans. -//! -//! The [`DefaultEmitter`](crate::DefaultEmitter) is generic over a [`Span`]. -//! This library comes with two Span implementations: -//! -//! * one for `()` which acts as the no-op implementation for when you don't want to track spans -//! * one for [`Range`] for when you do want to track spans -//! -//! To use the latter your reader however has to implement [`Position`]. -//! You can easily use any existing reader by wrapping it in the [`PosTrackingReader`] struct -//! which implements the [`Position`] trait and takes care of tracking the current position. - -use std::ops::{Add, Range}; - -use crate::reader::{IntoReader, Reader}; - -pub struct NoopOffset; - -/// A trait to be implemented by readers that track their own position. -pub trait Position { - /// Returns the byte index of the current position. - fn position(&self) -> T; -} - -impl Position for R { - fn position(&self) -> NoopOffset { - NoopOffset - } -} - -/// Wraps a [`Reader`] so that it implements [`Position`]. -pub struct PosTrackingReader { - /// The wrapped reader. - reader: R, - /// The current position. - position: usize, -} - -impl PosTrackingReader { - /// Wraps the given [`Reader`] so that it implements [`Position`] with the position starting from 0. - pub fn new<'a>(into_reader: impl IntoReader<'a, Reader = R>) -> Self { - Self { - reader: into_reader.into_reader(), - position: 0, - } - } -} - -impl Position for PosTrackingReader { - fn position(&self) -> usize { - self.position - } -} - -/// A byte range in the source code. -pub trait Span: Default + Clone { - type Offset: Add; - - /// Constructs a new span from the given byte offsets. - fn new(start: Self::Offset, end: Self::Offset) -> Self; - - /// Extends the span by the length of the given string. - fn push_str(&mut self, str: &str); -} - -impl Span for () { - type Offset = NoopOffset; - - fn new(_start: Self::Offset, _end: Self::Offset) -> Self { - () - } - - fn push_str(&mut self, _str: &str) {} -} - -impl Add for NoopOffset { - type Output = Self; - - fn add(self, _rhs: usize) -> NoopOffset { - self - } -} - -impl Span for Range { - type Offset = usize; - - fn new(start: Self::Offset, end: Self::Offset) -> Self { - start - 1..end - 1 - } - - fn push_str(&mut self, str: &str) { - self.end += str.len(); - } -} - -impl Reader for PosTrackingReader { - type Error = R::Error; - - fn read_char(&mut self) -> Result, Self::Error> { - match self.reader.read_char()? { - Some(char) => { - self.position += char.len_utf8(); - Ok(Some(char)) - } - None => Ok(None), - } - } - - fn try_read_string(&mut self, s: &str, case_sensitive: bool) -> Result { - match self.reader.try_read_string(s, case_sensitive)? { - true => { - self.position += s.len(); - Ok(true) - } - false => Ok(false), - } - } -} diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 7b8b1ce..141efb9 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -1,6 +1,6 @@ use crate::machine; +use crate::offset::NoopOffset; use crate::reader::{IntoReader, Reader}; -use crate::spans::Position; use crate::utils::{ control_pat, noncharacter_pat, surrogate_pat, ControlToken, State as InternalState, }; @@ -33,7 +33,7 @@ impl Stack2 { } /// A HTML tokenizer. See crate-level docs for basic usage. -pub struct Tokenizer = DefaultEmitter> { +pub struct Tokenizer = DefaultEmitter> { eof: bool, pub(crate) state: InternalState, pub(crate) emitter: E, diff --git a/tests/test_spans.rs b/tests/test_spans.rs index f8e54a2..b41b1b9 100644 --- a/tests/test_spans.rs +++ b/tests/test_spans.rs @@ -6,14 +6,14 @@ use codespan_reporting::{ files::SimpleFiles, term::{self, termcolor::Buffer}, }; -use html5tokenizer::{spans::PosTrackingReader, DefaultEmitter, Token, Tokenizer}; +use html5tokenizer::{offset::PosTrackingReader, DefaultEmitter, Token, Tokenizer}; use insta::assert_snapshot; use pretty_assertions::assert_eq; -fn tokenizer(html: &'static str) -> impl Iterator>> { +fn tokenizer(html: &'static str) -> impl Iterator> { Tokenizer::new( PosTrackingReader::new(html), - DefaultEmitter::<_, Range>::default(), + DefaultEmitter::<_, usize>::default(), ) .flatten() } -- cgit v1.2.3