diff options
author | Martin Fischer <martin@push-f.com> | 2023-08-17 09:40:47 +0200 |
---|---|---|
committer | Martin Fischer <martin@push-f.com> | 2023-08-19 13:41:55 +0200 |
commit | c15895d44d17984386d3684e2aa85aca386ba3bf (patch) | |
tree | a7c92e5eff97bd7645c7d309c8bf94ea891459ad | |
parent | d5c9a851756b1e84b022c2fbf984137aae68e2c9 (diff) |
refactor!: make Emitter generic over offset instead of reader
Emitters should not have access to the reader at all. Also the
current position of the reader, at the time an Emitted method is
called, very much depends on machine implementation details such
as if `Tokenizer::unread_char` is used. Having the Emitter
methods take offsets lets the machine take care of providing
the right offsets, as evidenced by the next commit.
-rw-r--r-- | README.md | 2 | ||||
-rw-r--r-- | examples/tokenize.rs | 2 | ||||
-rw-r--r-- | src/emitter.rs | 60 | ||||
-rw-r--r-- | src/machine.rs | 45 | ||||
-rw-r--r-- | src/offset.rs | 2 | ||||
-rw-r--r-- | src/tokenizer.rs | 27 | ||||
-rw-r--r-- | tests/test_spans.rs | 2 |
7 files changed, 73 insertions, 67 deletions
@@ -11,7 +11,7 @@ use std::fmt::Write; use html5tokenizer::{DefaultEmitter, Tokenizer, Token}; let html = "<title >hello world</title>"; -let emitter = DefaultEmitter::<_>::default(); +let emitter = DefaultEmitter::default(); let mut new_html = String::new(); for token in Tokenizer::new(html, emitter).flatten() { diff --git a/examples/tokenize.rs b/examples/tokenize.rs index 54ba0ec..da99dd3 100644 --- a/examples/tokenize.rs +++ b/examples/tokenize.rs @@ -7,7 +7,7 @@ use std::io::BufReader; fn main() { for token in Tokenizer::new( BufReader::new(std::io::stdin().lock()), - DefaultEmitter::<_>::default(), + DefaultEmitter::default(), ) .flatten() { diff --git a/src/emitter.rs b/src/emitter.rs index caf7b55..1f60f70 100644 --- a/src/emitter.rs +++ b/src/emitter.rs @@ -2,13 +2,11 @@ use std::collections::btree_map::Entry; use std::collections::BTreeMap; use std::collections::BTreeSet; use std::collections::VecDeque; -use std::marker::PhantomData; use std::mem; use std::ops::Range; use crate::offset::NoopOffset; use crate::offset::Offset; -use crate::offset::Position; use crate::Error; /// An emitter is an object providing methods to the tokenizer to produce ("emit") tokens. @@ -29,7 +27,7 @@ use crate::Error; /// checks that would emit errors. /// /// * If you don't care about attributes at all, you can make all related methods a noop. -pub trait Emitter<R> { +pub trait Emitter<O> { /// The token type emitted by this emitter. This controls what type of values the [`Tokenizer`](crate::Tokenizer) /// yields when used as an iterator. type Token; @@ -39,7 +37,7 @@ pub trait Emitter<R> { fn emit_eof(&mut self); /// A (probably recoverable) parsing error has occured. - fn emit_error(&mut self, error: Error, reader: &R); + fn emit_error(&mut self, error: Error, offset: O); /// After every state change, the tokenizer calls this method to retrieve a new token that can /// be returned via the tokenizer's iterator interface. @@ -49,13 +47,13 @@ pub trait Emitter<R> { fn emit_string(&mut self, c: &str); /// Set the _current token_ to a start tag. - fn init_start_tag(&mut self, reader: &R); + fn init_start_tag(&mut self, offset: O); /// Set the _current token_ to an end tag. - fn init_end_tag(&mut self, reader: &R); + fn init_end_tag(&mut self, offset: O); /// Set the _current token_ to a comment. - fn init_comment(&mut self, reader: &R); + fn init_comment(&mut self, data_offset: O); /// Emit the _current token_, assuming it is a tag. /// @@ -84,7 +82,7 @@ pub trait Emitter<R> { /// /// If the current token is an end tag, the emitter should emit the /// [`Error::EndTagWithTrailingSolidus`] error. - fn set_self_closing(&mut self, reader: &R); + fn set_self_closing(&mut self, offset: O); /// Assuming the _current token_ is a doctype, set its "force quirks" flag to true. /// @@ -112,7 +110,7 @@ pub trait Emitter<R> { /// * the "public identifier" should be null (different from empty) /// * the "system identifier" should be null (different from empty) /// * the "force quirks" flag should be `false` - fn init_doctype(&mut self, reader: &R); + fn init_doctype(&mut self, offset: O); /// Set the _current attribute_ to a new one, starting with empty name and value strings. /// @@ -121,14 +119,14 @@ pub trait Emitter<R> { /// [`Error::DuplicateAttribute`] error should be emitted. /// /// If the current token is no tag at all, this method may panic. - fn init_attribute_name(&mut self, reader: &R); + fn init_attribute_name(&mut self, offset: O); /// Called before the first push_attribute_value call. /// If the value is wrappend in double or single quotes `quoted` is set to true, otherwise false. /// /// If there is no current attribute, this method may panic. #[allow(unused_variables)] - fn init_attribute_value(&mut self, reader: &R, quoted: bool) {} + fn init_attribute_value(&mut self, offset: O, quoted: bool) {} /// Append a string to the current attribute's name. /// @@ -162,17 +160,16 @@ pub trait Emitter<R> { } /// The default implementation of [`Emitter`], used to produce tokens. -pub struct DefaultEmitter<R, O = NoopOffset> { +pub struct DefaultEmitter<O = NoopOffset> { current_characters: String, current_token: Option<Token<O>>, current_attribute: Option<(String, Attribute<O>)>, seen_attributes: BTreeSet<String>, emitted_tokens: VecDeque<Token<O>>, - reader: PhantomData<R>, attr_in_end_tag_span: Option<Range<O>>, } -impl<R, O> Default for DefaultEmitter<R, O> { +impl<O> Default for DefaultEmitter<O> { fn default() -> Self { DefaultEmitter { current_characters: String::new(), @@ -180,13 +177,12 @@ impl<R, O> Default for DefaultEmitter<R, O> { current_attribute: None, seen_attributes: BTreeSet::new(), emitted_tokens: VecDeque::new(), - reader: PhantomData::default(), attr_in_end_tag_span: None, } } } -impl<R, O> DefaultEmitter<R, O> { +impl<O> DefaultEmitter<O> { fn emit_token(&mut self, token: Token<O>) { self.flush_current_characters(); self.emitted_tokens.push_front(token); @@ -235,15 +231,15 @@ impl<R, O> DefaultEmitter<R, O> { } } -impl<O: Offset, R: Position<O>> Emitter<R> for DefaultEmitter<R, O> { +impl<O: Offset> Emitter<O> for DefaultEmitter<O> { type Token = Token<O>; fn emit_eof(&mut self) { self.flush_current_characters(); } - fn emit_error(&mut self, error: Error, reader: &R) { - self.push_error(error, reader.position()..reader.position()); + fn emit_error(&mut self, error: Error, offset: O) { + self.push_error(error, offset..offset); } fn pop_token(&mut self) -> Option<Self::Token> { @@ -254,26 +250,26 @@ impl<O: Offset, R: Position<O>> Emitter<R> for DefaultEmitter<R, O> { self.current_characters.push_str(s); } - fn init_start_tag(&mut self, reader: &R) { + fn init_start_tag(&mut self, offset: O) { self.current_token = Some(Token::StartTag(StartTag { - name_span: reader.position()..reader.position(), + name_span: offset..offset, self_closing: false, name: String::new(), attributes: Default::default(), })); } - fn init_end_tag(&mut self, reader: &R) { + fn init_end_tag(&mut self, offset: O) { self.current_token = Some(Token::EndTag(EndTag { - name_span: reader.position()..reader.position(), + name_span: offset..offset, name: String::new(), })); self.seen_attributes.clear(); } - fn init_comment(&mut self, reader: &R) { + fn init_comment(&mut self, data_offset: O) { self.current_token = Some(Token::Comment(Comment { data: String::new(), - data_offset: reader.position(), + data_offset, })); } fn emit_current_tag(&mut self) { @@ -304,7 +300,7 @@ impl<O: Offset, R: Position<O>> Emitter<R> for DefaultEmitter<R, O> { self.emit_token(doctype); } - fn set_self_closing(&mut self, reader: &R) { + fn set_self_closing(&mut self, offset: O) { let tag = self.current_token.as_mut().unwrap(); match tag { Token::StartTag(StartTag { @@ -314,7 +310,7 @@ impl<O: Offset, R: Position<O>> Emitter<R> for DefaultEmitter<R, O> { *self_closing = true; } Token::EndTag(_) => { - self.emit_error(Error::EndTagWithTrailingSolidus, reader); + self.emit_error(Error::EndTagWithTrailingSolidus, offset); } _ => { debug_assert!(false); @@ -362,7 +358,7 @@ impl<O: Offset, R: Position<O>> Emitter<R> for DefaultEmitter<R, O> { _ => debug_assert!(false), } } - fn init_doctype(&mut self, _reader: &R) { + fn init_doctype(&mut self, _offset: O) { self.current_token = Some(Token::Doctype(Doctype { name: String::new(), force_quirks: false, @@ -371,20 +367,20 @@ impl<O: Offset, R: Position<O>> Emitter<R> for DefaultEmitter<R, O> { })); } - fn init_attribute_name(&mut self, reader: &R) { + fn init_attribute_name(&mut self, offset: O) { self.flush_current_attribute(); self.current_attribute = Some(( String::new(), Attribute { - name_span: reader.position()..reader.position(), + name_span: offset..offset, value: String::new(), value_span: Range::default(), }, )); } - fn init_attribute_value(&mut self, reader: &R, quoted: bool) { + fn init_attribute_value(&mut self, offset: O, quoted: bool) { self.current_attribute.as_mut().unwrap().1.value_span = - reader.position() + quoted as usize..reader.position() + quoted as usize; + offset + quoted as usize..offset + quoted as usize; } fn push_attribute_name(&mut self, s: &str) { diff --git a/src/machine.rs b/src/machine.rs index c11720d..deb3983 100644 --- a/src/machine.rs +++ b/src/machine.rs @@ -1,4 +1,5 @@ use crate::entities::try_read_character_reference; +use crate::offset::{Offset, Position}; use crate::utils::{ ascii_digit_pat, control_pat, ctostr, noncharacter_pat, surrogate_pat, whitespace_pat, ControlToken, State, @@ -8,10 +9,11 @@ use crate::{reader::Reader, Emitter, Error, Tokenizer}; // Note: This is not implemented as a method on Tokenizer because there's fields on Tokenizer that // should not be available in this method, such as Tokenizer.to_reconsume or the Reader instance #[inline] -pub fn consume<R, E>(slf: &mut Tokenizer<R, E>) -> Result<ControlToken, R::Error> +pub fn consume<O, R, E>(slf: &mut Tokenizer<R, O, E>) -> Result<ControlToken, R::Error> where - R: Reader, - E: Emitter<R>, + O: Offset, + R: Reader + Position<O>, + E: Emitter<O>, { macro_rules! mutate_character_reference { (* $mul:literal + $x:ident - $sub:literal) => { @@ -133,7 +135,7 @@ where } c @ Some('?') => { slf.emit_error(Error::UnexpectedQuestionMarkInsteadOfTagName); - slf.emitter.init_comment(&slf.reader); + slf.emitter.init_comment(slf.reader.position()); slf.state = State::BogusComment; slf.unread_char(c); Ok(ControlToken::Continue) @@ -170,7 +172,7 @@ where } Some(x) => { slf.emit_error(Error::InvalidFirstCharacterOfTagName); - slf.emitter.init_comment(&slf.reader); + slf.emitter.init_comment(slf.reader.position()); slf.state = State::BogusComment; slf.unread_char(Some(x)); Ok(ControlToken::Continue) @@ -686,13 +688,13 @@ where } Some('=') => { slf.emit_error(Error::UnexpectedEqualsSignBeforeAttributeName); - slf.emitter.init_attribute_name(&slf.reader); + slf.emitter.init_attribute_name(slf.reader.position()); slf.emitter.push_attribute_name("="); slf.state = State::AttributeName; Ok(ControlToken::Continue) } Some(x) => { - slf.emitter.init_attribute_name(&slf.reader); + slf.emitter.init_attribute_name(slf.reader.position()); slf.state = State::AttributeName; slf.unread_char(Some(x)); Ok(ControlToken::Continue) @@ -745,7 +747,7 @@ where Ok(ControlToken::Eof) } Some(x) => { - slf.emitter.init_attribute_name(&slf.reader); + slf.emitter.init_attribute_name(slf.reader.position()); slf.state = State::AttributeName; slf.unread_char(Some(x)); Ok(ControlToken::Continue) @@ -754,12 +756,14 @@ where State::BeforeAttributeValue => match slf.read_char()? { Some(whitespace_pat!()) => Ok(ControlToken::Continue), Some('"') => { - slf.emitter.init_attribute_value(&slf.reader, true); + slf.emitter + .init_attribute_value(slf.reader.position(), true); slf.state = State::AttributeValueDoubleQuoted; Ok(ControlToken::Continue) } Some('\'') => { - slf.emitter.init_attribute_value(&slf.reader, true); + slf.emitter + .init_attribute_value(slf.reader.position(), true); slf.state = State::AttributeValueSingleQuoted; Ok(ControlToken::Continue) } @@ -770,7 +774,8 @@ where Ok(ControlToken::Continue) } c => { - slf.emitter.init_attribute_value(&slf.reader, false); + slf.emitter + .init_attribute_value(slf.reader.position(), false); slf.state = State::AttributeValueUnquoted; slf.unread_char(c); Ok(ControlToken::Continue) @@ -885,7 +890,7 @@ where }, State::SelfClosingStartTag => match slf.read_char()? { Some('>') => { - slf.emitter.set_self_closing(&slf.reader); + slf.emitter.set_self_closing(slf.reader.position()); slf.state = State::Data; slf.emit_current_tag(); Ok(ControlToken::Continue) @@ -923,7 +928,7 @@ where }, State::MarkupDeclarationOpen => match slf.read_char()? { Some('-') if slf.try_read_string("-", true)? => { - slf.emitter.init_comment(&slf.reader); + slf.emitter.init_comment(slf.reader.position()); slf.state = State::CommentStart; Ok(ControlToken::Continue) } @@ -940,14 +945,14 @@ where // let's hope that bogus comment can just sort of skip over cdata slf.emit_error(Error::CdataInHtmlContent); - slf.emitter.init_comment(&slf.reader); + slf.emitter.init_comment(slf.reader.position()); slf.emitter.push_comment("[CDATA["); slf.state = State::BogusComment; Ok(ControlToken::Continue) } c => { slf.emit_error(Error::IncorrectlyOpenedComment); - slf.emitter.init_comment(&slf.reader); + slf.emitter.init_comment(slf.reader.position()); slf.state = State::BogusComment; slf.unread_char(c); Ok(ControlToken::Continue) @@ -1153,7 +1158,7 @@ where } None => { slf.emit_error(Error::EofInDoctype); - slf.emitter.init_doctype(&slf.reader); + slf.emitter.init_doctype(slf.reader.position()); slf.emitter.set_force_quirks(); slf.emitter.emit_current_doctype(); Ok(ControlToken::Eof) @@ -1169,14 +1174,14 @@ where Some(whitespace_pat!()) => Ok(ControlToken::Continue), Some('\0') => { slf.emit_error(Error::UnexpectedNullCharacter); - slf.emitter.init_doctype(&slf.reader); + slf.emitter.init_doctype(slf.reader.position()); slf.emitter.push_doctype_name("\u{fffd}"); slf.state = State::DoctypeName; Ok(ControlToken::Continue) } Some('>') => { slf.emit_error(Error::MissingDoctypeName); - slf.emitter.init_doctype(&slf.reader); + slf.emitter.init_doctype(slf.reader.position()); slf.emitter.set_force_quirks(); slf.state = State::Data; slf.emitter.emit_current_doctype(); @@ -1184,13 +1189,13 @@ where } None => { slf.emit_error(Error::EofInDoctype); - slf.emitter.init_doctype(&slf.reader); + slf.emitter.init_doctype(slf.reader.position()); slf.emitter.set_force_quirks(); slf.emitter.emit_current_doctype(); Ok(ControlToken::Eof) } Some(x) => { - slf.emitter.init_doctype(&slf.reader); + slf.emitter.init_doctype(slf.reader.position()); slf.emitter .push_doctype_name(ctostr!(x.to_ascii_lowercase())); slf.state = State::DoctypeName; diff --git a/src/offset.rs b/src/offset.rs index f1f436d..8809366 100644 --- a/src/offset.rs +++ b/src/offset.rs @@ -1,6 +1,6 @@ //! Source code offsets. //! -//! The [`DefaultEmitter`](crate::DefaultEmitter) is generic over an [`Offset`]. +//! The [`Emitter`](crate::Emitter) is generic over an [`Offset`]. //! This library comes with two Offset implementations: //! //! * [`NoopOffset`] for when you don't want to track source offsets diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 7eb33f7..02a4d62 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -1,5 +1,7 @@ +use std::marker::PhantomData; + use crate::machine; -use crate::offset::NoopOffset; +use crate::offset::{NoopOffset, Offset, Position}; use crate::reader::{IntoReader, Reader}; use crate::utils::{ control_pat, noncharacter_pat, surrogate_pat, ControlToken, State as InternalState, @@ -33,12 +35,13 @@ impl<T: Copy> Stack2<T> { } /// A HTML tokenizer. See crate-level docs for basic usage. -pub struct Tokenizer<R: Reader, E: Emitter<R> = DefaultEmitter<R, NoopOffset>> { +pub struct Tokenizer<R: Reader, O = NoopOffset, E: Emitter<O> = DefaultEmitter<O>> { eof: bool, pub(crate) state: InternalState, pub(crate) emitter: E, pub(crate) temporary_buffer: String, pub(crate) reader: R, + _offset: PhantomData<O>, to_reconsume: Stack2<Option<char>>, pub(crate) character_reference_code: u32, pub(crate) return_state: Option<InternalState>, @@ -47,7 +50,7 @@ pub struct Tokenizer<R: Reader, E: Emitter<R> = DefaultEmitter<R, NoopOffset>> { is_start_tag: bool, } -impl<R: Reader, E: Emitter<R>> Tokenizer<R, E> { +impl<R: Reader, O, E: Emitter<O>> Tokenizer<R, O, E> { /// Creates a new tokenizer from some input and an emitter. /// /// TODO: add warning about you needing to do the state switching @@ -55,6 +58,7 @@ impl<R: Reader, E: Emitter<R>> Tokenizer<R, E> { Tokenizer { reader: reader.into_reader(), emitter, + _offset: PhantomData, state: InternalState::Data, to_reconsume: Stack2::default(), return_state: None, @@ -102,7 +106,7 @@ impl From<State> for InternalState { } } -impl<R: Reader, E: Emitter<R>> Tokenizer<R, E> { +impl<R: Reader + Position<O>, O, E: Emitter<O>> Tokenizer<R, O, E> { /// Test-internal function to override internal state. /// /// Only available with the `integration-tests` feature which is not public API. @@ -119,7 +123,7 @@ impl<R: Reader, E: Emitter<R>> Tokenizer<R, E> { /// Just a helper method for the machine. #[inline] pub(crate) fn emit_error(&mut self, error: Error) { - self.emitter.emit_error(error, &self.reader); + self.emitter.emit_error(error, self.reader.position()); } /// Assuming the _current token_ is an end tag, return true if all of these hold. Return false otherwise. @@ -136,14 +140,14 @@ impl<R: Reader, E: Emitter<R>> Tokenizer<R, E> { #[inline] pub(crate) fn init_start_tag(&mut self) { - self.emitter.init_start_tag(&self.reader); + self.emitter.init_start_tag(self.reader.position()); self.current_tag_name.clear(); self.is_start_tag = true; } #[inline] pub(crate) fn init_end_tag(&mut self) { - self.emitter.init_end_tag(&self.reader); + self.emitter.init_end_tag(self.reader.position()); self.current_tag_name.clear(); self.is_start_tag = false; } @@ -270,10 +274,11 @@ impl<R: Reader, E: Emitter<R>> Tokenizer<R, E> { } } -impl<R, E> Iterator for Tokenizer<R, E> +impl<O, R, E> Iterator for Tokenizer<R, O, E> where - R: Reader, - E: Emitter<R>, + O: Offset, + R: Reader + Position<O>, + E: Emitter<O>, { type Item = Result<E::Token, R::Error>; @@ -297,7 +302,7 @@ where } } -impl<R: Reader, E: Emitter<R>> Tokenizer<R, E> { +impl<R: Reader, O, E: Emitter<O>> Tokenizer<R, O, E> { /// Test-internal function to override internal state. /// /// Only available with the `integration-tests` feature which is not public API. diff --git a/tests/test_spans.rs b/tests/test_spans.rs index 21882a3..970099a 100644 --- a/tests/test_spans.rs +++ b/tests/test_spans.rs @@ -13,7 +13,7 @@ use pretty_assertions::assert_eq; fn tokenizer(html: &'static str) -> impl Iterator<Item = Token<usize>> { Tokenizer::new( PosTrackingReader::new(html), - DefaultEmitter::<_, usize>::default(), + DefaultEmitter::<usize>::default(), ) .flatten() } |