diff options
-rw-r--r-- | README.md | 2 | ||||
-rw-r--r-- | examples/tokenize.rs | 2 | ||||
-rw-r--r-- | src/emitter.rs | 60 | ||||
-rw-r--r-- | src/machine.rs | 45 | ||||
-rw-r--r-- | src/offset.rs | 2 | ||||
-rw-r--r-- | src/tokenizer.rs | 27 | ||||
-rw-r--r-- | tests/test_spans.rs | 2 |
7 files changed, 73 insertions, 67 deletions
@@ -11,7 +11,7 @@ use std::fmt::Write; use html5tokenizer::{DefaultEmitter, Tokenizer, Token}; let html = "<title >hello world</title>"; -let emitter = DefaultEmitter::<_>::default(); +let emitter = DefaultEmitter::default(); let mut new_html = String::new(); for token in Tokenizer::new(html, emitter).flatten() { diff --git a/examples/tokenize.rs b/examples/tokenize.rs index 54ba0ec..da99dd3 100644 --- a/examples/tokenize.rs +++ b/examples/tokenize.rs @@ -7,7 +7,7 @@ use std::io::BufReader; fn main() { for token in Tokenizer::new( BufReader::new(std::io::stdin().lock()), - DefaultEmitter::<_>::default(), + DefaultEmitter::default(), ) .flatten() { diff --git a/src/emitter.rs b/src/emitter.rs index caf7b55..1f60f70 100644 --- a/src/emitter.rs +++ b/src/emitter.rs @@ -2,13 +2,11 @@ use std::collections::btree_map::Entry; use std::collections::BTreeMap; use std::collections::BTreeSet; use std::collections::VecDeque; -use std::marker::PhantomData; use std::mem; use std::ops::Range; use crate::offset::NoopOffset; use crate::offset::Offset; -use crate::offset::Position; use crate::Error; /// An emitter is an object providing methods to the tokenizer to produce ("emit") tokens. @@ -29,7 +27,7 @@ use crate::Error; /// checks that would emit errors. /// /// * If you don't care about attributes at all, you can make all related methods a noop. -pub trait Emitter<R> { +pub trait Emitter<O> { /// The token type emitted by this emitter. This controls what type of values the [`Tokenizer`](crate::Tokenizer) /// yields when used as an iterator. type Token; @@ -39,7 +37,7 @@ pub trait Emitter<R> { fn emit_eof(&mut self); /// A (probably recoverable) parsing error has occured. - fn emit_error(&mut self, error: Error, reader: &R); + fn emit_error(&mut self, error: Error, offset: O); /// After every state change, the tokenizer calls this method to retrieve a new token that can /// be returned via the tokenizer's iterator interface. @@ -49,13 +47,13 @@ pub trait Emitter<R> { fn emit_string(&mut self, c: &str); /// Set the _current token_ to a start tag. - fn init_start_tag(&mut self, reader: &R); + fn init_start_tag(&mut self, offset: O); /// Set the _current token_ to an end tag. - fn init_end_tag(&mut self, reader: &R); + fn init_end_tag(&mut self, offset: O); /// Set the _current token_ to a comment. - fn init_comment(&mut self, reader: &R); + fn init_comment(&mut self, data_offset: O); /// Emit the _current token_, assuming it is a tag. /// @@ -84,7 +82,7 @@ pub trait Emitter<R> { /// /// If the current token is an end tag, the emitter should emit the /// [`Error::EndTagWithTrailingSolidus`] error. - fn set_self_closing(&mut self, reader: &R); + fn set_self_closing(&mut self, offset: O); /// Assuming the _current token_ is a doctype, set its "force quirks" flag to true. /// @@ -112,7 +110,7 @@ pub trait Emitter<R> { /// * the "public identifier" should be null (different from empty) /// * the "system identifier" should be null (different from empty) /// * the "force quirks" flag should be `false` - fn init_doctype(&mut self, reader: &R); + fn init_doctype(&mut self, offset: O); /// Set the _current attribute_ to a new one, starting with empty name and value strings. /// @@ -121,14 +119,14 @@ pub trait Emitter<R> { /// [`Error::DuplicateAttribute`] error should be emitted. /// /// If the current token is no tag at all, this method may panic. - fn init_attribute_name(&mut self, reader: &R); + fn init_attribute_name(&mut self, offset: O); /// Called before the first push_attribute_value call. /// If the value is wrappend in double or single quotes `quoted` is set to true, otherwise false. /// /// If there is no current attribute, this method may panic. #[allow(unused_variables)] - fn init_attribute_value(&mut self, reader: &R, quoted: bool) {} + fn init_attribute_value(&mut self, offset: O, quoted: bool) {} /// Append a string to the current attribute's name. /// @@ -162,17 +160,16 @@ pub trait Emitter<R> { } /// The default implementation of [`Emitter`], used to produce tokens. -pub struct DefaultEmitter<R, O = NoopOffset> { +pub struct DefaultEmitter<O = NoopOffset> { current_characters: String, current_token: Option<Token<O>>, current_attribute: Option<(String, Attribute<O>)>, seen_attributes: BTreeSet<String>, emitted_tokens: VecDeque<Token<O>>, - reader: PhantomData<R>, attr_in_end_tag_span: Option<Range<O>>, } -impl<R, O> Default for DefaultEmitter<R, O> { +impl<O> Default for DefaultEmitter<O> { fn default() -> Self { DefaultEmitter { current_characters: String::new(), @@ -180,13 +177,12 @@ impl<R, O> Default for DefaultEmitter<R, O> { current_attribute: None, seen_attributes: BTreeSet::new(), emitted_tokens: VecDeque::new(), - reader: PhantomData::default(), attr_in_end_tag_span: None, } } } -impl<R, O> DefaultEmitter<R, O> { +impl<O> DefaultEmitter<O> { fn emit_token(&mut self, token: Token<O>) { self.flush_current_characters(); self.emitted_tokens.push_front(token); @@ -235,15 +231,15 @@ impl<R, O> DefaultEmitter<R, O> { } } -impl<O: Offset, R: Position<O>> Emitter<R> for DefaultEmitter<R, O> { +impl<O: Offset> Emitter<O> for DefaultEmitter<O> { type Token = Token<O>; fn emit_eof(&mut self) { self.flush_current_characters(); } - fn emit_error(&mut self, error: Error, reader: &R) { - self.push_error(error, reader.position()..reader.position()); + fn emit_error(&mut self, error: Error, offset: O) { + self.push_error(error, offset..offset); } fn pop_token(&mut self) -> Option<Self::Token> { @@ -254,26 +250,26 @@ impl<O: Offset, R: Position<O>> Emitter<R> for DefaultEmitter<R, O> { self.current_characters.push_str(s); } - fn init_start_tag(&mut self, reader: &R) { + fn init_start_tag(&mut self, offset: O) { self.current_token = Some(Token::StartTag(StartTag { - name_span: reader.position()..reader.position(), + name_span: offset..offset, self_closing: false, name: String::new(), attributes: Default::default(), })); } - fn init_end_tag(&mut self, reader: &R) { + fn init_end_tag(&mut self, offset: O) { self.current_token = Some(Token::EndTag(EndTag { - name_span: reader.position()..reader.position(), + name_span: offset..offset, name: String::new(), })); self.seen_attributes.clear(); } - fn init_comment(&mut self, reader: &R) { + fn init_comment(&mut self, data_offset: O) { self.current_token = Some(Token::Comment(Comment { data: String::new(), - data_offset: reader.position(), + data_offset, })); } fn emit_current_tag(&mut self) { @@ -304,7 +300,7 @@ impl<O: Offset, R: Position<O>> Emitter<R> for DefaultEmitter<R, O> { self.emit_token(doctype); } - fn set_self_closing(&mut self, reader: &R) { + fn set_self_closing(&mut self, offset: O) { let tag = self.current_token.as_mut().unwrap(); match tag { Token::StartTag(StartTag { @@ -314,7 +310,7 @@ impl<O: Offset, R: Position<O>> Emitter<R> for DefaultEmitter<R, O> { *self_closing = true; } Token::EndTag(_) => { - self.emit_error(Error::EndTagWithTrailingSolidus, reader); + self.emit_error(Error::EndTagWithTrailingSolidus, offset); } _ => { debug_assert!(false); @@ -362,7 +358,7 @@ impl<O: Offset, R: Position<O>> Emitter<R> for DefaultEmitter<R, O> { _ => debug_assert!(false), } } - fn init_doctype(&mut self, _reader: &R) { + fn init_doctype(&mut self, _offset: O) { self.current_token = Some(Token::Doctype(Doctype { name: String::new(), force_quirks: false, @@ -371,20 +367,20 @@ impl<O: Offset, R: Position<O>> Emitter<R> for DefaultEmitter<R, O> { })); } - fn init_attribute_name(&mut self, reader: &R) { + fn init_attribute_name(&mut self, offset: O) { self.flush_current_attribute(); self.current_attribute = Some(( String::new(), Attribute { - name_span: reader.position()..reader.position(), + name_span: offset..offset, value: String::new(), value_span: Range::default(), }, )); } - fn init_attribute_value(&mut self, reader: &R, quoted: bool) { + fn init_attribute_value(&mut self, offset: O, quoted: bool) { self.current_attribute.as_mut().unwrap().1.value_span = - reader.position() + quoted as usize..reader.position() + quoted as usize; + offset + quoted as usize..offset + quoted as usize; } fn push_attribute_name(&mut self, s: &str) { diff --git a/src/machine.rs b/src/machine.rs index c11720d..deb3983 100644 --- a/src/machine.rs +++ b/src/machine.rs @@ -1,4 +1,5 @@ use crate::entities::try_read_character_reference; +use crate::offset::{Offset, Position}; use crate::utils::{ ascii_digit_pat, control_pat, ctostr, noncharacter_pat, surrogate_pat, whitespace_pat, ControlToken, State, @@ -8,10 +9,11 @@ use crate::{reader::Reader, Emitter, Error, Tokenizer}; // Note: This is not implemented as a method on Tokenizer because there's fields on Tokenizer that // should not be available in this method, such as Tokenizer.to_reconsume or the Reader instance #[inline] -pub fn consume<R, E>(slf: &mut Tokenizer<R, E>) -> Result<ControlToken, R::Error> +pub fn consume<O, R, E>(slf: &mut Tokenizer<R, O, E>) -> Result<ControlToken, R::Error> where - R: Reader, - E: Emitter<R>, + O: Offset, + R: Reader + Position<O>, + E: Emitter<O>, { macro_rules! mutate_character_reference { (* $mul:literal + $x:ident - $sub:literal) => { @@ -133,7 +135,7 @@ where } c @ Some('?') => { slf.emit_error(Error::UnexpectedQuestionMarkInsteadOfTagName); - slf.emitter.init_comment(&slf.reader); + slf.emitter.init_comment(slf.reader.position()); slf.state = State::BogusComment; slf.unread_char(c); Ok(ControlToken::Continue) @@ -170,7 +172,7 @@ where } Some(x) => { slf.emit_error(Error::InvalidFirstCharacterOfTagName); - slf.emitter.init_comment(&slf.reader); + slf.emitter.init_comment(slf.reader.position()); slf.state = State::BogusComment; slf.unread_char(Some(x)); Ok(ControlToken::Continue) @@ -686,13 +688,13 @@ where } Some('=') => { slf.emit_error(Error::UnexpectedEqualsSignBeforeAttributeName); - slf.emitter.init_attribute_name(&slf.reader); + slf.emitter.init_attribute_name(slf.reader.position()); slf.emitter.push_attribute_name("="); slf.state = State::AttributeName; Ok(ControlToken::Continue) } Some(x) => { - slf.emitter.init_attribute_name(&slf.reader); + slf.emitter.init_attribute_name(slf.reader.position()); slf.state = State::AttributeName; slf.unread_char(Some(x)); Ok(ControlToken::Continue) @@ -745,7 +747,7 @@ where Ok(ControlToken::Eof) } Some(x) => { - slf.emitter.init_attribute_name(&slf.reader); + slf.emitter.init_attribute_name(slf.reader.position()); slf.state = State::AttributeName; slf.unread_char(Some(x)); Ok(ControlToken::Continue) @@ -754,12 +756,14 @@ where State::BeforeAttributeValue => match slf.read_char()? { Some(whitespace_pat!()) => Ok(ControlToken::Continue), Some('"') => { - slf.emitter.init_attribute_value(&slf.reader, true); + slf.emitter + .init_attribute_value(slf.reader.position(), true); slf.state = State::AttributeValueDoubleQuoted; Ok(ControlToken::Continue) } Some('\'') => { - slf.emitter.init_attribute_value(&slf.reader, true); + slf.emitter + .init_attribute_value(slf.reader.position(), true); slf.state = State::AttributeValueSingleQuoted; Ok(ControlToken::Continue) } @@ -770,7 +774,8 @@ where Ok(ControlToken::Continue) } c => { - slf.emitter.init_attribute_value(&slf.reader, false); + slf.emitter + .init_attribute_value(slf.reader.position(), false); slf.state = State::AttributeValueUnquoted; slf.unread_char(c); Ok(ControlToken::Continue) @@ -885,7 +890,7 @@ where }, State::SelfClosingStartTag => match slf.read_char()? { Some('>') => { - slf.emitter.set_self_closing(&slf.reader); + slf.emitter.set_self_closing(slf.reader.position()); slf.state = State::Data; slf.emit_current_tag(); Ok(ControlToken::Continue) @@ -923,7 +928,7 @@ where }, State::MarkupDeclarationOpen => match slf.read_char()? { Some('-') if slf.try_read_string("-", true)? => { - slf.emitter.init_comment(&slf.reader); + slf.emitter.init_comment(slf.reader.position()); slf.state = State::CommentStart; Ok(ControlToken::Continue) } @@ -940,14 +945,14 @@ where // let's hope that bogus comment can just sort of skip over cdata slf.emit_error(Error::CdataInHtmlContent); - slf.emitter.init_comment(&slf.reader); + slf.emitter.init_comment(slf.reader.position()); slf.emitter.push_comment("[CDATA["); slf.state = State::BogusComment; Ok(ControlToken::Continue) } c => { slf.emit_error(Error::IncorrectlyOpenedComment); - slf.emitter.init_comment(&slf.reader); + slf.emitter.init_comment(slf.reader.position()); slf.state = State::BogusComment; slf.unread_char(c); Ok(ControlToken::Continue) @@ -1153,7 +1158,7 @@ where } None => { slf.emit_error(Error::EofInDoctype); - slf.emitter.init_doctype(&slf.reader); + slf.emitter.init_doctype(slf.reader.position()); slf.emitter.set_force_quirks(); slf.emitter.emit_current_doctype(); Ok(ControlToken::Eof) @@ -1169,14 +1174,14 @@ where Some(whitespace_pat!()) => Ok(ControlToken::Continue), Some('\0') => { slf.emit_error(Error::UnexpectedNullCharacter); - slf.emitter.init_doctype(&slf.reader); + slf.emitter.init_doctype(slf.reader.position()); slf.emitter.push_doctype_name("\u{fffd}"); slf.state = State::DoctypeName; Ok(ControlToken::Continue) } Some('>') => { slf.emit_error(Error::MissingDoctypeName); - slf.emitter.init_doctype(&slf.reader); + slf.emitter.init_doctype(slf.reader.position()); slf.emitter.set_force_quirks(); slf.state = State::Data; slf.emitter.emit_current_doctype(); @@ -1184,13 +1189,13 @@ where } None => { slf.emit_error(Error::EofInDoctype); - slf.emitter.init_doctype(&slf.reader); + slf.emitter.init_doctype(slf.reader.position()); slf.emitter.set_force_quirks(); slf.emitter.emit_current_doctype(); Ok(ControlToken::Eof) } Some(x) => { - slf.emitter.init_doctype(&slf.reader); + slf.emitter.init_doctype(slf.reader.position()); slf.emitter .push_doctype_name(ctostr!(x.to_ascii_lowercase())); slf.state = State::DoctypeName; diff --git a/src/offset.rs b/src/offset.rs index f1f436d..8809366 100644 --- a/src/offset.rs +++ b/src/offset.rs @@ -1,6 +1,6 @@ //! Source code offsets. //! -//! The [`DefaultEmitter`](crate::DefaultEmitter) is generic over an [`Offset`]. +//! The [`Emitter`](crate::Emitter) is generic over an [`Offset`]. //! This library comes with two Offset implementations: //! //! * [`NoopOffset`] for when you don't want to track source offsets diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 7eb33f7..02a4d62 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -1,5 +1,7 @@ +use std::marker::PhantomData; + use crate::machine; -use crate::offset::NoopOffset; +use crate::offset::{NoopOffset, Offset, Position}; use crate::reader::{IntoReader, Reader}; use crate::utils::{ control_pat, noncharacter_pat, surrogate_pat, ControlToken, State as InternalState, @@ -33,12 +35,13 @@ impl<T: Copy> Stack2<T> { } /// A HTML tokenizer. See crate-level docs for basic usage. -pub struct Tokenizer<R: Reader, E: Emitter<R> = DefaultEmitter<R, NoopOffset>> { +pub struct Tokenizer<R: Reader, O = NoopOffset, E: Emitter<O> = DefaultEmitter<O>> { eof: bool, pub(crate) state: InternalState, pub(crate) emitter: E, pub(crate) temporary_buffer: String, pub(crate) reader: R, + _offset: PhantomData<O>, to_reconsume: Stack2<Option<char>>, pub(crate) character_reference_code: u32, pub(crate) return_state: Option<InternalState>, @@ -47,7 +50,7 @@ pub struct Tokenizer<R: Reader, E: Emitter<R> = DefaultEmitter<R, NoopOffset>> { is_start_tag: bool, } -impl<R: Reader, E: Emitter<R>> Tokenizer<R, E> { +impl<R: Reader, O, E: Emitter<O>> Tokenizer<R, O, E> { /// Creates a new tokenizer from some input and an emitter. /// /// TODO: add warning about you needing to do the state switching @@ -55,6 +58,7 @@ impl<R: Reader, E: Emitter<R>> Tokenizer<R, E> { Tokenizer { reader: reader.into_reader(), emitter, + _offset: PhantomData, state: InternalState::Data, to_reconsume: Stack2::default(), return_state: None, @@ -102,7 +106,7 @@ impl From<State> for InternalState { } } -impl<R: Reader, E: Emitter<R>> Tokenizer<R, E> { +impl<R: Reader + Position<O>, O, E: Emitter<O>> Tokenizer<R, O, E> { /// Test-internal function to override internal state. /// /// Only available with the `integration-tests` feature which is not public API. @@ -119,7 +123,7 @@ impl<R: Reader, E: Emitter<R>> Tokenizer<R, E> { /// Just a helper method for the machine. #[inline] pub(crate) fn emit_error(&mut self, error: Error) { - self.emitter.emit_error(error, &self.reader); + self.emitter.emit_error(error, self.reader.position()); } /// Assuming the _current token_ is an end tag, return true if all of these hold. Return false otherwise. @@ -136,14 +140,14 @@ impl<R: Reader, E: Emitter<R>> Tokenizer<R, E> { #[inline] pub(crate) fn init_start_tag(&mut self) { - self.emitter.init_start_tag(&self.reader); + self.emitter.init_start_tag(self.reader.position()); self.current_tag_name.clear(); self.is_start_tag = true; } #[inline] pub(crate) fn init_end_tag(&mut self) { - self.emitter.init_end_tag(&self.reader); + self.emitter.init_end_tag(self.reader.position()); self.current_tag_name.clear(); self.is_start_tag = false; } @@ -270,10 +274,11 @@ impl<R: Reader, E: Emitter<R>> Tokenizer<R, E> { } } -impl<R, E> Iterator for Tokenizer<R, E> +impl<O, R, E> Iterator for Tokenizer<R, O, E> where - R: Reader, - E: Emitter<R>, + O: Offset, + R: Reader + Position<O>, + E: Emitter<O>, { type Item = Result<E::Token, R::Error>; @@ -297,7 +302,7 @@ where } } -impl<R: Reader, E: Emitter<R>> Tokenizer<R, E> { +impl<R: Reader, O, E: Emitter<O>> Tokenizer<R, O, E> { /// Test-internal function to override internal state. /// /// Only available with the `integration-tests` feature which is not public API. diff --git a/tests/test_spans.rs b/tests/test_spans.rs index 21882a3..970099a 100644 --- a/tests/test_spans.rs +++ b/tests/test_spans.rs @@ -13,7 +13,7 @@ use pretty_assertions::assert_eq; fn tokenizer(html: &'static str) -> impl Iterator<Item = Token<usize>> { Tokenizer::new( PosTrackingReader::new(html), - DefaultEmitter::<_, usize>::default(), + DefaultEmitter::<usize>::default(), ) .flatten() } |