diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/emitter.rs | 45 | ||||
-rw-r--r-- | src/machine.rs | 42 | ||||
-rw-r--r-- | src/tokenizer.rs | 18 |
3 files changed, 61 insertions, 44 deletions
diff --git a/src/emitter.rs b/src/emitter.rs index 2c4ba41..20bcba4 100644 --- a/src/emitter.rs +++ b/src/emitter.rs @@ -1,6 +1,7 @@ use std::collections::BTreeMap; use std::collections::BTreeSet; use std::collections::VecDeque; +use std::marker::PhantomData; use std::mem; use crate::Error; @@ -28,7 +29,7 @@ use crate::State; /// /// The state machine needs to have a functional implementation of /// `current_is_appropriate_end_tag_token` to do correct transitions, however. -pub trait Emitter { +pub trait Emitter<R> { /// The token type emitted by this emitter. This controls what type of values the [`crate::Tokenizer`] /// yields when used as an iterator. type Token; @@ -54,13 +55,13 @@ pub trait Emitter { fn emit_string(&mut self, c: &str); /// Set the _current token_ to a start tag. - fn init_start_tag(&mut self); + fn init_start_tag(&mut self, reader: &R); /// Set the _current token_ to an end tag. - fn init_end_tag(&mut self); + fn init_end_tag(&mut self, reader: &R); /// Set the _current token_ to a comment. - fn init_comment(&mut self); + fn init_comment(&mut self, reader: &R); /// Emit the _current token_, assuming it is a tag. /// @@ -116,7 +117,7 @@ pub trait Emitter { /// * the "public identifier" should be null (different from empty) /// * the "system identifier" should be null (different from empty) /// * the "force quirks" flag should be `false` - fn init_doctype(&mut self); + fn init_doctype(&mut self, reader: &R); /// Set the _current attribute_ to a new one, starting with empty name and value strings. /// @@ -128,7 +129,7 @@ pub trait Emitter { /// emitted. /// /// If the current token is no tag at all, this method may panic. - fn init_attribute(&mut self); + fn init_attribute(&mut self, reader: &R); /// Append a string to the current attribute's name. /// @@ -172,17 +173,31 @@ pub trait Emitter { } /// The default implementation of [`crate::Emitter`], used to produce ("emit") tokens. -#[derive(Default)] -pub struct DefaultEmitter<S> { +pub struct DefaultEmitter<R, S> { current_characters: String, current_token: Option<Token<S>>, last_start_tag: String, current_attribute: Option<(String, String)>, seen_attributes: BTreeSet<String>, emitted_tokens: VecDeque<Token<S>>, + reader: PhantomData<R>, } -impl DefaultEmitter<()> { +impl<R, S> Default for DefaultEmitter<R, S> { + fn default() -> Self { + DefaultEmitter { + current_characters: String::new(), + current_token: None, + last_start_tag: String::new(), + current_attribute: None, + seen_attributes: BTreeSet::new(), + emitted_tokens: VecDeque::new(), + reader: PhantomData::default(), + } + } +} + +impl<R> DefaultEmitter<R, ()> { fn emit_token(&mut self, token: Token<()>) { self.flush_current_characters(); self.emitted_tokens.push_front(token); @@ -226,7 +241,7 @@ impl DefaultEmitter<()> { } } -impl Emitter for DefaultEmitter<()> { +impl<R> Emitter<R> for DefaultEmitter<R, ()> { type Token = Token<()>; fn set_last_start_tag(&mut self, last_start_tag: Option<&str>) { @@ -253,15 +268,15 @@ impl Emitter for DefaultEmitter<()> { self.current_characters.push_str(s); } - fn init_start_tag(&mut self) { + fn init_start_tag(&mut self, _reader: &R) { self.current_token = Some(Token::StartTag(Default::default())); } - fn init_end_tag(&mut self) { + fn init_end_tag(&mut self, _reader: &R) { self.current_token = Some(Token::EndTag(Default::default())); self.seen_attributes.clear(); } - fn init_comment(&mut self) { + fn init_comment(&mut self, _reader: &R) { self.current_token = Some(Token::Comment(String::new())); } fn emit_current_tag(&mut self) { @@ -341,7 +356,7 @@ impl Emitter for DefaultEmitter<()> { _ => debug_assert!(false), } } - fn init_doctype(&mut self) { + fn init_doctype(&mut self, _reader: &R) { self.current_token = Some(Token::Doctype(Doctype { name: String::new(), force_quirks: false, @@ -350,7 +365,7 @@ impl Emitter for DefaultEmitter<()> { })); } - fn init_attribute(&mut self) { + fn init_attribute(&mut self, _reader: &R) { self.flush_current_attribute(); self.current_attribute = Some((String::new(), String::new())); } diff --git a/src/machine.rs b/src/machine.rs index 5222735..931abf1 100644 --- a/src/machine.rs +++ b/src/machine.rs @@ -8,7 +8,9 @@ use crate::{Emitter, Error, Reader, Tokenizer}; // Note: This is not implemented as a method on Tokenizer because there's fields on Tokenizer that // should not be available in this method, such as Tokenizer.to_reconsume or the Reader instance #[inline] -pub fn consume<R: Reader, E: Emitter>(slf: &mut Tokenizer<R, E>) -> Result<ControlToken, R::Error> { +pub fn consume<R: Reader, E: Emitter<R>>( + slf: &mut Tokenizer<R, E>, +) -> Result<ControlToken, R::Error> { macro_rules! mutate_character_reference { (* $mul:literal + $x:ident - $sub:literal) => { match slf @@ -122,7 +124,7 @@ pub fn consume<R: Reader, E: Emitter>(slf: &mut Tokenizer<R, E>) -> Result<Contr Ok(ControlToken::Continue) } Some(x) if x.is_ascii_alphabetic() => { - slf.emitter.init_start_tag(); + slf.emitter.init_start_tag(&slf.reader); slf.state = State::TagName; slf.unread_char(Some(x)); Ok(ControlToken::Continue) @@ -130,7 +132,7 @@ pub fn consume<R: Reader, E: Emitter>(slf: &mut Tokenizer<R, E>) -> Result<Contr c @ Some('?') => { slf.emitter .emit_error(Error::UnexpectedQuestionMarkInsteadOfTagName); - slf.emitter.init_comment(); + slf.emitter.init_comment(&slf.reader); slf.state = State::BogusComment; slf.unread_char(c); Ok(ControlToken::Continue) @@ -151,7 +153,7 @@ pub fn consume<R: Reader, E: Emitter>(slf: &mut Tokenizer<R, E>) -> Result<Contr }, State::EndTagOpen => match slf.read_char()? { Some(x) if x.is_ascii_alphabetic() => { - slf.emitter.init_end_tag(); + slf.emitter.init_end_tag(&slf.reader); slf.state = State::TagName; slf.unread_char(Some(x)); Ok(ControlToken::Continue) @@ -169,7 +171,7 @@ pub fn consume<R: Reader, E: Emitter>(slf: &mut Tokenizer<R, E>) -> Result<Contr Some(x) => { slf.emitter .emit_error(Error::InvalidFirstCharacterOfTagName); - slf.emitter.init_comment(); + slf.emitter.init_comment(&slf.reader); slf.state = State::BogusComment; slf.unread_char(Some(x)); Ok(ControlToken::Continue) @@ -218,7 +220,7 @@ pub fn consume<R: Reader, E: Emitter>(slf: &mut Tokenizer<R, E>) -> Result<Contr }, State::RcDataEndTagOpen => match slf.read_char()? { Some(x) if x.is_ascii_alphabetic() => { - slf.emitter.init_end_tag(); + slf.emitter.init_end_tag(&slf.reader); slf.state = State::RcDataEndTagName; slf.unread_char(Some(x)); Ok(ControlToken::Continue) @@ -273,7 +275,7 @@ pub fn consume<R: Reader, E: Emitter>(slf: &mut Tokenizer<R, E>) -> Result<Contr }, State::RawTextEndTagOpen => match slf.read_char()? { Some(x) if x.is_ascii_alphabetic() => { - slf.emitter.init_end_tag(); + slf.emitter.init_end_tag(&slf.reader); slf.state = State::RawTextEndTagName; slf.unread_char(Some(x)); Ok(ControlToken::Continue) @@ -333,7 +335,7 @@ pub fn consume<R: Reader, E: Emitter>(slf: &mut Tokenizer<R, E>) -> Result<Contr }, State::ScriptDataEndTagOpen => match slf.read_char()? { Some(x) if x.is_ascii_alphabetic() => { - slf.emitter.init_end_tag(); + slf.emitter.init_end_tag(&slf.reader); slf.state = State::ScriptDataEndTagName; slf.unread_char(Some(x)); Ok(ControlToken::Continue) @@ -501,7 +503,7 @@ pub fn consume<R: Reader, E: Emitter>(slf: &mut Tokenizer<R, E>) -> Result<Contr }, State::ScriptDataEscapedEndTagOpen => match slf.read_char()? { Some(x) if x.is_ascii_alphabetic() => { - slf.emitter.init_end_tag(); + slf.emitter.init_end_tag(&slf.reader); slf.state = State::ScriptDataEscapedEndTagName; slf.unread_char(Some(x)); Ok(ControlToken::Continue) @@ -692,13 +694,13 @@ pub fn consume<R: Reader, E: Emitter>(slf: &mut Tokenizer<R, E>) -> Result<Contr Some('=') => { slf.emitter .emit_error(Error::UnexpectedEqualsSignBeforeAttributeName); - slf.emitter.init_attribute(); + slf.emitter.init_attribute(&slf.reader); slf.emitter.push_attribute_name("="); slf.state = State::AttributeName; Ok(ControlToken::Continue) } Some(x) => { - slf.emitter.init_attribute(); + slf.emitter.init_attribute(&slf.reader); slf.state = State::AttributeName; slf.unread_char(Some(x)); Ok(ControlToken::Continue) @@ -752,7 +754,7 @@ pub fn consume<R: Reader, E: Emitter>(slf: &mut Tokenizer<R, E>) -> Result<Contr Ok(ControlToken::Eof) } Some(x) => { - slf.emitter.init_attribute(); + slf.emitter.init_attribute(&slf.reader); slf.state = State::AttributeName; slf.unread_char(Some(x)); Ok(ControlToken::Continue) @@ -929,7 +931,7 @@ pub fn consume<R: Reader, E: Emitter>(slf: &mut Tokenizer<R, E>) -> Result<Contr }, State::MarkupDeclarationOpen => match slf.read_char()? { Some('-') if slf.try_read_string("-", true)? => { - slf.emitter.init_comment(); + slf.emitter.init_comment(&slf.reader); slf.state = State::CommentStart; Ok(ControlToken::Continue) } @@ -946,14 +948,14 @@ pub fn consume<R: Reader, E: Emitter>(slf: &mut Tokenizer<R, E>) -> Result<Contr // let's hope that bogus comment can just sort of skip over cdata slf.emitter.emit_error(Error::CdataInHtmlContent); - slf.emitter.init_comment(); + slf.emitter.init_comment(&slf.reader); slf.emitter.push_comment("[CDATA["); slf.state = State::BogusComment; Ok(ControlToken::Continue) } c => { slf.emitter.emit_error(Error::IncorrectlyOpenedComment); - slf.emitter.init_comment(); + slf.emitter.init_comment(&slf.reader); slf.state = State::BogusComment; slf.unread_char(c); Ok(ControlToken::Continue) @@ -1159,7 +1161,7 @@ pub fn consume<R: Reader, E: Emitter>(slf: &mut Tokenizer<R, E>) -> Result<Contr } None => { slf.emitter.emit_error(Error::EofInDoctype); - slf.emitter.init_doctype(); + slf.emitter.init_doctype(&slf.reader); slf.emitter.set_force_quirks(); slf.emitter.emit_current_doctype(); Ok(ControlToken::Eof) @@ -1176,14 +1178,14 @@ pub fn consume<R: Reader, E: Emitter>(slf: &mut Tokenizer<R, E>) -> Result<Contr Some(whitespace_pat!()) => Ok(ControlToken::Continue), Some('\0') => { slf.emitter.emit_error(Error::UnexpectedNullCharacter); - slf.emitter.init_doctype(); + slf.emitter.init_doctype(&slf.reader); slf.emitter.push_doctype_name("\u{fffd}"); slf.state = State::DoctypeName; Ok(ControlToken::Continue) } Some('>') => { slf.emitter.emit_error(Error::MissingDoctypeName); - slf.emitter.init_doctype(); + slf.emitter.init_doctype(&slf.reader); slf.emitter.set_force_quirks(); slf.state = State::Data; slf.emitter.emit_current_doctype(); @@ -1191,13 +1193,13 @@ pub fn consume<R: Reader, E: Emitter>(slf: &mut Tokenizer<R, E>) -> Result<Contr } None => { slf.emitter.emit_error(Error::EofInDoctype); - slf.emitter.init_doctype(); + slf.emitter.init_doctype(&slf.reader); slf.emitter.set_force_quirks(); slf.emitter.emit_current_doctype(); Ok(ControlToken::Eof) } Some(x) => { - slf.emitter.init_doctype(); + slf.emitter.init_doctype(&slf.reader); slf.emitter .push_doctype_name(ctostr!(x.to_ascii_lowercase())); slf.state = State::DoctypeName; diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 377dd01..efaa870 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -33,12 +33,12 @@ impl<T: Copy> Stack2<T> { } /// A HTML tokenizer. See crate-level docs for basic usage. -pub struct Tokenizer<R: Reader, E: Emitter = DefaultEmitter<()>> { +pub struct Tokenizer<R: Reader, E: Emitter<R> = DefaultEmitter<R, ()>> { eof: bool, pub(crate) state: InternalState, pub(crate) emitter: E, pub(crate) temporary_buffer: String, - reader: R, + pub(crate) reader: R, to_reconsume: Stack2<Option<char>>, pub(crate) character_reference_code: u32, pub(crate) return_state: Option<InternalState>, @@ -91,7 +91,7 @@ impl From<State> for InternalState { } } -impl<R: Reader, E: Emitter> Tokenizer<R, E> { +impl<R: Reader, E: Emitter<R>> Tokenizer<R, E> { /// Construct a new tokenizer from some input and a custom emitter. /// /// Use this method over [`Tokenizer::new`] when you want to have more control over string allocation for @@ -239,7 +239,7 @@ impl<R: Reader, E: Emitter> Tokenizer<R, E> { } } -impl<R: Reader, E: Emitter> Iterator for Tokenizer<R, E> { +impl<R: Reader, E: Emitter<R>> Iterator for Tokenizer<R, E> { type Item = Result<E::Token, R::Error>; fn next(&mut self) -> Option<Self::Item> { @@ -266,9 +266,9 @@ impl<R: Reader, E: Emitter> Iterator for Tokenizer<R, E> { /// `Result<Token, _>`. /// /// This is the return value of [`Tokenizer::infallible`]. -pub struct InfallibleTokenizer<R: Reader<Error = Never>, E: Emitter>(Tokenizer<R, E>); +pub struct InfallibleTokenizer<R: Reader<Error = Never>, E: Emitter<R>>(Tokenizer<R, E>); -impl<R: Reader<Error = Never>, E: Emitter> Tokenizer<R, E> { +impl<R: Reader<Error = Never>, E: Emitter<R>> Tokenizer<R, E> { /// Statically assert that this iterator is infallible. /// /// Call this to get rid of error handling when parsing HTML from strings. @@ -277,7 +277,7 @@ impl<R: Reader<Error = Never>, E: Emitter> Tokenizer<R, E> { } } -impl<R: Reader<Error = Never>, E: Emitter> Iterator for InfallibleTokenizer<R, E> { +impl<R: Reader<Error = Never>, E: Emitter<R>> Iterator for InfallibleTokenizer<R, E> { type Item = E::Token; fn next(&mut self) -> Option<Self::Item> { @@ -288,7 +288,7 @@ impl<R: Reader<Error = Never>, E: Emitter> Iterator for InfallibleTokenizer<R, E } } -impl<R: Reader<Error = Never>, E: Emitter> Deref for InfallibleTokenizer<R, E> { +impl<R: Reader<Error = Never>, E: Emitter<R>> Deref for InfallibleTokenizer<R, E> { type Target = Tokenizer<R, E>; fn deref(&self) -> &Self::Target { @@ -296,7 +296,7 @@ impl<R: Reader<Error = Never>, E: Emitter> Deref for InfallibleTokenizer<R, E> { } } -impl<R: Reader<Error = Never>, E: Emitter> DerefMut for InfallibleTokenizer<R, E> { +impl<R: Reader<Error = Never>, E: Emitter<R>> DerefMut for InfallibleTokenizer<R, E> { fn deref_mut(&mut self) -> &mut Self::Target { &mut self.0 } |