aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMartin Fischer <martin@push-f.com>2021-11-30 17:16:17 +0100
committerMartin Fischer <martin@push-f.com>2021-12-05 02:52:36 +0100
commit927ac122a63ad5e1b8037a895d9e9b63883bcc01 (patch)
treeaa226caead5b563bb46c72e1438e7a1a8385eae4
parent1f99ea9e16f85945e2606905ed6345519ce16e4e (diff)
spans: make Emitter generic over Reader
-rw-r--r--src/emitter.rs45
-rw-r--r--src/machine.rs42
-rw-r--r--src/tokenizer.rs18
3 files changed, 61 insertions, 44 deletions
diff --git a/src/emitter.rs b/src/emitter.rs
index 2c4ba41..20bcba4 100644
--- a/src/emitter.rs
+++ b/src/emitter.rs
@@ -1,6 +1,7 @@
use std::collections::BTreeMap;
use std::collections::BTreeSet;
use std::collections::VecDeque;
+use std::marker::PhantomData;
use std::mem;
use crate::Error;
@@ -28,7 +29,7 @@ use crate::State;
///
/// The state machine needs to have a functional implementation of
/// `current_is_appropriate_end_tag_token` to do correct transitions, however.
-pub trait Emitter {
+pub trait Emitter<R> {
/// The token type emitted by this emitter. This controls what type of values the [`crate::Tokenizer`]
/// yields when used as an iterator.
type Token;
@@ -54,13 +55,13 @@ pub trait Emitter {
fn emit_string(&mut self, c: &str);
/// Set the _current token_ to a start tag.
- fn init_start_tag(&mut self);
+ fn init_start_tag(&mut self, reader: &R);
/// Set the _current token_ to an end tag.
- fn init_end_tag(&mut self);
+ fn init_end_tag(&mut self, reader: &R);
/// Set the _current token_ to a comment.
- fn init_comment(&mut self);
+ fn init_comment(&mut self, reader: &R);
/// Emit the _current token_, assuming it is a tag.
///
@@ -116,7 +117,7 @@ pub trait Emitter {
/// * the "public identifier" should be null (different from empty)
/// * the "system identifier" should be null (different from empty)
/// * the "force quirks" flag should be `false`
- fn init_doctype(&mut self);
+ fn init_doctype(&mut self, reader: &R);
/// Set the _current attribute_ to a new one, starting with empty name and value strings.
///
@@ -128,7 +129,7 @@ pub trait Emitter {
/// emitted.
///
/// If the current token is no tag at all, this method may panic.
- fn init_attribute(&mut self);
+ fn init_attribute(&mut self, reader: &R);
/// Append a string to the current attribute's name.
///
@@ -172,17 +173,31 @@ pub trait Emitter {
}
/// The default implementation of [`crate::Emitter`], used to produce ("emit") tokens.
-#[derive(Default)]
-pub struct DefaultEmitter<S> {
+pub struct DefaultEmitter<R, S> {
current_characters: String,
current_token: Option<Token<S>>,
last_start_tag: String,
current_attribute: Option<(String, String)>,
seen_attributes: BTreeSet<String>,
emitted_tokens: VecDeque<Token<S>>,
+ reader: PhantomData<R>,
}
-impl DefaultEmitter<()> {
+impl<R, S> Default for DefaultEmitter<R, S> {
+ fn default() -> Self {
+ DefaultEmitter {
+ current_characters: String::new(),
+ current_token: None,
+ last_start_tag: String::new(),
+ current_attribute: None,
+ seen_attributes: BTreeSet::new(),
+ emitted_tokens: VecDeque::new(),
+ reader: PhantomData::default(),
+ }
+ }
+}
+
+impl<R> DefaultEmitter<R, ()> {
fn emit_token(&mut self, token: Token<()>) {
self.flush_current_characters();
self.emitted_tokens.push_front(token);
@@ -226,7 +241,7 @@ impl DefaultEmitter<()> {
}
}
-impl Emitter for DefaultEmitter<()> {
+impl<R> Emitter<R> for DefaultEmitter<R, ()> {
type Token = Token<()>;
fn set_last_start_tag(&mut self, last_start_tag: Option<&str>) {
@@ -253,15 +268,15 @@ impl Emitter for DefaultEmitter<()> {
self.current_characters.push_str(s);
}
- fn init_start_tag(&mut self) {
+ fn init_start_tag(&mut self, _reader: &R) {
self.current_token = Some(Token::StartTag(Default::default()));
}
- fn init_end_tag(&mut self) {
+ fn init_end_tag(&mut self, _reader: &R) {
self.current_token = Some(Token::EndTag(Default::default()));
self.seen_attributes.clear();
}
- fn init_comment(&mut self) {
+ fn init_comment(&mut self, _reader: &R) {
self.current_token = Some(Token::Comment(String::new()));
}
fn emit_current_tag(&mut self) {
@@ -341,7 +356,7 @@ impl Emitter for DefaultEmitter<()> {
_ => debug_assert!(false),
}
}
- fn init_doctype(&mut self) {
+ fn init_doctype(&mut self, _reader: &R) {
self.current_token = Some(Token::Doctype(Doctype {
name: String::new(),
force_quirks: false,
@@ -350,7 +365,7 @@ impl Emitter for DefaultEmitter<()> {
}));
}
- fn init_attribute(&mut self) {
+ fn init_attribute(&mut self, _reader: &R) {
self.flush_current_attribute();
self.current_attribute = Some((String::new(), String::new()));
}
diff --git a/src/machine.rs b/src/machine.rs
index 5222735..931abf1 100644
--- a/src/machine.rs
+++ b/src/machine.rs
@@ -8,7 +8,9 @@ use crate::{Emitter, Error, Reader, Tokenizer};
// Note: This is not implemented as a method on Tokenizer because there's fields on Tokenizer that
// should not be available in this method, such as Tokenizer.to_reconsume or the Reader instance
#[inline]
-pub fn consume<R: Reader, E: Emitter>(slf: &mut Tokenizer<R, E>) -> Result<ControlToken, R::Error> {
+pub fn consume<R: Reader, E: Emitter<R>>(
+ slf: &mut Tokenizer<R, E>,
+) -> Result<ControlToken, R::Error> {
macro_rules! mutate_character_reference {
(* $mul:literal + $x:ident - $sub:literal) => {
match slf
@@ -122,7 +124,7 @@ pub fn consume<R: Reader, E: Emitter>(slf: &mut Tokenizer<R, E>) -> Result<Contr
Ok(ControlToken::Continue)
}
Some(x) if x.is_ascii_alphabetic() => {
- slf.emitter.init_start_tag();
+ slf.emitter.init_start_tag(&slf.reader);
slf.state = State::TagName;
slf.unread_char(Some(x));
Ok(ControlToken::Continue)
@@ -130,7 +132,7 @@ pub fn consume<R: Reader, E: Emitter>(slf: &mut Tokenizer<R, E>) -> Result<Contr
c @ Some('?') => {
slf.emitter
.emit_error(Error::UnexpectedQuestionMarkInsteadOfTagName);
- slf.emitter.init_comment();
+ slf.emitter.init_comment(&slf.reader);
slf.state = State::BogusComment;
slf.unread_char(c);
Ok(ControlToken::Continue)
@@ -151,7 +153,7 @@ pub fn consume<R: Reader, E: Emitter>(slf: &mut Tokenizer<R, E>) -> Result<Contr
},
State::EndTagOpen => match slf.read_char()? {
Some(x) if x.is_ascii_alphabetic() => {
- slf.emitter.init_end_tag();
+ slf.emitter.init_end_tag(&slf.reader);
slf.state = State::TagName;
slf.unread_char(Some(x));
Ok(ControlToken::Continue)
@@ -169,7 +171,7 @@ pub fn consume<R: Reader, E: Emitter>(slf: &mut Tokenizer<R, E>) -> Result<Contr
Some(x) => {
slf.emitter
.emit_error(Error::InvalidFirstCharacterOfTagName);
- slf.emitter.init_comment();
+ slf.emitter.init_comment(&slf.reader);
slf.state = State::BogusComment;
slf.unread_char(Some(x));
Ok(ControlToken::Continue)
@@ -218,7 +220,7 @@ pub fn consume<R: Reader, E: Emitter>(slf: &mut Tokenizer<R, E>) -> Result<Contr
},
State::RcDataEndTagOpen => match slf.read_char()? {
Some(x) if x.is_ascii_alphabetic() => {
- slf.emitter.init_end_tag();
+ slf.emitter.init_end_tag(&slf.reader);
slf.state = State::RcDataEndTagName;
slf.unread_char(Some(x));
Ok(ControlToken::Continue)
@@ -273,7 +275,7 @@ pub fn consume<R: Reader, E: Emitter>(slf: &mut Tokenizer<R, E>) -> Result<Contr
},
State::RawTextEndTagOpen => match slf.read_char()? {
Some(x) if x.is_ascii_alphabetic() => {
- slf.emitter.init_end_tag();
+ slf.emitter.init_end_tag(&slf.reader);
slf.state = State::RawTextEndTagName;
slf.unread_char(Some(x));
Ok(ControlToken::Continue)
@@ -333,7 +335,7 @@ pub fn consume<R: Reader, E: Emitter>(slf: &mut Tokenizer<R, E>) -> Result<Contr
},
State::ScriptDataEndTagOpen => match slf.read_char()? {
Some(x) if x.is_ascii_alphabetic() => {
- slf.emitter.init_end_tag();
+ slf.emitter.init_end_tag(&slf.reader);
slf.state = State::ScriptDataEndTagName;
slf.unread_char(Some(x));
Ok(ControlToken::Continue)
@@ -501,7 +503,7 @@ pub fn consume<R: Reader, E: Emitter>(slf: &mut Tokenizer<R, E>) -> Result<Contr
},
State::ScriptDataEscapedEndTagOpen => match slf.read_char()? {
Some(x) if x.is_ascii_alphabetic() => {
- slf.emitter.init_end_tag();
+ slf.emitter.init_end_tag(&slf.reader);
slf.state = State::ScriptDataEscapedEndTagName;
slf.unread_char(Some(x));
Ok(ControlToken::Continue)
@@ -692,13 +694,13 @@ pub fn consume<R: Reader, E: Emitter>(slf: &mut Tokenizer<R, E>) -> Result<Contr
Some('=') => {
slf.emitter
.emit_error(Error::UnexpectedEqualsSignBeforeAttributeName);
- slf.emitter.init_attribute();
+ slf.emitter.init_attribute(&slf.reader);
slf.emitter.push_attribute_name("=");
slf.state = State::AttributeName;
Ok(ControlToken::Continue)
}
Some(x) => {
- slf.emitter.init_attribute();
+ slf.emitter.init_attribute(&slf.reader);
slf.state = State::AttributeName;
slf.unread_char(Some(x));
Ok(ControlToken::Continue)
@@ -752,7 +754,7 @@ pub fn consume<R: Reader, E: Emitter>(slf: &mut Tokenizer<R, E>) -> Result<Contr
Ok(ControlToken::Eof)
}
Some(x) => {
- slf.emitter.init_attribute();
+ slf.emitter.init_attribute(&slf.reader);
slf.state = State::AttributeName;
slf.unread_char(Some(x));
Ok(ControlToken::Continue)
@@ -929,7 +931,7 @@ pub fn consume<R: Reader, E: Emitter>(slf: &mut Tokenizer<R, E>) -> Result<Contr
},
State::MarkupDeclarationOpen => match slf.read_char()? {
Some('-') if slf.try_read_string("-", true)? => {
- slf.emitter.init_comment();
+ slf.emitter.init_comment(&slf.reader);
slf.state = State::CommentStart;
Ok(ControlToken::Continue)
}
@@ -946,14 +948,14 @@ pub fn consume<R: Reader, E: Emitter>(slf: &mut Tokenizer<R, E>) -> Result<Contr
// let's hope that bogus comment can just sort of skip over cdata
slf.emitter.emit_error(Error::CdataInHtmlContent);
- slf.emitter.init_comment();
+ slf.emitter.init_comment(&slf.reader);
slf.emitter.push_comment("[CDATA[");
slf.state = State::BogusComment;
Ok(ControlToken::Continue)
}
c => {
slf.emitter.emit_error(Error::IncorrectlyOpenedComment);
- slf.emitter.init_comment();
+ slf.emitter.init_comment(&slf.reader);
slf.state = State::BogusComment;
slf.unread_char(c);
Ok(ControlToken::Continue)
@@ -1159,7 +1161,7 @@ pub fn consume<R: Reader, E: Emitter>(slf: &mut Tokenizer<R, E>) -> Result<Contr
}
None => {
slf.emitter.emit_error(Error::EofInDoctype);
- slf.emitter.init_doctype();
+ slf.emitter.init_doctype(&slf.reader);
slf.emitter.set_force_quirks();
slf.emitter.emit_current_doctype();
Ok(ControlToken::Eof)
@@ -1176,14 +1178,14 @@ pub fn consume<R: Reader, E: Emitter>(slf: &mut Tokenizer<R, E>) -> Result<Contr
Some(whitespace_pat!()) => Ok(ControlToken::Continue),
Some('\0') => {
slf.emitter.emit_error(Error::UnexpectedNullCharacter);
- slf.emitter.init_doctype();
+ slf.emitter.init_doctype(&slf.reader);
slf.emitter.push_doctype_name("\u{fffd}");
slf.state = State::DoctypeName;
Ok(ControlToken::Continue)
}
Some('>') => {
slf.emitter.emit_error(Error::MissingDoctypeName);
- slf.emitter.init_doctype();
+ slf.emitter.init_doctype(&slf.reader);
slf.emitter.set_force_quirks();
slf.state = State::Data;
slf.emitter.emit_current_doctype();
@@ -1191,13 +1193,13 @@ pub fn consume<R: Reader, E: Emitter>(slf: &mut Tokenizer<R, E>) -> Result<Contr
}
None => {
slf.emitter.emit_error(Error::EofInDoctype);
- slf.emitter.init_doctype();
+ slf.emitter.init_doctype(&slf.reader);
slf.emitter.set_force_quirks();
slf.emitter.emit_current_doctype();
Ok(ControlToken::Eof)
}
Some(x) => {
- slf.emitter.init_doctype();
+ slf.emitter.init_doctype(&slf.reader);
slf.emitter
.push_doctype_name(ctostr!(x.to_ascii_lowercase()));
slf.state = State::DoctypeName;
diff --git a/src/tokenizer.rs b/src/tokenizer.rs
index 377dd01..efaa870 100644
--- a/src/tokenizer.rs
+++ b/src/tokenizer.rs
@@ -33,12 +33,12 @@ impl<T: Copy> Stack2<T> {
}
/// A HTML tokenizer. See crate-level docs for basic usage.
-pub struct Tokenizer<R: Reader, E: Emitter = DefaultEmitter<()>> {
+pub struct Tokenizer<R: Reader, E: Emitter<R> = DefaultEmitter<R, ()>> {
eof: bool,
pub(crate) state: InternalState,
pub(crate) emitter: E,
pub(crate) temporary_buffer: String,
- reader: R,
+ pub(crate) reader: R,
to_reconsume: Stack2<Option<char>>,
pub(crate) character_reference_code: u32,
pub(crate) return_state: Option<InternalState>,
@@ -91,7 +91,7 @@ impl From<State> for InternalState {
}
}
-impl<R: Reader, E: Emitter> Tokenizer<R, E> {
+impl<R: Reader, E: Emitter<R>> Tokenizer<R, E> {
/// Construct a new tokenizer from some input and a custom emitter.
///
/// Use this method over [`Tokenizer::new`] when you want to have more control over string allocation for
@@ -239,7 +239,7 @@ impl<R: Reader, E: Emitter> Tokenizer<R, E> {
}
}
-impl<R: Reader, E: Emitter> Iterator for Tokenizer<R, E> {
+impl<R: Reader, E: Emitter<R>> Iterator for Tokenizer<R, E> {
type Item = Result<E::Token, R::Error>;
fn next(&mut self) -> Option<Self::Item> {
@@ -266,9 +266,9 @@ impl<R: Reader, E: Emitter> Iterator for Tokenizer<R, E> {
/// `Result<Token, _>`.
///
/// This is the return value of [`Tokenizer::infallible`].
-pub struct InfallibleTokenizer<R: Reader<Error = Never>, E: Emitter>(Tokenizer<R, E>);
+pub struct InfallibleTokenizer<R: Reader<Error = Never>, E: Emitter<R>>(Tokenizer<R, E>);
-impl<R: Reader<Error = Never>, E: Emitter> Tokenizer<R, E> {
+impl<R: Reader<Error = Never>, E: Emitter<R>> Tokenizer<R, E> {
/// Statically assert that this iterator is infallible.
///
/// Call this to get rid of error handling when parsing HTML from strings.
@@ -277,7 +277,7 @@ impl<R: Reader<Error = Never>, E: Emitter> Tokenizer<R, E> {
}
}
-impl<R: Reader<Error = Never>, E: Emitter> Iterator for InfallibleTokenizer<R, E> {
+impl<R: Reader<Error = Never>, E: Emitter<R>> Iterator for InfallibleTokenizer<R, E> {
type Item = E::Token;
fn next(&mut self) -> Option<Self::Item> {
@@ -288,7 +288,7 @@ impl<R: Reader<Error = Never>, E: Emitter> Iterator for InfallibleTokenizer<R, E
}
}
-impl<R: Reader<Error = Never>, E: Emitter> Deref for InfallibleTokenizer<R, E> {
+impl<R: Reader<Error = Never>, E: Emitter<R>> Deref for InfallibleTokenizer<R, E> {
type Target = Tokenizer<R, E>;
fn deref(&self) -> &Self::Target {
@@ -296,7 +296,7 @@ impl<R: Reader<Error = Never>, E: Emitter> Deref for InfallibleTokenizer<R, E> {
}
}
-impl<R: Reader<Error = Never>, E: Emitter> DerefMut for InfallibleTokenizer<R, E> {
+impl<R: Reader<Error = Never>, E: Emitter<R>> DerefMut for InfallibleTokenizer<R, E> {
fn deref_mut(&mut self) -> &mut Self::Target {
&mut self.0
}