aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--README.md2
-rw-r--r--examples/tokenize.rs2
-rw-r--r--src/emitter.rs60
-rw-r--r--src/machine.rs45
-rw-r--r--src/offset.rs2
-rw-r--r--src/tokenizer.rs27
-rw-r--r--tests/test_spans.rs2
7 files changed, 73 insertions, 67 deletions
diff --git a/README.md b/README.md
index 70b5be6..98a09c5 100644
--- a/README.md
+++ b/README.md
@@ -11,7 +11,7 @@ use std::fmt::Write;
use html5tokenizer::{DefaultEmitter, Tokenizer, Token};
let html = "<title >hello world</title>";
-let emitter = DefaultEmitter::<_>::default();
+let emitter = DefaultEmitter::default();
let mut new_html = String::new();
for token in Tokenizer::new(html, emitter).flatten() {
diff --git a/examples/tokenize.rs b/examples/tokenize.rs
index 54ba0ec..da99dd3 100644
--- a/examples/tokenize.rs
+++ b/examples/tokenize.rs
@@ -7,7 +7,7 @@ use std::io::BufReader;
fn main() {
for token in Tokenizer::new(
BufReader::new(std::io::stdin().lock()),
- DefaultEmitter::<_>::default(),
+ DefaultEmitter::default(),
)
.flatten()
{
diff --git a/src/emitter.rs b/src/emitter.rs
index caf7b55..1f60f70 100644
--- a/src/emitter.rs
+++ b/src/emitter.rs
@@ -2,13 +2,11 @@ use std::collections::btree_map::Entry;
use std::collections::BTreeMap;
use std::collections::BTreeSet;
use std::collections::VecDeque;
-use std::marker::PhantomData;
use std::mem;
use std::ops::Range;
use crate::offset::NoopOffset;
use crate::offset::Offset;
-use crate::offset::Position;
use crate::Error;
/// An emitter is an object providing methods to the tokenizer to produce ("emit") tokens.
@@ -29,7 +27,7 @@ use crate::Error;
/// checks that would emit errors.
///
/// * If you don't care about attributes at all, you can make all related methods a noop.
-pub trait Emitter<R> {
+pub trait Emitter<O> {
/// The token type emitted by this emitter. This controls what type of values the [`Tokenizer`](crate::Tokenizer)
/// yields when used as an iterator.
type Token;
@@ -39,7 +37,7 @@ pub trait Emitter<R> {
fn emit_eof(&mut self);
/// A (probably recoverable) parsing error has occured.
- fn emit_error(&mut self, error: Error, reader: &R);
+ fn emit_error(&mut self, error: Error, offset: O);
/// After every state change, the tokenizer calls this method to retrieve a new token that can
/// be returned via the tokenizer's iterator interface.
@@ -49,13 +47,13 @@ pub trait Emitter<R> {
fn emit_string(&mut self, c: &str);
/// Set the _current token_ to a start tag.
- fn init_start_tag(&mut self, reader: &R);
+ fn init_start_tag(&mut self, offset: O);
/// Set the _current token_ to an end tag.
- fn init_end_tag(&mut self, reader: &R);
+ fn init_end_tag(&mut self, offset: O);
/// Set the _current token_ to a comment.
- fn init_comment(&mut self, reader: &R);
+ fn init_comment(&mut self, data_offset: O);
/// Emit the _current token_, assuming it is a tag.
///
@@ -84,7 +82,7 @@ pub trait Emitter<R> {
///
/// If the current token is an end tag, the emitter should emit the
/// [`Error::EndTagWithTrailingSolidus`] error.
- fn set_self_closing(&mut self, reader: &R);
+ fn set_self_closing(&mut self, offset: O);
/// Assuming the _current token_ is a doctype, set its "force quirks" flag to true.
///
@@ -112,7 +110,7 @@ pub trait Emitter<R> {
/// * the "public identifier" should be null (different from empty)
/// * the "system identifier" should be null (different from empty)
/// * the "force quirks" flag should be `false`
- fn init_doctype(&mut self, reader: &R);
+ fn init_doctype(&mut self, offset: O);
/// Set the _current attribute_ to a new one, starting with empty name and value strings.
///
@@ -121,14 +119,14 @@ pub trait Emitter<R> {
/// [`Error::DuplicateAttribute`] error should be emitted.
///
/// If the current token is no tag at all, this method may panic.
- fn init_attribute_name(&mut self, reader: &R);
+ fn init_attribute_name(&mut self, offset: O);
/// Called before the first push_attribute_value call.
/// If the value is wrappend in double or single quotes `quoted` is set to true, otherwise false.
///
/// If there is no current attribute, this method may panic.
#[allow(unused_variables)]
- fn init_attribute_value(&mut self, reader: &R, quoted: bool) {}
+ fn init_attribute_value(&mut self, offset: O, quoted: bool) {}
/// Append a string to the current attribute's name.
///
@@ -162,17 +160,16 @@ pub trait Emitter<R> {
}
/// The default implementation of [`Emitter`], used to produce tokens.
-pub struct DefaultEmitter<R, O = NoopOffset> {
+pub struct DefaultEmitter<O = NoopOffset> {
current_characters: String,
current_token: Option<Token<O>>,
current_attribute: Option<(String, Attribute<O>)>,
seen_attributes: BTreeSet<String>,
emitted_tokens: VecDeque<Token<O>>,
- reader: PhantomData<R>,
attr_in_end_tag_span: Option<Range<O>>,
}
-impl<R, O> Default for DefaultEmitter<R, O> {
+impl<O> Default for DefaultEmitter<O> {
fn default() -> Self {
DefaultEmitter {
current_characters: String::new(),
@@ -180,13 +177,12 @@ impl<R, O> Default for DefaultEmitter<R, O> {
current_attribute: None,
seen_attributes: BTreeSet::new(),
emitted_tokens: VecDeque::new(),
- reader: PhantomData::default(),
attr_in_end_tag_span: None,
}
}
}
-impl<R, O> DefaultEmitter<R, O> {
+impl<O> DefaultEmitter<O> {
fn emit_token(&mut self, token: Token<O>) {
self.flush_current_characters();
self.emitted_tokens.push_front(token);
@@ -235,15 +231,15 @@ impl<R, O> DefaultEmitter<R, O> {
}
}
-impl<O: Offset, R: Position<O>> Emitter<R> for DefaultEmitter<R, O> {
+impl<O: Offset> Emitter<O> for DefaultEmitter<O> {
type Token = Token<O>;
fn emit_eof(&mut self) {
self.flush_current_characters();
}
- fn emit_error(&mut self, error: Error, reader: &R) {
- self.push_error(error, reader.position()..reader.position());
+ fn emit_error(&mut self, error: Error, offset: O) {
+ self.push_error(error, offset..offset);
}
fn pop_token(&mut self) -> Option<Self::Token> {
@@ -254,26 +250,26 @@ impl<O: Offset, R: Position<O>> Emitter<R> for DefaultEmitter<R, O> {
self.current_characters.push_str(s);
}
- fn init_start_tag(&mut self, reader: &R) {
+ fn init_start_tag(&mut self, offset: O) {
self.current_token = Some(Token::StartTag(StartTag {
- name_span: reader.position()..reader.position(),
+ name_span: offset..offset,
self_closing: false,
name: String::new(),
attributes: Default::default(),
}));
}
- fn init_end_tag(&mut self, reader: &R) {
+ fn init_end_tag(&mut self, offset: O) {
self.current_token = Some(Token::EndTag(EndTag {
- name_span: reader.position()..reader.position(),
+ name_span: offset..offset,
name: String::new(),
}));
self.seen_attributes.clear();
}
- fn init_comment(&mut self, reader: &R) {
+ fn init_comment(&mut self, data_offset: O) {
self.current_token = Some(Token::Comment(Comment {
data: String::new(),
- data_offset: reader.position(),
+ data_offset,
}));
}
fn emit_current_tag(&mut self) {
@@ -304,7 +300,7 @@ impl<O: Offset, R: Position<O>> Emitter<R> for DefaultEmitter<R, O> {
self.emit_token(doctype);
}
- fn set_self_closing(&mut self, reader: &R) {
+ fn set_self_closing(&mut self, offset: O) {
let tag = self.current_token.as_mut().unwrap();
match tag {
Token::StartTag(StartTag {
@@ -314,7 +310,7 @@ impl<O: Offset, R: Position<O>> Emitter<R> for DefaultEmitter<R, O> {
*self_closing = true;
}
Token::EndTag(_) => {
- self.emit_error(Error::EndTagWithTrailingSolidus, reader);
+ self.emit_error(Error::EndTagWithTrailingSolidus, offset);
}
_ => {
debug_assert!(false);
@@ -362,7 +358,7 @@ impl<O: Offset, R: Position<O>> Emitter<R> for DefaultEmitter<R, O> {
_ => debug_assert!(false),
}
}
- fn init_doctype(&mut self, _reader: &R) {
+ fn init_doctype(&mut self, _offset: O) {
self.current_token = Some(Token::Doctype(Doctype {
name: String::new(),
force_quirks: false,
@@ -371,20 +367,20 @@ impl<O: Offset, R: Position<O>> Emitter<R> for DefaultEmitter<R, O> {
}));
}
- fn init_attribute_name(&mut self, reader: &R) {
+ fn init_attribute_name(&mut self, offset: O) {
self.flush_current_attribute();
self.current_attribute = Some((
String::new(),
Attribute {
- name_span: reader.position()..reader.position(),
+ name_span: offset..offset,
value: String::new(),
value_span: Range::default(),
},
));
}
- fn init_attribute_value(&mut self, reader: &R, quoted: bool) {
+ fn init_attribute_value(&mut self, offset: O, quoted: bool) {
self.current_attribute.as_mut().unwrap().1.value_span =
- reader.position() + quoted as usize..reader.position() + quoted as usize;
+ offset + quoted as usize..offset + quoted as usize;
}
fn push_attribute_name(&mut self, s: &str) {
diff --git a/src/machine.rs b/src/machine.rs
index c11720d..deb3983 100644
--- a/src/machine.rs
+++ b/src/machine.rs
@@ -1,4 +1,5 @@
use crate::entities::try_read_character_reference;
+use crate::offset::{Offset, Position};
use crate::utils::{
ascii_digit_pat, control_pat, ctostr, noncharacter_pat, surrogate_pat, whitespace_pat,
ControlToken, State,
@@ -8,10 +9,11 @@ use crate::{reader::Reader, Emitter, Error, Tokenizer};
// Note: This is not implemented as a method on Tokenizer because there's fields on Tokenizer that
// should not be available in this method, such as Tokenizer.to_reconsume or the Reader instance
#[inline]
-pub fn consume<R, E>(slf: &mut Tokenizer<R, E>) -> Result<ControlToken, R::Error>
+pub fn consume<O, R, E>(slf: &mut Tokenizer<R, O, E>) -> Result<ControlToken, R::Error>
where
- R: Reader,
- E: Emitter<R>,
+ O: Offset,
+ R: Reader + Position<O>,
+ E: Emitter<O>,
{
macro_rules! mutate_character_reference {
(* $mul:literal + $x:ident - $sub:literal) => {
@@ -133,7 +135,7 @@ where
}
c @ Some('?') => {
slf.emit_error(Error::UnexpectedQuestionMarkInsteadOfTagName);
- slf.emitter.init_comment(&slf.reader);
+ slf.emitter.init_comment(slf.reader.position());
slf.state = State::BogusComment;
slf.unread_char(c);
Ok(ControlToken::Continue)
@@ -170,7 +172,7 @@ where
}
Some(x) => {
slf.emit_error(Error::InvalidFirstCharacterOfTagName);
- slf.emitter.init_comment(&slf.reader);
+ slf.emitter.init_comment(slf.reader.position());
slf.state = State::BogusComment;
slf.unread_char(Some(x));
Ok(ControlToken::Continue)
@@ -686,13 +688,13 @@ where
}
Some('=') => {
slf.emit_error(Error::UnexpectedEqualsSignBeforeAttributeName);
- slf.emitter.init_attribute_name(&slf.reader);
+ slf.emitter.init_attribute_name(slf.reader.position());
slf.emitter.push_attribute_name("=");
slf.state = State::AttributeName;
Ok(ControlToken::Continue)
}
Some(x) => {
- slf.emitter.init_attribute_name(&slf.reader);
+ slf.emitter.init_attribute_name(slf.reader.position());
slf.state = State::AttributeName;
slf.unread_char(Some(x));
Ok(ControlToken::Continue)
@@ -745,7 +747,7 @@ where
Ok(ControlToken::Eof)
}
Some(x) => {
- slf.emitter.init_attribute_name(&slf.reader);
+ slf.emitter.init_attribute_name(slf.reader.position());
slf.state = State::AttributeName;
slf.unread_char(Some(x));
Ok(ControlToken::Continue)
@@ -754,12 +756,14 @@ where
State::BeforeAttributeValue => match slf.read_char()? {
Some(whitespace_pat!()) => Ok(ControlToken::Continue),
Some('"') => {
- slf.emitter.init_attribute_value(&slf.reader, true);
+ slf.emitter
+ .init_attribute_value(slf.reader.position(), true);
slf.state = State::AttributeValueDoubleQuoted;
Ok(ControlToken::Continue)
}
Some('\'') => {
- slf.emitter.init_attribute_value(&slf.reader, true);
+ slf.emitter
+ .init_attribute_value(slf.reader.position(), true);
slf.state = State::AttributeValueSingleQuoted;
Ok(ControlToken::Continue)
}
@@ -770,7 +774,8 @@ where
Ok(ControlToken::Continue)
}
c => {
- slf.emitter.init_attribute_value(&slf.reader, false);
+ slf.emitter
+ .init_attribute_value(slf.reader.position(), false);
slf.state = State::AttributeValueUnquoted;
slf.unread_char(c);
Ok(ControlToken::Continue)
@@ -885,7 +890,7 @@ where
},
State::SelfClosingStartTag => match slf.read_char()? {
Some('>') => {
- slf.emitter.set_self_closing(&slf.reader);
+ slf.emitter.set_self_closing(slf.reader.position());
slf.state = State::Data;
slf.emit_current_tag();
Ok(ControlToken::Continue)
@@ -923,7 +928,7 @@ where
},
State::MarkupDeclarationOpen => match slf.read_char()? {
Some('-') if slf.try_read_string("-", true)? => {
- slf.emitter.init_comment(&slf.reader);
+ slf.emitter.init_comment(slf.reader.position());
slf.state = State::CommentStart;
Ok(ControlToken::Continue)
}
@@ -940,14 +945,14 @@ where
// let's hope that bogus comment can just sort of skip over cdata
slf.emit_error(Error::CdataInHtmlContent);
- slf.emitter.init_comment(&slf.reader);
+ slf.emitter.init_comment(slf.reader.position());
slf.emitter.push_comment("[CDATA[");
slf.state = State::BogusComment;
Ok(ControlToken::Continue)
}
c => {
slf.emit_error(Error::IncorrectlyOpenedComment);
- slf.emitter.init_comment(&slf.reader);
+ slf.emitter.init_comment(slf.reader.position());
slf.state = State::BogusComment;
slf.unread_char(c);
Ok(ControlToken::Continue)
@@ -1153,7 +1158,7 @@ where
}
None => {
slf.emit_error(Error::EofInDoctype);
- slf.emitter.init_doctype(&slf.reader);
+ slf.emitter.init_doctype(slf.reader.position());
slf.emitter.set_force_quirks();
slf.emitter.emit_current_doctype();
Ok(ControlToken::Eof)
@@ -1169,14 +1174,14 @@ where
Some(whitespace_pat!()) => Ok(ControlToken::Continue),
Some('\0') => {
slf.emit_error(Error::UnexpectedNullCharacter);
- slf.emitter.init_doctype(&slf.reader);
+ slf.emitter.init_doctype(slf.reader.position());
slf.emitter.push_doctype_name("\u{fffd}");
slf.state = State::DoctypeName;
Ok(ControlToken::Continue)
}
Some('>') => {
slf.emit_error(Error::MissingDoctypeName);
- slf.emitter.init_doctype(&slf.reader);
+ slf.emitter.init_doctype(slf.reader.position());
slf.emitter.set_force_quirks();
slf.state = State::Data;
slf.emitter.emit_current_doctype();
@@ -1184,13 +1189,13 @@ where
}
None => {
slf.emit_error(Error::EofInDoctype);
- slf.emitter.init_doctype(&slf.reader);
+ slf.emitter.init_doctype(slf.reader.position());
slf.emitter.set_force_quirks();
slf.emitter.emit_current_doctype();
Ok(ControlToken::Eof)
}
Some(x) => {
- slf.emitter.init_doctype(&slf.reader);
+ slf.emitter.init_doctype(slf.reader.position());
slf.emitter
.push_doctype_name(ctostr!(x.to_ascii_lowercase()));
slf.state = State::DoctypeName;
diff --git a/src/offset.rs b/src/offset.rs
index f1f436d..8809366 100644
--- a/src/offset.rs
+++ b/src/offset.rs
@@ -1,6 +1,6 @@
//! Source code offsets.
//!
-//! The [`DefaultEmitter`](crate::DefaultEmitter) is generic over an [`Offset`].
+//! The [`Emitter`](crate::Emitter) is generic over an [`Offset`].
//! This library comes with two Offset implementations:
//!
//! * [`NoopOffset`] for when you don't want to track source offsets
diff --git a/src/tokenizer.rs b/src/tokenizer.rs
index 7eb33f7..02a4d62 100644
--- a/src/tokenizer.rs
+++ b/src/tokenizer.rs
@@ -1,5 +1,7 @@
+use std::marker::PhantomData;
+
use crate::machine;
-use crate::offset::NoopOffset;
+use crate::offset::{NoopOffset, Offset, Position};
use crate::reader::{IntoReader, Reader};
use crate::utils::{
control_pat, noncharacter_pat, surrogate_pat, ControlToken, State as InternalState,
@@ -33,12 +35,13 @@ impl<T: Copy> Stack2<T> {
}
/// A HTML tokenizer. See crate-level docs for basic usage.
-pub struct Tokenizer<R: Reader, E: Emitter<R> = DefaultEmitter<R, NoopOffset>> {
+pub struct Tokenizer<R: Reader, O = NoopOffset, E: Emitter<O> = DefaultEmitter<O>> {
eof: bool,
pub(crate) state: InternalState,
pub(crate) emitter: E,
pub(crate) temporary_buffer: String,
pub(crate) reader: R,
+ _offset: PhantomData<O>,
to_reconsume: Stack2<Option<char>>,
pub(crate) character_reference_code: u32,
pub(crate) return_state: Option<InternalState>,
@@ -47,7 +50,7 @@ pub struct Tokenizer<R: Reader, E: Emitter<R> = DefaultEmitter<R, NoopOffset>> {
is_start_tag: bool,
}
-impl<R: Reader, E: Emitter<R>> Tokenizer<R, E> {
+impl<R: Reader, O, E: Emitter<O>> Tokenizer<R, O, E> {
/// Creates a new tokenizer from some input and an emitter.
///
/// TODO: add warning about you needing to do the state switching
@@ -55,6 +58,7 @@ impl<R: Reader, E: Emitter<R>> Tokenizer<R, E> {
Tokenizer {
reader: reader.into_reader(),
emitter,
+ _offset: PhantomData,
state: InternalState::Data,
to_reconsume: Stack2::default(),
return_state: None,
@@ -102,7 +106,7 @@ impl From<State> for InternalState {
}
}
-impl<R: Reader, E: Emitter<R>> Tokenizer<R, E> {
+impl<R: Reader + Position<O>, O, E: Emitter<O>> Tokenizer<R, O, E> {
/// Test-internal function to override internal state.
///
/// Only available with the `integration-tests` feature which is not public API.
@@ -119,7 +123,7 @@ impl<R: Reader, E: Emitter<R>> Tokenizer<R, E> {
/// Just a helper method for the machine.
#[inline]
pub(crate) fn emit_error(&mut self, error: Error) {
- self.emitter.emit_error(error, &self.reader);
+ self.emitter.emit_error(error, self.reader.position());
}
/// Assuming the _current token_ is an end tag, return true if all of these hold. Return false otherwise.
@@ -136,14 +140,14 @@ impl<R: Reader, E: Emitter<R>> Tokenizer<R, E> {
#[inline]
pub(crate) fn init_start_tag(&mut self) {
- self.emitter.init_start_tag(&self.reader);
+ self.emitter.init_start_tag(self.reader.position());
self.current_tag_name.clear();
self.is_start_tag = true;
}
#[inline]
pub(crate) fn init_end_tag(&mut self) {
- self.emitter.init_end_tag(&self.reader);
+ self.emitter.init_end_tag(self.reader.position());
self.current_tag_name.clear();
self.is_start_tag = false;
}
@@ -270,10 +274,11 @@ impl<R: Reader, E: Emitter<R>> Tokenizer<R, E> {
}
}
-impl<R, E> Iterator for Tokenizer<R, E>
+impl<O, R, E> Iterator for Tokenizer<R, O, E>
where
- R: Reader,
- E: Emitter<R>,
+ O: Offset,
+ R: Reader + Position<O>,
+ E: Emitter<O>,
{
type Item = Result<E::Token, R::Error>;
@@ -297,7 +302,7 @@ where
}
}
-impl<R: Reader, E: Emitter<R>> Tokenizer<R, E> {
+impl<R: Reader, O, E: Emitter<O>> Tokenizer<R, O, E> {
/// Test-internal function to override internal state.
///
/// Only available with the `integration-tests` feature which is not public API.
diff --git a/tests/test_spans.rs b/tests/test_spans.rs
index 21882a3..970099a 100644
--- a/tests/test_spans.rs
+++ b/tests/test_spans.rs
@@ -13,7 +13,7 @@ use pretty_assertions::assert_eq;
fn tokenizer(html: &'static str) -> impl Iterator<Item = Token<usize>> {
Tokenizer::new(
PosTrackingReader::new(html),
- DefaultEmitter::<_, usize>::default(),
+ DefaultEmitter::<usize>::default(),
)
.flatten()
}