aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/emitter.rs81
-rw-r--r--src/lib.rs2
-rw-r--r--src/offset.rs (renamed from src/spans.rs)69
-rw-r--r--src/tokenizer.rs4
4 files changed, 79 insertions, 77 deletions
diff --git a/src/emitter.rs b/src/emitter.rs
index 18b2539..b3fdb99 100644
--- a/src/emitter.rs
+++ b/src/emitter.rs
@@ -4,9 +4,11 @@ use std::collections::BTreeSet;
use std::collections::VecDeque;
use std::marker::PhantomData;
use std::mem;
+use std::ops::Range;
-use crate::spans::Position;
-use crate::spans::Span;
+use crate::offset::NoopOffset;
+use crate::offset::Offset;
+use crate::offset::Position;
use crate::Error;
/// An emitter is an object providing methods to the tokenizer to produce ("emit") tokens.
@@ -160,17 +162,17 @@ pub trait Emitter<R> {
}
/// The default implementation of [`Emitter`], used to produce tokens.
-pub struct DefaultEmitter<R, S = ()> {
+pub struct DefaultEmitter<R, O = NoopOffset> {
current_characters: String,
- current_token: Option<Token<S>>,
- current_attribute: Option<(String, Attribute<S>)>,
+ current_token: Option<Token<O>>,
+ current_attribute: Option<(String, Attribute<O>)>,
seen_attributes: BTreeSet<String>,
- emitted_tokens: VecDeque<Token<S>>,
+ emitted_tokens: VecDeque<Token<O>>,
reader: PhantomData<R>,
- attr_in_end_tag_span: Option<S>,
+ attr_in_end_tag_span: Option<Range<O>>,
}
-impl<R, S> Default for DefaultEmitter<R, S> {
+impl<R, O> Default for DefaultEmitter<R, O> {
fn default() -> Self {
DefaultEmitter {
current_characters: String::new(),
@@ -184,13 +186,16 @@ impl<R, S> Default for DefaultEmitter<R, S> {
}
}
-impl<R, S: Span> DefaultEmitter<R, S> {
- fn emit_token(&mut self, token: Token<S>) {
+impl<R, O> DefaultEmitter<R, O> {
+ fn emit_token(&mut self, token: Token<O>) {
self.flush_current_characters();
self.emitted_tokens.push_front(token);
}
- fn flush_current_attribute(&mut self) {
+ fn flush_current_attribute(&mut self)
+ where
+ O: Clone,
+ {
if let Some((k, v)) = self.current_attribute.take() {
match self.current_token {
Some(Token::StartTag(ref mut tag)) => match tag.attributes.entry(k) {
@@ -223,22 +228,22 @@ impl<R, S: Span> DefaultEmitter<R, S> {
self.emit_token(Token::String(s));
}
- fn push_error(&mut self, error: Error, span: S) {
+ fn push_error(&mut self, error: Error, span: Range<O>) {
// bypass character flushing in self.emit_token: we don't need the error location to be
// that exact
self.emitted_tokens.push_front(Token::Error { error, span });
}
}
-impl<R: Position<S::Offset>, S: Span> Emitter<R> for DefaultEmitter<R, S> {
- type Token = Token<S>;
+impl<O: Offset, R: Position<O>> Emitter<R> for DefaultEmitter<R, O> {
+ type Token = Token<O>;
fn emit_eof(&mut self) {
self.flush_current_characters();
}
fn emit_error(&mut self, error: Error, reader: &R) {
- self.push_error(error, S::new(reader.position(), reader.position()));
+ self.push_error(error, reader.position()..reader.position());
}
fn pop_token(&mut self) -> Option<Self::Token> {
@@ -251,7 +256,7 @@ impl<R: Position<S::Offset>, S: Span> Emitter<R> for DefaultEmitter<R, S> {
fn init_start_tag(&mut self, reader: &R) {
self.current_token = Some(Token::StartTag(StartTag {
- name_span: S::new(reader.position(), reader.position()),
+ name_span: reader.position()..reader.position(),
self_closing: false,
name: String::new(),
attributes: Default::default(),
@@ -259,7 +264,7 @@ impl<R: Position<S::Offset>, S: Span> Emitter<R> for DefaultEmitter<R, S> {
}
fn init_end_tag(&mut self, reader: &R) {
self.current_token = Some(Token::EndTag(EndTag {
- name_span: S::new(reader.position(), reader.position()),
+ name_span: reader.position()..reader.position(),
name: String::new(),
}));
self.seen_attributes.clear();
@@ -327,7 +332,7 @@ impl<R: Position<S::Offset>, S: Span> Emitter<R> for DefaultEmitter<R, S> {
..
})) => {
name.push_str(s);
- name_span.push_str(s);
+ name_span.end += s.len();
}
Some(Token::EndTag(EndTag {
ref mut name,
@@ -335,7 +340,7 @@ impl<R: Position<S::Offset>, S: Span> Emitter<R> for DefaultEmitter<R, S> {
..
})) => {
name.push_str(s);
- name_span.push_str(s);
+ name_span.end += s.len();
}
_ => debug_assert!(false),
}
@@ -368,28 +373,26 @@ impl<R: Position<S::Offset>, S: Span> Emitter<R> for DefaultEmitter<R, S> {
self.current_attribute = Some((
String::new(),
Attribute {
- name_span: S::new(reader.position(), reader.position()),
+ name_span: reader.position()..reader.position(),
value: String::new(),
- value_span: S::default(),
+ value_span: Range::default(),
},
));
}
fn init_attribute_value(&mut self, reader: &R, quoted: bool) {
- self.current_attribute.as_mut().unwrap().1.value_span = S::new(
- reader.position() + quoted as usize,
- reader.position() + quoted as usize,
- );
+ self.current_attribute.as_mut().unwrap().1.value_span =
+ reader.position() + quoted as usize..reader.position() + quoted as usize;
}
fn push_attribute_name(&mut self, s: &str) {
let current_attr = self.current_attribute.as_mut().unwrap();
current_attr.0.push_str(s);
- current_attr.1.name_span.push_str(s);
+ current_attr.1.name_span.end += s.len();
}
fn push_attribute_value(&mut self, s: &str) {
let current_attr = self.current_attribute.as_mut().unwrap();
current_attr.1.value.push_str(s);
- current_attr.1.value_span.push_str(s);
+ current_attr.1.value_span.end += s.len();
}
fn set_doctype_public_identifier(&mut self, value: &str) {
if let Some(Token::Doctype(Doctype {
@@ -439,7 +442,7 @@ impl<R: Position<S::Offset>, S: Span> Emitter<R> for DefaultEmitter<R, S> {
/// An HTML start tag, such as `<p>` or `<a>`.
#[derive(Debug, Eq, PartialEq)]
-pub struct StartTag<S> {
+pub struct StartTag<O> {
/// Whether this tag is self-closing. If it is self-closing, no following [`EndTag`] should be
/// expected.
pub self_closing: bool,
@@ -451,33 +454,33 @@ pub struct StartTag<S> {
///
/// Duplicate attributes are ignored after the first one as per WHATWG spec. Implement your own
/// [`Emitter`] to tweak this behavior.
- pub attributes: BTreeMap<String, Attribute<S>>,
+ pub attributes: BTreeMap<String, Attribute<O>>,
/// The source code span of the tag name.
- pub name_span: S,
+ pub name_span: Range<O>,
}
/// A HTML attribute value (plus spans).
#[derive(Debug, Eq, PartialEq)]
-pub struct Attribute<S> {
+pub struct Attribute<O> {
/// The value of the attribute.
pub value: String,
/// The source code span of the attribute name.
- pub name_span: S,
+ pub name_span: Range<O>,
/// The source code span of the attribute value.
- pub value_span: S,
+ pub value_span: Range<O>,
}
/// A HTML end/close tag, such as `</p>` or `</a>`.
#[derive(Debug, Eq, PartialEq)]
-pub struct EndTag<S> {
+pub struct EndTag<O> {
/// The ending tag's name, such as `"p"` or `"a"`.
pub name: String,
/// The source code span of the tag name.
- pub name_span: S,
+ pub name_span: Range<O>,
}
/// A doctype. Some examples:
@@ -504,11 +507,11 @@ pub struct Doctype {
/// The token type used by default. You can define your own token type by implementing the
/// [`Emitter`] trait.
#[derive(Debug, Eq, PartialEq)]
-pub enum Token<S> {
+pub enum Token<O> {
/// A HTML start tag.
- StartTag(StartTag<S>),
+ StartTag(StartTag<O>),
/// A HTML end tag.
- EndTag(EndTag<S>),
+ EndTag(EndTag<O>),
/// A literal string.
String(String),
/// A HTML comment.
@@ -523,6 +526,6 @@ pub enum Token<S> {
/// What kind of error occured.
error: Error,
/// The source code span of the error.
- span: S,
+ span: Range<O>,
},
}
diff --git a/src/lib.rs b/src/lib.rs
index 12eb6a2..fd0349c 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -7,8 +7,8 @@ mod emitter;
mod entities;
mod error;
mod machine;
+pub mod offset;
pub mod reader;
-pub mod spans;
mod tokenizer;
mod utils;
diff --git a/src/spans.rs b/src/offset.rs
index 14392cd..f1f436d 100644
--- a/src/spans.rs
+++ b/src/offset.rs
@@ -1,19 +1,39 @@
-//! Source code spans.
+//! Source code offsets.
//!
-//! The [`DefaultEmitter`](crate::DefaultEmitter) is generic over a [`Span`].
-//! This library comes with two Span implementations:
+//! The [`DefaultEmitter`](crate::DefaultEmitter) is generic over an [`Offset`].
+//! This library comes with two Offset implementations:
//!
-//! * one for `()` which acts as the no-op implementation for when you don't want to track spans
-//! * one for [`Range<usize>`] for when you do want to track spans
+//! * [`NoopOffset`] for when you don't want to track source offsets
+//! * `usize` for when you do want to track source offsets
//!
//! To use the latter your reader however has to implement [`Position<usize>`].
//! You can easily use any existing reader by wrapping it in the [`PosTrackingReader`] struct
//! which implements the [`Position<usize>`] trait and takes care of tracking the current position.
-use std::ops::{Add, Range};
+use std::fmt::Debug;
+use std::ops::{Add, AddAssign, Sub};
use crate::reader::{IntoReader, Reader};
+/// A byte offset in the source.
+pub trait Offset:
+ Default
+ + Copy
+ + Eq
+ + Ord
+ + Add<usize, Output = Self>
+ + Sub<usize, Output = Self>
+ + AddAssign<usize>
+ + Debug
+{
+}
+
+impl Offset for usize {}
+
+impl Offset for NoopOffset {}
+
+/// A zero-sized no-op implementation of [`Offset`] (for when you don't want to track offsets).
+#[derive(Default, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Debug)]
pub struct NoopOffset;
/// A trait to be implemented by readers that track their own position.
@@ -48,31 +68,10 @@ impl<R> PosTrackingReader<R> {
impl<R> Position<usize> for PosTrackingReader<R> {
fn position(&self) -> usize {
- self.position
+ self.position - 1
}
}
-/// A byte range in the source code.
-pub trait Span: Default + Clone {
- type Offset: Add<usize, Output = Self::Offset>;
-
- /// Constructs a new span from the given byte offsets.
- fn new(start: Self::Offset, end: Self::Offset) -> Self;
-
- /// Extends the span by the length of the given string.
- fn push_str(&mut self, str: &str);
-}
-
-impl Span for () {
- type Offset = NoopOffset;
-
- fn new(_start: Self::Offset, _end: Self::Offset) -> Self {
- ()
- }
-
- fn push_str(&mut self, _str: &str) {}
-}
-
impl Add<usize> for NoopOffset {
type Output = Self;
@@ -81,16 +80,16 @@ impl Add<usize> for NoopOffset {
}
}
-impl Span for Range<usize> {
- type Offset = usize;
+impl Sub<usize> for NoopOffset {
+ type Output = Self;
- fn new(start: Self::Offset, end: Self::Offset) -> Self {
- start - 1..end - 1
+ fn sub(self, _rhs: usize) -> NoopOffset {
+ self
}
+}
- fn push_str(&mut self, str: &str) {
- self.end += str.len();
- }
+impl AddAssign<usize> for NoopOffset {
+ fn add_assign(&mut self, _rhs: usize) {}
}
impl<R: Reader> Reader for PosTrackingReader<R> {
diff --git a/src/tokenizer.rs b/src/tokenizer.rs
index 7b8b1ce..141efb9 100644
--- a/src/tokenizer.rs
+++ b/src/tokenizer.rs
@@ -1,6 +1,6 @@
use crate::machine;
+use crate::offset::NoopOffset;
use crate::reader::{IntoReader, Reader};
-use crate::spans::Position;
use crate::utils::{
control_pat, noncharacter_pat, surrogate_pat, ControlToken, State as InternalState,
};
@@ -33,7 +33,7 @@ impl<T: Copy> Stack2<T> {
}
/// A HTML tokenizer. See crate-level docs for basic usage.
-pub struct Tokenizer<R: Reader, E: Emitter<R> = DefaultEmitter<R, ()>> {
+pub struct Tokenizer<R: Reader, E: Emitter<R> = DefaultEmitter<R, NoopOffset>> {
eof: bool,
pub(crate) state: InternalState,
pub(crate) emitter: E,