aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMartin Fischer <martin@push-f.com>2021-11-30 17:09:08 +0100
committerMartin Fischer <martin@push-f.com>2021-12-05 02:52:36 +0100
commit1f99ea9e16f85945e2606905ed6345519ce16e4e (patch)
treee2c689ac735f08c8d683d84be0292f1e20d7051a
parent91c0008023746a9ffdd01b9b87f89a2ef4ebb01e (diff)
spans: make Emitter generic over Span
-rw-r--r--src/emitter.rs32
-rw-r--r--src/tokenizer.rs2
-rw-r--r--tests/test_html5lib.rs11
3 files changed, 28 insertions, 17 deletions
diff --git a/src/emitter.rs b/src/emitter.rs
index 0a80544..2c4ba41 100644
--- a/src/emitter.rs
+++ b/src/emitter.rs
@@ -173,17 +173,17 @@ pub trait Emitter {
/// The default implementation of [`crate::Emitter`], used to produce ("emit") tokens.
#[derive(Default)]
-pub struct DefaultEmitter {
+pub struct DefaultEmitter<S> {
current_characters: String,
- current_token: Option<Token>,
+ current_token: Option<Token<S>>,
last_start_tag: String,
current_attribute: Option<(String, String)>,
seen_attributes: BTreeSet<String>,
- emitted_tokens: VecDeque<Token>,
+ emitted_tokens: VecDeque<Token<S>>,
}
-impl DefaultEmitter {
- fn emit_token(&mut self, token: Token) {
+impl DefaultEmitter<()> {
+ fn emit_token(&mut self, token: Token<()>) {
self.flush_current_characters();
self.emitted_tokens.push_front(token);
}
@@ -226,8 +226,8 @@ impl DefaultEmitter {
}
}
-impl Emitter for DefaultEmitter {
- type Token = Token;
+impl Emitter for DefaultEmitter<()> {
+ type Token = Token<()>;
fn set_last_start_tag(&mut self, last_start_tag: Option<&str>) {
self.last_start_tag.clear();
@@ -417,7 +417,7 @@ impl Emitter for DefaultEmitter {
/// A HTML end/close tag, such as `<p>` or `<a>`.
#[derive(Debug, Default, Eq, PartialEq)]
-pub struct StartTag {
+pub struct StartTag<S> {
/// Whether this tag is self-closing. If it is self-closing, no following [`EndTag`] should be
/// expected.
pub self_closing: bool,
@@ -430,9 +430,12 @@ pub struct StartTag {
/// Duplicate attributes are ignored after the first one as per WHATWG spec. Implement your own
/// [`Emitter`] to tweak this behavior.
pub attributes: BTreeMap<String, String>,
+
+ /// The source code span of the tag name.
+ pub name_span: S,
}
-impl StartTag {
+impl<S> StartTag<S> {
/// Returns the next tokenizer state according to
/// [Parsing HTML fragments](https://html.spec.whatwg.org/multipage/parsing.html#concept-frag-parse-context).
/// If `scripting` is set to true [`State::RawText`] is returned if this is a `<noscript>` tag,
@@ -451,9 +454,12 @@ impl StartTag {
/// A HTML end/close tag, such as `</p>` or `</a>`.
#[derive(Debug, Default, Eq, PartialEq)]
-pub struct EndTag {
+pub struct EndTag<S> {
/// The ending tag's name, such as `"p"` or `"a"`.
pub name: String,
+
+ /// The source code span of the tag name.
+ pub name_span: S,
}
/// A doctype. Some examples:
@@ -480,11 +486,11 @@ pub struct Doctype {
/// The token type used by default. You can define your own token type by implementing the
/// [`crate::Emitter`] trait and using [`crate::Tokenizer::new_with_emitter`].
#[derive(Debug, Eq, PartialEq)]
-pub enum Token {
+pub enum Token<S> {
/// A HTML start tag.
- StartTag(StartTag),
+ StartTag(StartTag<S>),
/// A HTML end tag.
- EndTag(EndTag),
+ EndTag(EndTag<S>),
/// A literal string.
String(String),
/// A HTML comment.
diff --git a/src/tokenizer.rs b/src/tokenizer.rs
index b5a2edf..377dd01 100644
--- a/src/tokenizer.rs
+++ b/src/tokenizer.rs
@@ -33,7 +33,7 @@ impl<T: Copy> Stack2<T> {
}
/// A HTML tokenizer. See crate-level docs for basic usage.
-pub struct Tokenizer<R: Reader, E: Emitter = DefaultEmitter> {
+pub struct Tokenizer<R: Reader, E: Emitter = DefaultEmitter<()>> {
eof: bool,
pub(crate) state: InternalState,
pub(crate) emitter: E,
diff --git a/tests/test_html5lib.rs b/tests/test_html5lib.rs
index cb11a00..5668217 100644
--- a/tests/test_html5lib.rs
+++ b/tests/test_html5lib.rs
@@ -10,7 +10,7 @@ compile_error!(
"integration tests need the integration-tests feature enabled. Run cargo test --all-features"
);
-struct ExpectedOutputTokens(Vec<Token>);
+struct ExpectedOutputTokens(Vec<Token<()>>);
impl<'de> Deserialize<'de> for ExpectedOutputTokens {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
@@ -80,19 +80,24 @@ impl<'de> Deserialize<'de> for ExpectedOutputTokens {
self_closing: false,
name,
attributes,
+ name_span: (),
}),
OutputToken::StartTag2(_, name, attributes, self_closing) => {
Token::StartTag(StartTag {
self_closing,
name,
attributes,
+ name_span: (),
})
}
- OutputToken::EndTag(_, name) => Token::EndTag(EndTag { name }),
+ OutputToken::EndTag(_, name) => Token::EndTag(EndTag {
+ name,
+ name_span: (),
+ }),
OutputToken::Comment(_, data) => Token::Comment(data),
OutputToken::Character(_, data) => Token::String(data),
})
- .collect::<Vec<Token>>(),
+ .collect::<Vec<Token<()>>>(),
))
}
}