diff options
author | Martin Fischer <martin@push-f.com> | 2023-09-09 19:10:49 +0200 |
---|---|---|
committer | Martin Fischer <martin@push-f.com> | 2023-09-09 23:02:47 +0200 |
commit | 1d8e6239875c810197a0679a20412726afb8ff66 (patch) | |
tree | 3daa917d720235117770040405a78dad548032e9 /src/emitter.rs | |
parent | 77eb3eefeeab3ba7ab3c6387e5916192b95b482d (diff) |
refactor: merge token types with attr to new token module
Diffstat (limited to 'src/emitter.rs')
-rw-r--r-- | src/emitter.rs | 145 |
1 files changed, 6 insertions, 139 deletions
diff --git a/src/emitter.rs b/src/emitter.rs index f27c778..23f9ede 100644 --- a/src/emitter.rs +++ b/src/emitter.rs @@ -4,9 +4,9 @@ use std::collections::VecDeque; use std::mem; use std::ops::Range; -use crate::attr::AttrValueSyntax; use crate::offset::NoopOffset; use crate::offset::Offset; +use crate::token::{AttrValueSyntax, Comment, Doctype, EndTag, StartTag, Token}; use crate::Error; /// An emitter is an object providing methods to the tokenizer to produce ("emit") tokens. @@ -214,7 +214,7 @@ pub trait Emitter<O> { pub struct DefaultEmitter<O = NoopOffset> { current_characters: String, current_token: Option<Token<O>>, - current_attribute: Option<(String, crate::attr::AttrInternal<O>)>, + current_attribute: Option<(String, crate::token::AttrInternal<O>)>, seen_attributes: BTreeSet<String>, emitted_tokens: VecDeque<Token<O>>, attr_in_end_tag_span: Option<Range<O>>, @@ -465,7 +465,7 @@ impl<O: Offset> Emitter<O> for DefaultEmitter<O> { self.flush_current_attribute(); self.current_attribute = Some(( String::new(), - crate::attr::AttrInternal { + crate::token::AttrInternal { name_span: offset..O::default(), value: String::new(), value_span: O::default()..O::default(), @@ -564,147 +564,14 @@ impl<O: Offset> Emitter<O> for DefaultEmitter<O> { } } -/// An HTML start tag, such as `<p>` or `<a>`. -#[derive(Debug, Eq, PartialEq)] -pub struct StartTag<O> { - /// Whether this tag is self-closing. If it is self-closing, no following [`EndTag`] should be - /// expected. - pub self_closing: bool, - - /// The tag name. - /// Uppercase ASCII characters (A-Z) have been converted to lowercase. - pub name: String, - - /// A mapping for any HTML attributes this start tag may have. - /// - /// Duplicate attributes are ignored after the first one as per WHATWG spec. - pub attributes: crate::attr::AttributeMap<O>, - - /// The source code span of the tag. - pub span: Range<O>, - - /// The span of the tag name. - pub name_span: Range<O>, -} - -/// An HTML end/close tag, such as `</p>` or `</a>`. -#[derive(Debug, Eq, PartialEq)] -pub struct EndTag<O> { - /// The tag name. - /// Uppercase ASCII characters (A-Z) have been converted to lowercase. - pub name: String, - - /// The source code span of the tag. - pub span: Range<O>, - - /// The span of the tag name. - pub name_span: Range<O>, -} - -/// An HTML comment. -#[derive(PartialEq, Eq, Debug)] -pub struct Comment<O> { - /// The text within the comment. - pub data: String, - /// The source offset of the comment data. - pub data_span: Range<O>, -} - -impl<O: Offset> Comment<O> { - /// Returns the span for the comment data. - pub fn data_span(&self) -> Range<O> { - self.data_span.clone() - } -} - -/// A doctype. Some examples: -/// -/// * `<!DOCTYPE {name}>` -/// * `<!DOCTYPE {name} PUBLIC '{public_id}'>` -/// * `<!DOCTYPE {name} SYSTEM '{system_id}'>` -/// * `<!DOCTYPE {name} PUBLIC '{public_id}' '{system_id}'>` -#[derive(Debug, Eq, PartialEq)] -pub struct Doctype<O> { - /// The [force-quirks flag]. - /// - /// [force-quirks flag]: https://html.spec.whatwg.org/multipage/parsing.html#force-quirks-flag - pub force_quirks: bool, - - /// The doctype's name. Uppercase ASCII characters (A-Z) have been - /// converted to lowercase. For HTML documents this should be "html". - pub name: Option<String>, - - /// The doctype's public identifier. - pub public_id: Option<String>, - - /// The doctype's system identifier. - pub system_id: Option<String>, - - /// The source code span of the doctype. - pub span: Range<O>, - - /// The span of the name. - name_span: Range<O>, - - /// The span of the public identifier. - public_id_span: Range<O>, - - /// The span of the system identifier. - system_id_span: Range<O>, -} - -impl<O: Offset> Doctype<O> { - /// Returns the span of the name. - pub fn name_span(&self) -> Option<Range<O>> { - self.name.as_ref()?; - Some(self.name_span.clone()) - } - - /// Returns the span of the public identifier. - pub fn public_id_span(&self) -> Option<Range<O>> { - self.public_id.as_ref()?; - Some(self.public_id_span.clone()) - } - - /// Returns the span of the system identifier. - pub fn system_id_span(&self) -> Option<Range<O>> { - self.system_id.as_ref()?; - Some(self.system_id_span.clone()) - } -} - -/// A type for the tokens emitted by a WHATWG-compliant HTML tokenizer. -#[derive(Debug, Eq, PartialEq)] -pub enum Token<O> { - /// An HTML start tag. - StartTag(StartTag<O>), - /// An HTML end tag. - EndTag(EndTag<O>), - /// A literal string. Character references have been resolved. - String(String), - /// An HTML comment. - Comment(Comment<O>), - /// An HTML doctype declaration. - Doctype(Doctype<O>), - /// An HTML parsing error. - /// - /// Can be skipped over, the tokenizer is supposed to recover from the error and continues with - /// more tokens afterward. - Error { - /// What kind of error occurred. - error: Error, - /// The source code span of the error. - span: Range<O>, - }, -} - /// The majority of our testing of the [`DefaultEmitter`] is done against the /// html5lib-tests in the html5lib integration test. This module only tests /// details that aren't present in the html5lib test data. #[cfg(test)] mod tests { - use super::{DefaultEmitter, Token}; - use crate::{attr::AttrValueSyntax, Event, Tokenizer}; + use super::DefaultEmitter; + use crate::token::{AttrValueSyntax, Token}; + use crate::{Event, Tokenizer}; #[test] fn test_attribute_value_syntax() { |