summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMartin Fischer <martin@push-f.com>2023-09-09 19:10:49 +0200
committerMartin Fischer <martin@push-f.com>2023-09-09 23:02:47 +0200
commit1d8e6239875c810197a0679a20412726afb8ff66 (patch)
tree3daa917d720235117770040405a78dad548032e9
parent77eb3eefeeab3ba7ab3c6387e5916192b95b482d (diff)
refactor: merge token types with attr to new token module
-rw-r--r--src/emitter.rs145
-rw-r--r--src/lib.rs11
-rw-r--r--src/machine.rs2
-rw-r--r--src/token.rs (renamed from src/attr.rs)137
4 files changed, 152 insertions, 143 deletions
diff --git a/src/emitter.rs b/src/emitter.rs
index f27c778..23f9ede 100644
--- a/src/emitter.rs
+++ b/src/emitter.rs
@@ -4,9 +4,9 @@ use std::collections::VecDeque;
use std::mem;
use std::ops::Range;
-use crate::attr::AttrValueSyntax;
use crate::offset::NoopOffset;
use crate::offset::Offset;
+use crate::token::{AttrValueSyntax, Comment, Doctype, EndTag, StartTag, Token};
use crate::Error;
/// An emitter is an object providing methods to the tokenizer to produce ("emit") tokens.
@@ -214,7 +214,7 @@ pub trait Emitter<O> {
pub struct DefaultEmitter<O = NoopOffset> {
current_characters: String,
current_token: Option<Token<O>>,
- current_attribute: Option<(String, crate::attr::AttrInternal<O>)>,
+ current_attribute: Option<(String, crate::token::AttrInternal<O>)>,
seen_attributes: BTreeSet<String>,
emitted_tokens: VecDeque<Token<O>>,
attr_in_end_tag_span: Option<Range<O>>,
@@ -465,7 +465,7 @@ impl<O: Offset> Emitter<O> for DefaultEmitter<O> {
self.flush_current_attribute();
self.current_attribute = Some((
String::new(),
- crate::attr::AttrInternal {
+ crate::token::AttrInternal {
name_span: offset..O::default(),
value: String::new(),
value_span: O::default()..O::default(),
@@ -564,147 +564,14 @@ impl<O: Offset> Emitter<O> for DefaultEmitter<O> {
}
}
-/// An HTML start tag, such as `<p>` or `<a>`.
-#[derive(Debug, Eq, PartialEq)]
-pub struct StartTag<O> {
- /// Whether this tag is self-closing. If it is self-closing, no following [`EndTag`] should be
- /// expected.
- pub self_closing: bool,
-
- /// The tag name.
- /// Uppercase ASCII characters (A-Z) have been converted to lowercase.
- pub name: String,
-
- /// A mapping for any HTML attributes this start tag may have.
- ///
- /// Duplicate attributes are ignored after the first one as per WHATWG spec.
- pub attributes: crate::attr::AttributeMap<O>,
-
- /// The source code span of the tag.
- pub span: Range<O>,
-
- /// The span of the tag name.
- pub name_span: Range<O>,
-}
-
-/// An HTML end/close tag, such as `</p>` or `</a>`.
-#[derive(Debug, Eq, PartialEq)]
-pub struct EndTag<O> {
- /// The tag name.
- /// Uppercase ASCII characters (A-Z) have been converted to lowercase.
- pub name: String,
-
- /// The source code span of the tag.
- pub span: Range<O>,
-
- /// The span of the tag name.
- pub name_span: Range<O>,
-}
-
-/// An HTML comment.
-#[derive(PartialEq, Eq, Debug)]
-pub struct Comment<O> {
- /// The text within the comment.
- pub data: String,
- /// The source offset of the comment data.
- pub data_span: Range<O>,
-}
-
-impl<O: Offset> Comment<O> {
- /// Returns the span for the comment data.
- pub fn data_span(&self) -> Range<O> {
- self.data_span.clone()
- }
-}
-
-/// A doctype. Some examples:
-///
-/// * `<!DOCTYPE {name}>`
-/// * `<!DOCTYPE {name} PUBLIC '{public_id}'>`
-/// * `<!DOCTYPE {name} SYSTEM '{system_id}'>`
-/// * `<!DOCTYPE {name} PUBLIC '{public_id}' '{system_id}'>`
-#[derive(Debug, Eq, PartialEq)]
-pub struct Doctype<O> {
- /// The [force-quirks flag].
- ///
- /// [force-quirks flag]: https://html.spec.whatwg.org/multipage/parsing.html#force-quirks-flag
- pub force_quirks: bool,
-
- /// The doctype's name. Uppercase ASCII characters (A-Z) have been
- /// converted to lowercase. For HTML documents this should be "html".
- pub name: Option<String>,
-
- /// The doctype's public identifier.
- pub public_id: Option<String>,
-
- /// The doctype's system identifier.
- pub system_id: Option<String>,
-
- /// The source code span of the doctype.
- pub span: Range<O>,
-
- /// The span of the name.
- name_span: Range<O>,
-
- /// The span of the public identifier.
- public_id_span: Range<O>,
-
- /// The span of the system identifier.
- system_id_span: Range<O>,
-}
-
-impl<O: Offset> Doctype<O> {
- /// Returns the span of the name.
- pub fn name_span(&self) -> Option<Range<O>> {
- self.name.as_ref()?;
- Some(self.name_span.clone())
- }
-
- /// Returns the span of the public identifier.
- pub fn public_id_span(&self) -> Option<Range<O>> {
- self.public_id.as_ref()?;
- Some(self.public_id_span.clone())
- }
-
- /// Returns the span of the system identifier.
- pub fn system_id_span(&self) -> Option<Range<O>> {
- self.system_id.as_ref()?;
- Some(self.system_id_span.clone())
- }
-}
-
-/// A type for the tokens emitted by a WHATWG-compliant HTML tokenizer.
-#[derive(Debug, Eq, PartialEq)]
-pub enum Token<O> {
- /// An HTML start tag.
- StartTag(StartTag<O>),
- /// An HTML end tag.
- EndTag(EndTag<O>),
- /// A literal string. Character references have been resolved.
- String(String),
- /// An HTML comment.
- Comment(Comment<O>),
- /// An HTML doctype declaration.
- Doctype(Doctype<O>),
- /// An HTML parsing error.
- ///
- /// Can be skipped over, the tokenizer is supposed to recover from the error and continues with
- /// more tokens afterward.
- Error {
- /// What kind of error occurred.
- error: Error,
- /// The source code span of the error.
- span: Range<O>,
- },
-}
-
/// The majority of our testing of the [`DefaultEmitter`] is done against the
/// html5lib-tests in the html5lib integration test. This module only tests
/// details that aren't present in the html5lib test data.
#[cfg(test)]
mod tests {
- use super::{DefaultEmitter, Token};
- use crate::{attr::AttrValueSyntax, Event, Tokenizer};
+ use super::DefaultEmitter;
+ use crate::token::{AttrValueSyntax, Token};
+ use crate::{Event, Tokenizer};
#[test]
fn test_attribute_value_syntax() {
diff --git a/src/lib.rs b/src/lib.rs
index 97358d0..2918c80 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -15,13 +15,20 @@ mod naive_parser;
mod tokenizer;
mod utils;
-pub mod attr;
+/// Types for HTML attributes.
+pub mod attr {
+ pub use crate::token::{
+ AttrIntoIter, AttrIter, AttrValueSyntax, Attribute, AttributeMap, AttributeOwned,
+ };
+}
pub mod offset;
pub mod reader;
+pub mod token;
-pub use emitter::{Comment, DefaultEmitter, Doctype, Emitter, EndTag, StartTag, Token};
+pub use emitter::{DefaultEmitter, Emitter};
pub use error::Error;
pub use naive_parser::NaiveParser;
+pub use token::{Comment, Doctype, EndTag, StartTag, Token};
pub use tokenizer::{CdataAction, Event, State, Tokenizer};
#[cfg(feature = "integration-tests")]
diff --git a/src/machine.rs b/src/machine.rs
index d175b8b..6d4dc10 100644
--- a/src/machine.rs
+++ b/src/machine.rs
@@ -1,6 +1,6 @@
-use crate::attr::AttrValueSyntax;
use crate::entities::try_read_character_reference;
use crate::offset::{Offset, Position};
+use crate::token::AttrValueSyntax;
use crate::tokenizer::CdataAction;
use crate::utils::{
ascii_digit_pat, control_pat, ctostr, noncharacter_pat, surrogate_pat, whitespace_pat, State,
diff --git a/src/attr.rs b/src/token.rs
index 096235e..48c90f7 100644
--- a/src/attr.rs
+++ b/src/token.rs
@@ -1,10 +1,145 @@
-//! Types for HTML attributes.
+//! Provides the [`Token`] type.
use std::collections::{btree_map, BTreeMap};
use std::iter::FromIterator;
use std::ops::{Index, Range};
use crate::offset::Offset;
+use crate::Error;
+
+/// A type for the tokens emitted by a WHATWG-compliant HTML tokenizer.
+#[derive(Debug, Eq, PartialEq)]
+pub enum Token<O> {
+ /// An HTML start tag.
+ StartTag(StartTag<O>),
+ /// An HTML end tag.
+ EndTag(EndTag<O>),
+ /// A literal string. Character references have been resolved.
+ String(String),
+ /// An HTML comment.
+ Comment(Comment<O>),
+ /// An HTML doctype declaration.
+ Doctype(Doctype<O>),
+ /// An HTML parsing error.
+ ///
+ /// Can be skipped over, the tokenizer is supposed to recover from the error and continues with
+ /// more tokens afterward.
+ Error {
+ /// What kind of error occurred.
+ error: Error,
+ /// The source code span of the error.
+ span: Range<O>,
+ },
+}
+
+/// An HTML start tag, such as `<p>` or `<a>`.
+#[derive(Debug, Eq, PartialEq)]
+pub struct StartTag<O> {
+ /// Whether this tag is self-closing. If it is self-closing, no following [`EndTag`] should be
+ /// expected.
+ pub self_closing: bool,
+
+ /// The tag name.
+ /// Uppercase ASCII characters (A-Z) have been converted to lowercase.
+ pub name: String,
+
+ /// A mapping for any HTML attributes this start tag may have.
+ ///
+ /// Duplicate attributes are ignored after the first one as per WHATWG spec.
+ pub attributes: AttributeMap<O>,
+
+ /// The source code span of the tag.
+ pub span: Range<O>,
+
+ /// The span of the tag name.
+ pub name_span: Range<O>,
+}
+
+/// An HTML end/close tag, such as `</p>` or `</a>`.
+#[derive(Debug, Eq, PartialEq)]
+pub struct EndTag<O> {
+ /// The tag name.
+ /// Uppercase ASCII characters (A-Z) have been converted to lowercase.
+ pub name: String,
+
+ /// The source code span of the tag.
+ pub span: Range<O>,
+
+ /// The span of the tag name.
+ pub name_span: Range<O>,
+}
+
+/// An HTML comment.
+#[derive(PartialEq, Eq, Debug)]
+pub struct Comment<O> {
+ /// The text within the comment.
+ pub data: String,
+ /// The source offset of the comment data.
+ pub data_span: Range<O>,
+}
+
+impl<O: Offset> Comment<O> {
+ /// Returns the span for the comment data.
+ pub fn data_span(&self) -> Range<O> {
+ self.data_span.clone()
+ }
+}
+
+/// A doctype. Some examples:
+///
+/// * `<!DOCTYPE {name}>`
+/// * `<!DOCTYPE {name} PUBLIC '{public_id}'>`
+/// * `<!DOCTYPE {name} SYSTEM '{system_id}'>`
+/// * `<!DOCTYPE {name} PUBLIC '{public_id}' '{system_id}'>`
+#[derive(Debug, Eq, PartialEq)]
+pub struct Doctype<O> {
+ /// The [force-quirks flag].
+ ///
+ /// [force-quirks flag]: https://html.spec.whatwg.org/multipage/parsing.html#force-quirks-flag
+ pub force_quirks: bool,
+
+ /// The doctype's name. Uppercase ASCII characters (A-Z) have been
+ /// converted to lowercase. For HTML documents this should be "html".
+ pub name: Option<String>,
+
+ /// The doctype's public identifier.
+ pub public_id: Option<String>,
+
+ /// The doctype's system identifier.
+ pub system_id: Option<String>,
+
+ /// The source code span of the doctype.
+ pub span: Range<O>,
+
+ /// The span of the name.
+ pub(crate) name_span: Range<O>,
+
+ /// The span of the public identifier.
+ pub(crate) public_id_span: Range<O>,
+
+ /// The span of the system identifier.
+ pub(crate) system_id_span: Range<O>,
+}
+
+impl<O: Offset> Doctype<O> {
+ /// Returns the span of the name.
+ pub fn name_span(&self) -> Option<Range<O>> {
+ self.name.as_ref()?;
+ Some(self.name_span.clone())
+ }
+
+ /// Returns the span of the public identifier.
+ pub fn public_id_span(&self) -> Option<Range<O>> {
+ self.public_id.as_ref()?;
+ Some(self.public_id_span.clone())
+ }
+
+ /// Returns the span of the system identifier.
+ pub fn system_id_span(&self) -> Option<Range<O>> {
+ self.system_id.as_ref()?;
+ Some(self.system_id_span.clone())
+ }
+}
/// A map of HTML attributes.
///