aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/lib.rs1
-rw-r--r--src/spans.rs265
2 files changed, 266 insertions, 0 deletions
diff --git a/src/lib.rs b/src/lib.rs
index b222193..40f9588 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -9,6 +9,7 @@ mod error;
mod machine;
mod never;
mod reader;
+mod spans;
mod tokenizer;
mod utils;
diff --git a/src/spans.rs b/src/spans.rs
new file mode 100644
index 0000000..ea3409c
--- /dev/null
+++ b/src/spans.rs
@@ -0,0 +1,265 @@
+use std::{
+ collections::{BTreeSet, VecDeque},
+ marker::PhantomData,
+ mem,
+};
+
+use crate::{Doctype, Emitter, EndTag, Error, StartTag, Token};
+
+/// The default implementation of [`crate::Emitter`], used to produce ("emit") tokens.
+pub struct DefaultEmitter<S, R> {
+ current_characters: String,
+ current_token: Option<Token<S>>,
+ last_start_tag: String,
+ current_attribute: Option<(String, String)>,
+ seen_attributes: BTreeSet<String>,
+ emitted_tokens: VecDeque<Token<S>>,
+ reader: PhantomData<R>,
+}
+
+impl<S, R> Default for DefaultEmitter<S, R> {
+ fn default() -> Self {
+ DefaultEmitter {
+ current_characters: String::new(),
+ current_token: None,
+ last_start_tag: String::new(),
+ current_attribute: None,
+ seen_attributes: BTreeSet::new(),
+ emitted_tokens: VecDeque::new(),
+ reader: PhantomData::default(),
+ }
+ }
+}
+
+impl<R> DefaultEmitter<(), R> {
+ fn emit_token(&mut self, token: Token<()>) {
+ self.flush_current_characters();
+ self.emitted_tokens.push_front(token);
+ }
+
+ fn flush_current_attribute(&mut self) {
+ if let Some((k, v)) = self.current_attribute.take() {
+ match self.current_token {
+ Some(Token::StartTag(ref mut tag)) => {
+ let mut error = None;
+ tag.attributes
+ .entry(k)
+ .and_modify(|_| {
+ error = Some(Error::DuplicateAttribute);
+ })
+ .or_insert(v);
+
+ if let Some(e) = error {
+ self.emit_error(e);
+ }
+ }
+ Some(Token::EndTag(_)) => {
+ if !self.seen_attributes.insert(k) {
+ self.emit_error(Error::DuplicateAttribute);
+ }
+ }
+ _ => {
+ debug_assert!(false);
+ }
+ }
+ }
+ }
+
+ fn flush_current_characters(&mut self) {
+ if self.current_characters.is_empty() {
+ return;
+ }
+
+ let s = mem::take(&mut self.current_characters);
+ self.emit_token(Token::String(s));
+ }
+}
+
+impl<R> Emitter<R> for DefaultEmitter<(), R> {
+ type Token = Token<()>;
+
+ fn set_last_start_tag(&mut self, last_start_tag: Option<&str>) {
+ self.last_start_tag.clear();
+ self.last_start_tag
+ .push_str(last_start_tag.unwrap_or_default());
+ }
+
+ fn emit_eof(&mut self) {
+ self.flush_current_characters();
+ }
+
+ fn emit_error(&mut self, error: Error) {
+ // bypass character flushing in self.emit_token: we don't need the error location to be
+ // that exact
+ self.emitted_tokens.push_front(Token::Error(error));
+ }
+
+ fn pop_token(&mut self) -> Option<Self::Token> {
+ self.emitted_tokens.pop_back()
+ }
+
+ fn emit_string(&mut self, s: &str) {
+ self.current_characters.push_str(s);
+ }
+
+ fn init_start_tag(&mut self, _reader: &R) {
+ self.current_token = Some(Token::StartTag(Default::default()));
+ }
+ fn init_end_tag(&mut self, _reader: &R) {
+ self.current_token = Some(Token::EndTag(Default::default()));
+ self.seen_attributes.clear();
+ }
+
+ fn init_comment(&mut self, _reader: &R) {
+ self.current_token = Some(Token::Comment(String::new()));
+ }
+ fn emit_current_tag(&mut self) {
+ self.flush_current_attribute();
+ let mut token = self.current_token.take().unwrap();
+ match token {
+ Token::EndTag(_) => {
+ if !self.seen_attributes.is_empty() {
+ self.emit_error(Error::EndTagWithAttributes);
+ }
+ self.seen_attributes.clear();
+ }
+ Token::StartTag(ref mut _tag) => {
+ self.set_last_start_tag(Some(&_tag.name));
+ }
+ _ => debug_assert!(false),
+ }
+ self.emit_token(token);
+ }
+ fn emit_current_comment(&mut self) {
+ let comment = self.current_token.take().unwrap();
+ debug_assert!(matches!(comment, Token::Comment(_)));
+ self.emit_token(comment);
+ }
+
+ fn emit_current_doctype(&mut self) {
+ let doctype = self.current_token.take().unwrap();
+ debug_assert!(matches!(doctype, Token::Doctype(_)));
+ self.emit_token(doctype);
+ }
+
+ fn set_self_closing(&mut self) {
+ let tag = self.current_token.as_mut().unwrap();
+ match tag {
+ Token::StartTag(StartTag {
+ ref mut self_closing,
+ ..
+ }) => {
+ *self_closing = true;
+ }
+ Token::EndTag(_) => {
+ self.emit_error(Error::EndTagWithTrailingSolidus);
+ }
+ _ => {
+ debug_assert!(false);
+ }
+ }
+ }
+ fn set_force_quirks(&mut self) {
+ match self.current_token {
+ Some(Token::Doctype(ref mut doctype)) => doctype.force_quirks = true,
+ _ => debug_assert!(false),
+ }
+ }
+ fn push_tag_name(&mut self, s: &str) {
+ match self.current_token {
+ Some(Token::StartTag(StartTag { ref mut name, .. })) => {
+ name.push_str(s);
+ }
+ Some(Token::EndTag(EndTag { ref mut name, .. })) => {
+ name.push_str(s);
+ }
+ _ => debug_assert!(false),
+ }
+ }
+
+ fn push_comment(&mut self, s: &str) {
+ match self.current_token {
+ Some(Token::Comment(ref mut data)) => data.push_str(s),
+ _ => debug_assert!(false),
+ }
+ }
+
+ fn push_doctype_name(&mut self, s: &str) {
+ match self.current_token {
+ Some(Token::Doctype(ref mut doctype)) => doctype.name.push_str(s),
+ _ => debug_assert!(false),
+ }
+ }
+ fn init_doctype(&mut self, _reader: &R) {
+ self.current_token = Some(Token::Doctype(Doctype {
+ name: String::new(),
+ force_quirks: false,
+ public_identifier: None,
+ system_identifier: None,
+ }));
+ }
+
+ fn init_attribute(&mut self, _reader: &R) {
+ self.flush_current_attribute();
+ self.current_attribute = Some((String::new(), String::new()));
+ }
+ fn push_attribute_name(&mut self, s: &str) {
+ self.current_attribute.as_mut().unwrap().0.push_str(s);
+ }
+ fn push_attribute_value(&mut self, s: &str) {
+ self.current_attribute.as_mut().unwrap().1.push_str(s);
+ }
+ fn set_doctype_public_identifier(&mut self, value: &str) {
+ if let Some(Token::Doctype(Doctype {
+ ref mut public_identifier,
+ ..
+ })) = self.current_token
+ {
+ *public_identifier = Some(value.to_owned());
+ } else {
+ debug_assert!(false);
+ }
+ }
+ fn set_doctype_system_identifier(&mut self, value: &str) {
+ if let Some(Token::Doctype(Doctype {
+ ref mut system_identifier,
+ ..
+ })) = self.current_token
+ {
+ *system_identifier = Some(value.to_owned());
+ } else {
+ debug_assert!(false);
+ }
+ }
+ fn push_doctype_public_identifier(&mut self, s: &str) {
+ if let Some(Token::Doctype(Doctype {
+ public_identifier: Some(ref mut id),
+ ..
+ })) = self.current_token
+ {
+ id.push_str(s);
+ } else {
+ debug_assert!(false);
+ }
+ }
+ fn push_doctype_system_identifier(&mut self, s: &str) {
+ if let Some(Token::Doctype(Doctype {
+ system_identifier: Some(ref mut id),
+ ..
+ })) = self.current_token
+ {
+ id.push_str(s);
+ } else {
+ debug_assert!(false);
+ }
+ }
+
+ fn current_is_appropriate_end_tag_token(&mut self) -> bool {
+ match self.current_token {
+ Some(Token::EndTag(ref tag)) => {
+ !self.last_start_tag.is_empty() && self.last_start_tag == tag.name
+ }
+ _ => false,
+ }
+ }
+}