summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMartin Fischer <martin@push-f.com>2021-11-30 17:27:05 +0100
committerMartin Fischer <martin@push-f.com>2021-12-05 02:52:36 +0100
commitb35817f2bbde6ba24d4286c9a1782815c28613a0 (patch)
tree73feb5ec1b722e30a86d0dd1c17435a7b75c322a
parent927ac122a63ad5e1b8037a895d9e9b63883bcc01 (diff)
spans: copy DefaultEmitter to new span module
-rw-r--r--src/lib.rs1
-rw-r--r--src/spans.rs265
2 files changed, 266 insertions, 0 deletions
diff --git a/src/lib.rs b/src/lib.rs
index b222193..40f9588 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -9,6 +9,7 @@ mod error;
mod machine;
mod never;
mod reader;
+mod spans;
mod tokenizer;
mod utils;
diff --git a/src/spans.rs b/src/spans.rs
new file mode 100644
index 0000000..ea3409c
--- /dev/null
+++ b/src/spans.rs
@@ -0,0 +1,265 @@
+use std::{
+ collections::{BTreeSet, VecDeque},
+ marker::PhantomData,
+ mem,
+};
+
+use crate::{Doctype, Emitter, EndTag, Error, StartTag, Token};
+
+/// The default implementation of [`crate::Emitter`], used to produce ("emit") tokens.
+pub struct DefaultEmitter<S, R> {
+ current_characters: String,
+ current_token: Option<Token<S>>,
+ last_start_tag: String,
+ current_attribute: Option<(String, String)>,
+ seen_attributes: BTreeSet<String>,
+ emitted_tokens: VecDeque<Token<S>>,
+ reader: PhantomData<R>,
+}
+
+impl<S, R> Default for DefaultEmitter<S, R> {
+ fn default() -> Self {
+ DefaultEmitter {
+ current_characters: String::new(),
+ current_token: None,
+ last_start_tag: String::new(),
+ current_attribute: None,
+ seen_attributes: BTreeSet::new(),
+ emitted_tokens: VecDeque::new(),
+ reader: PhantomData::default(),
+ }
+ }
+}
+
+impl<R> DefaultEmitter<(), R> {
+ fn emit_token(&mut self, token: Token<()>) {
+ self.flush_current_characters();
+ self.emitted_tokens.push_front(token);
+ }
+
+ fn flush_current_attribute(&mut self) {
+ if let Some((k, v)) = self.current_attribute.take() {
+ match self.current_token {
+ Some(Token::StartTag(ref mut tag)) => {
+ let mut error = None;
+ tag.attributes
+ .entry(k)
+ .and_modify(|_| {
+ error = Some(Error::DuplicateAttribute);
+ })
+ .or_insert(v);
+
+ if let Some(e) = error {
+ self.emit_error(e);
+ }
+ }
+ Some(Token::EndTag(_)) => {
+ if !self.seen_attributes.insert(k) {
+ self.emit_error(Error::DuplicateAttribute);
+ }
+ }
+ _ => {
+ debug_assert!(false);
+ }
+ }
+ }
+ }
+
+ fn flush_current_characters(&mut self) {
+ if self.current_characters.is_empty() {
+ return;
+ }
+
+ let s = mem::take(&mut self.current_characters);
+ self.emit_token(Token::String(s));
+ }
+}
+
+impl<R> Emitter<R> for DefaultEmitter<(), R> {
+ type Token = Token<()>;
+
+ fn set_last_start_tag(&mut self, last_start_tag: Option<&str>) {
+ self.last_start_tag.clear();
+ self.last_start_tag
+ .push_str(last_start_tag.unwrap_or_default());
+ }
+
+ fn emit_eof(&mut self) {
+ self.flush_current_characters();
+ }
+
+ fn emit_error(&mut self, error: Error) {
+ // bypass character flushing in self.emit_token: we don't need the error location to be
+ // that exact
+ self.emitted_tokens.push_front(Token::Error(error));
+ }
+
+ fn pop_token(&mut self) -> Option<Self::Token> {
+ self.emitted_tokens.pop_back()
+ }
+
+ fn emit_string(&mut self, s: &str) {
+ self.current_characters.push_str(s);
+ }
+
+ fn init_start_tag(&mut self, _reader: &R) {
+ self.current_token = Some(Token::StartTag(Default::default()));
+ }
+ fn init_end_tag(&mut self, _reader: &R) {
+ self.current_token = Some(Token::EndTag(Default::default()));
+ self.seen_attributes.clear();
+ }
+
+ fn init_comment(&mut self, _reader: &R) {
+ self.current_token = Some(Token::Comment(String::new()));
+ }
+ fn emit_current_tag(&mut self) {
+ self.flush_current_attribute();
+ let mut token = self.current_token.take().unwrap();
+ match token {
+ Token::EndTag(_) => {
+ if !self.seen_attributes.is_empty() {
+ self.emit_error(Error::EndTagWithAttributes);
+ }
+ self.seen_attributes.clear();
+ }
+ Token::StartTag(ref mut _tag) => {
+ self.set_last_start_tag(Some(&_tag.name));
+ }
+ _ => debug_assert!(false),
+ }
+ self.emit_token(token);
+ }
+ fn emit_current_comment(&mut self) {
+ let comment = self.current_token.take().unwrap();
+ debug_assert!(matches!(comment, Token::Comment(_)));
+ self.emit_token(comment);
+ }
+
+ fn emit_current_doctype(&mut self) {
+ let doctype = self.current_token.take().unwrap();
+ debug_assert!(matches!(doctype, Token::Doctype(_)));
+ self.emit_token(doctype);
+ }
+
+ fn set_self_closing(&mut self) {
+ let tag = self.current_token.as_mut().unwrap();
+ match tag {
+ Token::StartTag(StartTag {
+ ref mut self_closing,
+ ..
+ }) => {
+ *self_closing = true;
+ }
+ Token::EndTag(_) => {
+ self.emit_error(Error::EndTagWithTrailingSolidus);
+ }
+ _ => {
+ debug_assert!(false);
+ }
+ }
+ }
+ fn set_force_quirks(&mut self) {
+ match self.current_token {
+ Some(Token::Doctype(ref mut doctype)) => doctype.force_quirks = true,
+ _ => debug_assert!(false),
+ }
+ }
+ fn push_tag_name(&mut self, s: &str) {
+ match self.current_token {
+ Some(Token::StartTag(StartTag { ref mut name, .. })) => {
+ name.push_str(s);
+ }
+ Some(Token::EndTag(EndTag { ref mut name, .. })) => {
+ name.push_str(s);
+ }
+ _ => debug_assert!(false),
+ }
+ }
+
+ fn push_comment(&mut self, s: &str) {
+ match self.current_token {
+ Some(Token::Comment(ref mut data)) => data.push_str(s),
+ _ => debug_assert!(false),
+ }
+ }
+
+ fn push_doctype_name(&mut self, s: &str) {
+ match self.current_token {
+ Some(Token::Doctype(ref mut doctype)) => doctype.name.push_str(s),
+ _ => debug_assert!(false),
+ }
+ }
+ fn init_doctype(&mut self, _reader: &R) {
+ self.current_token = Some(Token::Doctype(Doctype {
+ name: String::new(),
+ force_quirks: false,
+ public_identifier: None,
+ system_identifier: None,
+ }));
+ }
+
+ fn init_attribute(&mut self, _reader: &R) {
+ self.flush_current_attribute();
+ self.current_attribute = Some((String::new(), String::new()));
+ }
+ fn push_attribute_name(&mut self, s: &str) {
+ self.current_attribute.as_mut().unwrap().0.push_str(s);
+ }
+ fn push_attribute_value(&mut self, s: &str) {
+ self.current_attribute.as_mut().unwrap().1.push_str(s);
+ }
+ fn set_doctype_public_identifier(&mut self, value: &str) {
+ if let Some(Token::Doctype(Doctype {
+ ref mut public_identifier,
+ ..
+ })) = self.current_token
+ {
+ *public_identifier = Some(value.to_owned());
+ } else {
+ debug_assert!(false);
+ }
+ }
+ fn set_doctype_system_identifier(&mut self, value: &str) {
+ if let Some(Token::Doctype(Doctype {
+ ref mut system_identifier,
+ ..
+ })) = self.current_token
+ {
+ *system_identifier = Some(value.to_owned());
+ } else {
+ debug_assert!(false);
+ }
+ }
+ fn push_doctype_public_identifier(&mut self, s: &str) {
+ if let Some(Token::Doctype(Doctype {
+ public_identifier: Some(ref mut id),
+ ..
+ })) = self.current_token
+ {
+ id.push_str(s);
+ } else {
+ debug_assert!(false);
+ }
+ }
+ fn push_doctype_system_identifier(&mut self, s: &str) {
+ if let Some(Token::Doctype(Doctype {
+ system_identifier: Some(ref mut id),
+ ..
+ })) = self.current_token
+ {
+ id.push_str(s);
+ } else {
+ debug_assert!(false);
+ }
+ }
+
+ fn current_is_appropriate_end_tag_token(&mut self) -> bool {
+ match self.current_token {
+ Some(Token::EndTag(ref tag)) => {
+ !self.last_start_tag.is_empty() && self.last_start_tag == tag.name
+ }
+ _ => false,
+ }
+ }
+}