diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/lib.rs | 1 | ||||
-rw-r--r-- | src/spans.rs | 265 |
2 files changed, 266 insertions, 0 deletions
@@ -9,6 +9,7 @@ mod error; mod machine; mod never; mod reader; +mod spans; mod tokenizer; mod utils; diff --git a/src/spans.rs b/src/spans.rs new file mode 100644 index 0000000..ea3409c --- /dev/null +++ b/src/spans.rs @@ -0,0 +1,265 @@ +use std::{ + collections::{BTreeSet, VecDeque}, + marker::PhantomData, + mem, +}; + +use crate::{Doctype, Emitter, EndTag, Error, StartTag, Token}; + +/// The default implementation of [`crate::Emitter`], used to produce ("emit") tokens. +pub struct DefaultEmitter<S, R> { + current_characters: String, + current_token: Option<Token<S>>, + last_start_tag: String, + current_attribute: Option<(String, String)>, + seen_attributes: BTreeSet<String>, + emitted_tokens: VecDeque<Token<S>>, + reader: PhantomData<R>, +} + +impl<S, R> Default for DefaultEmitter<S, R> { + fn default() -> Self { + DefaultEmitter { + current_characters: String::new(), + current_token: None, + last_start_tag: String::new(), + current_attribute: None, + seen_attributes: BTreeSet::new(), + emitted_tokens: VecDeque::new(), + reader: PhantomData::default(), + } + } +} + +impl<R> DefaultEmitter<(), R> { + fn emit_token(&mut self, token: Token<()>) { + self.flush_current_characters(); + self.emitted_tokens.push_front(token); + } + + fn flush_current_attribute(&mut self) { + if let Some((k, v)) = self.current_attribute.take() { + match self.current_token { + Some(Token::StartTag(ref mut tag)) => { + let mut error = None; + tag.attributes + .entry(k) + .and_modify(|_| { + error = Some(Error::DuplicateAttribute); + }) + .or_insert(v); + + if let Some(e) = error { + self.emit_error(e); + } + } + Some(Token::EndTag(_)) => { + if !self.seen_attributes.insert(k) { + self.emit_error(Error::DuplicateAttribute); + } + } + _ => { + debug_assert!(false); + } + } + } + } + + fn flush_current_characters(&mut self) { + if self.current_characters.is_empty() { + return; + } + + let s = mem::take(&mut self.current_characters); + self.emit_token(Token::String(s)); + } +} + +impl<R> Emitter<R> for DefaultEmitter<(), R> { + type Token = Token<()>; + + fn set_last_start_tag(&mut self, last_start_tag: Option<&str>) { + self.last_start_tag.clear(); + self.last_start_tag + .push_str(last_start_tag.unwrap_or_default()); + } + + fn emit_eof(&mut self) { + self.flush_current_characters(); + } + + fn emit_error(&mut self, error: Error) { + // bypass character flushing in self.emit_token: we don't need the error location to be + // that exact + self.emitted_tokens.push_front(Token::Error(error)); + } + + fn pop_token(&mut self) -> Option<Self::Token> { + self.emitted_tokens.pop_back() + } + + fn emit_string(&mut self, s: &str) { + self.current_characters.push_str(s); + } + + fn init_start_tag(&mut self, _reader: &R) { + self.current_token = Some(Token::StartTag(Default::default())); + } + fn init_end_tag(&mut self, _reader: &R) { + self.current_token = Some(Token::EndTag(Default::default())); + self.seen_attributes.clear(); + } + + fn init_comment(&mut self, _reader: &R) { + self.current_token = Some(Token::Comment(String::new())); + } + fn emit_current_tag(&mut self) { + self.flush_current_attribute(); + let mut token = self.current_token.take().unwrap(); + match token { + Token::EndTag(_) => { + if !self.seen_attributes.is_empty() { + self.emit_error(Error::EndTagWithAttributes); + } + self.seen_attributes.clear(); + } + Token::StartTag(ref mut _tag) => { + self.set_last_start_tag(Some(&_tag.name)); + } + _ => debug_assert!(false), + } + self.emit_token(token); + } + fn emit_current_comment(&mut self) { + let comment = self.current_token.take().unwrap(); + debug_assert!(matches!(comment, Token::Comment(_))); + self.emit_token(comment); + } + + fn emit_current_doctype(&mut self) { + let doctype = self.current_token.take().unwrap(); + debug_assert!(matches!(doctype, Token::Doctype(_))); + self.emit_token(doctype); + } + + fn set_self_closing(&mut self) { + let tag = self.current_token.as_mut().unwrap(); + match tag { + Token::StartTag(StartTag { + ref mut self_closing, + .. + }) => { + *self_closing = true; + } + Token::EndTag(_) => { + self.emit_error(Error::EndTagWithTrailingSolidus); + } + _ => { + debug_assert!(false); + } + } + } + fn set_force_quirks(&mut self) { + match self.current_token { + Some(Token::Doctype(ref mut doctype)) => doctype.force_quirks = true, + _ => debug_assert!(false), + } + } + fn push_tag_name(&mut self, s: &str) { + match self.current_token { + Some(Token::StartTag(StartTag { ref mut name, .. })) => { + name.push_str(s); + } + Some(Token::EndTag(EndTag { ref mut name, .. })) => { + name.push_str(s); + } + _ => debug_assert!(false), + } + } + + fn push_comment(&mut self, s: &str) { + match self.current_token { + Some(Token::Comment(ref mut data)) => data.push_str(s), + _ => debug_assert!(false), + } + } + + fn push_doctype_name(&mut self, s: &str) { + match self.current_token { + Some(Token::Doctype(ref mut doctype)) => doctype.name.push_str(s), + _ => debug_assert!(false), + } + } + fn init_doctype(&mut self, _reader: &R) { + self.current_token = Some(Token::Doctype(Doctype { + name: String::new(), + force_quirks: false, + public_identifier: None, + system_identifier: None, + })); + } + + fn init_attribute(&mut self, _reader: &R) { + self.flush_current_attribute(); + self.current_attribute = Some((String::new(), String::new())); + } + fn push_attribute_name(&mut self, s: &str) { + self.current_attribute.as_mut().unwrap().0.push_str(s); + } + fn push_attribute_value(&mut self, s: &str) { + self.current_attribute.as_mut().unwrap().1.push_str(s); + } + fn set_doctype_public_identifier(&mut self, value: &str) { + if let Some(Token::Doctype(Doctype { + ref mut public_identifier, + .. + })) = self.current_token + { + *public_identifier = Some(value.to_owned()); + } else { + debug_assert!(false); + } + } + fn set_doctype_system_identifier(&mut self, value: &str) { + if let Some(Token::Doctype(Doctype { + ref mut system_identifier, + .. + })) = self.current_token + { + *system_identifier = Some(value.to_owned()); + } else { + debug_assert!(false); + } + } + fn push_doctype_public_identifier(&mut self, s: &str) { + if let Some(Token::Doctype(Doctype { + public_identifier: Some(ref mut id), + .. + })) = self.current_token + { + id.push_str(s); + } else { + debug_assert!(false); + } + } + fn push_doctype_system_identifier(&mut self, s: &str) { + if let Some(Token::Doctype(Doctype { + system_identifier: Some(ref mut id), + .. + })) = self.current_token + { + id.push_str(s); + } else { + debug_assert!(false); + } + } + + fn current_is_appropriate_end_tag_token(&mut self) -> bool { + match self.current_token { + Some(Token::EndTag(ref tag)) => { + !self.last_start_tag.is_empty() && self.last_start_tag == tag.name + } + _ => false, + } + } +} |