diff options
-rw-r--r-- | src/emitter.rs | 83 | ||||
-rw-r--r-- | src/spans.rs | 347 | ||||
-rw-r--r-- | tests/test_spans.rs | 9 |
3 files changed, 108 insertions, 331 deletions
diff --git a/src/emitter.rs b/src/emitter.rs index fe98c43..e872b1f 100644 --- a/src/emitter.rs +++ b/src/emitter.rs @@ -5,6 +5,7 @@ use std::collections::VecDeque; use std::marker::PhantomData; use std::mem; +use crate::spans::Span; use crate::Error; use crate::State; @@ -189,6 +190,7 @@ pub struct DefaultEmitter<R, S> { seen_attributes: BTreeSet<String>, emitted_tokens: VecDeque<Token<S>>, reader: PhantomData<R>, + attr_in_end_tag_span: Option<S>, } impl<R, S> Default for DefaultEmitter<R, S> { @@ -201,12 +203,13 @@ impl<R, S> Default for DefaultEmitter<R, S> { seen_attributes: BTreeSet::new(), emitted_tokens: VecDeque::new(), reader: PhantomData::default(), + attr_in_end_tag_span: None, } } } -impl<R> DefaultEmitter<R, ()> { - fn emit_token(&mut self, token: Token<()>) { +impl<R, S: Span<R>> DefaultEmitter<R, S> { + fn emit_token(&mut self, token: Token<S>) { self.flush_current_characters(); self.emitted_tokens.push_front(token); } @@ -219,12 +222,13 @@ impl<R> DefaultEmitter<R, ()> { vacant.insert(v); } Entry::Occupied(_) => { - self.push_error(Error::DuplicateAttribute); + self.push_error(Error::DuplicateAttribute, v.name_span); } }, Some(Token::EndTag(_)) => { + self.attr_in_end_tag_span = Some(v.name_span.clone()); if !self.seen_attributes.insert(k) { - self.push_error(Error::DuplicateAttribute); + self.push_error(Error::DuplicateAttribute, v.name_span); } } _ => { @@ -243,16 +247,15 @@ impl<R> DefaultEmitter<R, ()> { self.emit_token(Token::String(s)); } - fn push_error(&mut self, error: Error) { + fn push_error(&mut self, error: Error, span: S) { // bypass character flushing in self.emit_token: we don't need the error location to be // that exact - self.emitted_tokens - .push_front(Token::Error { error, span: () }); + self.emitted_tokens.push_front(Token::Error { error, span }); } } -impl<R> Emitter<R> for DefaultEmitter<R, ()> { - type Token = Token<()>; +impl<R, S: Span<R>> Emitter<R> for DefaultEmitter<R, S> { + type Token = Token<S>; fn set_last_start_tag(&mut self, last_start_tag: Option<&str>) { self.last_start_tag.clear(); @@ -264,8 +267,8 @@ impl<R> Emitter<R> for DefaultEmitter<R, ()> { self.flush_current_characters(); } - fn emit_error(&mut self, error: Error, _reader: &R) { - self.push_error(error); + fn emit_error(&mut self, error: Error, reader: &R) { + self.push_error(error, S::from_reader(reader)); } fn pop_token(&mut self) -> Option<Self::Token> { @@ -276,11 +279,17 @@ impl<R> Emitter<R> for DefaultEmitter<R, ()> { self.current_characters.push_str(s); } - fn init_start_tag(&mut self, _reader: &R) { - self.current_token = Some(Token::StartTag(Default::default())); + fn init_start_tag(&mut self, reader: &R) { + self.current_token = Some(Token::StartTag(StartTag { + name_span: S::from_reader(reader), + ..Default::default() + })); } - fn init_end_tag(&mut self, _reader: &R) { - self.current_token = Some(Token::EndTag(Default::default())); + fn init_end_tag(&mut self, reader: &R) { + self.current_token = Some(Token::EndTag(EndTag { + name_span: S::from_reader(reader), + ..Default::default() + })); self.seen_attributes.clear(); } @@ -293,7 +302,8 @@ impl<R> Emitter<R> for DefaultEmitter<R, ()> { match token { Token::EndTag(_) => { if !self.seen_attributes.is_empty() { - self.push_error(Error::EndTagWithAttributes); + let span = self.attr_in_end_tag_span.take().unwrap(); + self.push_error(Error::EndTagWithAttributes, span); } self.seen_attributes.clear(); } @@ -316,7 +326,7 @@ impl<R> Emitter<R> for DefaultEmitter<R, ()> { self.emit_token(doctype); } - fn set_self_closing(&mut self, _reader: &R) { + fn set_self_closing(&mut self, reader: &R) { let tag = self.current_token.as_mut().unwrap(); match tag { Token::StartTag(StartTag { @@ -326,7 +336,7 @@ impl<R> Emitter<R> for DefaultEmitter<R, ()> { *self_closing = true; } Token::EndTag(_) => { - self.push_error(Error::EndTagWithTrailingSolidus); + self.emit_error(Error::EndTagWithTrailingSolidus, reader); } _ => { debug_assert!(false); @@ -341,11 +351,21 @@ impl<R> Emitter<R> for DefaultEmitter<R, ()> { } fn push_tag_name(&mut self, s: &str) { match self.current_token { - Some(Token::StartTag(StartTag { ref mut name, .. })) => { + Some(Token::StartTag(StartTag { + ref mut name, + ref mut name_span, + .. + })) => { name.push_str(s); + name_span.push_str(s); } - Some(Token::EndTag(EndTag { ref mut name, .. })) => { + Some(Token::EndTag(EndTag { + ref mut name, + ref mut name_span, + .. + })) => { name.push_str(s); + name_span.push_str(s); } _ => debug_assert!(false), } @@ -373,15 +393,30 @@ impl<R> Emitter<R> for DefaultEmitter<R, ()> { })); } - fn init_attribute_name(&mut self, _reader: &R) { + fn init_attribute_name(&mut self, reader: &R) { self.flush_current_attribute(); - self.current_attribute = Some((String::new(), Attribute::default())); + self.current_attribute = Some(( + String::new(), + Attribute { + name_span: S::from_reader(reader), + ..Default::default() + }, + )); + } + fn init_attribute_value(&mut self, reader: &R, quoted: bool) { + self.current_attribute.as_mut().unwrap().1.value_span = + S::from_reader_with_offset(reader, quoted as usize); } + fn push_attribute_name(&mut self, s: &str) { - self.current_attribute.as_mut().unwrap().0.push_str(s); + let current_attr = self.current_attribute.as_mut().unwrap(); + current_attr.0.push_str(s); + current_attr.1.name_span.push_str(s); } fn push_attribute_value(&mut self, s: &str) { - self.current_attribute.as_mut().unwrap().1.value.push_str(s); + let current_attr = self.current_attribute.as_mut().unwrap(); + current_attr.1.value.push_str(s); + current_attr.1.value_span.push_str(s); } fn set_doctype_public_identifier(&mut self, value: &str) { if let Some(Token::Doctype(Doctype { diff --git a/src/spans.rs b/src/spans.rs index c582457..88d5eed 100644 --- a/src/spans.rs +++ b/src/spans.rs @@ -1,13 +1,18 @@ //! Source code spans. -use std::{ - collections::{btree_map::Entry, BTreeSet, VecDeque}, - marker::PhantomData, - mem, -}; +//! +//! The [`DefaultEmitter`](crate::DefaultEmitter) is generic over a [`Span`]. +//! This library comes with two Span implementations: +//! +//! * one for `()` which acts as the no-op implementation for when you don't want to track spans +//! * one for [`Range<usize>`] for when you do want to track spans +//! +//! To use the latter your reader however has to implement [`GetPos`]. +//! You can easily use any existing reader by wrapping it in the [`PosTracker`] struct +//! which implements the [`GetPos`] trait and takes care of tracking the current position. -use crate::{Attribute, Doctype, Emitter, EndTag, Error, Reader, StartTag, Token}; +use std::ops::Range; -type Span = std::ops::Range<usize>; +use crate::Reader; /// A trait to be implemented by readers that track their own position. pub trait GetPos { @@ -29,320 +34,60 @@ impl<R> GetPos for PosTracker<R> { } } -impl<R: Reader> Reader for PosTracker<R> { - type Error = R::Error; - - fn read_char(&mut self) -> Result<Option<char>, Self::Error> { - match self.reader.read_char()? { - Some(char) => { - self.position += char.len_utf8(); - Ok(Some(char)) - } - None => Ok(None), - } - } +/// Represents a character range in the source code. +pub trait Span<R>: Default + Clone { + /// Initializes a new span at the current position of the reader. + fn from_reader(reader: &R) -> Self; - fn try_read_string(&mut self, s: &str, case_sensitive: bool) -> Result<bool, Self::Error> { - match self.reader.try_read_string(s, case_sensitive)? { - true => { - self.position += s.len(); - Ok(true) - } - false => Ok(false), - } - } -} - -/// The default implementation of [`crate::Emitter`], used to produce ("emit") tokens. -pub struct SpanEmitter<R> { - current_characters: String, - current_token: Option<Token<Span>>, - last_start_tag: String, - current_attribute: Option<(String, Attribute<Span>)>, - seen_attributes: BTreeSet<String>, - emitted_tokens: VecDeque<Token<Span>>, - reader: PhantomData<R>, - attr_in_end_tag_span: Option<Span>, -} + /// Initializes a new span at the current position of the reader with the given offset. + fn from_reader_with_offset(reader: &R, offset: usize) -> Self; -impl<R> Default for SpanEmitter<R> { - fn default() -> Self { - SpanEmitter { - current_characters: String::new(), - current_token: None, - last_start_tag: String::new(), - current_attribute: None, - seen_attributes: BTreeSet::new(), - emitted_tokens: VecDeque::new(), - reader: PhantomData::default(), - attr_in_end_tag_span: None, - } - } + /// Extends the span by the length of the given string. + fn push_str(&mut self, str: &str); } -impl<R: GetPos> SpanEmitter<R> { - fn emit_token(&mut self, token: Token<Span>) { - self.flush_current_characters(); - self.emitted_tokens.push_front(token); - } - - fn flush_current_attribute(&mut self) { - if let Some((k, v)) = self.current_attribute.take() { - match self.current_token { - Some(Token::StartTag(ref mut tag)) => match tag.attributes.entry(k) { - Entry::Vacant(vacant) => { - vacant.insert(v); - } - Entry::Occupied(_) => { - self.emit_error_span(Error::DuplicateAttribute, v.name_span); - } - }, - Some(Token::EndTag(_)) => { - self.attr_in_end_tag_span = Some(v.name_span.clone()); - if !self.seen_attributes.insert(k) { - self.emit_error_span(Error::DuplicateAttribute, v.name_span); - } - } - _ => { - debug_assert!(false); - } - } - } - } - - fn flush_current_characters(&mut self) { - if self.current_characters.is_empty() { - return; - } +impl<R> Span<R> for () { + fn from_reader(_reader: &R) -> Self {} - let s = mem::take(&mut self.current_characters); - self.emit_token(Token::String(s)); - } + fn from_reader_with_offset(_reader: &R, _offset: usize) -> Self {} - fn emit_error_span(&mut self, error: Error, span: Span) { - // bypass character flushing in self.emit_token: we don't need the error location to be - // that exact - self.emitted_tokens.push_front(Token::Error { error, span }); - } + fn push_str(&mut self, _str: &str) {} } -impl<R: GetPos> Emitter<R> for SpanEmitter<R> { - type Token = Token<Span>; - - fn set_last_start_tag(&mut self, last_start_tag: Option<&str>) { - self.last_start_tag.clear(); - self.last_start_tag - .push_str(last_start_tag.unwrap_or_default()); +impl<P: GetPos> Span<P> for Range<usize> { + fn from_reader(reader: &P) -> Self { + reader.get_pos() - 1..reader.get_pos() - 1 } - fn emit_eof(&mut self) { - self.flush_current_characters(); + fn from_reader_with_offset(reader: &P, offset: usize) -> Self { + reader.get_pos() - 1 + offset..reader.get_pos() - 1 + offset } - fn emit_error(&mut self, error: Error, reader: &R) { - self.emit_error_span(error, reader.get_pos() - 1..reader.get_pos() - 1) - } - - fn pop_token(&mut self) -> Option<Self::Token> { - self.emitted_tokens.pop_back() - } - - fn emit_string(&mut self, s: &str) { - self.current_characters.push_str(s); - } - - fn init_start_tag(&mut self, reader: &R) { - self.current_token = Some(Token::StartTag(StartTag { - name_span: reader.get_pos() - 1..reader.get_pos() - 1, - ..Default::default() - })); - } - fn init_end_tag(&mut self, reader: &R) { - self.current_token = Some(Token::EndTag(EndTag { - name_span: reader.get_pos() - 1..reader.get_pos() - 1, - ..Default::default() - })); - self.seen_attributes.clear(); - } - - fn init_comment(&mut self, _reader: &R) { - self.current_token = Some(Token::Comment(String::new())); - } - fn emit_current_tag(&mut self) { - self.flush_current_attribute(); - let mut token = self.current_token.take().unwrap(); - match token { - Token::EndTag(_) => { - if !self.seen_attributes.is_empty() { - let span = self.attr_in_end_tag_span.take().unwrap(); - self.emit_error_span(Error::EndTagWithAttributes, span); - } - self.seen_attributes.clear(); - } - Token::StartTag(ref mut _tag) => { - self.set_last_start_tag(Some(&_tag.name)); - } - _ => debug_assert!(false), - } - self.emit_token(token); - } - fn emit_current_comment(&mut self) { - let comment = self.current_token.take().unwrap(); - debug_assert!(matches!(comment, Token::Comment(_))); - self.emit_token(comment); + fn push_str(&mut self, str: &str) { + self.end += str.len(); } +} - fn emit_current_doctype(&mut self) { - let doctype = self.current_token.take().unwrap(); - debug_assert!(matches!(doctype, Token::Doctype(_))); - self.emit_token(doctype); - } +impl<R: Reader> Reader for PosTracker<R> { + type Error = R::Error; - fn set_self_closing(&mut self, reader: &R) { - let tag = self.current_token.as_mut().unwrap(); - match tag { - Token::StartTag(StartTag { - ref mut self_closing, - .. - }) => { - *self_closing = true; - } - Token::EndTag(_) => { - self.emit_error(Error::EndTagWithTrailingSolidus, reader); - } - _ => { - debug_assert!(false); - } - } - } - fn set_force_quirks(&mut self) { - match self.current_token { - Some(Token::Doctype(ref mut doctype)) => doctype.force_quirks = true, - _ => debug_assert!(false), - } - } - fn push_tag_name(&mut self, s: &str) { - match self.current_token { - Some(Token::StartTag(StartTag { - ref mut name, - ref mut name_span, - .. - })) => { - name.push_str(s); - name_span.end += s.len(); - } - Some(Token::EndTag(EndTag { - ref mut name, - ref mut name_span, - .. - })) => { - name.push_str(s); - name_span.end += s.len(); + fn read_char(&mut self) -> Result<Option<char>, Self::Error> { + match self.reader.read_char()? { + Some(char) => { + self.position += char.len_utf8(); + Ok(Some(char)) } - _ => debug_assert!(false), - } - } - - fn push_comment(&mut self, s: &str) { - match self.current_token { - Some(Token::Comment(ref mut data)) => data.push_str(s), - _ => debug_assert!(false), - } - } - - fn push_doctype_name(&mut self, s: &str) { - match self.current_token { - Some(Token::Doctype(ref mut doctype)) => doctype.name.push_str(s), - _ => debug_assert!(false), - } - } - fn init_doctype(&mut self, _reader: &R) { - self.current_token = Some(Token::Doctype(Doctype { - name: String::new(), - force_quirks: false, - public_identifier: None, - system_identifier: None, - })); - } - - fn init_attribute_name(&mut self, reader: &R) { - self.flush_current_attribute(); - self.current_attribute = Some(( - String::new(), - Attribute { - name_span: reader.get_pos() - 1..reader.get_pos() - 1, - ..Default::default() - }, - )); - } - - fn init_attribute_value(&mut self, reader: &R, quoted: bool) { - let current_attr = self.current_attribute.as_mut().unwrap(); - let offset = if quoted { 0 } else { 1 }; - current_attr.1.value_span = reader.get_pos() - offset..reader.get_pos() - offset; - } - - fn push_attribute_name(&mut self, s: &str) { - let current_attr = self.current_attribute.as_mut().unwrap(); - current_attr.0.push_str(s); - current_attr.1.name_span.end += s.len(); - } - fn push_attribute_value(&mut self, s: &str) { - let current_attr = self.current_attribute.as_mut().unwrap(); - current_attr.1.value.push_str(s); - current_attr.1.value_span.end += s.len(); - } - fn set_doctype_public_identifier(&mut self, value: &str) { - if let Some(Token::Doctype(Doctype { - ref mut public_identifier, - .. - })) = self.current_token - { - *public_identifier = Some(value.to_owned()); - } else { - debug_assert!(false); - } - } - fn set_doctype_system_identifier(&mut self, value: &str) { - if let Some(Token::Doctype(Doctype { - ref mut system_identifier, - .. - })) = self.current_token - { - *system_identifier = Some(value.to_owned()); - } else { - debug_assert!(false); - } - } - fn push_doctype_public_identifier(&mut self, s: &str) { - if let Some(Token::Doctype(Doctype { - public_identifier: Some(ref mut id), - .. - })) = self.current_token - { - id.push_str(s); - } else { - debug_assert!(false); - } - } - fn push_doctype_system_identifier(&mut self, s: &str) { - if let Some(Token::Doctype(Doctype { - system_identifier: Some(ref mut id), - .. - })) = self.current_token - { - id.push_str(s); - } else { - debug_assert!(false); + None => Ok(None), } } - fn current_is_appropriate_end_tag_token(&mut self) -> bool { - match self.current_token { - Some(Token::EndTag(ref tag)) => { - !self.last_start_tag.is_empty() && self.last_start_tag == tag.name + fn try_read_string(&mut self, s: &str, case_sensitive: bool) -> Result<bool, Self::Error> { + match self.reader.try_read_string(s, case_sensitive)? { + true => { + self.position += s.len(); + Ok(true) } - _ => false, + false => Ok(false), } } } diff --git a/tests/test_spans.rs b/tests/test_spans.rs index aeb4a94..5b1e814 100644 --- a/tests/test_spans.rs +++ b/tests/test_spans.rs @@ -1,4 +1,4 @@ -use std::include_str; +use std::{include_str, ops::Range}; use codespan_reporting::{ self, @@ -6,10 +6,7 @@ use codespan_reporting::{ files::SimpleFiles, term::{self, termcolor::Buffer}, }; -use html5gum::{ - spans::{PosTracker, SpanEmitter}, - Readable, Token, Tokenizer, -}; +use html5gum::{spans::PosTracker, DefaultEmitter, Readable, StringReader, Token, Tokenizer}; #[test] fn test() { @@ -24,7 +21,7 @@ fn test() { reader: html.to_reader(), position: 0, }, - SpanEmitter::default(), + DefaultEmitter::<PosTracker<StringReader>, Range<usize>>::default(), ) .infallible() { |