diff options
Diffstat (limited to 'src')
| -rw-r--r-- | src/basic_emitter.rs | 178 | ||||
| -rw-r--r-- | src/naive_parser.rs | 2 | ||||
| -rw-r--r-- | src/tokenizer.rs | 7 | 
3 files changed, 157 insertions, 30 deletions
| diff --git a/src/basic_emitter.rs b/src/basic_emitter.rs index bcb3f41..e67447b 100644 --- a/src/basic_emitter.rs +++ b/src/basic_emitter.rs @@ -1,20 +1,36 @@ +use std::collections::btree_map::Entry; +use std::collections::BTreeSet;  use std::collections::VecDeque;  use std::ops::Range; +use crate::let_else::assume; +use crate::offset::NoopOffset;  use crate::offset::Offset; +use crate::token::{Doctype, EndTag, StartTag, Token};  use crate::Emitter;  use crate::Error; -use crate::Token;  /// An [`Emitter`] implementation that yields [`Token`]. -pub struct BasicEmitter<O> { +pub struct BasicEmitter<O = NoopOffset> { +    current_token: Option<Token>, +    current_attribute_name: String, +    current_attr_internal: crate::token::AttrInternal, +    seen_attributes: BTreeSet<String>, +    emitted_tokens: VecDeque<Token>,      errors: VecDeque<(Error, Range<O>)>, +    attr_name_span: Range<O>,  }  impl<O: Default> Default for BasicEmitter<O> {      fn default() -> Self {          BasicEmitter { +            current_token: None, +            current_attribute_name: String::new(), +            current_attr_internal: Default::default(), +            seen_attributes: BTreeSet::new(), +            emitted_tokens: VecDeque::new(),              errors: VecDeque::new(), +            attr_name_span: Default::default(),          }      }  } @@ -30,97 +46,209 @@ impl<O> Iterator for BasicEmitter<O> {      type Item = Token;      fn next(&mut self) -> Option<Self::Item> { -        todo!() +        self.emitted_tokens.pop_back()      }  }  #[allow(unused_variables)]  impl<O: Offset> Emitter<O> for BasicEmitter<O> {      fn report_error(&mut self, error: Error, span: Range<O>) { -        todo!() +        self.errors.push_back((error, span));      }      fn emit_char(&mut self, c: char) { -        todo!() +        self.emit_token(Token::Char(c));      }      fn emit_eof(&mut self) { -        todo!() +        self.emit_token(Token::EndOfFile);      }      fn init_start_tag(&mut self, tag_offset: O, name_offset: O) { -        todo!() +        self.current_token = Some(Token::StartTag(StartTag { +            self_closing: false, +            name: String::new(), +            attributes: Default::default(), +        }));      }      fn init_end_tag(&mut self, tag_offset: O, name_offset: O) { -        todo!() +        self.current_token = Some(Token::EndTag(EndTag { +            name: String::new(), +        })); +        self.seen_attributes.clear();      }      fn push_tag_name(&mut self, s: &str) { -        todo!() +        assume!( +            Some(Token::StartTag(StartTag { name, .. }) | Token::EndTag(EndTag { name, .. })), +            &mut self.current_token +        ); +        name.push_str(s);      }      fn init_attribute_name(&mut self, offset: O) { -        todo!() +        self.flush_current_attribute(); +        self.attr_name_span.start = offset;      }      fn push_attribute_name(&mut self, s: &str) { -        todo!() +        self.current_attribute_name.push_str(s); +    } + +    fn terminate_attribute_name(&mut self, offset: O) { +        self.attr_name_span.end = offset;      }      fn push_attribute_value(&mut self, s: &str) { -        todo!() +        self.current_attr_internal.value.push_str(s);      }      fn set_self_closing(&mut self, slash_span: Range<O>) { -        todo!() +        let token = self.current_token.as_mut().unwrap(); + +        match token { +            Token::StartTag(tag) => { +                tag.self_closing = true; +            } +            Token::EndTag(_) => { +                self.report_error(Error::EndTagWithTrailingSolidus, slash_span); +            } +            other => debug_assert!(false, "unexpected current_token: {other:?}"), +        }      }      fn emit_current_tag(&mut self, offset: O) { -        todo!() +        self.flush_current_attribute(); +        let mut token = self.current_token.take().unwrap(); +        match &mut token { +            Token::EndTag(_) => { +                if !self.seen_attributes.is_empty() { +                    self.report_error(Error::EndTagWithAttributes, self.attr_name_span.clone()); +                } +                self.seen_attributes.clear(); +            } +            Token::StartTag(_) => {} +            other => { +                debug_assert!(false, "unexpected current_token: {other:?}"); +                return; +            } +        } +        self.emit_token(token);      }      fn init_comment(&mut self, data_start_offset: O) { -        todo!() +        self.current_token = Some(Token::Comment(String::new()));      }      fn push_comment(&mut self, s: &str) { -        todo!() +        assume!(Some(Token::Comment(data)), &mut self.current_token); +        data.push_str(s);      }      fn emit_current_comment(&mut self, data_end_offset: O) { -        todo!() +        let token = self.current_token.take().unwrap(); +        self.emit_token(token);      }      fn init_doctype(&mut self, offset: O) { -        todo!() +        self.current_token = Some(Token::Doctype(Doctype { +            name: None, +            force_quirks: false, +            public_id: None, +            system_id: None, +        })); +    } + +    fn init_doctype_name(&mut self, offset: O) { +        assume!(Some(Token::Doctype(doctype)), &mut self.current_token); +        doctype.name = Some("".into());      }      fn push_doctype_name(&mut self, s: &str) { -        todo!() +        assume!( +            Some(Token::Doctype(Doctype { +                name: Some(name), +                .. +            })), +            &mut self.current_token +        ); +        name.push_str(s);      }      fn init_doctype_public_id(&mut self, offset: O) { -        todo!() +        assume!(Some(Token::Doctype(doctype)), &mut self.current_token); +        doctype.public_id = Some("".to_owned());      }      fn push_doctype_public_id(&mut self, s: &str) { -        todo!() +        assume!( +            Some(Token::Doctype(Doctype { +                public_id: Some(public_id), +                .. +            })), +            &mut self.current_token +        ); +        public_id.push_str(s);      }      fn init_doctype_system_id(&mut self, offset: O) { -        todo!() +        assume!(Some(Token::Doctype(doctype)), &mut self.current_token); +        doctype.system_id = Some("".to_owned());      }      fn push_doctype_system_id(&mut self, s: &str) { -        todo!() +        assume!( +            Some(Token::Doctype(Doctype { +                system_id: Some(id), +                .. +            })), +            &mut self.current_token +        ); +        id.push_str(s);      }      fn set_force_quirks(&mut self) { -        todo!() +        assume!(Some(Token::Doctype(doctype)), &mut self.current_token); +        doctype.force_quirks = true;      }      fn emit_current_doctype(&mut self, offset: O) { -        todo!() +        let token = self.current_token.take().unwrap(); +        self.emit_token(token); +    } +} + +impl<O> BasicEmitter<O> { +    fn emit_token(&mut self, token: Token) { +        self.emitted_tokens.push_front(token); +    } + +    fn flush_current_attribute(&mut self) +    where +        O: Offset, +    { +        if self.current_attribute_name.is_empty() { +            return; +        } +        let name = std::mem::take(&mut self.current_attribute_name); +        let attr_internal = std::mem::take(&mut self.current_attr_internal); + +        match &mut self.current_token { +            Some(Token::StartTag(tag)) => match tag.attributes.inner.entry(name) { +                Entry::Vacant(vacant) => { +                    vacant.insert(attr_internal); +                } +                Entry::Occupied(_) => { +                    self.report_error(Error::DuplicateAttribute, self.attr_name_span.clone()); +                } +            }, +            Some(Token::EndTag(_)) => { +                if !self.seen_attributes.insert(name) { +                    self.report_error(Error::DuplicateAttribute, self.attr_name_span.clone()); +                } +            } +            other => debug_assert!(false, "unexpected current_token: {other:?}"), +        }      }  } diff --git a/src/naive_parser.rs b/src/naive_parser.rs index 4f8dc0d..70b6522 100644 --- a/src/naive_parser.rs +++ b/src/naive_parser.rs @@ -12,7 +12,7 @@ use crate::{BasicEmitter, Emitter, Event, State, Tokenizer};  ///  /// * it naively emits any CDATA sections as bogus comments, for example:  /// -///   ```no_run TODO: run again once BasicEmitter has been implemented +///   ```  ///   # use html5tokenizer::{NaiveParser, Token};  ///   let html = "<svg><![CDATA[I love SVG]]>";  ///   let mut tokens = NaiveParser::new(html).flatten(); diff --git a/src/tokenizer.rs b/src/tokenizer.rs index decd4df..3359637 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -15,12 +15,11 @@ pub use machine::State as InternalState;  /// Iterating over the tokenizer directly without calling [`Tokenizer::set_state`]  /// results in wrong state transitions:  /// -/// ```ignore TODO: unignore once the BasicEmitter has been implemented -/// # use html5tokenizer::{DefaultEmitter, Event, Tokenizer, Token}; -/// let emitter = DefaultEmitter::default(); +/// ``` +/// # use html5tokenizer::{BasicEmitter, Event, Tokenizer, Token}; +/// let emitter = BasicEmitter::default();  /// let html = "<script><b>";  /// let mut tokens = Tokenizer::new(html, emitter).flatten(); -/// let mut tokens = tokens.map(|event| match event { Event::Token((token, _)) => Event::Token(token), Event::CdataOpen => Event::CdataOpen }); // TODO: remove once BasicEmitter can be used instead  /// assert!(matches!(tokens.next(), Some(Event::Token(Token::StartTag(_)))));  /// assert!(matches!(tokens.next(), Some(Event::Token(Token::StartTag(_)))));  /// ``` | 
