diff options
| -rw-r--r-- | CHANGELOG.md | 2 | ||||
| -rw-r--r-- | README.md | 2 | ||||
| -rw-r--r-- | examples/spans.rs | 5 | ||||
| -rw-r--r-- | examples/tokenize.rs | 4 | ||||
| -rw-r--r-- | integration_tests/tests/test_html5lib.rs | 6 | ||||
| -rw-r--r-- | src/basic_emitter.rs | 126 | ||||
| -rw-r--r-- | src/lib.rs | 2 | ||||
| -rw-r--r-- | src/naive_parser.rs | 13 | ||||
| -rw-r--r-- | tests/test_spans.rs | 11 | 
9 files changed, 152 insertions, 19 deletions
diff --git a/CHANGELOG.md b/CHANGELOG.md index 06831c3..146d627 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,7 +12,7 @@    * Removed the `Error` variant.        (Errors now have to be queried separately with -    `TracingEmitter::drain_errors`.) +    `BasicEmitter::drain_errors` or `TracingEmitter::drain_errors`.)    * Replaced the `String` variant with a new `Char` variant.        (The tokenizer now emits chars instead of strings.) @@ -7,7 +7,7 @@ Spec-compliant HTML parsing [requires both tokenization and tree-construction][p  While this crate implements a spec-compliant HTML tokenizer it does not implement any  tree-construction. Instead it just provides a `NaiveParser` that may be used as follows: -```rust +```rust no_run TODO: run again once BasicEmitter has been implemented  use std::fmt::Write;  use html5tokenizer::{NaiveParser, Token}; diff --git a/examples/spans.rs b/examples/spans.rs index fc3c6a1..c1fe23b 100644 --- a/examples/spans.rs +++ b/examples/spans.rs @@ -4,11 +4,12 @@ use codespan_reporting::{      term,      term::termcolor::{ColorChoice, StandardStream},  }; -use html5tokenizer::{offset::PosTrackingReader, NaiveParser, Token}; +use html5tokenizer::{offset::PosTrackingReader, NaiveParser, Token, TracingEmitter};  fn main() {      let html = r#"<img src=example.jpg alt="some description">"#; -    let parser = NaiveParser::new(PosTrackingReader::new(html)); +    let parser = +        NaiveParser::new_with_emitter(PosTrackingReader::new(html), TracingEmitter::default());      let Token::StartTag(tag) = parser.flatten().next().unwrap() else {          panic!() diff --git a/examples/tokenize.rs b/examples/tokenize.rs index 791db0f..a7c2214 100644 --- a/examples/tokenize.rs +++ b/examples/tokenize.rs @@ -1,13 +1,13 @@  //! Let's you easily try out the tokenizer with e.g.  //! printf '<h1>Hello world!</h1>' | cargo run --example=tokenize -use html5tokenizer::{Tokenizer, TracingEmitter}; +use html5tokenizer::{BasicEmitter, Tokenizer};  use std::io::BufReader;  fn main() {      let mut tokenizer = Tokenizer::new(          BufReader::new(std::io::stdin().lock()), -        TracingEmitter::default(), +        BasicEmitter::default(),      );      while let Some(token) = tokenizer.next() {          for (error, _) in tokenizer.emitter_mut().drain_errors() { diff --git a/integration_tests/tests/test_html5lib.rs b/integration_tests/tests/test_html5lib.rs index 36fb880..0040a01 100644 --- a/integration_tests/tests/test_html5lib.rs +++ b/integration_tests/tests/test_html5lib.rs @@ -68,13 +68,15 @@ fn test_tokenizer_file(path: &Path) {  fn run_test(fname: &str, test_i: usize, test: Test) {      for state in &test.initial_states { +        // TODO: test BasicEmitter here once it's implemented +          run_test_inner(              fname,              test_i,              &test,              state,              Tokenizer::new(&test.input, TracingEmitter::default()), -            "string", +            "TracingEmitter string",          );          run_test_inner( @@ -86,7 +88,7 @@ fn run_test(fname: &str, test_i: usize, test: Test) {                  BufReader::new(test.input.as_bytes()),                  TracingEmitter::default(),              ), -            "bufread", +            "TracingEmitter bufread",          );      }  } diff --git a/src/basic_emitter.rs b/src/basic_emitter.rs new file mode 100644 index 0000000..046b645 --- /dev/null +++ b/src/basic_emitter.rs @@ -0,0 +1,126 @@ +use std::collections::VecDeque; +use std::ops::Range; + +use crate::offset::Offset; +use crate::Emitter; +use crate::Error; +use crate::Token; + +/// An [`Emitter`] implementation that yields [`Token`]. +pub struct BasicEmitter<O> { +    errors: VecDeque<(Error, Range<O>)>, +} + +impl<O: Default> Default for BasicEmitter<O> { +    fn default() -> Self { +        BasicEmitter { +            errors: VecDeque::new(), +        } +    } +} + +impl<O> BasicEmitter<O> { +    /// Removes all encountered tokenizer errors and returns them as an iterator. +    pub fn drain_errors(&mut self) -> impl Iterator<Item = (Error, Range<O>)> + '_ { +        self.errors.drain(0..) +    } +} + +impl<O> Iterator for BasicEmitter<O> { +    type Item = Token<O>; + +    fn next(&mut self) -> Option<Self::Item> { +        todo!() +    } +} + +#[allow(unused_variables)] +impl<O: Offset> Emitter<O> for BasicEmitter<O> { +    fn report_error(&mut self, error: Error, span: Range<O>) { +        todo!() +    } + +    fn emit_char(&mut self, c: char) { +        todo!() +    } + +    fn emit_eof(&mut self) { +        todo!() +    } + +    fn init_start_tag(&mut self, tag_offset: O, name_offset: O) { +        todo!() +    } + +    fn init_end_tag(&mut self, tag_offset: O, name_offset: O) { +        todo!() +    } + +    fn push_tag_name(&mut self, s: &str) { +        todo!() +    } + +    fn init_attribute_name(&mut self, offset: O) { +        todo!() +    } + +    fn push_attribute_name(&mut self, s: &str) { +        todo!() +    } + +    fn push_attribute_value(&mut self, s: &str) { +        todo!() +    } + +    fn set_self_closing(&mut self, slash_span: Range<O>) { +        todo!() +    } + +    fn emit_current_tag(&mut self, offset: O) { +        todo!() +    } + +    fn init_comment(&mut self, data_start_offset: O) { +        todo!() +    } + +    fn push_comment(&mut self, s: &str) { +        todo!() +    } + +    fn emit_current_comment(&mut self, data_end_offset: O) { +        todo!() +    } + +    fn init_doctype(&mut self, offset: O) { +        todo!() +    } + +    fn push_doctype_name(&mut self, s: &str) { +        todo!() +    } + +    fn init_doctype_public_id(&mut self, offset: O) { +        todo!() +    } + +    fn push_doctype_public_id(&mut self, s: &str) { +        todo!() +    } + +    fn init_doctype_system_id(&mut self, offset: O) { +        todo!() +    } + +    fn push_doctype_system_id(&mut self, s: &str) { +        todo!() +    } + +    fn set_force_quirks(&mut self) { +        todo!() +    } + +    fn emit_current_doctype(&mut self, offset: O) { +        todo!() +    } +} @@ -7,6 +7,7 @@  #![doc = concat!("[the LICENSE file]: ", file_url!("LICENSE"))]  #![doc = include_str!("../README.md")] +mod basic_emitter;  mod emitter;  mod entities;  mod error; @@ -25,6 +26,7 @@ pub mod offset;  pub mod reader;  pub mod token; +pub use basic_emitter::BasicEmitter;  pub use emitter::Emitter;  pub use error::Error;  pub use naive_parser::NaiveParser; diff --git a/src/naive_parser.rs b/src/naive_parser.rs index 91edbc0..4f8dc0d 100644 --- a/src/naive_parser.rs +++ b/src/naive_parser.rs @@ -1,8 +1,7 @@  use crate::offset::{Offset, Position};  use crate::reader::{IntoReader, Reader};  use crate::tokenizer::CdataAction; -use crate::tracing_emitter::TracingEmitter; -use crate::{Emitter, Event, State, Tokenizer}; +use crate::{BasicEmitter, Emitter, Event, State, Tokenizer};  /// A naive HTML parser (**not** spec-compliant since it doesn't do tree construction).  /// @@ -13,7 +12,7 @@ use crate::{Emitter, Event, State, Tokenizer};  ///  /// * it naively emits any CDATA sections as bogus comments, for example:  /// -///   ``` +///   ```no_run TODO: run again once BasicEmitter has been implemented  ///   # use html5tokenizer::{NaiveParser, Token};  ///   let html = "<svg><![CDATA[I love SVG]]>";  ///   let mut tokens = NaiveParser::new(html).flatten(); @@ -30,18 +29,18 @@ pub struct NaiveParser<R: Reader, O: Offset, E: Emitter<O>> {      tokenizer: Tokenizer<R, O, E>,  } -impl<R, O> NaiveParser<R, O, TracingEmitter<O>> +impl<R, O> NaiveParser<R, O, BasicEmitter<O>>  where      R: Reader + Position<O>,      O: Offset,  { -    /// Constructs a new naive parser. +    /// Constructs a new naive parser using the [`BasicEmitter`].      // TODO: add example for NaiveParser::new -    pub fn new<'a, IR>(reader: IR) -> NaiveParser<R, O, TracingEmitter<O>> +    pub fn new<'a, IR>(reader: IR) -> NaiveParser<R, O, BasicEmitter<O>>      where          IR: IntoReader<'a, Reader = R>,      { -        NaiveParser::new_with_emitter(reader, TracingEmitter::default()) +        NaiveParser::new_with_emitter(reader, BasicEmitter::default())      }  } diff --git a/tests/test_spans.rs b/tests/test_spans.rs index eb93d43..fdb9a78 100644 --- a/tests/test_spans.rs +++ b/tests/test_spans.rs @@ -10,7 +10,7 @@ use codespan_reporting::{  use html5tokenizer::{      offset::PosTrackingReader,      reader::{IntoReader, Reader}, -    NaiveParser, Token, +    NaiveParser, Token, TracingEmitter,  };  use insta::assert_snapshot;  use similar_asserts::assert_eq; @@ -27,9 +27,12 @@ fn parser<R>(reader: impl IntoReader<'static, Reader = R>) -> Parser  where      R: Reader<Error = Infallible> + 'static,  { -    NaiveParser::new(PosTrackingReader::new( -        Box::new(reader.into_reader()) as Box<dyn Reader<Error = Infallible>> -    )) +    NaiveParser::new_with_emitter( +        PosTrackingReader::new( +            Box::new(reader.into_reader()) as Box<dyn Reader<Error = Infallible>> +        ), +        TracingEmitter::default(), +    )  }  fn test_and_annotate<S: AsRef<str> + Clone>(  | 
