diff options
-rw-r--r-- | README.md | 5 | ||||
-rw-r--r-- | examples/tokenize.rs | 11 | ||||
-rw-r--r-- | integration_tests/tests/test_html5lib.rs | 9 | ||||
-rw-r--r-- | src/emitter.rs | 2 | ||||
-rw-r--r-- | src/reader.rs | 4 | ||||
-rw-r--r-- | src/tokenizer.rs | 36 | ||||
-rw-r--r-- | tests/test_spans.rs | 2 |
7 files changed, 37 insertions, 32 deletions
@@ -8,12 +8,13 @@ ```rust use std::fmt::Write; -use html5tokenizer::{Tokenizer, Token}; +use html5tokenizer::{DefaultEmitter, Tokenizer, Token}; let html = "<title >hello world</title>"; +let emitter = DefaultEmitter::<_, ()>::default(); let mut new_html = String::new(); -for token in Tokenizer::new(html).flatten() { +for token in Tokenizer::new(html, emitter).flatten() { match token { Token::StartTag(tag) => { write!(new_html, "<{}>", tag.name).unwrap(); diff --git a/examples/tokenize.rs b/examples/tokenize.rs index ceb5751..5776362 100644 --- a/examples/tokenize.rs +++ b/examples/tokenize.rs @@ -1,9 +1,16 @@ //! Let's you easily try out the tokenizer with e.g. //! printf '<h1>Hello world!</h1>' | cargo run --example=tokenize -use html5tokenizer::{BufReadReader, Tokenizer}; + +use html5tokenizer::{DefaultEmitter, Tokenizer}; +use std::io::BufReader; fn main() { - for token in Tokenizer::new(BufReadReader::new(std::io::stdin().lock())).flatten() { + for token in Tokenizer::new( + BufReader::new(std::io::stdin().lock()), + DefaultEmitter::<_, ()>::default(), + ) + .flatten() + { println!("{:?}", token); } } diff --git a/integration_tests/tests/test_html5lib.rs b/integration_tests/tests/test_html5lib.rs index 23adec0..f5a69c3 100644 --- a/integration_tests/tests/test_html5lib.rs +++ b/integration_tests/tests/test_html5lib.rs @@ -3,7 +3,7 @@ use std::{fs::File, io::BufReader, path::Path}; use html5lib_tests::{ parse_tests, Error as TestError, InitialState, Output, Test, Token as TestToken, }; -use html5tokenizer::{InternalState, Reader, Token, Tokenizer}; +use html5tokenizer::{DefaultEmitter, InternalState, Reader, Token, Tokenizer}; use pretty_assertions::assert_eq; /// Path to a local checkout of [html5lib-tests], relative to the @@ -69,7 +69,7 @@ fn run_test(fname: &str, test_i: usize, test: Test) { test_i, &test, state, - Tokenizer::new(&test.input), + Tokenizer::new(&test.input, DefaultEmitter::default()), "string", ); @@ -78,7 +78,10 @@ fn run_test(fname: &str, test_i: usize, test: Test) { test_i, &test, state, - Tokenizer::new(BufReader::new(test.input.as_bytes())), + Tokenizer::new( + BufReader::new(test.input.as_bytes()), + DefaultEmitter::default(), + ), "bufread", ); } diff --git a/src/emitter.rs b/src/emitter.rs index be712df..110ed5d 100644 --- a/src/emitter.rs +++ b/src/emitter.rs @@ -534,7 +534,7 @@ pub struct Doctype { } /// The token type used by default. You can define your own token type by implementing the -/// [`crate::Emitter`] trait and using [`crate::Tokenizer::new_with_emitter`]. +/// [`crate::Emitter`] trait. #[derive(Debug, Eq, PartialEq)] pub enum Token<S> { /// A HTML start tag. diff --git a/src/reader.rs b/src/reader.rs index f756c65..19929d4 100644 --- a/src/reader.rs +++ b/src/reader.rs @@ -30,8 +30,8 @@ pub trait Reader { /// An object that can be converted into a [`crate::Reader`]. /// -/// For example, any utf8-string can be converted into a `StringReader`, such that -/// `Tokenizer::new("mystring")` and `Tokenizer::new(&String::new("foo"))` work. +/// For example, any utf8-string can be converted into a `StringReader`. +// TODO: , such that [give concrete examples of not-yet-implemented parser API] work. pub trait IntoReader<'a> { /// The reader type into which this type should be converted. type Reader: Reader + 'a; diff --git a/src/tokenizer.rs b/src/tokenizer.rs index d7db3b6..5abd6ba 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -42,10 +42,21 @@ pub struct Tokenizer<R: Reader, E: Emitter<R> = DefaultEmitter<R, ()>> { pub(crate) return_state: Option<InternalState>, } -impl<R: Reader> Tokenizer<R> { - /// Create a new tokenizer from some input. - pub fn new<'a, S: IntoReader<'a, Reader = R>>(input: S) -> Self { - Tokenizer::<S::Reader>::new_with_emitter(input, DefaultEmitter::default()) +impl<R: Reader, E: Emitter<R>> Tokenizer<R, E> { + /// Creates a new tokenizer from some input and an emitter. + /// + /// TODO: add warning about you needing to do the state switching + pub fn new<'a>(reader: impl IntoReader<'a, Reader = R>, emitter: E) -> Self { + Tokenizer { + reader: reader.into_reader(), + emitter, + state: InternalState::Data, + to_reconsume: Stack2::default(), + return_state: None, + temporary_buffer: String::new(), + character_reference_code: 0, + eof: false, + } } } @@ -84,23 +95,6 @@ impl From<State> for InternalState { } impl<R: Reader, E: Emitter<R>> Tokenizer<R, E> { - /// Construct a new tokenizer from some input and a custom emitter. - /// - /// Use this method over [`Tokenizer::new`] when you want to have more control over string allocation for - /// tokens. - pub fn new_with_emitter<'a, S: IntoReader<'a, Reader = R>>(input: S, emitter: E) -> Self { - Tokenizer { - eof: false, - state: InternalState::Data, - emitter, - temporary_buffer: String::new(), - to_reconsume: Stack2::default(), - reader: input.into_reader(), - character_reference_code: 0, - return_state: None, - } - } - /// Test-internal function to override internal state. /// /// Only available with the `integration-tests` feature which is not public API. diff --git a/tests/test_spans.rs b/tests/test_spans.rs index 64a64cf..8a820de 100644 --- a/tests/test_spans.rs +++ b/tests/test_spans.rs @@ -16,7 +16,7 @@ fn test() { let file_id = files.add("test.html", html); let mut labels = Vec::new(); - for token in Tokenizer::new_with_emitter( + for token in Tokenizer::new( PosTrackingReader::new(html), DefaultEmitter::<_, Range<usize>>::default(), ) |