diff options
Diffstat (limited to 'src')
| -rw-r--r-- | src/lib.rs | 2 | ||||
| -rw-r--r-- | src/naive_parser.rs | 70 | ||||
| -rw-r--r-- | src/reader.rs | 2 | ||||
| -rw-r--r-- | src/tokenizer.rs | 20 | 
4 files changed, 91 insertions, 3 deletions
| @@ -9,6 +9,7 @@ mod emitter;  mod entities;  mod error;  mod machine; +mod naive_parser;  pub mod offset;  pub mod reader;  mod tokenizer; @@ -16,6 +17,7 @@ mod utils;  pub use emitter::{Comment, Doctype, Emitter, EndTag, StartTag, Token};  pub use error::Error; +pub use naive_parser::NaiveParser;  pub use tokenizer::{State, Tokenizer};  #[cfg(feature = "integration-tests")] diff --git a/src/naive_parser.rs b/src/naive_parser.rs new file mode 100644 index 0000000..e229592 --- /dev/null +++ b/src/naive_parser.rs @@ -0,0 +1,70 @@ +use crate::emitter::DefaultEmitter; +use crate::offset::{Offset, Position}; +use crate::reader::{IntoReader, Reader}; +use crate::{Emitter, State, Tokenizer}; + +/// A naive HTML parser (**not** spec-compliant since it doesn't do tree construction). +/// +/// * it **does not** correct [misnested tags] +/// * it **does not** recognize implicitly self-closing elements like +///  `<img>`, it will simply emit a start token +/// * it naively emits any CDATA sections as bogus comments +/// +/// It has similar caveats to the [HTMLParser] from the Python standard library. +/// It should suffice for web scraping but you wouldn't use it to implement a browser. +/// +/// [misnested tags]: https://html.spec.whatwg.org/multipage/parsing.html#an-introduction-to-error-handling-and-strange-cases-in-the-parser +/// [HTMLParser]: https://docs.python.org/3/library/html.parser.html +pub struct NaiveParser<R: Reader, O: Offset, E: Emitter<O>> { +    tokenizer: Tokenizer<R, O, E>, +} + +impl<R: Reader, O: Offset> NaiveParser<R, O, DefaultEmitter<O>> { +    /// Constructs a new naive parser. +    // TODO: add example for NaiveParser::new +    pub fn new<'a>(reader: impl IntoReader<'a, Reader = R>) -> Self { +        let mut tokenizer = Tokenizer::new(reader, DefaultEmitter::default()); +        tokenizer.naively_switch_state = true; +        NaiveParser { tokenizer } +    } +} + +impl<R: Reader + Position<usize>> NaiveParser<R, usize, DefaultEmitter<usize>> { +    /// Constructs a new naive parser with source code offsets and spans. +    // TODO: add example for NaiveParser::new_with_spans +    pub fn new_with_spans<'a>(reader: impl IntoReader<'a, Reader = R>) -> Self { +        let mut tokenizer = Tokenizer::new(reader, DefaultEmitter::default()); +        tokenizer.naively_switch_state = true; +        NaiveParser { tokenizer } +    } +} + +impl<R: Reader, O: Offset, E: Emitter<O>> NaiveParser<R, O, E> { +    /// Constructs a new naive parser with a custom emitter. +    // TODO: add example for NaiveParser::new_with_emitter +    pub fn new_with_emitter<'a>(reader: impl IntoReader<'a, Reader = R>, emitter: E) -> Self { +        let mut tokenizer = Tokenizer::new(reader, emitter); +        tokenizer.naively_switch_state = true; +        NaiveParser { tokenizer } +    } +} + +impl<R: Reader + Position<O>, O: Offset, E: Emitter<O>> Iterator for NaiveParser<R, O, E> { +    type Item = Result<E::Token, R::Error>; + +    fn next(&mut self) -> Option<Self::Item> { +        self.tokenizer.next() +    } +} + +pub(crate) fn naive_next_state(tag_name: &str) -> State { +    // These transitions are defined in https://html.spec.whatwg.org/multipage/parsing.html#parsing-html-fragments. +    // TODO: investigate what state logic Python's HTMLParser is using +    match tag_name { +        "title" | "textarea" => State::RcData, +        "style" | "xmp" | "iframe" | "noembed" | "noframes" => State::RawText, +        "script" => State::ScriptData, +        "plaintext" => State::PlainText, +        _other => State::Data, +    } +} diff --git a/src/reader.rs b/src/reader.rs index e0161e5..b6e0905 100644 --- a/src/reader.rs +++ b/src/reader.rs @@ -33,7 +33,7 @@ pub trait Reader {  /// An object that can be converted into a [`Reader`].  ///  /// For example, any utf8-string can be converted into a `StringReader`. -// TODO: , such that [give concrete examples of not-yet-implemented parser API] work. +// TODO: , such that [give concrete examples of NaiveParser::new] work.  pub trait IntoReader<'a> {      /// The reader type into which this type should be converted.      type Reader: Reader + 'a; diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 3a75e60..7cc4712 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -1,4 +1,5 @@  use crate::machine; +use crate::naive_parser::naive_next_state;  use crate::offset::{Offset, Position};  use crate::reader::{IntoReader, Reader};  use crate::utils::{ @@ -32,7 +33,12 @@ impl<T: Copy> Stack2<T> {      }  } -/// An HTML tokenizer. See crate-level docs for basic usage. +/// An HTML tokenizer. +/// +/// Note that for proper HTML parsing, you'll have to implement [tree construction] +/// based on this Tokenizer yourself (since this crate currently does not implement it). +/// +/// [tree construction]: https://html.spec.whatwg.org/multipage/parsing.html#tree-construction  pub struct Tokenizer<R: Reader, O, E: Emitter<O>> {      eof: bool,      pub(crate) state: InternalState, @@ -46,12 +52,18 @@ pub struct Tokenizer<R: Reader, O, E: Emitter<O>> {      last_start_tag_name: String,      is_start_tag: bool,      pub(crate) doctype_offset: O, +    /// This boolean flag exists so that the [`NaiveParser`](crate::NaiveParser) can work with any [`Emitter`] +    /// (it cannot call [`Tokenizer::set_state`] using the emitted start tags since they can be of an arbitrary type). +    pub(crate) naively_switch_state: bool,  }  impl<R: Reader, O: Offset, E: Emitter<O>> Tokenizer<R, O, E> {      /// Creates a new tokenizer from some input and an emitter.      /// -    /// TODO: add warning about you needing to do the state switching +    /// Note that properly parsing HTML with this tokenizer requires you to +    /// implement [tree construction] and call [`Tokenizer::set_state`] accordingly. +    /// +    /// [tree construction]: https://html.spec.whatwg.org/multipage/parsing.html#tree-construction      pub fn new<'a>(reader: impl IntoReader<'a, Reader = R>, emitter: E) -> Self {          Tokenizer {              reader: reader.into_reader(), @@ -66,6 +78,7 @@ impl<R: Reader, O: Offset, E: Emitter<O>> Tokenizer<R, O, E> {              last_start_tag_name: String::new(),              is_start_tag: false,              doctype_offset: O::default(), +            naively_switch_state: false,          }      }  } @@ -175,6 +188,9 @@ impl<R: Reader + Position<O>, O: Offset, E: Emitter<O>> Tokenizer<R, O, E> {      pub(crate) fn emit_current_tag(&mut self) {          self.emitter.emit_current_tag(self.reader.position() - 1);          if self.is_start_tag { +            if self.naively_switch_state { +                self.state = naive_next_state(&self.current_tag_name).into(); +            }              std::mem::swap(&mut self.last_start_tag_name, &mut self.current_tag_name);          }      } | 
