diff options
author | Martin Fischer <martin@push-f.com> | 2021-04-08 08:42:01 +0200 |
---|---|---|
committer | Martin Fischer <martin@push-f.com> | 2021-04-08 15:40:37 +0200 |
commit | 57e7eefcbe6fb8c3dc4b01c707be9de4c34963a7 (patch) | |
tree | 6a9d296389bf3023396592c8514ed6712e011c7f /src/driver.rs |
import https://github.com/servo/html5ever
commit d1206daa740305f55a5fa159e43eb33afc359cb4
Diffstat (limited to 'src/driver.rs')
-rw-r--r-- | src/driver.rs | 137 |
1 files changed, 137 insertions, 0 deletions
diff --git a/src/driver.rs b/src/driver.rs new file mode 100644 index 0000000..26db9b8 --- /dev/null +++ b/src/driver.rs @@ -0,0 +1,137 @@ +// Copyright 2014-2017 The html5ever Project Developers. See the +// COPYRIGHT file at the top-level directory of this distribution. +// +// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or +// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license +// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +//! High-level interface to the parser. + +use crate::buffer_queue::BufferQueue; +use crate::tokenizer::{Tokenizer, TokenizerOpts, TokenizerResult}; +use crate::tree_builder::{create_element, TreeBuilder, TreeBuilderOpts, TreeSink}; +use crate::{Attribute, QualName}; + +use std::borrow::Cow; + +use crate::tendril; +use crate::tendril::stream::{TendrilSink, Utf8LossyDecoder}; +use crate::tendril::StrTendril; + +/// All-encompassing options struct for the parser. +#[derive(Clone, Default)] +pub struct ParseOpts { + /// Tokenizer options. + pub tokenizer: TokenizerOpts, + + /// Tree builder options. + pub tree_builder: TreeBuilderOpts, +} + +/// Parse an HTML document +/// +/// The returned value implements `tendril::TendrilSink` +/// so that Unicode input may be provided incrementally, +/// or all at once with the `one` method. +/// +/// If your input is bytes, use `Parser::from_utf8`. +pub fn parse_document<Sink>(sink: Sink, opts: ParseOpts) -> Parser<Sink> +where + Sink: TreeSink, +{ + let tb = TreeBuilder::new(sink, opts.tree_builder); + let tok = Tokenizer::new(tb, opts.tokenizer); + Parser { + tokenizer: tok, + input_buffer: BufferQueue::new(), + } +} + +/// Parse an HTML fragment +/// +/// The returned value implements `tendril::TendrilSink` +/// so that Unicode input may be provided incrementally, +/// or all at once with the `one` method. +/// +/// If your input is bytes, use `Parser::from_utf8`. +pub fn parse_fragment<Sink>( + mut sink: Sink, + opts: ParseOpts, + context_name: QualName, + context_attrs: Vec<Attribute>, +) -> Parser<Sink> +where + Sink: TreeSink, +{ + let context_elem = create_element(&mut sink, context_name, context_attrs); + parse_fragment_for_element(sink, opts, context_elem, None) +} + +/// Like `parse_fragment`, but with an existing context element +/// and optionally a form element. +pub fn parse_fragment_for_element<Sink>( + sink: Sink, + opts: ParseOpts, + context_element: Sink::Handle, + form_element: Option<Sink::Handle>, +) -> Parser<Sink> +where + Sink: TreeSink, +{ + let tb = TreeBuilder::new_for_fragment(sink, context_element, form_element, opts.tree_builder); + let tok_opts = TokenizerOpts { + initial_state: Some(tb.tokenizer_state_for_context_elem()), + ..opts.tokenizer + }; + let tok = Tokenizer::new(tb, tok_opts); + Parser { + tokenizer: tok, + input_buffer: BufferQueue::new(), + } +} + +/// An HTML parser, +/// ready to receive Unicode input through the `tendril::TendrilSink` trait’s methods. +pub struct Parser<Sink> +where + Sink: TreeSink, +{ + pub tokenizer: Tokenizer<TreeBuilder<Sink::Handle, Sink>>, + pub input_buffer: BufferQueue, +} + +impl<Sink: TreeSink> TendrilSink<tendril::fmt::UTF8> for Parser<Sink> { + fn process(&mut self, t: StrTendril) { + self.input_buffer.push_back(t); + // FIXME: Properly support </script> somehow. + while let TokenizerResult::Script(_) = self.tokenizer.feed(&mut self.input_buffer) {} + } + + // FIXME: Is it too noisy to report every character decoding error? + fn error(&mut self, desc: Cow<'static, str>) { + self.tokenizer.sink.sink.parse_error(desc) + } + + type Output = Sink::Output; + + fn finish(mut self) -> Self::Output { + // FIXME: Properly support </script> somehow. + while let TokenizerResult::Script(_) = self.tokenizer.feed(&mut self.input_buffer) {} + assert!(self.input_buffer.is_empty()); + self.tokenizer.end(); + self.tokenizer.sink.sink.finish() + } +} + +impl<Sink: TreeSink> Parser<Sink> { + /// Wrap this parser into a `TendrilSink` that accepts UTF-8 bytes. + /// + /// Use this when your input is bytes that are known to be in the UTF-8 encoding. + /// Decoding is lossy, like `String::from_utf8_lossy`. + #[allow(clippy::wrong_self_convention)] + pub fn from_utf8(self) -> Utf8LossyDecoder<Self> { + Utf8LossyDecoder::new(self) + } +} |