diff options
author | Markus Unterwaditzer <markus@unterwaditzer.net> | 2021-11-26 18:38:58 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2021-11-26 18:38:58 +0100 |
commit | dced8066f77f570dd3e396ec3570c71aa86c454e (patch) | |
tree | 2fb81d5f411319948c135dfaaa78bc5da6783621 /src/reader.rs | |
parent | 029c13426634e58adb3996c9a5e4d79c3e8437f2 (diff) |
Read html from io::BufRead (#8)
Diffstat (limited to 'src/reader.rs')
-rw-r--r-- | src/reader.rs | 151 |
1 files changed, 143 insertions, 8 deletions
diff --git a/src/reader.rs b/src/reader.rs index 7246129..b7a63b2 100644 --- a/src/reader.rs +++ b/src/reader.rs @@ -1,12 +1,18 @@ +use crate::Never; +use std::io::{self, BufRead, BufReader, Read}; + /// An object that provides characters to the tokenizer. /// /// See [`crate::Tokenizer::new`] for more information. pub trait Reader { + /// The error returned by this reader. + type Error: std::error::Error; + /// Return a new character from the input stream. /// /// The input stream does **not** have to be preprocessed in any way, it can contain standalone /// surrogates and have inconsistent newlines. - fn read_char(&mut self) -> Option<char>; + fn read_char(&mut self) -> Result<Option<char>, Self::Error>; /// Attempt to read an entire string at once, either case-insensitively or not. /// @@ -18,7 +24,7 @@ pub trait Reader { /// /// If the next characters equal to `s`, this function consumes the respective characters from /// the input stream and returns `true`. If not, it does nothing and returns `false`. - fn try_read_string(&mut self, s: &str, case_sensitive: bool) -> bool; + fn try_read_string(&mut self, s: &str, case_sensitive: bool) -> Result<bool, Self::Error>; } /// An object that can be converted into a [`crate::Reader`]. @@ -43,6 +49,33 @@ impl<'a, R: 'a + Reader> Readable<'a> for R { /// A helper struct to seek forwards and backwards in strings. Used by the tokenizer to read HTML /// from strings. +/// +/// Example: +/// +/// ```rust +/// use std::fmt::Write; +/// use html5gum::{Tokenizer, Token}; +/// +/// let html = "<title >hello world</title>"; +/// let mut new_html = String::new(); +/// +/// for token in Tokenizer::new(html).infallible() { +/// match token { +/// Token::StartTag(tag) => { +/// write!(new_html, "<{}>", tag.name).unwrap(); +/// } +/// Token::String(hello_world) => { +/// write!(new_html, "{}", hello_world).unwrap(); +/// } +/// Token::EndTag(tag) => { +/// write!(new_html, "</{}>", tag.name).unwrap(); +/// } +/// _ => panic!("unexpected input"), +/// } +/// } +/// +/// assert_eq!(new_html, "<title>hello world</title>"); +/// ``` pub struct StringReader<'a> { input: &'a str, cursor: std::str::Chars<'a>, @@ -61,24 +94,29 @@ impl<'a> StringReader<'a> { } impl<'a> Reader for StringReader<'a> { - fn read_char(&mut self) -> Option<char> { - let c = self.cursor.next()?; + type Error = Never; + + fn read_char(&mut self) -> Result<Option<char>, Self::Error> { + let c = match self.cursor.next() { + Some(c) => c, + None => return Ok(None), + }; self.pos += c.len_utf8(); - Some(c) + Ok(Some(c)) } - fn try_read_string(&mut self, s1: &str, case_sensitive: bool) -> bool { + fn try_read_string(&mut self, s1: &str, case_sensitive: bool) -> Result<bool, Self::Error> { // we do not need to call validate_char here because `s` hopefully does not contain invalid // characters if let Some(s2) = self.input.get(self.pos..self.pos + s1.len()) { if s1 == s2 || (!case_sensitive && s1.eq_ignore_ascii_case(s2)) { self.pos += s1.len(); self.cursor = self.input[self.pos..].chars(); - return true; + return Ok(true); } } - false + Ok(false) } } @@ -97,3 +135,100 @@ impl<'a> Readable<'a> for &'a String { StringReader::new(self.as_str()) } } + +/// A [`BufReadReader`] can be used to construct a tokenizer from any type that implements +/// `BufRead`. +/// +/// Example: +/// +/// ```rust +/// use std::io::BufReader; +/// use std::fmt::Write; +/// use html5gum::{Token, BufReadReader, Tokenizer}; +/// +/// let tokenizer = Tokenizer::new(BufReader::new("<title>hello world</title>".as_bytes())); +/// // or alternatively: +/// // tokenizer = Tokenizer::new(BufReadReader::new(BufReader::new("...".as_bytes()))); +/// +/// let mut new_html = String::new(); +/// +/// for token in tokenizer { +/// let token = token.unwrap(); +/// +/// match token { +/// Token::StartTag(tag) => { +/// write!(new_html, "<{}>", tag.name).unwrap(); +/// } +/// Token::String(hello_world) => { +/// write!(new_html, "{}", hello_world).unwrap(); +/// } +/// Token::EndTag(tag) => { +/// write!(new_html, "</{}>", tag.name).unwrap(); +/// } +/// _ => panic!("unexpected input"), +/// } +/// +/// } +/// +/// assert_eq!(new_html, "<title>hello world</title>"); +/// ``` +pub struct BufReadReader<R: BufRead> { + line: String, + line_pos: usize, + reader: R, +} + +impl<R: BufRead> BufReadReader<R> { + /// Construct a new `BufReadReader` from any type that implements `BufRead`. + pub fn new(reader: R) -> Self { + BufReadReader { + line: String::new(), + line_pos: 0, + reader, + } + } + + #[inline] + fn get_remaining_line(&mut self) -> Result<&str, io::Error> { + if self.line_pos < self.line.len() { + return Ok(&self.line[self.line_pos..]); + } + + self.line.clear(); + self.line_pos = 0; + self.reader.read_line(&mut self.line)?; + Ok(&self.line) + } +} + +impl<R: BufRead> Reader for BufReadReader<R> { + type Error = io::Error; + + fn read_char(&mut self) -> Result<Option<char>, Self::Error> { + let rv = self.get_remaining_line()?.chars().next(); + self.line_pos += rv.map(char::len_utf8).unwrap_or(1); + Ok(rv) + } + + fn try_read_string(&mut self, s1: &str, case_sensitive: bool) -> Result<bool, Self::Error> { + debug_assert!(!s1.contains('\r')); + debug_assert!(!s1.contains('\n')); + + if let Some(s2) = self.get_remaining_line()?.get(..s1.len()) { + if s1 == s2 || (!case_sensitive && s1.eq_ignore_ascii_case(s2)) { + self.line_pos += s1.len(); + return Ok(true); + } + } + + Ok(false) + } +} + +impl<'a, R: Read + 'a> Readable<'a> for BufReader<R> { + type Reader = BufReadReader<BufReader<R>>; + + fn to_reader(self) -> Self::Reader { + BufReadReader::new(self) + } +} |