//! Provides the [`Reader`] trait (and implementations) used by the tokenizer. use std::convert::Infallible; use std::io::{self, BufRead, BufReader, Read}; /// An object that provides characters to the tokenizer. /// /// Patches are welcome for providing an efficient implementation over async streams, /// iterators, files, etc, as long as any dependencies come behind featureflags. pub trait Reader { /// The error returned by this reader. type Error: std::error::Error; /// Return a new character from the input stream. /// /// The input stream does **not** have to be preprocessed in any way, it can contain standalone /// surrogates and have inconsistent newlines. fn read_char(&mut self) -> Result, Self::Error>; /// Attempt to read an entire string at once, either case-insensitively or not. /// /// `case_sensitive=false` means that characters of the input stream should be compared while /// ignoring ASCII-casing. /// /// It can be assumed that this function is never called with a string that contains `\r` or /// `\n`. /// /// If the next characters equal to `s`, this function consumes the respective characters from /// the input stream and returns `true`. If not, it does nothing and returns `false`. fn try_read_string(&mut self, s: &str, case_sensitive: bool) -> Result; } /// An object that can be converted into a [`Reader`]. /// /// For example, any utf8-string can be converted into a `StringReader`. // TODO: , such that [give concrete examples of NaiveParser::new] work. pub trait IntoReader<'a> { /// The reader type into which this type should be converted. type Reader: Reader + 'a; /// Convert self into some sort of reader. fn into_reader(self) -> Self::Reader; } impl<'a, R: 'a + Reader> IntoReader<'a> for R { type Reader = Self; fn into_reader(self) -> Self::Reader { self } } /// A helper struct to seek forwards and backwards in strings. Used by the tokenizer to read HTML /// from strings. pub struct StringReader<'a> { input: &'a str, cursor: std::str::Chars<'a>, pos: usize, } impl<'a> StringReader<'a> { fn new(input: &'a str) -> Self { let cursor = input.chars(); StringReader { input, cursor, pos: 0, } } } impl<'a> Reader for StringReader<'a> { type Error = Infallible; fn read_char(&mut self) -> Result, Self::Error> { let c = match self.cursor.next() { Some(c) => c, None => return Ok(None), }; self.pos += c.len_utf8(); Ok(Some(c)) } fn try_read_string(&mut self, s1: &str, case_sensitive: bool) -> Result { // we do not need to call validate_char here because `s` hopefully does not contain invalid // characters if let Some(s2) = self.input.get(self.pos..self.pos + s1.len()) { if s1 == s2 || (!case_sensitive && s1.eq_ignore_ascii_case(s2)) { self.pos += s1.len(); self.cursor = self.input[self.pos..].chars(); return Ok(true); } } Ok(false) } } impl<'a> IntoReader<'a> for &'a str { type Reader = StringReader<'a>; fn into_reader(self) -> Self::Reader { StringReader::new(self) } } impl<'a> IntoReader<'a> for &'a String { type Reader = StringReader<'a>; fn into_reader(self) -> Self::Reader { StringReader::new(self.as_str()) } } /// A [`BufRead`]-based [`Reader`] implementation that attempts to read UTF-8. pub struct BufReadReader { line: String, line_pos: usize, reader: R, } impl BufReadReader { /// Construct a new `BufReadReader` from any type that implements `BufRead`. pub fn new(reader: R) -> Self { BufReadReader { line: String::new(), line_pos: 0, reader, } } #[inline] fn get_remaining_line(&mut self) -> Result<&str, io::Error> { if self.line_pos < self.line.len() { return Ok(&self.line[self.line_pos..]); } self.line.clear(); self.line_pos = 0; self.reader.read_line(&mut self.line)?; Ok(&self.line) } } impl Reader for BufReadReader { type Error = io::Error; fn read_char(&mut self) -> Result, Self::Error> { let rv = self.get_remaining_line()?.chars().next(); self.line_pos += rv.map(char::len_utf8).unwrap_or(1); Ok(rv) } fn try_read_string(&mut self, s1: &str, case_sensitive: bool) -> Result { debug_assert!(!s1.contains('\r')); debug_assert!(!s1.contains('\n')); if let Some(s2) = self.get_remaining_line()?.get(..s1.len()) { if s1 == s2 || (!case_sensitive && s1.eq_ignore_ascii_case(s2)) { self.line_pos += s1.len(); return Ok(true); } } Ok(false) } } impl<'a, R: Read + 'a> IntoReader<'a> for BufReader { type Reader = BufReadReader>; fn into_reader(self) -> Self::Reader { BufReadReader::new(self) } } #[cfg(test)] mod tests { use std::io::{BufReader, ErrorKind}; use std::str::Utf8Error; use super::{IntoReader, Reader}; #[test] #[should_panic] // FIXME fn buf_read_reader_invalid_utf8() { let mut reader = BufReader::new(b" \xc3\x28" as &[u8]).into_reader(); assert_eq!(reader.read_char().unwrap(), Some(' ')); let error = reader.read_char().unwrap_err(); assert!(matches!(error.kind(), ErrorKind::InvalidData)); error.into_inner().unwrap().downcast::().unwrap(); assert_eq!(reader.read_char().unwrap(), Some('(')); assert_eq!(reader.read_char().unwrap(), None); } }