diff options
Diffstat (limited to 'src')
| -rw-r--r-- | src/lib.rs | 3 | ||||
| -rw-r--r-- | src/reader.rs | 135 | 
2 files changed, 111 insertions, 27 deletions
| @@ -1,6 +1,7 @@  #![warn(missing_docs)]  // This is an HTML parser. HTML can be untrusted input from the internet. -#![forbid(unsafe_code)] +#![forbid(clippy::undocumented_unsafe_blocks)] +#![forbid(clippy::multiple_unsafe_ops_per_block)]  #![doc = concat!("[changelog]: ", file_url!("CHANGELOG.md"))]  #![doc = concat!("[the LICENSE file]: ", file_url!("LICENSE"))]  #![doc = include_str!("../README.md")] diff --git a/src/reader.rs b/src/reader.rs index a987b28..15b9224 100644 --- a/src/reader.rs +++ b/src/reader.rs @@ -1,7 +1,8 @@  //! Provides the [`Reader`] trait (and implementations) used by the tokenizer. +use std::collections::VecDeque;  use std::convert::Infallible; -use std::io::{self, BufRead, BufReader, Read}; +use std::io::{self, BufReader, Read};  /// An object that provides characters to the tokenizer.  /// @@ -27,6 +28,7 @@ pub trait Reader {      ///      /// If the next characters equal to `s`, this function consumes the respective characters from      /// the input stream and returns `true`. If not, it does nothing and returns `false`. +    // TODO: document a maximum s length that may be assumed (depends on longest named character reference ... which may change?)      fn try_read_string(&mut self, s: &str, case_sensitive: bool) -> Result<bool, Self::Error>;  } @@ -112,57 +114,139 @@ impl<'a> IntoReader<'a> for &'a String {      }  } -/// A [`BufRead`]-based [`Reader`] implementation that attempts to read UTF-8. -pub struct BufReadReader<R: BufRead> { -    line: String, -    line_pos: usize, +/// Just the same as [`std::sys_common::io::DEFAULT_BUF_SIZE`] (which isn't public). +const BUF_SIZE: usize = 8 * 1024; + +/// A [`Read`]-based buffered [`Reader`] implementation that attempts to read UTF-8. +pub struct BufReadReader<R: Read> {      reader: R, +    /// The buffer into which bytes will be read from the reader. +    buffer: [u8; BUF_SIZE], +    /// Number of bytes in the buffer that have been read. +    read: usize, +    /// Position in the buffer up until the bytes have been parsed to chars. +    pos: usize, +    /// The characters parsed from the buffer in read order. +    chars: VecDeque<char>, +    /// An error that has occurred after reading the current content of chars. +    error: Option<io::Error>, +    /// Indicates if the end-of-file has been reached (we won't read anymore). +    eof: bool,  } -impl<R: BufRead> BufReadReader<R> { -    /// Construct a new `BufReadReader` from any type that implements `BufRead`. +impl<R: Read> BufReadReader<R> { +    /// Construct a new `BufReadReader` from any type that implements [`Read`].      pub fn new(reader: R) -> Self {          BufReadReader { -            line: String::new(), -            line_pos: 0,              reader, +            buffer: [0; BUF_SIZE], +            read: 0, +            pos: 0, +            chars: VecDeque::new(), +            error: None, +            eof: false,          }      }      #[inline] -    fn get_remaining_line(&mut self) -> Result<&str, io::Error> { -        if self.line_pos < self.line.len() { -            return Ok(&self.line[self.line_pos..]); +    fn read(&mut self) -> Result<(), io::Error> { +        debug_assert!(!self.eof); +        debug_assert!(self.error.is_none()); + +        if self.pos == self.read { +            self.read = match self.reader.read(&mut self.buffer)? { +                0 => { +                    self.eof = true; +                    return Ok(()); +                } +                n => n, +            }; +            self.pos = 0; +        } + +        let unprocessed = &self.buffer[self.pos..self.read]; + +        let (valid_str, err) = match std::str::from_utf8(unprocessed) { +            Ok(s) => (s, None), +            Err(err) => ( +                // SAFETY: The UTF-8 checking has already been done by the previous from_utf8 call. +                unsafe { std::str::from_utf8_unchecked(&unprocessed[..err.valid_up_to()]) }, +                Some(err), +            ), +        }; +        for c in valid_str.chars() { +            self.chars.push_back(c);          } +        self.pos += valid_str.len(); -        self.line.clear(); -        self.line_pos = 0; -        self.reader.read_line(&mut self.line)?; -        Ok(&self.line) +        if let Some(err) = err { +            self.error = Some(io::Error::new(io::ErrorKind::InvalidData, err)); + +            match err.error_len() { +                None => self.eof = true, +                Some(error_len) => self.pos += error_len, +            } +        } +        Ok(())      }  } -impl<R: BufRead> Reader for BufReadReader<R> { +impl<R: Read> Reader for BufReadReader<R> {      type Error = io::Error;      fn read_char(&mut self) -> Result<Option<char>, Self::Error> { -        let rv = self.get_remaining_line()?.chars().next(); -        self.line_pos += rv.map(char::len_utf8).unwrap_or(1); -        Ok(rv) +        if let Some(char) = self.chars.pop_front() { +            return Ok(Some(char)); +        } +        if let Some(error) = self.error.take() { +            return Err(error); +        } +        if self.eof { +            return Ok(None); +        } + +        self.read()?; + +        if let Some(char) = self.chars.pop_front() { +            return Ok(Some(char)); +        } +        if let Some(error) = self.error.take() { +            return Err(error); +        } +        debug_assert!(self.eof); +        Ok(None)      }      fn try_read_string(&mut self, s1: &str, case_sensitive: bool) -> Result<bool, Self::Error> {          debug_assert!(!s1.contains('\r'));          debug_assert!(!s1.contains('\n')); +        debug_assert!(s1.len() <= self.buffer.len()); -        if let Some(s2) = self.get_remaining_line()?.get(..s1.len()) { -            if s1 == s2 || (!case_sensitive && s1.eq_ignore_ascii_case(s2)) { -                self.line_pos += s1.len(); -                return Ok(true); +        while self.chars.len() < s1.len() { +            if self.error.is_some() { +                return Ok(false);              } +            if self.eof { +                return Ok(false); +            } +            self.read()?;          } -        Ok(false) +        for (c, expected) in std::iter::zip(self.chars.iter(), s1.chars()) { +            if case_sensitive { +                if *c != expected { +                    return Ok(false); +                } +            } else { +                if !c.eq_ignore_ascii_case(&expected) { +                    return Ok(false); +                } +            } +        } + +        self.chars.drain(..s1.len()); + +        Ok(true)      }  } @@ -182,7 +266,6 @@ mod tests {      use super::{IntoReader, Reader};      #[test] -    #[should_panic] // FIXME      fn buf_read_reader_invalid_utf8() {          let mut reader = BufReader::new(b" \xc3\x28" as &[u8]).into_reader();          assert_eq!(reader.read_char().unwrap(), Some(' ')); | 
