diff options
author | Martin Fischer <martin@push-f.com> | 2023-09-03 19:14:14 +0200 |
---|---|---|
committer | Martin Fischer <martin@push-f.com> | 2023-09-03 23:00:05 +0200 |
commit | 7cd148aa33a0de7ae5629a51db9157f66db9ac67 (patch) | |
tree | 274b2e114a9cd4f2d21a82ea3a3f83665bac8086 /src | |
parent | 509ac6a8e3151c065a7ee609fcdabf8847fc0498 (diff) |
fix: BufReadReader skips line on invalid UTF-8
Diffstat (limited to 'src')
-rw-r--r-- | src/lib.rs | 3 | ||||
-rw-r--r-- | src/reader.rs | 135 |
2 files changed, 111 insertions, 27 deletions
@@ -1,6 +1,7 @@ #![warn(missing_docs)] // This is an HTML parser. HTML can be untrusted input from the internet. -#![forbid(unsafe_code)] +#![forbid(clippy::undocumented_unsafe_blocks)] +#![forbid(clippy::multiple_unsafe_ops_per_block)] #![doc = concat!("[changelog]: ", file_url!("CHANGELOG.md"))] #![doc = concat!("[the LICENSE file]: ", file_url!("LICENSE"))] #![doc = include_str!("../README.md")] diff --git a/src/reader.rs b/src/reader.rs index a987b28..15b9224 100644 --- a/src/reader.rs +++ b/src/reader.rs @@ -1,7 +1,8 @@ //! Provides the [`Reader`] trait (and implementations) used by the tokenizer. +use std::collections::VecDeque; use std::convert::Infallible; -use std::io::{self, BufRead, BufReader, Read}; +use std::io::{self, BufReader, Read}; /// An object that provides characters to the tokenizer. /// @@ -27,6 +28,7 @@ pub trait Reader { /// /// If the next characters equal to `s`, this function consumes the respective characters from /// the input stream and returns `true`. If not, it does nothing and returns `false`. + // TODO: document a maximum s length that may be assumed (depends on longest named character reference ... which may change?) fn try_read_string(&mut self, s: &str, case_sensitive: bool) -> Result<bool, Self::Error>; } @@ -112,57 +114,139 @@ impl<'a> IntoReader<'a> for &'a String { } } -/// A [`BufRead`]-based [`Reader`] implementation that attempts to read UTF-8. -pub struct BufReadReader<R: BufRead> { - line: String, - line_pos: usize, +/// Just the same as [`std::sys_common::io::DEFAULT_BUF_SIZE`] (which isn't public). +const BUF_SIZE: usize = 8 * 1024; + +/// A [`Read`]-based buffered [`Reader`] implementation that attempts to read UTF-8. +pub struct BufReadReader<R: Read> { reader: R, + /// The buffer into which bytes will be read from the reader. + buffer: [u8; BUF_SIZE], + /// Number of bytes in the buffer that have been read. + read: usize, + /// Position in the buffer up until the bytes have been parsed to chars. + pos: usize, + /// The characters parsed from the buffer in read order. + chars: VecDeque<char>, + /// An error that has occurred after reading the current content of chars. + error: Option<io::Error>, + /// Indicates if the end-of-file has been reached (we won't read anymore). + eof: bool, } -impl<R: BufRead> BufReadReader<R> { - /// Construct a new `BufReadReader` from any type that implements `BufRead`. +impl<R: Read> BufReadReader<R> { + /// Construct a new `BufReadReader` from any type that implements [`Read`]. pub fn new(reader: R) -> Self { BufReadReader { - line: String::new(), - line_pos: 0, reader, + buffer: [0; BUF_SIZE], + read: 0, + pos: 0, + chars: VecDeque::new(), + error: None, + eof: false, } } #[inline] - fn get_remaining_line(&mut self) -> Result<&str, io::Error> { - if self.line_pos < self.line.len() { - return Ok(&self.line[self.line_pos..]); + fn read(&mut self) -> Result<(), io::Error> { + debug_assert!(!self.eof); + debug_assert!(self.error.is_none()); + + if self.pos == self.read { + self.read = match self.reader.read(&mut self.buffer)? { + 0 => { + self.eof = true; + return Ok(()); + } + n => n, + }; + self.pos = 0; + } + + let unprocessed = &self.buffer[self.pos..self.read]; + + let (valid_str, err) = match std::str::from_utf8(unprocessed) { + Ok(s) => (s, None), + Err(err) => ( + // SAFETY: The UTF-8 checking has already been done by the previous from_utf8 call. + unsafe { std::str::from_utf8_unchecked(&unprocessed[..err.valid_up_to()]) }, + Some(err), + ), + }; + for c in valid_str.chars() { + self.chars.push_back(c); } + self.pos += valid_str.len(); - self.line.clear(); - self.line_pos = 0; - self.reader.read_line(&mut self.line)?; - Ok(&self.line) + if let Some(err) = err { + self.error = Some(io::Error::new(io::ErrorKind::InvalidData, err)); + + match err.error_len() { + None => self.eof = true, + Some(error_len) => self.pos += error_len, + } + } + Ok(()) } } -impl<R: BufRead> Reader for BufReadReader<R> { +impl<R: Read> Reader for BufReadReader<R> { type Error = io::Error; fn read_char(&mut self) -> Result<Option<char>, Self::Error> { - let rv = self.get_remaining_line()?.chars().next(); - self.line_pos += rv.map(char::len_utf8).unwrap_or(1); - Ok(rv) + if let Some(char) = self.chars.pop_front() { + return Ok(Some(char)); + } + if let Some(error) = self.error.take() { + return Err(error); + } + if self.eof { + return Ok(None); + } + + self.read()?; + + if let Some(char) = self.chars.pop_front() { + return Ok(Some(char)); + } + if let Some(error) = self.error.take() { + return Err(error); + } + debug_assert!(self.eof); + Ok(None) } fn try_read_string(&mut self, s1: &str, case_sensitive: bool) -> Result<bool, Self::Error> { debug_assert!(!s1.contains('\r')); debug_assert!(!s1.contains('\n')); + debug_assert!(s1.len() <= self.buffer.len()); - if let Some(s2) = self.get_remaining_line()?.get(..s1.len()) { - if s1 == s2 || (!case_sensitive && s1.eq_ignore_ascii_case(s2)) { - self.line_pos += s1.len(); - return Ok(true); + while self.chars.len() < s1.len() { + if self.error.is_some() { + return Ok(false); } + if self.eof { + return Ok(false); + } + self.read()?; } - Ok(false) + for (c, expected) in std::iter::zip(self.chars.iter(), s1.chars()) { + if case_sensitive { + if *c != expected { + return Ok(false); + } + } else { + if !c.eq_ignore_ascii_case(&expected) { + return Ok(false); + } + } + } + + self.chars.drain(..s1.len()); + + Ok(true) } } @@ -182,7 +266,6 @@ mod tests { use super::{IntoReader, Reader}; #[test] - #[should_panic] // FIXME fn buf_read_reader_invalid_utf8() { let mut reader = BufReader::new(b" \xc3\x28" as &[u8]).into_reader(); assert_eq!(reader.read_char().unwrap(), Some(' ')); |