summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMartin Fischer <martin@push-f.com>2023-09-03 19:14:14 +0200
committerMartin Fischer <martin@push-f.com>2023-09-03 23:00:05 +0200
commit7cd148aa33a0de7ae5629a51db9157f66db9ac67 (patch)
tree274b2e114a9cd4f2d21a82ea3a3f83665bac8086
parent509ac6a8e3151c065a7ee609fcdabf8847fc0498 (diff)
fix: BufReadReader skips line on invalid UTF-8
-rw-r--r--CHANGELOG.md9
-rw-r--r--src/lib.rs3
-rw-r--r--src/reader.rs135
3 files changed, 120 insertions, 27 deletions
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 1fa3b76..d266c7f 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,12 +2,21 @@
### [unreleased]
+#### Features
+
+* `BufReadReader` can now operate on any `std::io::Read` implementation
+ and no longer requires the reader to implement `std::io::BufRead`.
+
#### Breaking changes
* Added missing `R: Position<O>` bounds for `Tokenizer`/`NaiveParser` constructors.
(If you are able to construct a Tokenizer/NaiveParser,
you should be able to iterate over it.)
+#### Fixes
+
+* Fixed `BufReadReader` skipping the whole line if it contained invalid UTF-8.
+
### 0.5.0 - 2023-08-19
#### Features
diff --git a/src/lib.rs b/src/lib.rs
index 151bb98..9dd878c 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -1,6 +1,7 @@
#![warn(missing_docs)]
// This is an HTML parser. HTML can be untrusted input from the internet.
-#![forbid(unsafe_code)]
+#![forbid(clippy::undocumented_unsafe_blocks)]
+#![forbid(clippy::multiple_unsafe_ops_per_block)]
#![doc = concat!("[changelog]: ", file_url!("CHANGELOG.md"))]
#![doc = concat!("[the LICENSE file]: ", file_url!("LICENSE"))]
#![doc = include_str!("../README.md")]
diff --git a/src/reader.rs b/src/reader.rs
index a987b28..15b9224 100644
--- a/src/reader.rs
+++ b/src/reader.rs
@@ -1,7 +1,8 @@
//! Provides the [`Reader`] trait (and implementations) used by the tokenizer.
+use std::collections::VecDeque;
use std::convert::Infallible;
-use std::io::{self, BufRead, BufReader, Read};
+use std::io::{self, BufReader, Read};
/// An object that provides characters to the tokenizer.
///
@@ -27,6 +28,7 @@ pub trait Reader {
///
/// If the next characters equal to `s`, this function consumes the respective characters from
/// the input stream and returns `true`. If not, it does nothing and returns `false`.
+ // TODO: document a maximum s length that may be assumed (depends on longest named character reference ... which may change?)
fn try_read_string(&mut self, s: &str, case_sensitive: bool) -> Result<bool, Self::Error>;
}
@@ -112,57 +114,139 @@ impl<'a> IntoReader<'a> for &'a String {
}
}
-/// A [`BufRead`]-based [`Reader`] implementation that attempts to read UTF-8.
-pub struct BufReadReader<R: BufRead> {
- line: String,
- line_pos: usize,
+/// Just the same as [`std::sys_common::io::DEFAULT_BUF_SIZE`] (which isn't public).
+const BUF_SIZE: usize = 8 * 1024;
+
+/// A [`Read`]-based buffered [`Reader`] implementation that attempts to read UTF-8.
+pub struct BufReadReader<R: Read> {
reader: R,
+ /// The buffer into which bytes will be read from the reader.
+ buffer: [u8; BUF_SIZE],
+ /// Number of bytes in the buffer that have been read.
+ read: usize,
+ /// Position in the buffer up until the bytes have been parsed to chars.
+ pos: usize,
+ /// The characters parsed from the buffer in read order.
+ chars: VecDeque<char>,
+ /// An error that has occurred after reading the current content of chars.
+ error: Option<io::Error>,
+ /// Indicates if the end-of-file has been reached (we won't read anymore).
+ eof: bool,
}
-impl<R: BufRead> BufReadReader<R> {
- /// Construct a new `BufReadReader` from any type that implements `BufRead`.
+impl<R: Read> BufReadReader<R> {
+ /// Construct a new `BufReadReader` from any type that implements [`Read`].
pub fn new(reader: R) -> Self {
BufReadReader {
- line: String::new(),
- line_pos: 0,
reader,
+ buffer: [0; BUF_SIZE],
+ read: 0,
+ pos: 0,
+ chars: VecDeque::new(),
+ error: None,
+ eof: false,
}
}
#[inline]
- fn get_remaining_line(&mut self) -> Result<&str, io::Error> {
- if self.line_pos < self.line.len() {
- return Ok(&self.line[self.line_pos..]);
+ fn read(&mut self) -> Result<(), io::Error> {
+ debug_assert!(!self.eof);
+ debug_assert!(self.error.is_none());
+
+ if self.pos == self.read {
+ self.read = match self.reader.read(&mut self.buffer)? {
+ 0 => {
+ self.eof = true;
+ return Ok(());
+ }
+ n => n,
+ };
+ self.pos = 0;
+ }
+
+ let unprocessed = &self.buffer[self.pos..self.read];
+
+ let (valid_str, err) = match std::str::from_utf8(unprocessed) {
+ Ok(s) => (s, None),
+ Err(err) => (
+ // SAFETY: The UTF-8 checking has already been done by the previous from_utf8 call.
+ unsafe { std::str::from_utf8_unchecked(&unprocessed[..err.valid_up_to()]) },
+ Some(err),
+ ),
+ };
+ for c in valid_str.chars() {
+ self.chars.push_back(c);
}
+ self.pos += valid_str.len();
- self.line.clear();
- self.line_pos = 0;
- self.reader.read_line(&mut self.line)?;
- Ok(&self.line)
+ if let Some(err) = err {
+ self.error = Some(io::Error::new(io::ErrorKind::InvalidData, err));
+
+ match err.error_len() {
+ None => self.eof = true,
+ Some(error_len) => self.pos += error_len,
+ }
+ }
+ Ok(())
}
}
-impl<R: BufRead> Reader for BufReadReader<R> {
+impl<R: Read> Reader for BufReadReader<R> {
type Error = io::Error;
fn read_char(&mut self) -> Result<Option<char>, Self::Error> {
- let rv = self.get_remaining_line()?.chars().next();
- self.line_pos += rv.map(char::len_utf8).unwrap_or(1);
- Ok(rv)
+ if let Some(char) = self.chars.pop_front() {
+ return Ok(Some(char));
+ }
+ if let Some(error) = self.error.take() {
+ return Err(error);
+ }
+ if self.eof {
+ return Ok(None);
+ }
+
+ self.read()?;
+
+ if let Some(char) = self.chars.pop_front() {
+ return Ok(Some(char));
+ }
+ if let Some(error) = self.error.take() {
+ return Err(error);
+ }
+ debug_assert!(self.eof);
+ Ok(None)
}
fn try_read_string(&mut self, s1: &str, case_sensitive: bool) -> Result<bool, Self::Error> {
debug_assert!(!s1.contains('\r'));
debug_assert!(!s1.contains('\n'));
+ debug_assert!(s1.len() <= self.buffer.len());
- if let Some(s2) = self.get_remaining_line()?.get(..s1.len()) {
- if s1 == s2 || (!case_sensitive && s1.eq_ignore_ascii_case(s2)) {
- self.line_pos += s1.len();
- return Ok(true);
+ while self.chars.len() < s1.len() {
+ if self.error.is_some() {
+ return Ok(false);
}
+ if self.eof {
+ return Ok(false);
+ }
+ self.read()?;
}
- Ok(false)
+ for (c, expected) in std::iter::zip(self.chars.iter(), s1.chars()) {
+ if case_sensitive {
+ if *c != expected {
+ return Ok(false);
+ }
+ } else {
+ if !c.eq_ignore_ascii_case(&expected) {
+ return Ok(false);
+ }
+ }
+ }
+
+ self.chars.drain(..s1.len());
+
+ Ok(true)
}
}
@@ -182,7 +266,6 @@ mod tests {
use super::{IntoReader, Reader};
#[test]
- #[should_panic] // FIXME
fn buf_read_reader_invalid_utf8() {
let mut reader = BufReader::new(b" \xc3\x28" as &[u8]).into_reader();
assert_eq!(reader.read_char().unwrap(), Some(' '));