fix!: make PosTrackingReader encoding-independent

While much of the span logic currently assumes UTF-8, we also want to support other character encodings, such as e.g. UTF-16 where characters can take up more or less bytes than in UTF-8.
author: Martin Fischer <martin@push-f.com> 2023-09-01 14:59:08 +0200
committer: Martin Fischer <martin@push-f.com> 2023-09-03 23:00:05 +0200
commit: 153acbee5650b9b006c7de215fb1421a64516fb4 (patch)
tree: ed363c6e00035cb227b183170bbd42495c84c0ca /src/reader.rs
parent: eea8c60fba5962d5575c8515b837fb6304f73c9d (diff)
1 files changed, 11 insertions, 0 deletions
diff --git a/src/reader.rs b/src/reader.rs
index 15b9224..3c39b16 100644
--- a/src/reader.rs
+++ b/src/reader.rs
@@ -30,6 +30,9 @@ pub trait Reader {
     /// the input stream and returns `true`. If not, it does nothing and returns `false`.
     // TODO: document a maximum s length that may be assumed (depends on longest named character reference ... which may change?)
     fn try_read_string(&mut self, s: &str, case_sensitive: bool) -> Result<bool, Self::Error>;
+
+    /// Returns the number of bytes that the given character takes up in the current character encoding.
+    fn len_of_char_in_current_encoding(&self, c: char) -> usize;
 }
 
 /// An object that can be converted into a [`Reader`].
@@ -96,6 +99,10 @@ impl<'a> Reader for StringReader<'a> {
 
         Ok(false)
     }
+
+    fn len_of_char_in_current_encoding(&self, c: char) -> usize {
+        c.len_utf8()
+    }
 }
 
 impl<'a> IntoReader<'a> for &'a str {
@@ -248,6 +255,10 @@ impl<R: Read> Reader for BufReadReader<R> {
 
         Ok(true)
     }
+
+    fn len_of_char_in_current_encoding(&self, c: char) -> usize {
+        c.len_utf8()
+    }
 }
 
 impl<'a, R: Read + 'a> IntoReader<'a> for BufReader<R> {
author	Martin Fischer <martin@push-f.com>	2023-09-01 14:59:08 +0200
committer	Martin Fischer <martin@push-f.com>	2023-09-03 23:00:05 +0200
commit	153acbee5650b9b006c7de215fb1421a64516fb4 (patch)
tree	ed363c6e00035cb227b183170bbd42495c84c0ca /src/reader.rs
parent	eea8c60fba5962d5575c8515b837fb6304f73c9d (diff)