summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMartin Fischer <martin@push-f.com>2023-09-01 14:59:08 +0200
committerMartin Fischer <martin@push-f.com>2023-09-03 23:00:05 +0200
commit153acbee5650b9b006c7de215fb1421a64516fb4 (patch)
treeed363c6e00035cb227b183170bbd42495c84c0ca
parenteea8c60fba5962d5575c8515b837fb6304f73c9d (diff)
fix!: make PosTrackingReader encoding-independent
While much of the span logic currently assumes UTF-8, we also want to support other character encodings, such as e.g. UTF-16 where characters can take up more or less bytes than in UTF-8.
-rw-r--r--CHANGELOG.md2
-rw-r--r--src/offset.rs10
-rw-r--r--src/reader.rs11
3 files changed, 21 insertions, 2 deletions
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 28431e4..5025516 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -29,6 +29,8 @@
* `AttributeOwned`: The `value_offset` field has been replaced with `value_span`.
+* Added required `len_of_char_in_current_encoding` method to `Reader` trait.
+
* Added missing `R: Position<O>` bounds for `Tokenizer`/`NaiveParser` constructors.
(If you are able to construct a Tokenizer/NaiveParser,
you should be able to iterate over it.)
diff --git a/src/offset.rs b/src/offset.rs
index 3152c78..bdb069a 100644
--- a/src/offset.rs
+++ b/src/offset.rs
@@ -98,7 +98,7 @@ impl<R: Reader> Reader for PosTrackingReader<R> {
fn read_char(&mut self) -> Result<Option<char>, Self::Error> {
match self.reader.read_char()? {
Some(char) => {
- self.position += char.len_utf8();
+ self.position += self.reader.len_of_char_in_current_encoding(char);
Ok(Some(char))
}
None => Ok(None),
@@ -108,10 +108,16 @@ impl<R: Reader> Reader for PosTrackingReader<R> {
fn try_read_string(&mut self, s: &str, case_sensitive: bool) -> Result<bool, Self::Error> {
match self.reader.try_read_string(s, case_sensitive)? {
true => {
- self.position += s.len();
+ for c in s.chars() {
+ self.position += self.reader.len_of_char_in_current_encoding(c);
+ }
Ok(true)
}
false => Ok(false),
}
}
+
+ fn len_of_char_in_current_encoding(&self, c: char) -> usize {
+ self.reader.len_of_char_in_current_encoding(c)
+ }
}
diff --git a/src/reader.rs b/src/reader.rs
index 15b9224..3c39b16 100644
--- a/src/reader.rs
+++ b/src/reader.rs
@@ -30,6 +30,9 @@ pub trait Reader {
/// the input stream and returns `true`. If not, it does nothing and returns `false`.
// TODO: document a maximum s length that may be assumed (depends on longest named character reference ... which may change?)
fn try_read_string(&mut self, s: &str, case_sensitive: bool) -> Result<bool, Self::Error>;
+
+ /// Returns the number of bytes that the given character takes up in the current character encoding.
+ fn len_of_char_in_current_encoding(&self, c: char) -> usize;
}
/// An object that can be converted into a [`Reader`].
@@ -96,6 +99,10 @@ impl<'a> Reader for StringReader<'a> {
Ok(false)
}
+
+ fn len_of_char_in_current_encoding(&self, c: char) -> usize {
+ c.len_utf8()
+ }
}
impl<'a> IntoReader<'a> for &'a str {
@@ -248,6 +255,10 @@ impl<R: Read> Reader for BufReadReader<R> {
Ok(true)
}
+
+ fn len_of_char_in_current_encoding(&self, c: char) -> usize {
+ c.len_utf8()
+ }
}
impl<'a, R: Read + 'a> IntoReader<'a> for BufReader<R> {