From 153acbee5650b9b006c7de215fb1421a64516fb4 Mon Sep 17 00:00:00 2001 From: Martin Fischer Date: Fri, 1 Sep 2023 14:59:08 +0200 Subject: fix!: make PosTrackingReader encoding-independent While much of the span logic currently assumes UTF-8, we also want to support other character encodings, such as e.g. UTF-16 where characters can take up more or less bytes than in UTF-8. --- CHANGELOG.md | 2 ++ src/offset.rs | 10 ++++++++-- src/reader.rs | 11 +++++++++++ 3 files changed, 21 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 28431e4..5025516 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -29,6 +29,8 @@ * `AttributeOwned`: The `value_offset` field has been replaced with `value_span`. +* Added required `len_of_char_in_current_encoding` method to `Reader` trait. + * Added missing `R: Position` bounds for `Tokenizer`/`NaiveParser` constructors. (If you are able to construct a Tokenizer/NaiveParser, you should be able to iterate over it.) diff --git a/src/offset.rs b/src/offset.rs index 3152c78..bdb069a 100644 --- a/src/offset.rs +++ b/src/offset.rs @@ -98,7 +98,7 @@ impl Reader for PosTrackingReader { fn read_char(&mut self) -> Result, Self::Error> { match self.reader.read_char()? { Some(char) => { - self.position += char.len_utf8(); + self.position += self.reader.len_of_char_in_current_encoding(char); Ok(Some(char)) } None => Ok(None), @@ -108,10 +108,16 @@ impl Reader for PosTrackingReader { fn try_read_string(&mut self, s: &str, case_sensitive: bool) -> Result { match self.reader.try_read_string(s, case_sensitive)? { true => { - self.position += s.len(); + for c in s.chars() { + self.position += self.reader.len_of_char_in_current_encoding(c); + } Ok(true) } false => Ok(false), } } + + fn len_of_char_in_current_encoding(&self, c: char) -> usize { + self.reader.len_of_char_in_current_encoding(c) + } } diff --git a/src/reader.rs b/src/reader.rs index 15b9224..3c39b16 100644 --- a/src/reader.rs +++ b/src/reader.rs @@ -30,6 +30,9 @@ pub trait Reader { /// the input stream and returns `true`. If not, it does nothing and returns `false`. // TODO: document a maximum s length that may be assumed (depends on longest named character reference ... which may change?) fn try_read_string(&mut self, s: &str, case_sensitive: bool) -> Result; + + /// Returns the number of bytes that the given character takes up in the current character encoding. + fn len_of_char_in_current_encoding(&self, c: char) -> usize; } /// An object that can be converted into a [`Reader`]. @@ -96,6 +99,10 @@ impl<'a> Reader for StringReader<'a> { Ok(false) } + + fn len_of_char_in_current_encoding(&self, c: char) -> usize { + c.len_utf8() + } } impl<'a> IntoReader<'a> for &'a str { @@ -248,6 +255,10 @@ impl Reader for BufReadReader { Ok(true) } + + fn len_of_char_in_current_encoding(&self, c: char) -> usize { + c.len_utf8() + } } impl<'a, R: Read + 'a> IntoReader<'a> for BufReader { -- cgit v1.2.3