diff options
| author | Martin Fischer <martin@push-f.com> | 2023-09-01 14:59:08 +0200 | 
|---|---|---|
| committer | Martin Fischer <martin@push-f.com> | 2023-09-03 23:00:05 +0200 | 
| commit | 153acbee5650b9b006c7de215fb1421a64516fb4 (patch) | |
| tree | ed363c6e00035cb227b183170bbd42495c84c0ca /src | |
| parent | eea8c60fba5962d5575c8515b837fb6304f73c9d (diff) | |
fix!: make PosTrackingReader encoding-independent
While much of the span logic currently assumes UTF-8, we also
want to support other character encodings, such as e.g. UTF-16
where characters can take up more or less bytes than in UTF-8.
Diffstat (limited to 'src')
| -rw-r--r-- | src/offset.rs | 10 | ||||
| -rw-r--r-- | src/reader.rs | 11 | 
2 files changed, 19 insertions, 2 deletions
| diff --git a/src/offset.rs b/src/offset.rs index 3152c78..bdb069a 100644 --- a/src/offset.rs +++ b/src/offset.rs @@ -98,7 +98,7 @@ impl<R: Reader> Reader for PosTrackingReader<R> {      fn read_char(&mut self) -> Result<Option<char>, Self::Error> {          match self.reader.read_char()? {              Some(char) => { -                self.position += char.len_utf8(); +                self.position += self.reader.len_of_char_in_current_encoding(char);                  Ok(Some(char))              }              None => Ok(None), @@ -108,10 +108,16 @@ impl<R: Reader> Reader for PosTrackingReader<R> {      fn try_read_string(&mut self, s: &str, case_sensitive: bool) -> Result<bool, Self::Error> {          match self.reader.try_read_string(s, case_sensitive)? {              true => { -                self.position += s.len(); +                for c in s.chars() { +                    self.position += self.reader.len_of_char_in_current_encoding(c); +                }                  Ok(true)              }              false => Ok(false),          }      } + +    fn len_of_char_in_current_encoding(&self, c: char) -> usize { +        self.reader.len_of_char_in_current_encoding(c) +    }  } diff --git a/src/reader.rs b/src/reader.rs index 15b9224..3c39b16 100644 --- a/src/reader.rs +++ b/src/reader.rs @@ -30,6 +30,9 @@ pub trait Reader {      /// the input stream and returns `true`. If not, it does nothing and returns `false`.      // TODO: document a maximum s length that may be assumed (depends on longest named character reference ... which may change?)      fn try_read_string(&mut self, s: &str, case_sensitive: bool) -> Result<bool, Self::Error>; + +    /// Returns the number of bytes that the given character takes up in the current character encoding. +    fn len_of_char_in_current_encoding(&self, c: char) -> usize;  }  /// An object that can be converted into a [`Reader`]. @@ -96,6 +99,10 @@ impl<'a> Reader for StringReader<'a> {          Ok(false)      } + +    fn len_of_char_in_current_encoding(&self, c: char) -> usize { +        c.len_utf8() +    }  }  impl<'a> IntoReader<'a> for &'a str { @@ -248,6 +255,10 @@ impl<R: Read> Reader for BufReadReader<R> {          Ok(true)      } + +    fn len_of_char_in_current_encoding(&self, c: char) -> usize { +        c.len_utf8() +    }  }  impl<'a, R: Read + 'a> IntoReader<'a> for BufReader<R> { | 
