diff options
| -rw-r--r-- | src/machine.rs | 47 | ||||
| -rw-r--r-- | src/tokenizer.rs | 5 | ||||
| -rw-r--r-- | tests/test_spans.rs | 18 | 
3 files changed, 43 insertions, 27 deletions
| diff --git a/src/machine.rs b/src/machine.rs index f7f5ac6..5b36eee 100644 --- a/src/machine.rs +++ b/src/machine.rs @@ -38,6 +38,8 @@ where          };      } +    slf.position_before_match = slf.reader.position(); +      match slf.state {          State::Data => match slf.read_char()? {              Some('&') => { @@ -46,7 +48,7 @@ where                  Ok(ControlToken::Continue)              }              Some('<') => { -                slf.some_offset = slf.reader.position() - 1; +                slf.some_offset = slf.position_before_match;                  slf.state = State::TagOpen;                  Ok(ControlToken::Continue)              } @@ -702,7 +704,7 @@ where                  Ok(ControlToken::Continue)              }              Some(x) => { -                slf.emitter.init_attribute_name(slf.reader.position() - 1); +                slf.emitter.init_attribute_name(slf.position_before_match);                  slf.state = State::AttributeName;                  slf.unread_char(Some(x));                  Ok(ControlToken::Continue) @@ -755,7 +757,7 @@ where                  Ok(ControlToken::Eof)              }              Some(x) => { -                slf.emitter.init_attribute_name(slf.reader.position() - 1); +                slf.emitter.init_attribute_name(slf.position_before_match);                  slf.state = State::AttributeName;                  slf.unread_char(Some(x));                  Ok(ControlToken::Continue) @@ -783,7 +785,7 @@ where              }              c => {                  slf.emitter -                    .init_attribute_value(AttrValueSyntax::Unquoted, slf.reader.position() - 1); +                    .init_attribute_value(AttrValueSyntax::Unquoted, slf.position_before_match);                  slf.state = State::AttributeValueUnquoted;                  slf.unread_char(c);                  Ok(ControlToken::Continue) @@ -791,8 +793,13 @@ where          },          State::AttributeValueDoubleQuoted => match slf.read_char()? {              Some('"') => { -                slf.emitter -                    .terminate_attribute_value(slf.reader.position() - 1); +                slf.emitter.terminate_attribute_value( +                    // We cannot simply pass slf.position_before_match because +                    // State::NamedCharacterReference calls Tokenizer::unread_char +                    // which Reader::position doesn't account for. +                    // TODO: pass slf.position_before_match once CharacterReference has been converted to a function call +                    slf.reader.position() - slf.reader.len_of_char_in_current_encoding('"'), +                );                  slf.state = State::AfterAttributeValueQuoted;                  Ok(ControlToken::Continue)              } @@ -817,8 +824,13 @@ where          },          State::AttributeValueSingleQuoted => match slf.read_char()? {              Some('\'') => { -                slf.emitter -                    .terminate_attribute_value(slf.reader.position() - 1); +                slf.emitter.terminate_attribute_value( +                    // We cannot simply pass slf.position_before_match because +                    // State::NamedCharacterReference calls Tokenizer::unread_char +                    // which Reader::position doesn't account for. +                    // TODO: pass slf.position_before_match once CharacterReference has been converted to a function call +                    slf.reader.position() - slf.reader.len_of_char_in_current_encoding('\''), +                );                  slf.state = State::AfterAttributeValueQuoted;                  Ok(ControlToken::Continue)              } @@ -843,8 +855,13 @@ where          },          State::AttributeValueUnquoted => match slf.read_char()? {              Some(whitespace_pat!()) => { -                slf.emitter -                    .terminate_attribute_value(slf.reader.position() - 1); +                slf.emitter.terminate_attribute_value( +                    // We cannot simply pass slf.position_before_match because +                    // State::NamedCharacterReference calls Tokenizer::unread_char +                    // which Reader::position doesn't account for. +                    // TODO: pass slf.position_before_match once CharacterReference has been converted to a function call +                    slf.reader.position() - slf.reader.len_of_char_in_current_encoding(' '), +                );                  slf.state = State::BeforeAttributeName;                  Ok(ControlToken::Continue)              } @@ -904,7 +921,9 @@ where          },          State::SelfClosingStartTag => match slf.read_char()? {              Some('>') => { -                slf.emitter.set_self_closing(slf.reader.position() - 2); +                slf.emitter.set_self_closing( +                    slf.position_before_match - slf.reader.len_of_char_in_current_encoding('/'), +                );                  slf.state = State::Data;                  slf.emit_current_tag();                  Ok(ControlToken::Continue) @@ -953,7 +972,7 @@ where              Some('[') if slf.try_read_string("CDATA[", true)? => Ok(ControlToken::CdataOpen),              c => {                  slf.emit_error(Error::IncorrectlyOpenedComment); -                slf.emitter.init_comment(slf.reader.position() - 1); +                slf.emitter.init_comment(slf.position_before_match);                  slf.state = State::BogusComment;                  slf.unread_char(c);                  Ok(ControlToken::Continue) @@ -1659,7 +1678,9 @@ where              }          },          State::CharacterReference => { -            slf.some_offset = slf.reader.position() - "&".len(); +            // TODO: we can avoid these Reader method calls by changing CharacterReference to be a function instead of a state +            slf.some_offset = +                slf.reader.position() - slf.reader.len_of_char_in_current_encoding('&');              slf.temporary_buffer.clear();              slf.temporary_buffer.push('&');              match slf.read_char()? { diff --git a/src/tokenizer.rs b/src/tokenizer.rs index d59710d..58f7b80 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -49,6 +49,8 @@ pub struct Tokenizer<R: Reader, O, E: Emitter<O>> {      current_tag_name: String,      last_start_tag_name: String,      is_start_tag: bool, +    /// The reader position before the match block in [`machine::consume`]. +    pub(crate) position_before_match: O,      /// * Set to the offset of `<` in [`InternalState::Data`].      /// * Set to the offset of `&` in [`InternalState::CharacterReference`].      pub(crate) some_offset: O, @@ -77,6 +79,7 @@ impl<R: Reader + Position<O>, O: Offset, E: Emitter<O>> Tokenizer<R, O, E> {              current_tag_name: String::new(),              last_start_tag_name: String::new(),              is_start_tag: false, +            position_before_match: O::default(),              some_offset: O::default(),              naively_switch_state: false,          } @@ -198,7 +201,7 @@ impl<R: Reader + Position<O>, O: Offset, E: Emitter<O>> Tokenizer<R, O, E> {              | Error::ControlCharacterReference              | Error::UnknownNamedCharacterReference => self.some_offset..self.reader.position(), -            _ => self.reader.position() - 1..self.reader.position(), +            _ => self.position_before_match..self.reader.position(),          };          self.emitter.emit_error(error, span);      } diff --git a/tests/test_spans.rs b/tests/test_spans.rs index c58616d..8190f01 100644 --- a/tests/test_spans.rs +++ b/tests/test_spans.rs @@ -73,7 +73,7 @@ fn start_tag_span() {          }          labels      }; -    assert_panics_but_should_not(|| assert_char_encoding_independence(html, labeler)); // FIXME +    assert_char_encoding_independence(html, labeler);      assert_snapshot!(test_and_annotate(html, labeler), @r###"      <x> <xyz> <xyz  > <xyz/>      ^^^ ^^^^^ ^^^^^^^ ^^^^^^ @@ -92,7 +92,7 @@ fn end_tag_span() {          }          labels      }; -    assert_panics_but_should_not(|| assert_char_encoding_independence(html, labeler)); // FIXME +    assert_char_encoding_independence(html, labeler);      assert_snapshot!(test_and_annotate(html, labeler), @r###"      </x> </xyz> </xyz  > </xyz/>      ^^^^ ^^^^^^ ^^^^^^^^ ^^^^^^^ @@ -170,7 +170,7 @@ fn attribute_value_span() {          }          labels      }; -    assert_panics_but_should_not(|| assert_char_encoding_independence(html, labeler)); // FIXME +    assert_char_encoding_independence(html, labeler);      assert_snapshot!(test_and_annotate(html, labeler), @r###"      <test x=unquoted y = unquoted z='single-quoted' zz="double-quoted" empty=''>              ^^^^^^^^     ^^^^^^^^    ^^^^^^^^^^^^^      ^^^^^^^^^^^^^         ^ @@ -190,7 +190,7 @@ fn attribute_value_with_char_ref() {          }          labels      }; -    assert_panics_but_should_not(|| assert_char_encoding_independence(html, labeler)); // FIXME +    assert_char_encoding_independence(html, labeler);      assert_snapshot!(test_and_annotate(html, labeler), @r###"      <test x=& y='&' z="&">              ^^^^^    ^^^^^     ^^^^^ @@ -261,7 +261,7 @@ fn doctype_span() {              };              vec![(doctype.span, "")]          }; -        assert_panics_but_should_not(|| assert_char_encoding_independence(case, labeler)); // FIXME +        assert_char_encoding_independence(case, labeler);          annotated.push_str(&test_and_annotate(case, labeler));      } @@ -341,17 +341,9 @@ fn annotate_errors(html: &'static str) -> String {              *doesnt_support_utf16.lock().unwrap() = matches!(                  error, -                | Error::AbsenceOfDigitsInNumericCharacterReference // FIXME -                | Error::CharacterReferenceOutsideUnicodeRange // FIXME -                | Error::ControlCharacterReference // FIXME                  | Error::DuplicateAttribute // FIXME                  | Error::EndTagWithAttributes // FIXME                  | Error::EndTagWithTrailingSolidus // FIXME -                | Error::InvalidFirstCharacterOfTagName // FIXME -                | Error::NoncharacterCharacterReference // FIXME -                | Error::NullCharacterReference // FIXME -                | Error::SurrogateCharacterReference // FIXME -                | Error::UnknownNamedCharacterReference // FIXME              );          }          labels | 
