diff options
| -rw-r--r-- | src/emitter.rs | 51 | ||||
| -rw-r--r-- | src/machine.rs | 119 | ||||
| -rw-r--r-- | src/tokenizer.rs | 8 | ||||
| -rw-r--r-- | tests/test_spans.rs | 31 | 
4 files changed, 134 insertions, 75 deletions
diff --git a/src/emitter.rs b/src/emitter.rs index f665f47..5b64acd 100644 --- a/src/emitter.rs +++ b/src/emitter.rs @@ -69,12 +69,12 @@ pub trait Emitter<O> {      /// Emit the _current token_, assuming it is a comment.      ///      /// If the current token is not a comment, this method may panic. -    fn emit_current_comment(&mut self); +    fn emit_current_comment(&mut self, offset: O);      /// Emit the _current token_, assuming it is a doctype.      ///      /// If the current token is not a doctype, this method may panic. -    fn emit_current_doctype(&mut self); +    fn emit_current_doctype(&mut self, offset: O);      /// Assuming the _current token_ is a start tag, set the self-closing flag.      /// @@ -140,12 +140,12 @@ pub trait Emitter<O> {      /// Assuming the _current token_ is a doctype, set its "public identifier" to the empty string.      ///      /// If the current token is not a doctype, this method may panic. -    fn init_doctype_public_id(&mut self); +    fn init_doctype_public_id(&mut self, offset: O);      /// Assuming the _current token_ is a doctype, set its "system identifier" to the empty string.      ///      /// If the current token is not a doctype, this method may panic. -    fn init_doctype_system_id(&mut self); +    fn init_doctype_system_id(&mut self, offset: O);      /// Assuming the _current token_ is a doctype, append a string to its "public identifier" to the given string.      /// @@ -308,17 +308,18 @@ impl<O: Offset> Emitter<O> for DefaultEmitter<O> {          }          self.emit_token(token);      } -    fn emit_current_comment(&mut self) { +    fn emit_current_comment(&mut self, _offset: O) {          let comment = self.current_token.take().unwrap();          debug_assert!(matches!(comment, Token::Comment(_)));          self.emit_token(comment);      } -    fn emit_current_doctype(&mut self) { +    fn emit_current_doctype(&mut self, offset: O) {          let Some(Token::Doctype(mut doctype)) = self.current_token.take() else {              debug_assert!(false);              return;          }; +        doctype.span.end = offset;          self.emit_token(Token::Doctype(doctype));      } @@ -370,12 +371,15 @@ impl<O: Offset> Emitter<O> for DefaultEmitter<O> {              _ => debug_assert!(false),          }      } -    fn init_doctype(&mut self, _offset: O) { +    fn init_doctype(&mut self, offset: O) {          self.current_token = Some(Token::Doctype(Doctype {              name: String::new(),              force_quirks: false,              public_id: None,              system_id: None, +            span: offset..O::default(), +            public_id_offset: O::default(), +            system_id_offset: O::default(),          }));      } @@ -405,19 +409,21 @@ impl<O: Offset> Emitter<O> for DefaultEmitter<O> {          let current_attr = self.current_attribute.as_mut().unwrap();          current_attr.1.value.push_str(s);      } -    fn init_doctype_public_id(&mut self) { +    fn init_doctype_public_id(&mut self, offset: O) {          let Some(Token::Doctype(doctype)) = &mut self.current_token else {              debug_assert!(false);              return;          };          doctype.public_id = Some("".to_owned()); +        doctype.public_id_offset = offset;      } -    fn init_doctype_system_id(&mut self) { +    fn init_doctype_system_id(&mut self, offset: O) {          let Some(Token::Doctype(doctype)) = &mut self.current_token else {              debug_assert!(false);              return;          };          doctype.system_id = Some("".to_owned()); +        doctype.system_id_offset = offset;      }      fn push_doctype_public_id(&mut self, s: &str) {          if let Some(Token::Doctype(Doctype { @@ -512,7 +518,7 @@ impl<O: Offset> Comment<O> {  /// * `<!DOCTYPE {name} SYSTEM '{system_id}'>`  /// * `<!DOCTYPE {name} PUBLIC '{public_id}' '{system_id}'>`  #[derive(Debug, Eq, PartialEq)] -pub struct Doctype { +pub struct Doctype<O> {      /// The ["force quirks"](https://html.spec.whatwg.org/#force-quirks-flag) flag.      pub force_quirks: bool, @@ -524,6 +530,29 @@ pub struct Doctype {      /// The doctype's system identifier.      pub system_id: Option<String>, + +    /// The source code span of the doctype. +    pub span: Range<O>, + +    /// The source offset of the pulic identifier. +    public_id_offset: O, + +    /// The source offset of the system identifier. +    system_id_offset: O, +} + +impl<O: Offset> Doctype<O> { +    /// Calculates the span of the public identifier and returns it. +    pub fn public_id_span(&self) -> Option<Range<O>> { +        let public_id = self.public_id.as_ref()?; +        Some(self.public_id_offset..self.public_id_offset + public_id.len()) +    } + +    /// Calculates the span of the system identifier and returns it. +    pub fn system_id_span(&self) -> Option<Range<O>> { +        let system_id = self.system_id.as_ref()?; +        Some(self.system_id_offset..self.system_id_offset + system_id.len()) +    }  }  /// The token type used by default. You can define your own token type by implementing the @@ -539,7 +568,7 @@ pub enum Token<O> {      /// A HTML comment.      Comment(Comment<O>),      /// A HTML doctype declaration. -    Doctype(Doctype), +    Doctype(Doctype<O>),      /// A HTML parsing error.      ///      /// Can be skipped over, the tokenizer is supposed to recover from the error and continues with diff --git a/src/machine.rs b/src/machine.rs index 0755e20..0d99ab8 100644 --- a/src/machine.rs +++ b/src/machine.rs @@ -910,11 +910,11 @@ where          State::BogusComment => match slf.read_char()? {              Some('>') => {                  slf.state = State::Data; -                slf.emitter.emit_current_comment(); +                slf.emitter.emit_current_comment(slf.reader.position());                  Ok(ControlToken::Continue)              }              None => { -                slf.emitter.emit_current_comment(); +                slf.emitter.emit_current_comment(slf.reader.position());                  Ok(ControlToken::Eof)              }              Some('\0') => { @@ -935,6 +935,7 @@ where              }              Some('d' | 'D') if slf.try_read_string("octype", false)? => {                  slf.state = State::Doctype; +                slf.doctype_offset = slf.reader.position() - b"<!doctype".len();                  Ok(ControlToken::Continue)              }              Some('[') if slf.try_read_string("CDATA[", true)? => { @@ -967,7 +968,7 @@ where              Some('>') => {                  slf.emit_error(Error::AbruptClosingOfEmptyComment);                  slf.state = State::Data; -                slf.emitter.emit_current_comment(); +                slf.emitter.emit_current_comment(slf.reader.position());                  Ok(ControlToken::Continue)              }              c => { @@ -984,12 +985,12 @@ where              Some('>') => {                  slf.emit_error(Error::AbruptClosingOfEmptyComment);                  slf.state = State::Data; -                slf.emitter.emit_current_comment(); +                slf.emitter.emit_current_comment(slf.reader.position());                  Ok(ControlToken::Continue)              }              None => {                  slf.emit_error(Error::EofInComment); -                slf.emitter.emit_current_comment(); +                slf.emitter.emit_current_comment(slf.reader.position());                  Ok(ControlToken::Eof)              }              c @ Some(_) => { @@ -1016,7 +1017,7 @@ where              }              None => {                  slf.emit_error(Error::EofInComment); -                slf.emitter.emit_current_comment(); +                slf.emitter.emit_current_comment(slf.reader.position());                  Ok(ControlToken::Eof)              }              Some(x) => { @@ -1082,7 +1083,7 @@ where              }              None => {                  slf.emit_error(Error::EofInComment); -                slf.emitter.emit_current_comment(); +                slf.emitter.emit_current_comment(slf.reader.position());                  Ok(ControlToken::Eof)              }              c => { @@ -1095,7 +1096,7 @@ where          State::CommentEnd => match slf.read_char()? {              Some('>') => {                  slf.state = State::Data; -                slf.emitter.emit_current_comment(); +                slf.emitter.emit_current_comment(slf.reader.position());                  Ok(ControlToken::Continue)              }              Some('!') => { @@ -1108,7 +1109,7 @@ where              }              None => {                  slf.emit_error(Error::EofInComment); -                slf.emitter.emit_current_comment(); +                slf.emitter.emit_current_comment(slf.reader.position());                  Ok(ControlToken::Eof)              }              c @ Some(_) => { @@ -1130,12 +1131,12 @@ where              Some('>') => {                  slf.emit_error(Error::IncorrectlyClosedComment);                  slf.state = State::Data; -                slf.emitter.emit_current_comment(); +                slf.emitter.emit_current_comment(slf.reader.position());                  Ok(ControlToken::Continue)              }              None => {                  slf.emit_error(Error::EofInComment); -                slf.emitter.emit_current_comment(); +                slf.emitter.emit_current_comment(slf.reader.position());                  Ok(ControlToken::Eof)              }              c @ Some(_) => { @@ -1159,9 +1160,9 @@ where              }              None => {                  slf.emit_error(Error::EofInDoctype); -                slf.emitter.init_doctype(slf.reader.position()); +                slf.emitter.init_doctype(slf.doctype_offset);                  slf.emitter.set_force_quirks(); -                slf.emitter.emit_current_doctype(); +                slf.emitter.emit_current_doctype(slf.reader.position());                  Ok(ControlToken::Eof)              }              c @ Some(_) => { @@ -1175,28 +1176,28 @@ where              Some(whitespace_pat!()) => Ok(ControlToken::Continue),              Some('\0') => {                  slf.emit_error(Error::UnexpectedNullCharacter); -                slf.emitter.init_doctype(slf.reader.position()); +                slf.emitter.init_doctype(slf.doctype_offset);                  slf.emitter.push_doctype_name("\u{fffd}");                  slf.state = State::DoctypeName;                  Ok(ControlToken::Continue)              }              Some('>') => {                  slf.emit_error(Error::MissingDoctypeName); -                slf.emitter.init_doctype(slf.reader.position()); +                slf.emitter.init_doctype(slf.doctype_offset);                  slf.emitter.set_force_quirks();                  slf.state = State::Data; -                slf.emitter.emit_current_doctype(); +                slf.emitter.emit_current_doctype(slf.reader.position());                  Ok(ControlToken::Continue)              }              None => {                  slf.emit_error(Error::EofInDoctype); -                slf.emitter.init_doctype(slf.reader.position()); +                slf.emitter.init_doctype(slf.doctype_offset);                  slf.emitter.set_force_quirks(); -                slf.emitter.emit_current_doctype(); +                slf.emitter.emit_current_doctype(slf.reader.position());                  Ok(ControlToken::Eof)              }              Some(x) => { -                slf.emitter.init_doctype(slf.reader.position()); +                slf.emitter.init_doctype(slf.doctype_offset);                  slf.emitter                      .push_doctype_name(ctostr!(x.to_ascii_lowercase()));                  slf.state = State::DoctypeName; @@ -1210,7 +1211,7 @@ where              }              Some('>') => {                  slf.state = State::Data; -                slf.emitter.emit_current_doctype(); +                slf.emitter.emit_current_doctype(slf.reader.position());                  Ok(ControlToken::Continue)              }              Some('\0') => { @@ -1221,7 +1222,7 @@ where              None => {                  slf.emit_error(Error::EofInDoctype);                  slf.emitter.set_force_quirks(); -                slf.emitter.emit_current_doctype(); +                slf.emitter.emit_current_doctype(slf.reader.position());                  Ok(ControlToken::Eof)              }              Some(x) => { @@ -1234,13 +1235,13 @@ where              Some(whitespace_pat!()) => Ok(ControlToken::Continue),              Some('>') => {                  slf.state = State::Data; -                slf.emitter.emit_current_doctype(); +                slf.emitter.emit_current_doctype(slf.reader.position());                  Ok(ControlToken::Continue)              }              None => {                  slf.emit_error(Error::EofInDoctype);                  slf.emitter.set_force_quirks(); -                slf.emitter.emit_current_doctype(); +                slf.emitter.emit_current_doctype(slf.reader.position());                  Ok(ControlToken::Eof)              }              Some('p' | 'P') if slf.try_read_string("ublic", false)? => { @@ -1266,13 +1267,13 @@ where              }              Some('"') => {                  slf.emit_error(Error::MissingWhitespaceAfterDoctypePublicKeyword); -                slf.emitter.init_doctype_public_id(); +                slf.emitter.init_doctype_public_id(slf.reader.position());                  slf.state = State::DoctypePublicIdentifierDoubleQuoted;                  Ok(ControlToken::Continue)              }              Some('\'') => {                  slf.emit_error(Error::MissingWhitespaceAfterDoctypePublicKeyword); -                slf.emitter.init_doctype_public_id(); +                slf.emitter.init_doctype_public_id(slf.reader.position());                  slf.state = State::DoctypePublicIdentifierSingleQuoted;                  Ok(ControlToken::Continue)              } @@ -1280,13 +1281,13 @@ where                  slf.emit_error(Error::MissingDoctypePublicIdentifier);                  slf.emitter.set_force_quirks();                  slf.state = State::Data; -                slf.emitter.emit_current_doctype(); +                slf.emitter.emit_current_doctype(slf.reader.position());                  Ok(ControlToken::Continue)              }              None => {                  slf.emit_error(Error::EofInDoctype);                  slf.emitter.set_force_quirks(); -                slf.emitter.emit_current_doctype(); +                slf.emitter.emit_current_doctype(slf.reader.position());                  Ok(ControlToken::Eof)              }              c @ Some(_) => { @@ -1300,12 +1301,12 @@ where          State::BeforeDoctypePublicIdentifier => match slf.read_char()? {              Some(whitespace_pat!()) => Ok(ControlToken::Continue),              Some('"') => { -                slf.emitter.init_doctype_public_id(); +                slf.emitter.init_doctype_public_id(slf.reader.position());                  slf.state = State::DoctypePublicIdentifierDoubleQuoted;                  Ok(ControlToken::Continue)              }              Some('\'') => { -                slf.emitter.init_doctype_public_id(); +                slf.emitter.init_doctype_public_id(slf.reader.position());                  slf.state = State::DoctypePublicIdentifierSingleQuoted;                  Ok(ControlToken::Continue)              } @@ -1313,13 +1314,13 @@ where                  slf.emit_error(Error::MissingDoctypePublicIdentifier);                  slf.emitter.set_force_quirks();                  slf.state = State::Data; -                slf.emitter.emit_current_doctype(); +                slf.emitter.emit_current_doctype(slf.reader.position());                  Ok(ControlToken::Continue)              }              None => {                  slf.emit_error(Error::EofInDoctype);                  slf.emitter.set_force_quirks(); -                slf.emitter.emit_current_doctype(); +                slf.emitter.emit_current_doctype(slf.reader.position());                  Ok(ControlToken::Eof)              }              c @ Some(_) => { @@ -1344,13 +1345,13 @@ where                  slf.emit_error(Error::AbruptDoctypePublicIdentifier);                  slf.emitter.set_force_quirks();                  slf.state = State::Data; -                slf.emitter.emit_current_doctype(); +                slf.emitter.emit_current_doctype(slf.reader.position());                  Ok(ControlToken::Continue)              }              None => {                  slf.emit_error(Error::EofInDoctype);                  slf.emitter.set_force_quirks(); -                slf.emitter.emit_current_doctype(); +                slf.emitter.emit_current_doctype(slf.reader.position());                  Ok(ControlToken::Eof)              }              Some(x) => { @@ -1372,13 +1373,13 @@ where                  slf.emit_error(Error::AbruptDoctypePublicIdentifier);                  slf.emitter.set_force_quirks();                  slf.state = State::Data; -                slf.emitter.emit_current_doctype(); +                slf.emitter.emit_current_doctype(slf.reader.position());                  Ok(ControlToken::Continue)              }              None => {                  slf.emit_error(Error::EofInDoctype);                  slf.emitter.set_force_quirks(); -                slf.emitter.emit_current_doctype(); +                slf.emitter.emit_current_doctype(slf.reader.position());                  Ok(ControlToken::Eof)              }              Some(x) => { @@ -1393,25 +1394,25 @@ where              }              Some('>') => {                  slf.state = State::Data; -                slf.emitter.emit_current_doctype(); +                slf.emitter.emit_current_doctype(slf.reader.position());                  Ok(ControlToken::Continue)              }              Some('"') => {                  slf.emit_error(Error::MissingWhitespaceBetweenDoctypePublicAndSystemIdentifiers); -                slf.emitter.init_doctype_system_id(); +                slf.emitter.init_doctype_system_id(slf.reader.position());                  slf.state = State::DoctypeSystemIdentifierDoubleQuoted;                  Ok(ControlToken::Continue)              }              Some('\'') => {                  slf.emit_error(Error::MissingWhitespaceBetweenDoctypePublicAndSystemIdentifiers); -                slf.emitter.init_doctype_system_id(); +                slf.emitter.init_doctype_system_id(slf.reader.position());                  slf.state = State::DoctypeSystemIdentifierSingleQuoted;                  Ok(ControlToken::Continue)              }              None => {                  slf.emit_error(Error::EofInDoctype);                  slf.emitter.set_force_quirks(); -                slf.emitter.emit_current_doctype(); +                slf.emitter.emit_current_doctype(slf.reader.position());                  Ok(ControlToken::Eof)              }              c @ Some(_) => { @@ -1426,23 +1427,23 @@ where              Some(whitespace_pat!()) => Ok(ControlToken::Continue),              Some('>') => {                  slf.state = State::Data; -                slf.emitter.emit_current_doctype(); +                slf.emitter.emit_current_doctype(slf.reader.position());                  Ok(ControlToken::Continue)              }              Some('"') => { -                slf.emitter.init_doctype_system_id(); +                slf.emitter.init_doctype_system_id(slf.reader.position());                  slf.state = State::DoctypeSystemIdentifierDoubleQuoted;                  Ok(ControlToken::Continue)              }              Some('\'') => { -                slf.emitter.init_doctype_system_id(); +                slf.emitter.init_doctype_system_id(slf.reader.position());                  slf.state = State::DoctypeSystemIdentifierSingleQuoted;                  Ok(ControlToken::Continue)              }              None => {                  slf.emit_error(Error::EofInDoctype);                  slf.emitter.set_force_quirks(); -                slf.emitter.emit_current_doctype(); +                slf.emitter.emit_current_doctype(slf.reader.position());                  Ok(ControlToken::Eof)              }              c @ Some(_) => { @@ -1460,13 +1461,13 @@ where              }              Some('"') => {                  slf.emit_error(Error::MissingWhitespaceAfterDoctypeSystemKeyword); -                slf.emitter.init_doctype_system_id(); +                slf.emitter.init_doctype_system_id(slf.reader.position());                  slf.state = State::DoctypeSystemIdentifierDoubleQuoted;                  Ok(ControlToken::Continue)              }              Some('\'') => {                  slf.emit_error(Error::MissingWhitespaceAfterDoctypeSystemKeyword); -                slf.emitter.init_doctype_system_id(); +                slf.emitter.init_doctype_system_id(slf.reader.position());                  slf.state = State::DoctypeSystemIdentifierSingleQuoted;                  Ok(ControlToken::Continue)              } @@ -1474,13 +1475,13 @@ where                  slf.emit_error(Error::MissingDoctypeSystemIdentifier);                  slf.emitter.set_force_quirks();                  slf.state = State::Data; -                slf.emitter.emit_current_doctype(); +                slf.emitter.emit_current_doctype(slf.reader.position());                  Ok(ControlToken::Continue)              }              None => {                  slf.emit_error(Error::EofInDoctype);                  slf.emitter.set_force_quirks(); -                slf.emitter.emit_current_doctype(); +                slf.emitter.emit_current_doctype(slf.reader.position());                  Ok(ControlToken::Eof)              }              c @ Some(_) => { @@ -1494,12 +1495,12 @@ where          State::BeforeDoctypeSystemIdentifier => match slf.read_char()? {              Some(whitespace_pat!()) => Ok(ControlToken::Continue),              Some('"') => { -                slf.emitter.init_doctype_system_id(); +                slf.emitter.init_doctype_system_id(slf.reader.position());                  slf.state = State::DoctypeSystemIdentifierDoubleQuoted;                  Ok(ControlToken::Continue)              }              Some('\'') => { -                slf.emitter.init_doctype_system_id(); +                slf.emitter.init_doctype_system_id(slf.reader.position());                  slf.state = State::DoctypeSystemIdentifierSingleQuoted;                  Ok(ControlToken::Continue)              } @@ -1507,13 +1508,13 @@ where                  slf.emit_error(Error::MissingDoctypeSystemIdentifier);                  slf.emitter.set_force_quirks();                  slf.state = State::Data; -                slf.emitter.emit_current_doctype(); +                slf.emitter.emit_current_doctype(slf.reader.position());                  Ok(ControlToken::Continue)              }              None => {                  slf.emit_error(Error::EofInDoctype);                  slf.emitter.set_force_quirks(); -                slf.emitter.emit_current_doctype(); +                slf.emitter.emit_current_doctype(slf.reader.position());                  Ok(ControlToken::Eof)              }              c @ Some(_) => { @@ -1538,13 +1539,13 @@ where                  slf.emit_error(Error::AbruptDoctypeSystemIdentifier);                  slf.emitter.set_force_quirks();                  slf.state = State::Data; -                slf.emitter.emit_current_doctype(); +                slf.emitter.emit_current_doctype(slf.reader.position());                  Ok(ControlToken::Continue)              }              None => {                  slf.emit_error(Error::EofInDoctype);                  slf.emitter.set_force_quirks(); -                slf.emitter.emit_current_doctype(); +                slf.emitter.emit_current_doctype(slf.reader.position());                  Ok(ControlToken::Eof)              }              Some(x) => { @@ -1566,13 +1567,13 @@ where                  slf.emit_error(Error::AbruptDoctypeSystemIdentifier);                  slf.emitter.set_force_quirks();                  slf.state = State::Data; -                slf.emitter.emit_current_doctype(); +                slf.emitter.emit_current_doctype(slf.reader.position());                  Ok(ControlToken::Continue)              }              None => {                  slf.emit_error(Error::EofInDoctype);                  slf.emitter.set_force_quirks(); -                slf.emitter.emit_current_doctype(); +                slf.emitter.emit_current_doctype(slf.reader.position());                  Ok(ControlToken::Eof)              }              Some(x) => { @@ -1584,13 +1585,13 @@ where              Some(whitespace_pat!()) => Ok(ControlToken::Continue),              Some('>') => {                  slf.state = State::Data; -                slf.emitter.emit_current_doctype(); +                slf.emitter.emit_current_doctype(slf.reader.position());                  Ok(ControlToken::Continue)              }              None => {                  slf.emit_error(Error::EofInDoctype);                  slf.emitter.set_force_quirks(); -                slf.emitter.emit_current_doctype(); +                slf.emitter.emit_current_doctype(slf.reader.position());                  Ok(ControlToken::Eof)              }              c @ Some(_) => { @@ -1603,7 +1604,7 @@ where          State::BogusDoctype => match slf.read_char()? {              Some('>') => {                  slf.state = State::Data; -                slf.emitter.emit_current_doctype(); +                slf.emitter.emit_current_doctype(slf.reader.position());                  Ok(ControlToken::Continue)              }              Some('\0') => { @@ -1611,7 +1612,7 @@ where                  Ok(ControlToken::Continue)              }              None => { -                slf.emitter.emit_current_doctype(); +                slf.emitter.emit_current_doctype(slf.reader.position());                  Ok(ControlToken::Eof)              }              Some(_) => Ok(ControlToken::Continue), diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 1b80ec3..d272b14 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -1,5 +1,3 @@ -use std::marker::PhantomData; -  use crate::machine;  use crate::offset::{NoopOffset, Offset, Position};  use crate::reader::{IntoReader, Reader}; @@ -41,16 +39,16 @@ pub struct Tokenizer<R: Reader, O = NoopOffset, E: Emitter<O> = DefaultEmitter<O      pub(crate) emitter: E,      pub(crate) temporary_buffer: String,      pub(crate) reader: R, -    _offset: PhantomData<O>,      to_reconsume: Stack2<Option<char>>,      pub(crate) character_reference_code: u32,      pub(crate) return_state: Option<InternalState>,      current_tag_name: String,      last_start_tag_name: String,      is_start_tag: bool, +    pub(crate) doctype_offset: O,  } -impl<R: Reader, O, E: Emitter<O>> Tokenizer<R, O, E> { +impl<R: Reader, O: Offset, E: Emitter<O>> Tokenizer<R, O, E> {      /// Creates a new tokenizer from some input and an emitter.      ///      /// TODO: add warning about you needing to do the state switching @@ -58,7 +56,6 @@ impl<R: Reader, O, E: Emitter<O>> Tokenizer<R, O, E> {          Tokenizer {              reader: reader.into_reader(),              emitter, -            _offset: PhantomData,              state: InternalState::Data,              to_reconsume: Stack2::default(),              return_state: None, @@ -68,6 +65,7 @@ impl<R: Reader, O, E: Emitter<O>> Tokenizer<R, O, E> {              current_tag_name: String::new(),              last_start_tag_name: String::new(),              is_start_tag: false, +            doctype_offset: O::default(),          }      }  } diff --git a/tests/test_spans.rs b/tests/test_spans.rs index 6bd9378..70bcf6e 100644 --- a/tests/test_spans.rs +++ b/tests/test_spans.rs @@ -167,6 +167,37 @@ fn comment_bogus_data_span() {      "###);  } +#[test] +fn doctype_span() { +    let html = r#"<!DOCTYPE       HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd"     >"#; +    let Token::Doctype(doctype) = tokenizer(html).next().unwrap() else { +        panic!("expected doctype"); +    }; +    let labels = vec![(doctype.span, "")]; +    assert_snapshot!(annotate(html, labels), @r###" +    <!DOCTYPE       HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd"     > +    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +    "###); +} + +#[test] +fn doctype_id_spans() { +    let html = r#"<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">"#; +    let Token::Doctype(doctype) = tokenizer(html).next().unwrap() else { +        panic!("expected doctype"); +    }; +    let labels = vec![ +        (doctype.public_id_span().unwrap(), "public id"), +        (doctype.system_id_span().unwrap(), "system id"), +    ]; +    assert_snapshot!(annotate(html, labels), @r###" +    <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd"> +                           ^^^^^^^^^^^^^^^^^^^^^^^^^   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ system id +                           │ +                           public id +    "###); +} +  fn annotate_errors(html: &'static str) -> String {      let mut labels = Vec::new();      for token in tokenizer(html) {  | 
