aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--src/emitter.rs51
-rw-r--r--src/machine.rs119
-rw-r--r--src/tokenizer.rs8
-rw-r--r--tests/test_spans.rs31
4 files changed, 134 insertions, 75 deletions
diff --git a/src/emitter.rs b/src/emitter.rs
index f665f47..5b64acd 100644
--- a/src/emitter.rs
+++ b/src/emitter.rs
@@ -69,12 +69,12 @@ pub trait Emitter<O> {
/// Emit the _current token_, assuming it is a comment.
///
/// If the current token is not a comment, this method may panic.
- fn emit_current_comment(&mut self);
+ fn emit_current_comment(&mut self, offset: O);
/// Emit the _current token_, assuming it is a doctype.
///
/// If the current token is not a doctype, this method may panic.
- fn emit_current_doctype(&mut self);
+ fn emit_current_doctype(&mut self, offset: O);
/// Assuming the _current token_ is a start tag, set the self-closing flag.
///
@@ -140,12 +140,12 @@ pub trait Emitter<O> {
/// Assuming the _current token_ is a doctype, set its "public identifier" to the empty string.
///
/// If the current token is not a doctype, this method may panic.
- fn init_doctype_public_id(&mut self);
+ fn init_doctype_public_id(&mut self, offset: O);
/// Assuming the _current token_ is a doctype, set its "system identifier" to the empty string.
///
/// If the current token is not a doctype, this method may panic.
- fn init_doctype_system_id(&mut self);
+ fn init_doctype_system_id(&mut self, offset: O);
/// Assuming the _current token_ is a doctype, append a string to its "public identifier" to the given string.
///
@@ -308,17 +308,18 @@ impl<O: Offset> Emitter<O> for DefaultEmitter<O> {
}
self.emit_token(token);
}
- fn emit_current_comment(&mut self) {
+ fn emit_current_comment(&mut self, _offset: O) {
let comment = self.current_token.take().unwrap();
debug_assert!(matches!(comment, Token::Comment(_)));
self.emit_token(comment);
}
- fn emit_current_doctype(&mut self) {
+ fn emit_current_doctype(&mut self, offset: O) {
let Some(Token::Doctype(mut doctype)) = self.current_token.take() else {
debug_assert!(false);
return;
};
+ doctype.span.end = offset;
self.emit_token(Token::Doctype(doctype));
}
@@ -370,12 +371,15 @@ impl<O: Offset> Emitter<O> for DefaultEmitter<O> {
_ => debug_assert!(false),
}
}
- fn init_doctype(&mut self, _offset: O) {
+ fn init_doctype(&mut self, offset: O) {
self.current_token = Some(Token::Doctype(Doctype {
name: String::new(),
force_quirks: false,
public_id: None,
system_id: None,
+ span: offset..O::default(),
+ public_id_offset: O::default(),
+ system_id_offset: O::default(),
}));
}
@@ -405,19 +409,21 @@ impl<O: Offset> Emitter<O> for DefaultEmitter<O> {
let current_attr = self.current_attribute.as_mut().unwrap();
current_attr.1.value.push_str(s);
}
- fn init_doctype_public_id(&mut self) {
+ fn init_doctype_public_id(&mut self, offset: O) {
let Some(Token::Doctype(doctype)) = &mut self.current_token else {
debug_assert!(false);
return;
};
doctype.public_id = Some("".to_owned());
+ doctype.public_id_offset = offset;
}
- fn init_doctype_system_id(&mut self) {
+ fn init_doctype_system_id(&mut self, offset: O) {
let Some(Token::Doctype(doctype)) = &mut self.current_token else {
debug_assert!(false);
return;
};
doctype.system_id = Some("".to_owned());
+ doctype.system_id_offset = offset;
}
fn push_doctype_public_id(&mut self, s: &str) {
if let Some(Token::Doctype(Doctype {
@@ -512,7 +518,7 @@ impl<O: Offset> Comment<O> {
/// * `<!DOCTYPE {name} SYSTEM '{system_id}'>`
/// * `<!DOCTYPE {name} PUBLIC '{public_id}' '{system_id}'>`
#[derive(Debug, Eq, PartialEq)]
-pub struct Doctype {
+pub struct Doctype<O> {
/// The ["force quirks"](https://html.spec.whatwg.org/#force-quirks-flag) flag.
pub force_quirks: bool,
@@ -524,6 +530,29 @@ pub struct Doctype {
/// The doctype's system identifier.
pub system_id: Option<String>,
+
+ /// The source code span of the doctype.
+ pub span: Range<O>,
+
+ /// The source offset of the pulic identifier.
+ public_id_offset: O,
+
+ /// The source offset of the system identifier.
+ system_id_offset: O,
+}
+
+impl<O: Offset> Doctype<O> {
+ /// Calculates the span of the public identifier and returns it.
+ pub fn public_id_span(&self) -> Option<Range<O>> {
+ let public_id = self.public_id.as_ref()?;
+ Some(self.public_id_offset..self.public_id_offset + public_id.len())
+ }
+
+ /// Calculates the span of the system identifier and returns it.
+ pub fn system_id_span(&self) -> Option<Range<O>> {
+ let system_id = self.system_id.as_ref()?;
+ Some(self.system_id_offset..self.system_id_offset + system_id.len())
+ }
}
/// The token type used by default. You can define your own token type by implementing the
@@ -539,7 +568,7 @@ pub enum Token<O> {
/// A HTML comment.
Comment(Comment<O>),
/// A HTML doctype declaration.
- Doctype(Doctype),
+ Doctype(Doctype<O>),
/// A HTML parsing error.
///
/// Can be skipped over, the tokenizer is supposed to recover from the error and continues with
diff --git a/src/machine.rs b/src/machine.rs
index 0755e20..0d99ab8 100644
--- a/src/machine.rs
+++ b/src/machine.rs
@@ -910,11 +910,11 @@ where
State::BogusComment => match slf.read_char()? {
Some('>') => {
slf.state = State::Data;
- slf.emitter.emit_current_comment();
+ slf.emitter.emit_current_comment(slf.reader.position());
Ok(ControlToken::Continue)
}
None => {
- slf.emitter.emit_current_comment();
+ slf.emitter.emit_current_comment(slf.reader.position());
Ok(ControlToken::Eof)
}
Some('\0') => {
@@ -935,6 +935,7 @@ where
}
Some('d' | 'D') if slf.try_read_string("octype", false)? => {
slf.state = State::Doctype;
+ slf.doctype_offset = slf.reader.position() - b"<!doctype".len();
Ok(ControlToken::Continue)
}
Some('[') if slf.try_read_string("CDATA[", true)? => {
@@ -967,7 +968,7 @@ where
Some('>') => {
slf.emit_error(Error::AbruptClosingOfEmptyComment);
slf.state = State::Data;
- slf.emitter.emit_current_comment();
+ slf.emitter.emit_current_comment(slf.reader.position());
Ok(ControlToken::Continue)
}
c => {
@@ -984,12 +985,12 @@ where
Some('>') => {
slf.emit_error(Error::AbruptClosingOfEmptyComment);
slf.state = State::Data;
- slf.emitter.emit_current_comment();
+ slf.emitter.emit_current_comment(slf.reader.position());
Ok(ControlToken::Continue)
}
None => {
slf.emit_error(Error::EofInComment);
- slf.emitter.emit_current_comment();
+ slf.emitter.emit_current_comment(slf.reader.position());
Ok(ControlToken::Eof)
}
c @ Some(_) => {
@@ -1016,7 +1017,7 @@ where
}
None => {
slf.emit_error(Error::EofInComment);
- slf.emitter.emit_current_comment();
+ slf.emitter.emit_current_comment(slf.reader.position());
Ok(ControlToken::Eof)
}
Some(x) => {
@@ -1082,7 +1083,7 @@ where
}
None => {
slf.emit_error(Error::EofInComment);
- slf.emitter.emit_current_comment();
+ slf.emitter.emit_current_comment(slf.reader.position());
Ok(ControlToken::Eof)
}
c => {
@@ -1095,7 +1096,7 @@ where
State::CommentEnd => match slf.read_char()? {
Some('>') => {
slf.state = State::Data;
- slf.emitter.emit_current_comment();
+ slf.emitter.emit_current_comment(slf.reader.position());
Ok(ControlToken::Continue)
}
Some('!') => {
@@ -1108,7 +1109,7 @@ where
}
None => {
slf.emit_error(Error::EofInComment);
- slf.emitter.emit_current_comment();
+ slf.emitter.emit_current_comment(slf.reader.position());
Ok(ControlToken::Eof)
}
c @ Some(_) => {
@@ -1130,12 +1131,12 @@ where
Some('>') => {
slf.emit_error(Error::IncorrectlyClosedComment);
slf.state = State::Data;
- slf.emitter.emit_current_comment();
+ slf.emitter.emit_current_comment(slf.reader.position());
Ok(ControlToken::Continue)
}
None => {
slf.emit_error(Error::EofInComment);
- slf.emitter.emit_current_comment();
+ slf.emitter.emit_current_comment(slf.reader.position());
Ok(ControlToken::Eof)
}
c @ Some(_) => {
@@ -1159,9 +1160,9 @@ where
}
None => {
slf.emit_error(Error::EofInDoctype);
- slf.emitter.init_doctype(slf.reader.position());
+ slf.emitter.init_doctype(slf.doctype_offset);
slf.emitter.set_force_quirks();
- slf.emitter.emit_current_doctype();
+ slf.emitter.emit_current_doctype(slf.reader.position());
Ok(ControlToken::Eof)
}
c @ Some(_) => {
@@ -1175,28 +1176,28 @@ where
Some(whitespace_pat!()) => Ok(ControlToken::Continue),
Some('\0') => {
slf.emit_error(Error::UnexpectedNullCharacter);
- slf.emitter.init_doctype(slf.reader.position());
+ slf.emitter.init_doctype(slf.doctype_offset);
slf.emitter.push_doctype_name("\u{fffd}");
slf.state = State::DoctypeName;
Ok(ControlToken::Continue)
}
Some('>') => {
slf.emit_error(Error::MissingDoctypeName);
- slf.emitter.init_doctype(slf.reader.position());
+ slf.emitter.init_doctype(slf.doctype_offset);
slf.emitter.set_force_quirks();
slf.state = State::Data;
- slf.emitter.emit_current_doctype();
+ slf.emitter.emit_current_doctype(slf.reader.position());
Ok(ControlToken::Continue)
}
None => {
slf.emit_error(Error::EofInDoctype);
- slf.emitter.init_doctype(slf.reader.position());
+ slf.emitter.init_doctype(slf.doctype_offset);
slf.emitter.set_force_quirks();
- slf.emitter.emit_current_doctype();
+ slf.emitter.emit_current_doctype(slf.reader.position());
Ok(ControlToken::Eof)
}
Some(x) => {
- slf.emitter.init_doctype(slf.reader.position());
+ slf.emitter.init_doctype(slf.doctype_offset);
slf.emitter
.push_doctype_name(ctostr!(x.to_ascii_lowercase()));
slf.state = State::DoctypeName;
@@ -1210,7 +1211,7 @@ where
}
Some('>') => {
slf.state = State::Data;
- slf.emitter.emit_current_doctype();
+ slf.emitter.emit_current_doctype(slf.reader.position());
Ok(ControlToken::Continue)
}
Some('\0') => {
@@ -1221,7 +1222,7 @@ where
None => {
slf.emit_error(Error::EofInDoctype);
slf.emitter.set_force_quirks();
- slf.emitter.emit_current_doctype();
+ slf.emitter.emit_current_doctype(slf.reader.position());
Ok(ControlToken::Eof)
}
Some(x) => {
@@ -1234,13 +1235,13 @@ where
Some(whitespace_pat!()) => Ok(ControlToken::Continue),
Some('>') => {
slf.state = State::Data;
- slf.emitter.emit_current_doctype();
+ slf.emitter.emit_current_doctype(slf.reader.position());
Ok(ControlToken::Continue)
}
None => {
slf.emit_error(Error::EofInDoctype);
slf.emitter.set_force_quirks();
- slf.emitter.emit_current_doctype();
+ slf.emitter.emit_current_doctype(slf.reader.position());
Ok(ControlToken::Eof)
}
Some('p' | 'P') if slf.try_read_string("ublic", false)? => {
@@ -1266,13 +1267,13 @@ where
}
Some('"') => {
slf.emit_error(Error::MissingWhitespaceAfterDoctypePublicKeyword);
- slf.emitter.init_doctype_public_id();
+ slf.emitter.init_doctype_public_id(slf.reader.position());
slf.state = State::DoctypePublicIdentifierDoubleQuoted;
Ok(ControlToken::Continue)
}
Some('\'') => {
slf.emit_error(Error::MissingWhitespaceAfterDoctypePublicKeyword);
- slf.emitter.init_doctype_public_id();
+ slf.emitter.init_doctype_public_id(slf.reader.position());
slf.state = State::DoctypePublicIdentifierSingleQuoted;
Ok(ControlToken::Continue)
}
@@ -1280,13 +1281,13 @@ where
slf.emit_error(Error::MissingDoctypePublicIdentifier);
slf.emitter.set_force_quirks();
slf.state = State::Data;
- slf.emitter.emit_current_doctype();
+ slf.emitter.emit_current_doctype(slf.reader.position());
Ok(ControlToken::Continue)
}
None => {
slf.emit_error(Error::EofInDoctype);
slf.emitter.set_force_quirks();
- slf.emitter.emit_current_doctype();
+ slf.emitter.emit_current_doctype(slf.reader.position());
Ok(ControlToken::Eof)
}
c @ Some(_) => {
@@ -1300,12 +1301,12 @@ where
State::BeforeDoctypePublicIdentifier => match slf.read_char()? {
Some(whitespace_pat!()) => Ok(ControlToken::Continue),
Some('"') => {
- slf.emitter.init_doctype_public_id();
+ slf.emitter.init_doctype_public_id(slf.reader.position());
slf.state = State::DoctypePublicIdentifierDoubleQuoted;
Ok(ControlToken::Continue)
}
Some('\'') => {
- slf.emitter.init_doctype_public_id();
+ slf.emitter.init_doctype_public_id(slf.reader.position());
slf.state = State::DoctypePublicIdentifierSingleQuoted;
Ok(ControlToken::Continue)
}
@@ -1313,13 +1314,13 @@ where
slf.emit_error(Error::MissingDoctypePublicIdentifier);
slf.emitter.set_force_quirks();
slf.state = State::Data;
- slf.emitter.emit_current_doctype();
+ slf.emitter.emit_current_doctype(slf.reader.position());
Ok(ControlToken::Continue)
}
None => {
slf.emit_error(Error::EofInDoctype);
slf.emitter.set_force_quirks();
- slf.emitter.emit_current_doctype();
+ slf.emitter.emit_current_doctype(slf.reader.position());
Ok(ControlToken::Eof)
}
c @ Some(_) => {
@@ -1344,13 +1345,13 @@ where
slf.emit_error(Error::AbruptDoctypePublicIdentifier);
slf.emitter.set_force_quirks();
slf.state = State::Data;
- slf.emitter.emit_current_doctype();
+ slf.emitter.emit_current_doctype(slf.reader.position());
Ok(ControlToken::Continue)
}
None => {
slf.emit_error(Error::EofInDoctype);
slf.emitter.set_force_quirks();
- slf.emitter.emit_current_doctype();
+ slf.emitter.emit_current_doctype(slf.reader.position());
Ok(ControlToken::Eof)
}
Some(x) => {
@@ -1372,13 +1373,13 @@ where
slf.emit_error(Error::AbruptDoctypePublicIdentifier);
slf.emitter.set_force_quirks();
slf.state = State::Data;
- slf.emitter.emit_current_doctype();
+ slf.emitter.emit_current_doctype(slf.reader.position());
Ok(ControlToken::Continue)
}
None => {
slf.emit_error(Error::EofInDoctype);
slf.emitter.set_force_quirks();
- slf.emitter.emit_current_doctype();
+ slf.emitter.emit_current_doctype(slf.reader.position());
Ok(ControlToken::Eof)
}
Some(x) => {
@@ -1393,25 +1394,25 @@ where
}
Some('>') => {
slf.state = State::Data;
- slf.emitter.emit_current_doctype();
+ slf.emitter.emit_current_doctype(slf.reader.position());
Ok(ControlToken::Continue)
}
Some('"') => {
slf.emit_error(Error::MissingWhitespaceBetweenDoctypePublicAndSystemIdentifiers);
- slf.emitter.init_doctype_system_id();
+ slf.emitter.init_doctype_system_id(slf.reader.position());
slf.state = State::DoctypeSystemIdentifierDoubleQuoted;
Ok(ControlToken::Continue)
}
Some('\'') => {
slf.emit_error(Error::MissingWhitespaceBetweenDoctypePublicAndSystemIdentifiers);
- slf.emitter.init_doctype_system_id();
+ slf.emitter.init_doctype_system_id(slf.reader.position());
slf.state = State::DoctypeSystemIdentifierSingleQuoted;
Ok(ControlToken::Continue)
}
None => {
slf.emit_error(Error::EofInDoctype);
slf.emitter.set_force_quirks();
- slf.emitter.emit_current_doctype();
+ slf.emitter.emit_current_doctype(slf.reader.position());
Ok(ControlToken::Eof)
}
c @ Some(_) => {
@@ -1426,23 +1427,23 @@ where
Some(whitespace_pat!()) => Ok(ControlToken::Continue),
Some('>') => {
slf.state = State::Data;
- slf.emitter.emit_current_doctype();
+ slf.emitter.emit_current_doctype(slf.reader.position());
Ok(ControlToken::Continue)
}
Some('"') => {
- slf.emitter.init_doctype_system_id();
+ slf.emitter.init_doctype_system_id(slf.reader.position());
slf.state = State::DoctypeSystemIdentifierDoubleQuoted;
Ok(ControlToken::Continue)
}
Some('\'') => {
- slf.emitter.init_doctype_system_id();
+ slf.emitter.init_doctype_system_id(slf.reader.position());
slf.state = State::DoctypeSystemIdentifierSingleQuoted;
Ok(ControlToken::Continue)
}
None => {
slf.emit_error(Error::EofInDoctype);
slf.emitter.set_force_quirks();
- slf.emitter.emit_current_doctype();
+ slf.emitter.emit_current_doctype(slf.reader.position());
Ok(ControlToken::Eof)
}
c @ Some(_) => {
@@ -1460,13 +1461,13 @@ where
}
Some('"') => {
slf.emit_error(Error::MissingWhitespaceAfterDoctypeSystemKeyword);
- slf.emitter.init_doctype_system_id();
+ slf.emitter.init_doctype_system_id(slf.reader.position());
slf.state = State::DoctypeSystemIdentifierDoubleQuoted;
Ok(ControlToken::Continue)
}
Some('\'') => {
slf.emit_error(Error::MissingWhitespaceAfterDoctypeSystemKeyword);
- slf.emitter.init_doctype_system_id();
+ slf.emitter.init_doctype_system_id(slf.reader.position());
slf.state = State::DoctypeSystemIdentifierSingleQuoted;
Ok(ControlToken::Continue)
}
@@ -1474,13 +1475,13 @@ where
slf.emit_error(Error::MissingDoctypeSystemIdentifier);
slf.emitter.set_force_quirks();
slf.state = State::Data;
- slf.emitter.emit_current_doctype();
+ slf.emitter.emit_current_doctype(slf.reader.position());
Ok(ControlToken::Continue)
}
None => {
slf.emit_error(Error::EofInDoctype);
slf.emitter.set_force_quirks();
- slf.emitter.emit_current_doctype();
+ slf.emitter.emit_current_doctype(slf.reader.position());
Ok(ControlToken::Eof)
}
c @ Some(_) => {
@@ -1494,12 +1495,12 @@ where
State::BeforeDoctypeSystemIdentifier => match slf.read_char()? {
Some(whitespace_pat!()) => Ok(ControlToken::Continue),
Some('"') => {
- slf.emitter.init_doctype_system_id();
+ slf.emitter.init_doctype_system_id(slf.reader.position());
slf.state = State::DoctypeSystemIdentifierDoubleQuoted;
Ok(ControlToken::Continue)
}
Some('\'') => {
- slf.emitter.init_doctype_system_id();
+ slf.emitter.init_doctype_system_id(slf.reader.position());
slf.state = State::DoctypeSystemIdentifierSingleQuoted;
Ok(ControlToken::Continue)
}
@@ -1507,13 +1508,13 @@ where
slf.emit_error(Error::MissingDoctypeSystemIdentifier);
slf.emitter.set_force_quirks();
slf.state = State::Data;
- slf.emitter.emit_current_doctype();
+ slf.emitter.emit_current_doctype(slf.reader.position());
Ok(ControlToken::Continue)
}
None => {
slf.emit_error(Error::EofInDoctype);
slf.emitter.set_force_quirks();
- slf.emitter.emit_current_doctype();
+ slf.emitter.emit_current_doctype(slf.reader.position());
Ok(ControlToken::Eof)
}
c @ Some(_) => {
@@ -1538,13 +1539,13 @@ where
slf.emit_error(Error::AbruptDoctypeSystemIdentifier);
slf.emitter.set_force_quirks();
slf.state = State::Data;
- slf.emitter.emit_current_doctype();
+ slf.emitter.emit_current_doctype(slf.reader.position());
Ok(ControlToken::Continue)
}
None => {
slf.emit_error(Error::EofInDoctype);
slf.emitter.set_force_quirks();
- slf.emitter.emit_current_doctype();
+ slf.emitter.emit_current_doctype(slf.reader.position());
Ok(ControlToken::Eof)
}
Some(x) => {
@@ -1566,13 +1567,13 @@ where
slf.emit_error(Error::AbruptDoctypeSystemIdentifier);
slf.emitter.set_force_quirks();
slf.state = State::Data;
- slf.emitter.emit_current_doctype();
+ slf.emitter.emit_current_doctype(slf.reader.position());
Ok(ControlToken::Continue)
}
None => {
slf.emit_error(Error::EofInDoctype);
slf.emitter.set_force_quirks();
- slf.emitter.emit_current_doctype();
+ slf.emitter.emit_current_doctype(slf.reader.position());
Ok(ControlToken::Eof)
}
Some(x) => {
@@ -1584,13 +1585,13 @@ where
Some(whitespace_pat!()) => Ok(ControlToken::Continue),
Some('>') => {
slf.state = State::Data;
- slf.emitter.emit_current_doctype();
+ slf.emitter.emit_current_doctype(slf.reader.position());
Ok(ControlToken::Continue)
}
None => {
slf.emit_error(Error::EofInDoctype);
slf.emitter.set_force_quirks();
- slf.emitter.emit_current_doctype();
+ slf.emitter.emit_current_doctype(slf.reader.position());
Ok(ControlToken::Eof)
}
c @ Some(_) => {
@@ -1603,7 +1604,7 @@ where
State::BogusDoctype => match slf.read_char()? {
Some('>') => {
slf.state = State::Data;
- slf.emitter.emit_current_doctype();
+ slf.emitter.emit_current_doctype(slf.reader.position());
Ok(ControlToken::Continue)
}
Some('\0') => {
@@ -1611,7 +1612,7 @@ where
Ok(ControlToken::Continue)
}
None => {
- slf.emitter.emit_current_doctype();
+ slf.emitter.emit_current_doctype(slf.reader.position());
Ok(ControlToken::Eof)
}
Some(_) => Ok(ControlToken::Continue),
diff --git a/src/tokenizer.rs b/src/tokenizer.rs
index 1b80ec3..d272b14 100644
--- a/src/tokenizer.rs
+++ b/src/tokenizer.rs
@@ -1,5 +1,3 @@
-use std::marker::PhantomData;
-
use crate::machine;
use crate::offset::{NoopOffset, Offset, Position};
use crate::reader::{IntoReader, Reader};
@@ -41,16 +39,16 @@ pub struct Tokenizer<R: Reader, O = NoopOffset, E: Emitter<O> = DefaultEmitter<O
pub(crate) emitter: E,
pub(crate) temporary_buffer: String,
pub(crate) reader: R,
- _offset: PhantomData<O>,
to_reconsume: Stack2<Option<char>>,
pub(crate) character_reference_code: u32,
pub(crate) return_state: Option<InternalState>,
current_tag_name: String,
last_start_tag_name: String,
is_start_tag: bool,
+ pub(crate) doctype_offset: O,
}
-impl<R: Reader, O, E: Emitter<O>> Tokenizer<R, O, E> {
+impl<R: Reader, O: Offset, E: Emitter<O>> Tokenizer<R, O, E> {
/// Creates a new tokenizer from some input and an emitter.
///
/// TODO: add warning about you needing to do the state switching
@@ -58,7 +56,6 @@ impl<R: Reader, O, E: Emitter<O>> Tokenizer<R, O, E> {
Tokenizer {
reader: reader.into_reader(),
emitter,
- _offset: PhantomData,
state: InternalState::Data,
to_reconsume: Stack2::default(),
return_state: None,
@@ -68,6 +65,7 @@ impl<R: Reader, O, E: Emitter<O>> Tokenizer<R, O, E> {
current_tag_name: String::new(),
last_start_tag_name: String::new(),
is_start_tag: false,
+ doctype_offset: O::default(),
}
}
}
diff --git a/tests/test_spans.rs b/tests/test_spans.rs
index 6bd9378..70bcf6e 100644
--- a/tests/test_spans.rs
+++ b/tests/test_spans.rs
@@ -167,6 +167,37 @@ fn comment_bogus_data_span() {
"###);
}
+#[test]
+fn doctype_span() {
+ let html = r#"<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd" >"#;
+ let Token::Doctype(doctype) = tokenizer(html).next().unwrap() else {
+ panic!("expected doctype");
+ };
+ let labels = vec![(doctype.span, "")];
+ assert_snapshot!(annotate(html, labels), @r###"
+ <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd" >
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+ "###);
+}
+
+#[test]
+fn doctype_id_spans() {
+ let html = r#"<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">"#;
+ let Token::Doctype(doctype) = tokenizer(html).next().unwrap() else {
+ panic!("expected doctype");
+ };
+ let labels = vec![
+ (doctype.public_id_span().unwrap(), "public id"),
+ (doctype.system_id_span().unwrap(), "system id"),
+ ];
+ assert_snapshot!(annotate(html, labels), @r###"
+ <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
+ ^^^^^^^^^^^^^^^^^^^^^^^^^ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ system id
+ │
+ public id
+ "###);
+}
+
fn annotate_errors(html: &'static str) -> String {
let mut labels = Vec::new();
for token in tokenizer(html) {