diff options
-rw-r--r-- | src/emitter.rs | 26 | ||||
-rw-r--r-- | src/machine.rs | 266 | ||||
-rw-r--r-- | src/spans.rs | 34 | ||||
-rw-r--r-- | src/tokenizer.rs | 13 | ||||
-rw-r--r-- | tests/span-tests/demo.html | 10 | ||||
-rw-r--r-- | tests/span-tests/demo.out | 46 | ||||
-rw-r--r-- | tests/test_html5lib.rs | 4 | ||||
-rw-r--r-- | tests/test_spans.rs | 4 |
8 files changed, 205 insertions, 198 deletions
diff --git a/src/emitter.rs b/src/emitter.rs index 48ac391..b47dc20 100644 --- a/src/emitter.rs +++ b/src/emitter.rs @@ -45,7 +45,7 @@ pub trait Emitter<R> { fn emit_eof(&mut self); /// A (probably recoverable) parsing error has occured. - fn emit_error(&mut self, error: Error); + fn emit_error(&mut self, error: Error, reader: &R); /// After every state change, the tokenizer calls this method to retrieve a new token that can /// be returned via the tokenizer's iterator interface. @@ -89,7 +89,7 @@ pub trait Emitter<R> { /// /// If the current token is an end tag, the emitter should emit the /// [`crate::Error::EndTagWithTrailingSolidus`] error. - fn set_self_closing(&mut self); + fn set_self_closing(&mut self, reader: &R); /// Assuming the _current token_ is a doctype, set its "force quirks" flag to true. /// @@ -250,6 +250,13 @@ impl<R> DefaultEmitter<R, ()> { let s = mem::take(&mut self.current_characters); self.emit_token(Token::String(s)); } + + fn emit_error(&mut self, error: Error) { + // bypass character flushing in self.emit_token: we don't need the error location to be + // that exact + self.emitted_tokens + .push_front(Token::Error { error, span: () }); + } } impl<R> Emitter<R> for DefaultEmitter<R, ()> { @@ -265,10 +272,8 @@ impl<R> Emitter<R> for DefaultEmitter<R, ()> { self.flush_current_characters(); } - fn emit_error(&mut self, error: Error) { - // bypass character flushing in self.emit_token: we don't need the error location to be - // that exact - self.emitted_tokens.push_front(Token::Error(error)); + fn emit_error(&mut self, error: Error, _reader: &R) { + self.emit_error(error); } fn pop_token(&mut self) -> Option<Self::Token> { @@ -319,7 +324,7 @@ impl<R> Emitter<R> for DefaultEmitter<R, ()> { self.emit_token(doctype); } - fn set_self_closing(&mut self) { + fn set_self_closing(&mut self, _reader: &R) { let tag = self.current_token.as_mut().unwrap(); match tag { Token::StartTag(StartTag { @@ -540,5 +545,10 @@ pub enum Token<S> { /// /// Can be skipped over, the tokenizer is supposed to recover from the error and continues with /// more tokens afterward. - Error(Error), + Error { + /// What kind of error occured. + error: Error, + /// The source code span of the error. + span: S, + }, } diff --git a/src/machine.rs b/src/machine.rs index 4300f45..8c062ec 100644 --- a/src/machine.rs +++ b/src/machine.rs @@ -39,7 +39,7 @@ pub fn consume<R: Reader, E: Emitter<R>>( Ok(ControlToken::Continue) } Some('\0') => { - slf.emitter.emit_error(Error::UnexpectedNullCharacter); + slf.emit_error(Error::UnexpectedNullCharacter); slf.emitter.emit_string("\0"); Ok(ControlToken::Continue) } @@ -60,7 +60,7 @@ pub fn consume<R: Reader, E: Emitter<R>>( Ok(ControlToken::Continue) } Some('\0') => { - slf.emitter.emit_error(Error::UnexpectedNullCharacter); + slf.emit_error(Error::UnexpectedNullCharacter); slf.emitter.emit_string("\u{fffd}"); Ok(ControlToken::Continue) } @@ -76,7 +76,7 @@ pub fn consume<R: Reader, E: Emitter<R>>( Ok(ControlToken::Continue) } Some('\0') => { - slf.emitter.emit_error(Error::UnexpectedNullCharacter); + slf.emit_error(Error::UnexpectedNullCharacter); slf.emitter.emit_string("\u{fffd}"); Ok(ControlToken::Continue) } @@ -92,7 +92,7 @@ pub fn consume<R: Reader, E: Emitter<R>>( Ok(ControlToken::Continue) } Some('\0') => { - slf.emitter.emit_error(Error::UnexpectedNullCharacter); + slf.emit_error(Error::UnexpectedNullCharacter); slf.emitter.emit_string("\u{fffd}"); Ok(ControlToken::Continue) } @@ -104,7 +104,7 @@ pub fn consume<R: Reader, E: Emitter<R>>( }, State::PlainText => match slf.read_char()? { Some('\0') => { - slf.emitter.emit_error(Error::UnexpectedNullCharacter); + slf.emit_error(Error::UnexpectedNullCharacter); slf.emitter.emit_string("\u{fffd}"); Ok(ControlToken::Continue) } @@ -130,21 +130,19 @@ pub fn consume<R: Reader, E: Emitter<R>>( Ok(ControlToken::Continue) } c @ Some('?') => { - slf.emitter - .emit_error(Error::UnexpectedQuestionMarkInsteadOfTagName); + slf.emit_error(Error::UnexpectedQuestionMarkInsteadOfTagName); slf.emitter.init_comment(&slf.reader); slf.state = State::BogusComment; slf.unread_char(c); Ok(ControlToken::Continue) } None => { - slf.emitter.emit_error(Error::EofBeforeTagName); + slf.emit_error(Error::EofBeforeTagName); slf.emitter.emit_string("<"); Ok(ControlToken::Eof) } c @ Some(_) => { - slf.emitter - .emit_error(Error::InvalidFirstCharacterOfTagName); + slf.emit_error(Error::InvalidFirstCharacterOfTagName); slf.state = State::Data; slf.emitter.emit_string("<"); slf.unread_char(c); @@ -159,18 +157,17 @@ pub fn consume<R: Reader, E: Emitter<R>>( Ok(ControlToken::Continue) } Some('>') => { - slf.emitter.emit_error(Error::MissingEndTagName); + slf.emit_error(Error::MissingEndTagName); slf.state = State::Data; Ok(ControlToken::Continue) } None => { - slf.emitter.emit_error(Error::EofBeforeTagName); + slf.emit_error(Error::EofBeforeTagName); slf.emitter.emit_string("</"); Ok(ControlToken::Eof) } Some(x) => { - slf.emitter - .emit_error(Error::InvalidFirstCharacterOfTagName); + slf.emit_error(Error::InvalidFirstCharacterOfTagName); slf.emitter.init_comment(&slf.reader); slf.state = State::BogusComment; slf.unread_char(Some(x)); @@ -192,7 +189,7 @@ pub fn consume<R: Reader, E: Emitter<R>>( Ok(ControlToken::Continue) } Some('\0') => { - slf.emitter.emit_error(Error::UnexpectedNullCharacter); + slf.emit_error(Error::UnexpectedNullCharacter); slf.emitter.push_tag_name("\u{fffd}"); Ok(ControlToken::Continue) } @@ -201,7 +198,7 @@ pub fn consume<R: Reader, E: Emitter<R>>( Ok(ControlToken::Continue) } None => { - slf.emitter.emit_error(Error::EofInTag); + slf.emit_error(Error::EofInTag); Ok(ControlToken::Eof) } }, @@ -409,13 +406,12 @@ pub fn consume<R: Reader, E: Emitter<R>>( Ok(ControlToken::Continue) } Some('\0') => { - slf.emitter.emit_error(Error::UnexpectedNullCharacter); + slf.emit_error(Error::UnexpectedNullCharacter); slf.emitter.emit_string("\u{fffd}"); Ok(ControlToken::Continue) } None => { - slf.emitter - .emit_error(Error::EofInScriptHtmlCommentLikeText); + slf.emit_error(Error::EofInScriptHtmlCommentLikeText); Ok(ControlToken::Eof) } Some(x) => { @@ -434,14 +430,13 @@ pub fn consume<R: Reader, E: Emitter<R>>( Ok(ControlToken::Continue) } Some('\0') => { - slf.emitter.emit_error(Error::UnexpectedNullCharacter); + slf.emit_error(Error::UnexpectedNullCharacter); slf.state = State::ScriptDataEscaped; slf.emitter.emit_string("\u{fffd}"); Ok(ControlToken::Continue) } None => { - slf.emitter - .emit_error(Error::EofInScriptHtmlCommentLikeText); + slf.emit_error(Error::EofInScriptHtmlCommentLikeText); Ok(ControlToken::Eof) } Some(x) => { @@ -465,14 +460,13 @@ pub fn consume<R: Reader, E: Emitter<R>>( Ok(ControlToken::Continue) } Some('\0') => { - slf.emitter.emit_error(Error::UnexpectedNullCharacter); + slf.emit_error(Error::UnexpectedNullCharacter); slf.state = State::ScriptDataEscaped; slf.emitter.emit_string("\u{fffd}"); Ok(ControlToken::Continue) } None => { - slf.emitter - .emit_error(Error::EofInScriptHtmlCommentLikeText); + slf.emit_error(Error::EofInScriptHtmlCommentLikeText); Ok(ControlToken::Eof) } Some(x) => { @@ -575,13 +569,12 @@ pub fn consume<R: Reader, E: Emitter<R>>( Ok(ControlToken::Continue) } Some('\0') => { - slf.emitter.emit_error(Error::UnexpectedNullCharacter); + slf.emit_error(Error::UnexpectedNullCharacter); slf.emitter.emit_string("\u{fffd}"); Ok(ControlToken::Continue) } None => { - slf.emitter - .emit_error(Error::EofInScriptHtmlCommentLikeText); + slf.emit_error(Error::EofInScriptHtmlCommentLikeText); Ok(ControlToken::Eof) } Some(x) => { @@ -601,14 +594,13 @@ pub fn consume<R: Reader, E: Emitter<R>>( Ok(ControlToken::Continue) } Some('\0') => { - slf.emitter.emit_error(Error::UnexpectedNullCharacter); + slf.emit_error(Error::UnexpectedNullCharacter); slf.state = State::ScriptDataDoubleEscaped; slf.emitter.emit_string("\u{fffd}"); Ok(ControlToken::Continue) } None => { - slf.emitter - .emit_error(Error::EofInScriptHtmlCommentLikeText); + slf.emit_error(Error::EofInScriptHtmlCommentLikeText); Ok(ControlToken::Eof) } Some(x) => { @@ -633,14 +625,13 @@ pub fn consume<R: Reader, E: Emitter<R>>( Ok(ControlToken::Continue) } Some('\0') => { - slf.emitter.emit_error(Error::UnexpectedNullCharacter); + slf.emit_error(Error::UnexpectedNullCharacter); slf.state = State::ScriptDataDoubleEscaped; slf.emitter.emit_string("\u{fffd}"); Ok(ControlToken::Continue) } None => { - slf.emitter - .emit_error(Error::EofInScriptHtmlCommentLikeText); + slf.emit_error(Error::EofInScriptHtmlCommentLikeText); Ok(ControlToken::Eof) } Some(x) => { @@ -692,8 +683,7 @@ pub fn consume<R: Reader, E: Emitter<R>>( Ok(ControlToken::Continue) } Some('=') => { - slf.emitter - .emit_error(Error::UnexpectedEqualsSignBeforeAttributeName); + slf.emit_error(Error::UnexpectedEqualsSignBeforeAttributeName); slf.emitter.init_attribute_name(&slf.reader); slf.emitter.push_attribute_name("="); slf.state = State::AttributeName; @@ -717,13 +707,12 @@ pub fn consume<R: Reader, E: Emitter<R>>( Ok(ControlToken::Continue) } Some('\0') => { - slf.emitter.emit_error(Error::UnexpectedNullCharacter); + slf.emit_error(Error::UnexpectedNullCharacter); slf.emitter.push_attribute_name("\u{fffd}"); Ok(ControlToken::Continue) } Some(x @ '"' | x @ '\'' | x @ '<') => { - slf.emitter - .emit_error(Error::UnexpectedCharacterInAttributeName); + slf.emit_error(Error::UnexpectedCharacterInAttributeName); slf.emitter .push_attribute_name(ctostr!(x.to_ascii_lowercase())); Ok(ControlToken::Continue) @@ -750,7 +739,7 @@ pub fn consume<R: Reader, E: Emitter<R>>( Ok(ControlToken::Continue) } None => { - slf.emitter.emit_error(Error::EofInTag); + slf.emit_error(Error::EofInTag); Ok(ControlToken::Eof) } Some(x) => { @@ -773,7 +762,7 @@ pub fn consume<R: Reader, E: Emitter<R>>( Ok(ControlToken::Continue) } Some('>') => { - slf.emitter.emit_error(Error::MissingAttributeValue); + slf.emit_error(Error::MissingAttributeValue); slf.state = State::Data; slf.emitter.emit_current_tag(); Ok(ControlToken::Continue) @@ -796,12 +785,12 @@ pub fn consume<R: Reader, E: Emitter<R>>( Ok(ControlToken::Continue) } Some('\0') => { - slf.emitter.emit_error(Error::UnexpectedNullCharacter); + slf.emit_error(Error::UnexpectedNullCharacter); slf.emitter.push_attribute_value("\u{fffd}"); Ok(ControlToken::Continue) } None => { - slf.emitter.emit_error(Error::EofInTag); + slf.emit_error(Error::EofInTag); Ok(ControlToken::Eof) } Some(x) => { @@ -820,12 +809,12 @@ pub fn consume<R: Reader, E: Emitter<R>>( Ok(ControlToken::Continue) } Some('\0') => { - slf.emitter.emit_error(Error::UnexpectedNullCharacter); + slf.emit_error(Error::UnexpectedNullCharacter); slf.emitter.push_attribute_value("\u{fffd}"); Ok(ControlToken::Continue) } None => { - slf.emitter.emit_error(Error::EofInTag); + slf.emit_error(Error::EofInTag); Ok(ControlToken::Eof) } Some(x) => { @@ -849,18 +838,17 @@ pub fn consume<R: Reader, E: Emitter<R>>( Ok(ControlToken::Continue) } Some('\0') => { - slf.emitter.emit_error(Error::UnexpectedNullCharacter); + slf.emit_error(Error::UnexpectedNullCharacter); slf.emitter.push_attribute_value("\u{fffd}"); Ok(ControlToken::Continue) } Some(x @ '"' | x @ '\'' | x @ '<' | x @ '=' | x @ '\u{60}') => { - slf.emitter - .emit_error(Error::UnexpectedCharacterInUnquotedAttributeValue); + slf.emit_error(Error::UnexpectedCharacterInUnquotedAttributeValue); slf.emitter.push_attribute_value(ctostr!(x)); Ok(ControlToken::Continue) } None => { - slf.emitter.emit_error(Error::EofInTag); + slf.emit_error(Error::EofInTag); Ok(ControlToken::Eof) } Some(x) => { @@ -883,12 +871,11 @@ pub fn consume<R: Reader, E: Emitter<R>>( Ok(ControlToken::Continue) } None => { - slf.emitter.emit_error(Error::EofInTag); + slf.emit_error(Error::EofInTag); Ok(ControlToken::Eof) } Some(x) => { - slf.emitter - .emit_error(Error::MissingWhitespaceBetweenAttributes); + slf.emit_error(Error::MissingWhitespaceBetweenAttributes); slf.state = State::BeforeAttributeName; slf.unread_char(Some(x)); Ok(ControlToken::Continue) @@ -896,17 +883,17 @@ pub fn consume<R: Reader, E: Emitter<R>>( }, State::SelfClosingStartTag => match slf.read_char()? { Some('>') => { - slf.emitter.set_self_closing(); + slf.emitter.set_self_closing(&slf.reader); slf.state = State::Data; slf.emitter.emit_current_tag(); Ok(ControlToken::Continue) } None => { - slf.emitter.emit_error(Error::EofInTag); + slf.emit_error(Error::EofInTag); Ok(ControlToken::Eof) } Some(x) => { - slf.emitter.emit_error(Error::UnexpectedSolidusInTag); + slf.emit_error(Error::UnexpectedSolidusInTag); slf.state = State::BeforeAttributeName; slf.unread_char(Some(x)); Ok(ControlToken::Continue) @@ -923,7 +910,7 @@ pub fn consume<R: Reader, E: Emitter<R>>( Ok(ControlToken::Eof) } Some('\0') => { - slf.emitter.emit_error(Error::UnexpectedNullCharacter); + slf.emit_error(Error::UnexpectedNullCharacter); slf.emitter.push_comment("\u{fffd}"); Ok(ControlToken::Continue) } @@ -949,7 +936,7 @@ pub fn consume<R: Reader, E: Emitter<R>>( // missing: cdata transition // // let's hope that bogus comment can just sort of skip over cdata - slf.emitter.emit_error(Error::CdataInHtmlContent); + slf.emit_error(Error::CdataInHtmlContent); slf.emitter.init_comment(&slf.reader); slf.emitter.push_comment("[CDATA["); @@ -957,7 +944,7 @@ pub fn consume<R: Reader, E: Emitter<R>>( Ok(ControlToken::Continue) } c => { - slf.emitter.emit_error(Error::IncorrectlyOpenedComment); + slf.emit_error(Error::IncorrectlyOpenedComment); slf.emitter.init_comment(&slf.reader); slf.state = State::BogusComment; slf.unread_char(c); @@ -970,7 +957,7 @@ pub fn consume<R: Reader, E: Emitter<R>>( Ok(ControlToken::Continue) } Some('>') => { - slf.emitter.emit_error(Error::AbruptClosingOfEmptyComment); + slf.emit_error(Error::AbruptClosingOfEmptyComment); slf.state = State::Data; slf.emitter.emit_current_comment(); Ok(ControlToken::Continue) @@ -987,13 +974,13 @@ pub fn consume<R: Reader, E: Emitter<R>>( Ok(ControlToken::Continue) } Some('>') => { - slf.emitter.emit_error(Error::AbruptClosingOfEmptyComment); + slf.emit_error(Error::AbruptClosingOfEmptyComment); slf.state = State::Data; slf.emitter.emit_current_comment(); Ok(ControlToken::Continue) } None => { - slf.emitter.emit_error(Error::EofInComment); + slf.emit_error(Error::EofInComment); slf.emitter.emit_current_comment(); Ok(ControlToken::Eof) } @@ -1015,12 +1002,12 @@ pub fn consume<R: Reader, E: Emitter<R>>( Ok(ControlToken::Continue) } Some('\0') => { - slf.emitter.emit_error(Error::UnexpectedNullCharacter); + slf.emit_error(Error::UnexpectedNullCharacter); slf.emitter.push_comment("\u{fffd}"); Ok(ControlToken::Continue) } None => { - slf.emitter.emit_error(Error::EofInComment); + slf.emit_error(Error::EofInComment); slf.emitter.emit_current_comment(); Ok(ControlToken::Eof) } @@ -1074,7 +1061,7 @@ pub fn consume<R: Reader, E: Emitter<R>>( Ok(ControlToken::Continue) } c => { - slf.emitter.emit_error(Error::NestedComment); + slf.emit_error(Error::NestedComment); slf.unread_char(c); slf.state = State::CommentEnd; Ok(ControlToken::Continue) @@ -1086,7 +1073,7 @@ pub fn consume<R: Reader, E: Emitter<R>>( Ok(ControlToken::Continue) } None => { - slf.emitter.emit_error(Error::EofInComment); + slf.emit_error(Error::EofInComment); slf.emitter.emit_current_comment(); Ok(ControlToken::Eof) } @@ -1112,7 +1099,7 @@ pub fn consume<R: Reader, E: Emitter<R>>( Ok(ControlToken::Continue) } None => { - slf.emitter.emit_error(Error::EofInComment); + slf.emit_error(Error::EofInComment); slf.emitter.emit_current_comment(); Ok(ControlToken::Eof) } @@ -1133,13 +1120,13 @@ pub fn consume<R: Reader, E: Emitter<R>>( Ok(ControlToken::Continue) } Some('>') => { - slf.emitter.emit_error(Error::IncorrectlyClosedComment); + slf.emit_error(Error::IncorrectlyClosedComment); slf.state = State::Data; slf.emitter.emit_current_comment(); Ok(ControlToken::Continue) } None => { - slf.emitter.emit_error(Error::EofInComment); + slf.emit_error(Error::EofInComment); slf.emitter.emit_current_comment(); Ok(ControlToken::Eof) } @@ -1163,15 +1150,14 @@ pub fn consume<R: Reader, E: Emitter<R>>( Ok(ControlToken::Continue) } None => { - slf.emitter.emit_error(Error::EofInDoctype); + slf.emit_error(Error::EofInDoctype); slf.emitter.init_doctype(&slf.reader); slf.emitter.set_force_quirks(); slf.emitter.emit_current_doctype(); Ok(ControlToken::Eof) } c @ Some(_) => { - slf.emitter - .emit_error(Error::MissingWhitespaceBeforeDoctypeName); + slf.emit_error(Error::MissingWhitespaceBeforeDoctypeName); slf.unread_char(c); slf.state = State::BeforeDoctypeName; Ok(ControlToken::Continue) @@ -1180,14 +1166,14 @@ pub fn consume<R: Reader, E: Emitter<R>>( State::BeforeDoctypeName => match slf.read_char()? { Some(whitespace_pat!()) => Ok(ControlToken::Continue), Some('\0') => { - slf.emitter.emit_error(Error::UnexpectedNullCharacter); + slf.emit_error(Error::UnexpectedNullCharacter); slf.emitter.init_doctype(&slf.reader); slf.emitter.push_doctype_name("\u{fffd}"); slf.state = State::DoctypeName; Ok(ControlToken::Continue) } Some('>') => { - slf.emitter.emit_error(Error::MissingDoctypeName); + slf.emit_error(Error::MissingDoctypeName); slf.emitter.init_doctype(&slf.reader); slf.emitter.set_force_quirks(); slf.state = State::Data; @@ -1195,7 +1181,7 @@ pub fn consume<R: Reader, E: Emitter<R>>( Ok(ControlToken::Continue) } None => { - slf.emitter.emit_error(Error::EofInDoctype); + slf.emit_error(Error::EofInDoctype); slf.emitter.init_doctype(&slf.reader); slf.emitter.set_force_quirks(); slf.emitter.emit_current_doctype(); @@ -1220,12 +1206,12 @@ pub fn consume<R: Reader, E: Emitter<R>>( Ok(ControlToken::Continue) } Some('\0') => { - slf.emitter.emit_error(Error::UnexpectedNullCharacter); + slf.emit_error(Error::UnexpectedNullCharacter); slf.emitter.push_doctype_name("\u{fffd}"); Ok(ControlToken::Continue) } None => { - slf.emitter.emit_error(Error::EofInDoctype); + slf.emit_error(Error::EofInDoctype); slf.emitter.set_force_quirks(); slf.emitter.emit_current_doctype(); Ok(ControlToken::Eof) @@ -1244,7 +1230,7 @@ pub fn consume<R: Reader, E: Emitter<R>>( Ok(ControlToken::Continue) } None => { - slf.emitter.emit_error(Error::EofInDoctype); + slf.emit_error(Error::EofInDoctype); slf.emitter.set_force_quirks(); slf.emitter.emit_current_doctype(); Ok(ControlToken::Eof) @@ -1258,8 +1244,7 @@ pub fn consume<R: Reader, E: Emitter<R>>( Ok(ControlToken::Continue) } c @ Some(_) => { - slf.emitter - .emit_error(Error::InvalidCharacterSequenceAfterDoctypeName); + slf.emit_error(Error::InvalidCharacterSequenceAfterDoctypeName); slf.emitter.set_force_quirks(); slf.unread_char(c); slf.state = State::BogusDoctype; @@ -1272,36 +1257,32 @@ pub fn consume<R: Reader, E: Emitter<R>>( Ok(ControlToken::Continue) } Some('"') => { - slf.emitter - .emit_error(Error::MissingWhitespaceAfterDoctypePublicKeyword); + slf.emit_error(Error::MissingWhitespaceAfterDoctypePublicKeyword); slf.emitter.set_doctype_public_identifier(""); slf.state = State::DoctypePublicIdentifierDoubleQuoted; Ok(ControlToken::Continue) } Some('\'') => { - slf.emitter - .emit_error(Error::MissingWhitespaceAfterDoctypePublicKeyword); + slf.emit_error(Error::MissingWhitespaceAfterDoctypePublicKeyword); slf.emitter.set_doctype_public_identifier(""); slf.state = State::DoctypePublicIdentifierSingleQuoted; Ok(ControlToken::Continue) } Some('>') => { - slf.emitter - .emit_error(Error::MissingDoctypePublicIdentifier); + slf.emit_error(Error::MissingDoctypePublicIdentifier); slf.emitter.set_force_quirks(); slf.state = State::Data; slf.emitter.emit_current_doctype(); Ok(ControlToken::Continue) } None => { - slf.emitter.emit_error(Error::EofInDoctype); + slf.emit_error(Error::EofInDoctype); slf.emitter.set_force_quirks(); slf.emitter.emit_current_doctype(); Ok(ControlToken::Eof) } c @ Some(_) => { - slf.emitter - .emit_error(Error::MissingQuoteBeforeDoctypePublicIdentifier); + slf.emit_error(Error::MissingQuoteBeforeDoctypePublicIdentifier); slf.emitter.set_force_quirks(); slf.unread_char(c); slf.state = State::BogusDoctype; @@ -1321,22 +1302,20 @@ pub fn consume<R: Reader, E: Emitter<R>>( Ok(ControlToken::Continue) } Some('>') => { - slf.emitter - .emit_error(Error::MissingDoctypePublicIdentifier); + slf.emit_error(Error::MissingDoctypePublicIdentifier); slf.emitter.set_force_quirks(); slf.state = State::Data; slf.emitter.emit_current_doctype(); Ok(ControlToken::Continue) } None => { - slf.emitter.emit_error(Error::EofInDoctype); + slf.emit_error(Error::EofInDoctype); slf.emitter.set_force_quirks(); slf.emitter.emit_current_doctype(); Ok(ControlToken::Eof) } c @ Some(_) => { - slf.emitter - .emit_error(Error::MissingQuoteBeforeDoctypePublicIdentifier); + slf.emit_error(Error::MissingQuoteBeforeDoctypePublicIdentifier); slf.emitter.set_force_quirks(); slf.unread_char(c); slf.state = State::BogusDoctype; @@ -1349,19 +1328,19 @@ pub fn consume<R: Reader, E: Emitter<R>>( Ok(ControlToken::Continue) } Some('\0') => { - slf.emitter.emit_error(Error::UnexpectedNullCharacter); + slf.emit_error(Error::UnexpectedNullCharacter); slf.emitter.push_doctype_public_identifier("\u{fffd}"); Ok(ControlToken::Continue) } Some('>') => { - slf.emitter.emit_error(Error::AbruptDoctypePublicIdentifier); + slf.emit_error(Error::AbruptDoctypePublicIdentifier); slf.emitter.set_force_quirks(); slf.state = State::Data; slf.emitter.emit_current_doctype(); Ok(ControlToken::Continue) } None => { - slf.emitter.emit_error(Error::EofInDoctype); + slf.emit_error(Error::EofInDoctype); slf.emitter.set_force_quirks(); slf.emitter.emit_current_doctype(); Ok(ControlToken::Eof) @@ -1377,19 +1356,19 @@ pub fn consume<R: Reader, E: Emitter<R>>( Ok(ControlToken::Continue) } Some('\0') => { - slf.emitter.emit_error(Error::UnexpectedNullCharacter); + slf.emit_error(Error::UnexpectedNullCharacter); slf.emitter.push_doctype_public_identifier("\u{fffd}"); Ok(ControlToken::Continue) } Some('>') => { - slf.emitter.emit_error(Error::AbruptDoctypePublicIdentifier); + slf.emit_error(Error::AbruptDoctypePublicIdentifier); slf.emitter.set_force_quirks(); slf.state = State::Data; slf.emitter.emit_current_doctype(); Ok(ControlToken::Continue) } None => { - slf.emitter.emit_error(Error::EofInDoctype); + slf.emit_error(Error::EofInDoctype); slf.emitter.set_force_quirks(); slf.emitter.emit_current_doctype(); Ok(ControlToken::Eof) @@ -1410,28 +1389,25 @@ pub fn consume<R: Reader, E: Emitter<R>>( Ok(ControlToken::Continue) } Some('"') => { - slf.emitter - .emit_error(Error::MissingWhitespaceBetweenDoctypePublicAndSystemIdentifiers); + slf.emit_error(Error::MissingWhitespaceBetweenDoctypePublicAndSystemIdentifiers); slf.emitter.set_doctype_system_identifier(""); slf.state = State::DoctypeSystemIdentifierDoubleQuoted; Ok(ControlToken::Continue) } Some('\'') => { - slf.emitter - .emit_error(Error::MissingWhitespaceBetweenDoctypePublicAndSystemIdentifiers); + slf.emit_error(Error::MissingWhitespaceBetweenDoctypePublicAndSystemIdentifiers); slf.emitter.set_doctype_system_identifier(""); slf.state = State::DoctypeSystemIdentifierSingleQuoted; Ok(ControlToken::Continue) } None => { - slf.emitter.emit_error(Error::EofInDoctype); + slf.emit_error(Error::EofInDoctype); slf.emitter.set_force_quirks(); slf.emitter.emit_current_doctype(); Ok(ControlToken::Eof) } c @ Some(_) => { - slf.emitter - .emit_error(Error::MissingQuoteBeforeDoctypeSystemIdentifier); + slf.emit_error(Error::MissingQuoteBeforeDoctypeSystemIdentifier); slf.emitter.set_force_quirks(); slf.unread_char(c); slf.state = State::BogusDoctype; @@ -1456,14 +1432,13 @@ pub fn consume<R: Reader, E: Emitter<R>>( Ok(ControlToken::Continue) } None => { - slf.emitter.emit_error(Error::EofInDoctype); + slf.emit_error(Error::EofInDoctype); slf.emitter.set_force_quirks(); slf.emitter.emit_current_doctype(); Ok(ControlToken::Eof) } c @ Some(_) => { - slf.emitter - .emit_error(Error::MissingQuoteBeforeDoctypeSystemIdentifier); + slf.emit_error(Error::MissingQuoteBeforeDoctypeSystemIdentifier); slf.emitter.set_force_quirks(); slf.state = State::BogusDoctype; slf.unread_char(c); @@ -1476,36 +1451,32 @@ pub fn consume<R: Reader, E: Emitter<R>>( Ok(ControlToken::Continue) } Some('"') => { - slf.emitter - .emit_error(Error::MissingWhitespaceAfterDoctypeSystemKeyword); + slf.emit_error(Error::MissingWhitespaceAfterDoctypeSystemKeyword); slf.emitter.set_doctype_system_identifier(""); slf.state = State::DoctypeSystemIdentifierDoubleQuoted; Ok(ControlToken::Continue) } Some('\'') => { - slf.emitter - .emit_error(Error::MissingWhitespaceAfterDoctypeSystemKeyword); + slf.emit_error(Error::MissingWhitespaceAfterDoctypeSystemKeyword); slf.emitter.set_doctype_system_identifier(""); slf.state = State::DoctypeSystemIdentifierSingleQuoted; Ok(ControlToken::Continue) } Some('>') => { - slf.emitter - .emit_error(Error::MissingDoctypeSystemIdentifier); + slf.emit_error(Error::MissingDoctypeSystemIdentifier); slf.emitter.set_force_quirks(); slf.state = State::Data; slf.emitter.emit_current_doctype(); Ok(ControlToken::Continue) } None => { - slf.emitter.emit_error(Error::EofInDoctype); + slf.emit_error(Error::EofInDoctype); slf.emitter.set_force_quirks(); slf.emitter.emit_current_doctype(); Ok(ControlToken::Eof) } c @ Some(_) => { - slf.emitter - .emit_error(Error::MissingQuoteBeforeDoctypeSystemIdentifier); + slf.emit_error(Error::MissingQuoteBeforeDoctypeSystemIdentifier); slf.emitter.set_force_quirks(); slf.state = State::BogusDoctype; slf.unread_char(c); @@ -1525,22 +1496,20 @@ pub fn consume<R: Reader, E: Emitter<R>>( Ok(ControlToken::Continue) } Some('>') => { - slf.emitter - .emit_error(Error::MissingDoctypeSystemIdentifier); + slf.emit_error(Error::MissingDoctypeSystemIdentifier); slf.emitter.set_force_quirks(); slf.state = State::Data; slf.emitter.emit_current_doctype(); Ok(ControlToken::Continue) } None => { - slf.emitter.emit_error(Error::EofInDoctype); + slf.emit_error(Error::EofInDoctype); slf.emitter.set_force_quirks(); slf.emitter.emit_current_doctype(); Ok(ControlToken::Eof) } c @ Some(_) => { - slf.emitter - .emit_error(Error::MissingQuoteBeforeDoctypeSystemIdentifier); + slf.emit_error(Error::MissingQuoteBeforeDoctypeSystemIdentifier); slf.emitter.set_force_quirks(); slf.state = State::BogusDoctype; slf.unread_char(c); @@ -1553,19 +1522,19 @@ pub fn consume<R: Reader, E: Emitter<R>>( Ok(ControlToken::Continue) } Some('\0') => { - slf.emitter.emit_error(Error::UnexpectedNullCharacter); + slf.emit_error(Error::UnexpectedNullCharacter); slf.emitter.push_doctype_system_identifier("\u{fffd}"); Ok(ControlToken::Continue) } Some('>') => { - slf.emitter.emit_error(Error::AbruptDoctypeSystemIdentifier); + slf.emit_error(Error::AbruptDoctypeSystemIdentifier); slf.emitter.set_force_quirks(); slf.state = State::Data; slf.emitter.emit_current_doctype(); Ok(ControlToken::Continue) } None => { - slf.emitter.emit_error(Error::EofInDoctype); + slf.emit_error(Error::EofInDoctype); slf.emitter.set_force_quirks(); slf.emitter.emit_current_doctype(); Ok(ControlToken::Eof) @@ -1581,19 +1550,19 @@ pub fn consume<R: Reader, E: Emitter<R>>( Ok(ControlToken::Continue) } Some('\0') => { - slf.emitter.emit_error(Error::UnexpectedNullCharacter); + slf.emit_error(Error::UnexpectedNullCharacter); slf.emitter.push_doctype_system_identifier("\u{fffd}"); Ok(ControlToken::Continue) } Some('>') => { - slf.emitter.emit_error(Error::AbruptDoctypeSystemIdentifier); + slf.emit_error(Error::AbruptDoctypeSystemIdentifier); slf.emitter.set_force_quirks(); slf.state = State::Data; slf.emitter.emit_current_doctype(); Ok(ControlToken::Continue) } None => { - slf.emitter.emit_error(Error::EofInDoctype); + slf.emit_error(Error::EofInDoctype); slf.emitter.set_force_quirks(); slf.emitter.emit_current_doctype(); Ok(ControlToken::Eof) @@ -1611,14 +1580,13 @@ pub fn consume<R: Reader, E: Emitter<R>>( Ok(ControlToken::Continue) } None => { - slf.emitter.emit_error(Error::EofInDoctype); + slf.emit_error(Error::EofInDoctype); slf.emitter.set_force_quirks(); slf.emitter.emit_current_doctype(); Ok(ControlToken::Eof) } c @ Some(_) => { - slf.emitter - .emit_error(Error::UnexpectedCharacterAfterDoctypeSystemIdentifier); + slf.emit_error(Error::UnexpectedCharacterAfterDoctypeSystemIdentifier); slf.unread_char(c); slf.state = State::BogusDoctype; Ok(ControlToken::Continue) @@ -1631,7 +1599,7 @@ pub fn consume<R: Reader, E: Emitter<R>>( Ok(ControlToken::Continue) } Some('\0') => { - slf.emitter.emit_error(Error::UnexpectedNullCharacter); + slf.emit_error(Error::UnexpectedNullCharacter); Ok(ControlToken::Continue) } None => { @@ -1646,7 +1614,7 @@ pub fn consume<R: Reader, E: Emitter<R>>( Ok(ControlToken::Continue) } None => { - slf.emitter.emit_error(Error::EofInCdata); + slf.emit_error(Error::EofInCdata); Ok(ControlToken::Eof) } Some(x) => { @@ -1728,8 +1696,7 @@ pub fn consume<R: Reader, E: Emitter<R>>( Ok(ControlToken::Continue) } else { if char_ref_name_last_character != Some(';') { - slf.emitter - .emit_error(Error::MissingSemicolonAfterCharacterReference); + slf.emit_error(Error::MissingSemicolonAfterCharacterReference); } slf.temporary_buffer.clear(); @@ -1756,8 +1723,7 @@ pub fn consume<R: Reader, E: Emitter<R>>( Ok(ControlToken::Continue) } c @ Some(';') => { - slf.emitter - .emit_error(Error::UnknownNamedCharacterReference); + slf.emit_error(Error::UnknownNamedCharacterReference); slf.unread_char(c); slf.state = slf.return_state.take().unwrap(); Ok(ControlToken::Continue) @@ -1790,8 +1756,7 @@ pub fn consume<R: Reader, E: Emitter<R>>( Ok(ControlToken::Continue) } c => { - slf.emitter - .emit_error(Error::AbsenceOfDigitsInNumericCharacterReference); + slf.emit_error(Error::AbsenceOfDigitsInNumericCharacterReference); slf.flush_code_points_consumed_as_character_reference(); slf.unread_char(c); slf.state = slf.return_state.take().unwrap(); @@ -1805,8 +1770,7 @@ pub fn consume<R: Reader, E: Emitter<R>>( Ok(ControlToken::Continue) } c => { - slf.emitter - .emit_error(Error::AbsenceOfDigitsInNumericCharacterReference); + slf.emit_error(Error::AbsenceOfDigitsInNumericCharacterReference); slf.flush_code_points_consumed_as_character_reference(); slf.unread_char(c); slf.state = slf.return_state.take().unwrap(); @@ -1831,8 +1795,7 @@ pub fn consume<R: Reader, E: Emitter<R>>( Ok(ControlToken::Continue) } c => { - slf.emitter - .emit_error(Error::MissingSemicolonAfterCharacterReference); + slf.emit_error(Error::MissingSemicolonAfterCharacterReference); slf.unread_char(c); slf.state = State::NumericCharacterReferenceEnd; Ok(ControlToken::Continue) @@ -1848,8 +1811,7 @@ pub fn consume<R: Reader, E: Emitter<R>>( Ok(ControlToken::Continue) } c => { - slf.emitter - .emit_error(Error::MissingSemicolonAfterCharacterReference); + slf.emit_error(Error::MissingSemicolonAfterCharacterReference); slf.unread_char(c); slf.state = State::NumericCharacterReferenceEnd; Ok(ControlToken::Continue) @@ -1858,28 +1820,26 @@ pub fn consume<R: Reader, E: Emitter<R>>( State::NumericCharacterReferenceEnd => { match slf.character_reference_code { 0x00 => { - slf.emitter.emit_error(Error::NullCharacterReference); + slf.emit_error(Error::NullCharacterReference); slf.character_reference_code = 0xfffd; } 0x110000.. => { - slf.emitter - .emit_error(Error::CharacterReferenceOutsideUnicodeRange); + slf.emit_error(Error::CharacterReferenceOutsideUnicodeRange); slf.character_reference_code = 0xfffd; } surrogate_pat!() => { - slf.emitter.emit_error(Error::SurrogateCharacterReference); + slf.emit_error(Error::SurrogateCharacterReference); slf.character_reference_code = 0xfffd; } // noncharacter noncharacter_pat!() => { - slf.emitter - .emit_error(Error::NoncharacterCharacterReference); + slf.emit_error(Error::NoncharacterCharacterReference); } // 0x000d, or a control that is not whitespace x @ 0x000d | x @ control_pat!() if !matches!(x, 0x0009 | 0x000a | 0x000c | 0x0020) => { - slf.emitter.emit_error(Error::ControlCharacterReference); + slf.emit_error(Error::ControlCharacterReference); slf.character_reference_code = match x { 0x80 => 0x20AC, // EURO SIGN (€) 0x82 => 0x201A, // SINGLE LOW-9 QUOTATION MARK (‚) diff --git a/src/spans.rs b/src/spans.rs index 6d7c18e..85a64a9 100644 --- a/src/spans.rs +++ b/src/spans.rs @@ -62,6 +62,7 @@ pub struct SpanEmitter<R> { seen_attributes: BTreeSet<String>, emitted_tokens: VecDeque<Token<Span>>, reader: PhantomData<R>, + attr_in_end_tag_span: Span, } impl<R> Default for SpanEmitter<R> { @@ -74,6 +75,7 @@ impl<R> Default for SpanEmitter<R> { seen_attributes: BTreeSet::new(), emitted_tokens: VecDeque::new(), reader: PhantomData::default(), + attr_in_end_tag_span: Span::default(), } } } @@ -91,18 +93,19 @@ impl<R: GetPos> SpanEmitter<R> { let mut error = None; tag.attributes .entry(k) - .and_modify(|_| { - error = Some(Error::DuplicateAttribute); + .and_modify(|a| { + error = Some((Error::DuplicateAttribute, a.name_span.clone())); }) .or_insert(v); - if let Some(e) = error { - self.emit_error(e); + if let Some((e, span)) = error { + self.emit_error_span(e, span); } } Some(Token::EndTag(_)) => { + self.attr_in_end_tag_span = v.name_span.clone(); if !self.seen_attributes.insert(k) { - self.emit_error(Error::DuplicateAttribute); + self.emit_error_span(Error::DuplicateAttribute, v.name_span); } } _ => { @@ -120,6 +123,12 @@ impl<R: GetPos> SpanEmitter<R> { let s = mem::take(&mut self.current_characters); self.emit_token(Token::String(s)); } + + fn emit_error_span(&mut self, error: Error, span: Span) { + // bypass character flushing in self.emit_token: we don't need the error location to be + // that exact + self.emitted_tokens.push_front(Token::Error { error, span }); + } } impl<R: GetPos> Emitter<R> for SpanEmitter<R> { @@ -135,10 +144,8 @@ impl<R: GetPos> Emitter<R> for SpanEmitter<R> { self.flush_current_characters(); } - fn emit_error(&mut self, error: Error) { - // bypass character flushing in self.emit_token: we don't need the error location to be - // that exact - self.emitted_tokens.push_front(Token::Error(error)); + fn emit_error(&mut self, error: Error, reader: &R) { + self.emit_error_span(error, reader.get_pos() - 1..reader.get_pos() - 1) } fn pop_token(&mut self) -> Option<Self::Token> { @@ -172,7 +179,10 @@ impl<R: GetPos> Emitter<R> for SpanEmitter<R> { match token { Token::EndTag(_) => { if !self.seen_attributes.is_empty() { - self.emit_error(Error::EndTagWithAttributes); + self.emit_error_span( + Error::EndTagWithAttributes, + self.attr_in_end_tag_span.clone(), + ); } self.seen_attributes.clear(); } @@ -195,7 +205,7 @@ impl<R: GetPos> Emitter<R> for SpanEmitter<R> { self.emit_token(doctype); } - fn set_self_closing(&mut self) { + fn set_self_closing(&mut self, reader: &R) { let tag = self.current_token.as_mut().unwrap(); match tag { Token::StartTag(StartTag { @@ -205,7 +215,7 @@ impl<R: GetPos> Emitter<R> for SpanEmitter<R> { *self_closing = true; } Token::EndTag(_) => { - self.emit_error(Error::EndTagWithTrailingSolidus); + self.emit_error(Error::EndTagWithTrailingSolidus, reader); } _ => { debug_assert!(false); diff --git a/src/tokenizer.rs b/src/tokenizer.rs index efaa870..6e928e9 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -122,6 +122,12 @@ impl<R: Reader, E: Emitter<R>> Tokenizer<R, E> { self.state = state.into(); } + /// Just a helper method for the machine. + #[inline] + pub(crate) fn emit_error(&mut self, error: Error) { + self.emitter.emit_error(error, &self.reader); + } + /// Test-internal function to override internal state. /// /// Only available with the `integration-tests` feature which is not public API. @@ -139,17 +145,16 @@ impl<R: Reader, E: Emitter<R>> Tokenizer<R, E> { fn validate_char(&mut self, c: char) { match c as u32 { surrogate_pat!() => { - self.emitter.emit_error(Error::SurrogateInInputStream); + self.emit_error(Error::SurrogateInInputStream); } noncharacter_pat!() => { - self.emitter.emit_error(Error::NoncharacterInInputStream); + self.emit_error(Error::NoncharacterInInputStream); } // control without whitespace or nul x @ control_pat!() if !matches!(x, 0x0000 | 0x0009 | 0x000a | 0x000c | 0x000d | 0x0020) => { - self.emitter - .emit_error(Error::ControlCharacterInInputStream); + self.emit_error(Error::ControlCharacterInInputStream); } _ => (), } diff --git a/tests/span-tests/demo.html b/tests/span-tests/demo.html index c635846..07c305b 100644 --- a/tests/span-tests/demo.html +++ b/tests/span-tests/demo.html @@ -3,3 +3,13 @@ this is a tag: <h1>test</h1> tags can have attributes: <div id = foobar> Attribute values can be quoted: <input name = 'age' type = "number"> + +But you cannot put attributes everywhere: </nope data=foobar> + +Please mind the gap: < test + +The pirate says &arrrrr; + +Does this open two pages? <a href=foo.html href=bar.html>click me</a> + +Do you start or do you end? </yes/> diff --git a/tests/span-tests/demo.out b/tests/span-tests/demo.out index ad9cfb8..37ab8be 100644 --- a/tests/span-tests/demo.out +++ b/tests/span-tests/demo.out @@ -1,19 +1,29 @@ note: - ┌─ test.html:1:1 - │ -1 │ this is a tag: <h1>test</h1> - │ ^^ ^^ end tag - │ │ - │ start tag -2 │ -3 │ tags can have attributes: <div id = foobar> - │ ^^ ^^^^^^ attr value - │ │ - │ attr name -4 │ -5 │ Attribute values can be quoted: <input name = 'age' type = "number"> - │ ^^^^ ^^^ ^^^^ ^^^^^^ attr value - │ │ │ │ - │ │ │ attr name - │ │ attr value - │ attr name + ┌─ test.html:1:17 + │ + 1 │ this is a tag: <h1>test</h1> + │ ^^ ^^ end tag + │ │ + │ start tag + · + 5 │ Attribute values can be quoted: <input name = 'age' type = "number"> + │ ^^^^ ^^^ ^^^^ ^^^^^^ attr value + │ │ │ │ + │ │ │ attr name + │ │ attr value + │ attr name + 6 │ + 7 │ But you cannot put attributes everywhere: </nope data=foobar> + │ ^^^^ end-tag-with-attributes + 8 │ + 9 │ Please mind the gap: < test + │ ^ invalid-first-character-of-tag-name +10 │ +11 │ The pirate says &arrrrr; + │ ^ unknown-named-character-reference +12 │ +13 │ Does this open two pages? <a href=foo.html href=bar.html>click me</a> + │ ^^^^ duplicate-attribute +14 │ +15 │ Do you start or do you end? </yes/> + │ ^ end-tag-with-trailing-solidus diff --git a/tests/test_html5lib.rs b/tests/test_html5lib.rs index 662f3c5..cd3785f 100644 --- a/tests/test_html5lib.rs +++ b/tests/test_html5lib.rs @@ -296,9 +296,9 @@ fn run_test_inner<R: Reader>( for token in tokenizer { let token = token.unwrap(); - if let Token::Error(e) = token { + if let Token::Error { error, .. } = token { actual_errors.push(ParseError { - code: ParseErrorInner(e), + code: ParseErrorInner(error), }); } else { actual_tokens.push(token); diff --git a/tests/test_spans.rs b/tests/test_spans.rs index 9cc745c..aeb4a94 100644 --- a/tests/test_spans.rs +++ b/tests/test_spans.rs @@ -31,7 +31,7 @@ fn test() { if let Token::StartTag(tag) = token { if tag.name == "h1" { labels.push(Label::primary(file_id, tag.name_span).with_message("start tag")); - } else { + } else if tag.name == "input" { for attr in tag.attributes.values() { labels.push( Label::primary(file_id, attr.name_span.clone()).with_message("attr name"), @@ -45,6 +45,8 @@ fn test() { if tag.name == "h1" { labels.push(Label::primary(file_id, tag.name_span).with_message("end tag")); } + } else if let Token::Error { error, span } = token { + labels.push(Label::primary(file_id, span).with_message(error.to_string())); } } |