diff options
-rw-r--r-- | CHANGELOG.md | 2 | ||||
-rw-r--r-- | README.md | 2 | ||||
-rw-r--r-- | src/basic_emitter.rs | 2 | ||||
-rw-r--r-- | src/emitter.rs | 2 | ||||
-rw-r--r-- | src/tokenizer/machine.rs | 68 | ||||
-rw-r--r-- | src/tokenizer/machine/utils.rs | 30 | ||||
-rw-r--r-- | src/trace.rs | 2 | ||||
-rw-r--r-- | src/tracing_emitter.rs | 4 | ||||
-rw-r--r-- | tests/test_spans.rs | 20 |
9 files changed, 101 insertions, 31 deletions
diff --git a/CHANGELOG.md b/CHANGELOG.md index 5c6cc58..de57890 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,8 @@ #### Features +* Added spans for character tokens. + * Added offsets for end-of-file tokens. * Added a blanket implementation to implement `Reader` for boxed readers. @@ -56,8 +56,6 @@ note: * This crate does not yet implement [character encoding detection]. -* This crate does not yet implement spans for character tokens. - ## Compliance & testing The tokenizer passes the [html5lib tokenizer test suite]. diff --git a/src/basic_emitter.rs b/src/basic_emitter.rs index 0d37810..440b817 100644 --- a/src/basic_emitter.rs +++ b/src/basic_emitter.rs @@ -56,7 +56,7 @@ impl<O: Offset> Emitter<O> for BasicEmitter<O> { self.errors.push_back((error, span)); } - fn emit_char(&mut self, c: char) { + fn emit_char(&mut self, c: char, span: Range<O>) { self.emit_token(Token::Char(c)); } diff --git a/src/emitter.rs b/src/emitter.rs index 264d2f1..5d2dd4d 100644 --- a/src/emitter.rs +++ b/src/emitter.rs @@ -28,7 +28,7 @@ pub trait Emitter<O> { fn report_error(&mut self, error: Error, span: Range<O>); /// Emits the given character as a character token. - fn emit_char(&mut self, c: char); + fn emit_char(&mut self, char: char, span: Range<O>); /// The state machine has reached the end of the file. fn emit_eof(&mut self, offset: O); diff --git a/src/tokenizer/machine.rs b/src/tokenizer/machine.rs index ff72a7c..d5a1f87 100644 --- a/src/tokenizer/machine.rs +++ b/src/tokenizer/machine.rs @@ -26,6 +26,7 @@ pub(super) struct Machine<R, O, E> { position_before_match: O, /// * Set to the offset of `<` in [`State::Data`]. /// * Set to the offset of `-` in [`State::Comment`]. + /// * Set to the offset of `[` in [`State::CdataSectionBracket`]. /// * Set to the offset of `&` in [`State::CharacterReference`]. some_offset: O, /// This boolean flag exists so that the [`NaiveParser`](crate::NaiveParser) can work with any [`Emitter`] @@ -126,7 +127,7 @@ where } Some('\0') => { slf.emit_error(Error::UnexpectedNullCharacter); - slf.emit_char('\u{fffd}'); + slf.emit_char_for_source_char('\u{fffd}', '\0'); Ok(ControlToken::Continue) } Some(x) => { @@ -142,7 +143,7 @@ where } Some('\0') => { slf.emit_error(Error::UnexpectedNullCharacter); - slf.emit_char('\u{fffd}'); + slf.emit_char_for_source_char('\u{fffd}', '\0'); Ok(ControlToken::Continue) } Some(x) => { @@ -158,7 +159,7 @@ where } Some('\0') => { slf.emit_error(Error::UnexpectedNullCharacter); - slf.emit_char('\u{fffd}'); + slf.emit_char_for_source_char('\u{fffd}', '\0'); Ok(ControlToken::Continue) } Some(x) => { @@ -170,7 +171,7 @@ where State::PlainText => match slf.read_char()? { Some('\0') => { slf.emit_error(Error::UnexpectedNullCharacter); - slf.emit_char('\u{fffd}'); + slf.emit_char_for_source_char('\u{fffd}', '\0'); Ok(ControlToken::Continue) } Some(x) => { @@ -475,7 +476,7 @@ where } Some('\0') => { slf.emit_error(Error::UnexpectedNullCharacter); - slf.emit_char('\u{fffd}'); + slf.emit_char_for_source_char('\u{fffd}', '\0'); Ok(ControlToken::Continue) } None => { @@ -500,7 +501,7 @@ where Some('\0') => { slf.emit_error(Error::UnexpectedNullCharacter); slf.state = State::ScriptDataEscaped; - slf.emit_char('\u{fffd}'); + slf.emit_char_for_source_char('\u{fffd}', '\0'); Ok(ControlToken::Continue) } None => { @@ -530,7 +531,7 @@ where Some('\0') => { slf.emit_error(Error::UnexpectedNullCharacter); slf.state = State::ScriptDataEscaped; - slf.emit_char('\u{fffd}'); + slf.emit_char_for_source_char('\u{fffd}', '\0'); Ok(ControlToken::Continue) } None => { @@ -638,7 +639,7 @@ where } Some('\0') => { slf.emit_error(Error::UnexpectedNullCharacter); - slf.emit_char('\u{fffd}'); + slf.emit_char_for_source_char('\u{fffd}', '\0'); Ok(ControlToken::Continue) } None => { @@ -664,7 +665,7 @@ where Some('\0') => { slf.emit_error(Error::UnexpectedNullCharacter); slf.state = State::ScriptDataDoubleEscaped; - slf.emit_char('\u{fffd}'); + slf.emit_char_for_source_char('\u{fffd}', '\0'); Ok(ControlToken::Continue) } None => { @@ -695,7 +696,7 @@ where Some('\0') => { slf.emit_error(Error::UnexpectedNullCharacter); slf.state = State::ScriptDataDoubleEscaped; - slf.emit_char('\u{fffd}'); + slf.emit_char_for_source_char('\u{fffd}', '\0'); Ok(ControlToken::Continue) } None => { @@ -1748,6 +1749,7 @@ where State::CdataSectionBracket => match slf.read_char()? { Some(']') => { slf.state = State::CdataSectionEnd; + slf.some_offset = slf.position_before_match; Ok(ControlToken::Continue) } c => { @@ -1805,7 +1807,20 @@ where try_read_character_reference(first_char, |x| slf.try_read_string(x, true))? else { slf.unread_char(Some(first_char)); - slf.flush_code_points_consumed_as_character_reference(); + + debug_assert_eq!(slf.temporary_buffer, "&"); + slf.temporary_buffer.clear(); + + if slf.is_consumed_as_part_of_an_attribute() { + slf.emitter.push_attribute_value("&"); + } else { + slf.emitter.emit_char( + '&', + slf.some_offset + ..slf.some_offset + slf.reader.len_of_char_in_current_encoding('&'), + ); + } + slf.state = State::AmbiguousAmpersand; return Ok(ControlToken::Continue); }; @@ -1829,9 +1844,20 @@ where slf.emit_error(Error::MissingSemicolonAfterCharacterReference); } - slf.temporary_buffer.clear(); - slf.temporary_buffer.push_str(char_ref.characters); - slf.flush_code_points_consumed_as_character_reference(); + if slf.is_consumed_as_part_of_an_attribute() { + slf.temporary_buffer.clear(); + slf.temporary_buffer.push_str(char_ref.characters); + slf.emitter.push_attribute_value(&slf.temporary_buffer); + } else { + for c in char_ref.characters.chars() { + slf.emitter.emit_char( + c, + slf.some_offset + ..slf.reader.position() + - slf.reader.len_of_char_in_current_encoding(c), + ); + } + } slf.state = slf.return_state.take().unwrap(); Ok(ControlToken::Continue) } @@ -1998,10 +2024,16 @@ where _ => (), } - slf.temporary_buffer.clear(); - slf.temporary_buffer - .push(std::char::from_u32(slf.character_reference_code).unwrap()); - slf.flush_code_points_consumed_as_character_reference(); + let char = std::char::from_u32(slf.character_reference_code).unwrap(); + + if slf.is_consumed_as_part_of_an_attribute() { + slf.temporary_buffer.clear(); + slf.temporary_buffer.push(char); + slf.emitter.push_attribute_value(&slf.temporary_buffer); + } else { + slf.emitter + .emit_char(char, slf.some_offset..slf.reader.position()); + } slf.state = slf.return_state.take().unwrap(); Ok(ControlToken::Continue) } diff --git a/src/tokenizer/machine/utils.rs b/src/tokenizer/machine/utils.rs index 9752746..4d59282 100644 --- a/src/tokenizer/machine/utils.rs +++ b/src/tokenizer/machine/utils.rs @@ -17,20 +17,40 @@ where self.reader.position() } + /// Emits the given character as a character token, with its span set according to the given source character. + /// + /// This method should only be used if `c != source_char`, otherwise [`Machine::emit_char`] should be used instead. + #[inline] + pub(super) fn emit_char_for_source_char(&mut self, c: char, source_char: char) { + let pos = self.reader.position(); + self.emitter.emit_char( + c, + pos - self.reader.len_of_char_in_current_encoding(source_char)..pos, + ); + } + /// Emits the given character as a character token. + /// + /// The character MUST have been present literally in the read input. #[inline] pub(super) fn emit_char(&mut self, c: char) { - self.emitter.emit_char(c); + self.emit_char_for_source_char(c, c); } /// Emits every byte of the given byte slice as a character token. /// + /// Every byte MUST have been literally present as a character in the read input. + /// /// (We're operating on bytes to enable compiler optimization, /// since [`str::chars`] isn't `const`.) #[inline] pub(super) fn emit_chars(&mut self, s: &[u8]) { + let mut start = self.some_offset; + for c in s { - self.emit_char(*c as char); + let end = start + self.reader.len_of_char_in_current_encoding(*c as char); + self.emitter.emit_char(*c as char, start..end); + start = end; } } @@ -207,10 +227,8 @@ where } pub(super) fn flush_buffer_characters(&mut self) { - for c in self.temporary_buffer.chars() { - self.emitter.emit_char(c); - } - self.temporary_buffer.clear(); + let temporary_buffer = std::mem::take(&mut self.temporary_buffer); + self.emit_chars(temporary_buffer.as_bytes()); } } diff --git a/src/trace.rs b/src/trace.rs index 620d4f3..fdf9212 100644 --- a/src/trace.rs +++ b/src/trace.rs @@ -14,7 +14,7 @@ use crate::token::AttributeTraceIdx; #[allow(missing_docs)] #[derive(Eq, PartialEq, Debug)] pub enum Trace { - Char, + Char(Range<usize>), StartTag(StartTagTrace), EndTag(EndTagTrace), Comment(CommentTrace), diff --git a/src/tracing_emitter.rs b/src/tracing_emitter.rs index 819f909..21f40f7 100644 --- a/src/tracing_emitter.rs +++ b/src/tracing_emitter.rs @@ -62,8 +62,8 @@ impl Emitter<usize> for TracingEmitter { self.errors.push_back((error, span)); } - fn emit_char(&mut self, c: char) { - self.emit_token(Token::Char(c), Trace::Char); + fn emit_char(&mut self, c: char, span: Range<usize>) { + self.emit_token(Token::Char(c), Trace::Char(span)); } fn emit_eof(&mut self, offset: usize) { diff --git a/tests/test_spans.rs b/tests/test_spans.rs index d19d6aa..b10808c 100644 --- a/tests/test_spans.rs +++ b/tests/test_spans.rs @@ -73,6 +73,26 @@ fn annotate(html: &str, labels: Vec<(Range<usize>, impl AsRef<str>)>) -> String } #[test] +fn char_span() { + let html = "X & &doesntexist; ѣ </"; + let labeler = |parser: Parser| { + let mut labels = Vec::new(); + for token_trace in parser.flatten() { + if let (Token::Char(c), Trace::Char(span)) = token_trace { + if c != ' ' { + labels.push((span, "")); + } + } + } + labels + }; + assert_snapshot!(test_and_annotate(html, labeler), @r###" + X & &doesntexist; ѣ </ + ^ ^^^^^ ^^^^^^^^^^^^^ ^^^^^^^ ^^ + "###); +} + +#[test] fn start_tag_span() { let html = "<x> <xyz> <xyz > <xyz/>"; let labeler = |parser: Parser| { |