aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--CHANGELOG.md2
-rw-r--r--README.md2
-rw-r--r--src/basic_emitter.rs2
-rw-r--r--src/emitter.rs2
-rw-r--r--src/tokenizer/machine.rs68
-rw-r--r--src/tokenizer/machine/utils.rs30
-rw-r--r--src/trace.rs2
-rw-r--r--src/tracing_emitter.rs4
-rw-r--r--tests/test_spans.rs20
9 files changed, 101 insertions, 31 deletions
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 5c6cc58..de57890 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -4,6 +4,8 @@
#### Features
+* Added spans for character tokens.
+
* Added offsets for end-of-file tokens.
* Added a blanket implementation to implement `Reader` for boxed readers.
diff --git a/README.md b/README.md
index 6513b61..abccaeb 100644
--- a/README.md
+++ b/README.md
@@ -56,8 +56,6 @@ note:
* This crate does not yet implement [character encoding detection].
-* This crate does not yet implement spans for character tokens.
-
## Compliance & testing
The tokenizer passes the [html5lib tokenizer test suite].
diff --git a/src/basic_emitter.rs b/src/basic_emitter.rs
index 0d37810..440b817 100644
--- a/src/basic_emitter.rs
+++ b/src/basic_emitter.rs
@@ -56,7 +56,7 @@ impl<O: Offset> Emitter<O> for BasicEmitter<O> {
self.errors.push_back((error, span));
}
- fn emit_char(&mut self, c: char) {
+ fn emit_char(&mut self, c: char, span: Range<O>) {
self.emit_token(Token::Char(c));
}
diff --git a/src/emitter.rs b/src/emitter.rs
index 264d2f1..5d2dd4d 100644
--- a/src/emitter.rs
+++ b/src/emitter.rs
@@ -28,7 +28,7 @@ pub trait Emitter<O> {
fn report_error(&mut self, error: Error, span: Range<O>);
/// Emits the given character as a character token.
- fn emit_char(&mut self, c: char);
+ fn emit_char(&mut self, char: char, span: Range<O>);
/// The state machine has reached the end of the file.
fn emit_eof(&mut self, offset: O);
diff --git a/src/tokenizer/machine.rs b/src/tokenizer/machine.rs
index ff72a7c..d5a1f87 100644
--- a/src/tokenizer/machine.rs
+++ b/src/tokenizer/machine.rs
@@ -26,6 +26,7 @@ pub(super) struct Machine<R, O, E> {
position_before_match: O,
/// * Set to the offset of `<` in [`State::Data`].
/// * Set to the offset of `-` in [`State::Comment`].
+ /// * Set to the offset of `[` in [`State::CdataSectionBracket`].
/// * Set to the offset of `&` in [`State::CharacterReference`].
some_offset: O,
/// This boolean flag exists so that the [`NaiveParser`](crate::NaiveParser) can work with any [`Emitter`]
@@ -126,7 +127,7 @@ where
}
Some('\0') => {
slf.emit_error(Error::UnexpectedNullCharacter);
- slf.emit_char('\u{fffd}');
+ slf.emit_char_for_source_char('\u{fffd}', '\0');
Ok(ControlToken::Continue)
}
Some(x) => {
@@ -142,7 +143,7 @@ where
}
Some('\0') => {
slf.emit_error(Error::UnexpectedNullCharacter);
- slf.emit_char('\u{fffd}');
+ slf.emit_char_for_source_char('\u{fffd}', '\0');
Ok(ControlToken::Continue)
}
Some(x) => {
@@ -158,7 +159,7 @@ where
}
Some('\0') => {
slf.emit_error(Error::UnexpectedNullCharacter);
- slf.emit_char('\u{fffd}');
+ slf.emit_char_for_source_char('\u{fffd}', '\0');
Ok(ControlToken::Continue)
}
Some(x) => {
@@ -170,7 +171,7 @@ where
State::PlainText => match slf.read_char()? {
Some('\0') => {
slf.emit_error(Error::UnexpectedNullCharacter);
- slf.emit_char('\u{fffd}');
+ slf.emit_char_for_source_char('\u{fffd}', '\0');
Ok(ControlToken::Continue)
}
Some(x) => {
@@ -475,7 +476,7 @@ where
}
Some('\0') => {
slf.emit_error(Error::UnexpectedNullCharacter);
- slf.emit_char('\u{fffd}');
+ slf.emit_char_for_source_char('\u{fffd}', '\0');
Ok(ControlToken::Continue)
}
None => {
@@ -500,7 +501,7 @@ where
Some('\0') => {
slf.emit_error(Error::UnexpectedNullCharacter);
slf.state = State::ScriptDataEscaped;
- slf.emit_char('\u{fffd}');
+ slf.emit_char_for_source_char('\u{fffd}', '\0');
Ok(ControlToken::Continue)
}
None => {
@@ -530,7 +531,7 @@ where
Some('\0') => {
slf.emit_error(Error::UnexpectedNullCharacter);
slf.state = State::ScriptDataEscaped;
- slf.emit_char('\u{fffd}');
+ slf.emit_char_for_source_char('\u{fffd}', '\0');
Ok(ControlToken::Continue)
}
None => {
@@ -638,7 +639,7 @@ where
}
Some('\0') => {
slf.emit_error(Error::UnexpectedNullCharacter);
- slf.emit_char('\u{fffd}');
+ slf.emit_char_for_source_char('\u{fffd}', '\0');
Ok(ControlToken::Continue)
}
None => {
@@ -664,7 +665,7 @@ where
Some('\0') => {
slf.emit_error(Error::UnexpectedNullCharacter);
slf.state = State::ScriptDataDoubleEscaped;
- slf.emit_char('\u{fffd}');
+ slf.emit_char_for_source_char('\u{fffd}', '\0');
Ok(ControlToken::Continue)
}
None => {
@@ -695,7 +696,7 @@ where
Some('\0') => {
slf.emit_error(Error::UnexpectedNullCharacter);
slf.state = State::ScriptDataDoubleEscaped;
- slf.emit_char('\u{fffd}');
+ slf.emit_char_for_source_char('\u{fffd}', '\0');
Ok(ControlToken::Continue)
}
None => {
@@ -1748,6 +1749,7 @@ where
State::CdataSectionBracket => match slf.read_char()? {
Some(']') => {
slf.state = State::CdataSectionEnd;
+ slf.some_offset = slf.position_before_match;
Ok(ControlToken::Continue)
}
c => {
@@ -1805,7 +1807,20 @@ where
try_read_character_reference(first_char, |x| slf.try_read_string(x, true))?
else {
slf.unread_char(Some(first_char));
- slf.flush_code_points_consumed_as_character_reference();
+
+ debug_assert_eq!(slf.temporary_buffer, "&");
+ slf.temporary_buffer.clear();
+
+ if slf.is_consumed_as_part_of_an_attribute() {
+ slf.emitter.push_attribute_value("&");
+ } else {
+ slf.emitter.emit_char(
+ '&',
+ slf.some_offset
+ ..slf.some_offset + slf.reader.len_of_char_in_current_encoding('&'),
+ );
+ }
+
slf.state = State::AmbiguousAmpersand;
return Ok(ControlToken::Continue);
};
@@ -1829,9 +1844,20 @@ where
slf.emit_error(Error::MissingSemicolonAfterCharacterReference);
}
- slf.temporary_buffer.clear();
- slf.temporary_buffer.push_str(char_ref.characters);
- slf.flush_code_points_consumed_as_character_reference();
+ if slf.is_consumed_as_part_of_an_attribute() {
+ slf.temporary_buffer.clear();
+ slf.temporary_buffer.push_str(char_ref.characters);
+ slf.emitter.push_attribute_value(&slf.temporary_buffer);
+ } else {
+ for c in char_ref.characters.chars() {
+ slf.emitter.emit_char(
+ c,
+ slf.some_offset
+ ..slf.reader.position()
+ - slf.reader.len_of_char_in_current_encoding(c),
+ );
+ }
+ }
slf.state = slf.return_state.take().unwrap();
Ok(ControlToken::Continue)
}
@@ -1998,10 +2024,16 @@ where
_ => (),
}
- slf.temporary_buffer.clear();
- slf.temporary_buffer
- .push(std::char::from_u32(slf.character_reference_code).unwrap());
- slf.flush_code_points_consumed_as_character_reference();
+ let char = std::char::from_u32(slf.character_reference_code).unwrap();
+
+ if slf.is_consumed_as_part_of_an_attribute() {
+ slf.temporary_buffer.clear();
+ slf.temporary_buffer.push(char);
+ slf.emitter.push_attribute_value(&slf.temporary_buffer);
+ } else {
+ slf.emitter
+ .emit_char(char, slf.some_offset..slf.reader.position());
+ }
slf.state = slf.return_state.take().unwrap();
Ok(ControlToken::Continue)
}
diff --git a/src/tokenizer/machine/utils.rs b/src/tokenizer/machine/utils.rs
index 9752746..4d59282 100644
--- a/src/tokenizer/machine/utils.rs
+++ b/src/tokenizer/machine/utils.rs
@@ -17,20 +17,40 @@ where
self.reader.position()
}
+ /// Emits the given character as a character token, with its span set according to the given source character.
+ ///
+ /// This method should only be used if `c != source_char`, otherwise [`Machine::emit_char`] should be used instead.
+ #[inline]
+ pub(super) fn emit_char_for_source_char(&mut self, c: char, source_char: char) {
+ let pos = self.reader.position();
+ self.emitter.emit_char(
+ c,
+ pos - self.reader.len_of_char_in_current_encoding(source_char)..pos,
+ );
+ }
+
/// Emits the given character as a character token.
+ ///
+ /// The character MUST have been present literally in the read input.
#[inline]
pub(super) fn emit_char(&mut self, c: char) {
- self.emitter.emit_char(c);
+ self.emit_char_for_source_char(c, c);
}
/// Emits every byte of the given byte slice as a character token.
///
+ /// Every byte MUST have been literally present as a character in the read input.
+ ///
/// (We're operating on bytes to enable compiler optimization,
/// since [`str::chars`] isn't `const`.)
#[inline]
pub(super) fn emit_chars(&mut self, s: &[u8]) {
+ let mut start = self.some_offset;
+
for c in s {
- self.emit_char(*c as char);
+ let end = start + self.reader.len_of_char_in_current_encoding(*c as char);
+ self.emitter.emit_char(*c as char, start..end);
+ start = end;
}
}
@@ -207,10 +227,8 @@ where
}
pub(super) fn flush_buffer_characters(&mut self) {
- for c in self.temporary_buffer.chars() {
- self.emitter.emit_char(c);
- }
- self.temporary_buffer.clear();
+ let temporary_buffer = std::mem::take(&mut self.temporary_buffer);
+ self.emit_chars(temporary_buffer.as_bytes());
}
}
diff --git a/src/trace.rs b/src/trace.rs
index 620d4f3..fdf9212 100644
--- a/src/trace.rs
+++ b/src/trace.rs
@@ -14,7 +14,7 @@ use crate::token::AttributeTraceIdx;
#[allow(missing_docs)]
#[derive(Eq, PartialEq, Debug)]
pub enum Trace {
- Char,
+ Char(Range<usize>),
StartTag(StartTagTrace),
EndTag(EndTagTrace),
Comment(CommentTrace),
diff --git a/src/tracing_emitter.rs b/src/tracing_emitter.rs
index 819f909..21f40f7 100644
--- a/src/tracing_emitter.rs
+++ b/src/tracing_emitter.rs
@@ -62,8 +62,8 @@ impl Emitter<usize> for TracingEmitter {
self.errors.push_back((error, span));
}
- fn emit_char(&mut self, c: char) {
- self.emit_token(Token::Char(c), Trace::Char);
+ fn emit_char(&mut self, c: char, span: Range<usize>) {
+ self.emit_token(Token::Char(c), Trace::Char(span));
}
fn emit_eof(&mut self, offset: usize) {
diff --git a/tests/test_spans.rs b/tests/test_spans.rs
index d19d6aa..b10808c 100644
--- a/tests/test_spans.rs
+++ b/tests/test_spans.rs
@@ -73,6 +73,26 @@ fn annotate(html: &str, labels: Vec<(Range<usize>, impl AsRef<str>)>) -> String
}
#[test]
+fn char_span() {
+ let html = "X &amp; &doesntexist; &#1123; </";
+ let labeler = |parser: Parser| {
+ let mut labels = Vec::new();
+ for token_trace in parser.flatten() {
+ if let (Token::Char(c), Trace::Char(span)) = token_trace {
+ if c != ' ' {
+ labels.push((span, ""));
+ }
+ }
+ }
+ labels
+ };
+ assert_snapshot!(test_and_annotate(html, labeler), @r###"
+ X &amp; &doesntexist; &#1123; </
+ ^ ^^^^^ ^^^^^^^^^^^^^ ^^^^^^^ ^^
+ "###);
+}
+
+#[test]
fn start_tag_span() {
let html = "<x> <xyz> <xyz > <xyz/>";
let labeler = |parser: Parser| {