diff options
| -rw-r--r-- | tests/test_spans.rs | 85 | 
1 files changed, 85 insertions, 0 deletions
| diff --git a/tests/test_spans.rs b/tests/test_spans.rs index db17328..c58616d 100644 --- a/tests/test_spans.rs +++ b/tests/test_spans.rs @@ -54,6 +54,13 @@ fn annotate(html: &str, labels: Vec<(Range<usize>, impl AsRef<str>)>) -> String          .join("")  } +fn assert_panics_but_should_not(f: impl FnOnce() + std::panic::UnwindSafe) { +    assert!( +        std::panic::catch_unwind(f).is_err(), +        "congrats! you made some span test support UTF-16, please stop calling assert_panics_but_should_not for this test" +    ); +} +  #[test]  fn start_tag_span() {      let html = "<x> <xyz> <xyz  > <xyz/>"; @@ -66,6 +73,7 @@ fn start_tag_span() {          }          labels      }; +    assert_panics_but_should_not(|| assert_char_encoding_independence(html, labeler)); // FIXME      assert_snapshot!(test_and_annotate(html, labeler), @r###"      <x> <xyz> <xyz  > <xyz/>      ^^^ ^^^^^ ^^^^^^^ ^^^^^^ @@ -84,6 +92,7 @@ fn end_tag_span() {          }          labels      }; +    assert_panics_but_should_not(|| assert_char_encoding_independence(html, labeler)); // FIXME      assert_snapshot!(test_and_annotate(html, labeler), @r###"      </x> </xyz> </xyz  > </xyz/>      ^^^^ ^^^^^^ ^^^^^^^^ ^^^^^^^ @@ -102,6 +111,7 @@ fn start_tag_name_span() {          }          labels      }; +    assert_panics_but_should_not(|| assert_char_encoding_independence(html, labeler)); // FIXME      assert_snapshot!(test_and_annotate(html, labeler), @r###"      <x> <xyz> <xyz  > <xyz/>       ^   ^^^   ^^^     ^^^ @@ -120,6 +130,7 @@ fn end_tag_name_span() {          }          labels      }; +    assert_panics_but_should_not(|| assert_char_encoding_independence(html, labeler)); // FIXME      assert_snapshot!(test_and_annotate(html, labeler), @r###"      </x> </xyz> </xyz  > </xyz/>        ^    ^^^    ^^^      ^^^ @@ -139,6 +150,7 @@ fn attribute_name_span() {          }          labels      }; +    assert_panics_but_should_not(|| assert_char_encoding_independence(html, labeler)); // FIXME      assert_snapshot!(test_and_annotate(html, labeler), @r###"      <test x xyz y=VAL xy=VAL z = VAL yzx = VAL>            ^ ^^^ ^     ^^     ^       ^^^ @@ -158,6 +170,7 @@ fn attribute_value_span() {          }          labels      }; +    assert_panics_but_should_not(|| assert_char_encoding_independence(html, labeler)); // FIXME      assert_snapshot!(test_and_annotate(html, labeler), @r###"      <test x=unquoted y = unquoted z='single-quoted' zz="double-quoted" empty=''>              ^^^^^^^^     ^^^^^^^^    ^^^^^^^^^^^^^      ^^^^^^^^^^^^^         ^ @@ -177,6 +190,7 @@ fn attribute_value_with_char_ref() {          }          labels      }; +    assert_panics_but_should_not(|| assert_char_encoding_independence(html, labeler)); // FIXME      assert_snapshot!(test_and_annotate(html, labeler), @r###"      <test x=& y='&' z="&">              ^^^^^    ^^^^^     ^^^^^ @@ -205,6 +219,7 @@ fn comment_data_span() {              };              vec![(comment.data_span(), "")]          }; +        assert_panics_but_should_not(|| assert_char_encoding_independence(case, labeler)); // FIXME          annotated.push_str(&test_and_annotate(case, labeler));      } @@ -246,6 +261,7 @@ fn doctype_span() {              };              vec![(doctype.span, "")]          }; +        assert_panics_but_should_not(|| assert_char_encoding_independence(case, labeler)); // FIXME          annotated.push_str(&test_and_annotate(case, labeler));      } @@ -283,6 +299,7 @@ fn doctype_id_spans() {              labels          }; +        assert_panics_but_should_not(|| assert_char_encoding_independence(case, labeler)); // FIXME          annotated.push_str(&test_and_annotate(case, labeler));      } @@ -309,6 +326,8 @@ fn annotate_errors(html: &'static str) -> String {          }      } +    let doesnt_support_utf16 = std::sync::Mutex::new(false); +      let labeler = |tokens| {          let mut labels = Vec::new();          for token in tokens { @@ -317,10 +336,36 @@ fn annotate_errors(html: &'static str) -> String {              };              labels.push((span, error.code())); + +            use html5tokenizer::Error; + +            *doesnt_support_utf16.lock().unwrap() = matches!( +                error, +                | Error::AbsenceOfDigitsInNumericCharacterReference // FIXME +                | Error::CharacterReferenceOutsideUnicodeRange // FIXME +                | Error::ControlCharacterReference // FIXME +                | Error::DuplicateAttribute // FIXME +                | Error::EndTagWithAttributes // FIXME +                | Error::EndTagWithTrailingSolidus // FIXME +                | Error::InvalidFirstCharacterOfTagName // FIXME +                | Error::NoncharacterCharacterReference // FIXME +                | Error::NullCharacterReference // FIXME +                | Error::SurrogateCharacterReference // FIXME +                | Error::UnknownNamedCharacterReference // FIXME +            );          }          labels      }; +    // This will be removed once all tested errors support UTF-16. +    let _ = labeler(Box::new(tokenizer(html)) as TokenIter); +    if *doesnt_support_utf16.lock().unwrap() { +        assert_panics_but_should_not(|| assert_char_encoding_independence(html, labeler)); +    } else { +        // TODO: Move this assertion into test_and_annotate once all tests support it. +        assert_char_encoding_independence(html, labeler); +    } +      test_and_annotate(html, labeler)  } @@ -492,3 +537,43 @@ fn error_invalid_first_character_of_tag_name() {                            ^ invalid-first-character-of-tag-name      "###);  } + +fn assert_char_encoding_independence<S: AsRef<str> + Clone>( +    html: &'static str, +    labeler: impl Fn(TokenIter) -> Vec<(Range<usize>, S)>, +) { +    let utf8_tokens = NaiveParser::new(PosTrackingReader::new(html)).flatten(); +    let string_reader = html5tokenizer::reader::IntoReader::into_reader(html); +    let utf16_tokens = +        NaiveParser::new(PosTrackingReader::new(Utf16Reader(string_reader))).flatten(); +    let utf8_labels = labeler(Box::new(utf8_tokens)); + +    for (idx, (span, _)) in labeler(Box::new(utf16_tokens)).into_iter().enumerate() { +        let expected_utf16_span = html[..utf8_labels[idx].0.start].encode_utf16().count() * 2 +            ..html[..utf8_labels[idx].0.end].encode_utf16().count() * 2; +        assert_eq!( +            span, +            expected_utf16_span, +            "UTF-16 span didn't match the UTF-8 span, which looks like:\n{}", +            annotate(html, vec![utf8_labels[idx].clone()]) +        ); +    } +} + +struct Utf16Reader<'a>(html5tokenizer::reader::StringReader<'a>); + +impl html5tokenizer::reader::Reader for Utf16Reader<'_> { +    type Error = std::convert::Infallible; + +    fn read_char(&mut self) -> Result<Option<char>, Self::Error> { +        self.0.read_char() +    } + +    fn try_read_string(&mut self, s: &str, case_sensitive: bool) -> Result<bool, Self::Error> { +        self.0.try_read_string(s, case_sensitive) +    } + +    fn len_of_char_in_current_encoding(&self, c: char) -> usize { +        c.len_utf16() * 2 +    } +} | 
