clarify what html5gum isn't, fix #5

author: Markus Unterwaditzer <markus-honeypot@unterwaditzer.net> 2021-11-28 00:05:21 +0100
committer: Markus Unterwaditzer <markus-honeypot@unterwaditzer.net> 2021-11-28 00:05:21 +0100
commit: e14abf483b238da4d5b69dbc425b2ab80d1c3e98 (patch)
tree: 09801b839a98441793dafa8bd326d5df3f38d201
parent: 95afc5359e940398498310d46e81352f04b43a49 (diff)
3 files changed, 11 insertions, 10 deletions
diff --git a/README.md b/README.md
index 425ee18..c27190c 100644
--- a/README.md
+++ b/README.md
@@ -30,16 +30,16 @@ for token in Tokenizer::new(html).infallible() {
 assert_eq!(new_html, "<title>hello world</title>");
 ```
 
-It fully implements [13.2 of the WHATWG HTML
-spec](https://html.spec.whatwg.org/#parsing) and passes [html5lib's tokenizer
-test suite](https://github.com/html5lib/html5lib-tests/tree/master/tokenizer),
-except that:
+It fully implements [13.2.5 of the WHATWG HTML
+spec](https://html.spec.whatwg.org/#tokenization), i.e. is able to tokenize HTML documents and passes [html5lib's tokenizer
+test suite](https://github.com/html5lib/html5lib-tests/tree/master/tokenizer). Most importantly it does not:
 
-* this implementation requires all input to be Rust strings and therefore valid
-  UTF-8. There is no charset detection or handling of invalid surrogates, and
-  the relevant html5lib tests are skipped in CI.
+* [Implement charset detection.](https://html.spec.whatwg.org/#determining-the-character-encoding) This implementation requires all input to be
+  Rust strings and therefore valid UTF-8.
 
-* there's some remaining testcases to be decided on at [issue 5](https://github.com/untitaker/html5gum/issues/5).
+* [Correct mis-nested tags](https://html.spec.whatwg.org/#an-introduction-to-error-handling-and-strange-cases-in-the-parser)
+
+* Generally qualify as a complete HTML *parser* as per the WHATWG spec (yet).
 
 A distinguishing feature of `html5gum` is that you can bring your own token
 datastructure and hook into token creation by implementing the `Emitter` trait.
diff --git a/src/tokenizer.rs b/src/tokenizer.rs
index 7430cbc..ec3ae44 100644
--- a/src/tokenizer.rs
+++ b/src/tokenizer.rs
@@ -158,7 +158,8 @@ impl<R: Reader, E: Emitter> Tokenizer<R, E> {
         let mut chars = s.chars();
         while let Some(c) = self.to_reconsume.pop() {
             if let (Some(x), Some(x2)) = (c, chars.next()) {
-                if x == x2 || (!case_sensitive && x.to_ascii_lowercase() == x2.to_ascii_lowercase()) {
+                if x == x2 || (!case_sensitive && x.to_ascii_lowercase() == x2.to_ascii_lowercase())
+                {
                     s = &s[x.len_utf8()..];
                     continue;
                 }
diff --git a/tests/test_html5lib.rs b/tests/test_html5lib.rs
index a29cdc4..540388f 100644
--- a/tests/test_html5lib.rs
+++ b/tests/test_html5lib.rs
@@ -184,7 +184,7 @@ fn test_tokenizer_file(resource_name: &str) {
 
     if matches!(
         fname,
-        // We don't have the test harness for this test, TODO
+        // We don't implement "Coercing an HTML DOM into an infoset" section
         "xmlViolation.test" |
         // Our parser does not operate on bytes, the input isn't valid Rust &str
         "unicodeCharsProblematic.test"
author	Markus Unterwaditzer <markus-honeypot@unterwaditzer.net>	2021-11-28 00:05:21 +0100
committer	Markus Unterwaditzer <markus-honeypot@unterwaditzer.net>	2021-11-28 00:05:21 +0100
commit	e14abf483b238da4d5b69dbc425b2ab80d1c3e98 (patch)
tree	09801b839a98441793dafa8bd326d5df3f38d201
parent	95afc5359e940398498310d46e81352f04b43a49 (diff)