Check for illegal characters in query and fragment elements

author: Richard Walters <rwalters@digitalstirling.com> 2018-07-01 15:20:30 -0700
committer: Richard Walters <rwalters@digitalstirling.com> 2018-07-01 15:20:30 -0700
commit: d3a446cd9c3846735f4ea9e5270352633c597071 (patch)
tree: 4dd760e75c295b340db489df35b5275e1fba9352
parent: 4eb4f0c150642cf2fa92f75000ab5108d1908e48 (diff)
2 files changed, 267 insertions, 76 deletions
diff --git a/src/Uri.cpp b/src/Uri.cpp
index 2b9b93a..44ce0b0 100644
--- a/src/Uri.cpp
+++ b/src/Uri.cpp
@@ -129,6 +129,161 @@ namespace {
         };
     }
 
+    /**
+     * This method checks and decodes the given path segment.
+     *
+     * @param[in,out] segment
+     *     On input, this is the path segment to check and decode.
+     *     On output, this is the decoded path segment.
+     *
+     * @return
+     *     An indication of whether or not the path segment
+     *     passed all checks and was decoded successfully is returned.
+     */
+    bool DecodePathSegment(std::string& segment) {
+        const auto originalSegment = std::move(segment);
+        segment.clear();
+        size_t decoderState = 0;
+        int decodedCharacter = 0;
+        for (const auto c: originalSegment) {
+            switch(decoderState) {
+                case 0: { // default
+                    if (c == '%') {
+                        decoderState = 1;
+                    } else {
+                        if (
+                            IsCharacterInSet(
+                                c,
+                                {
+                                    // unreserved
+                                    'a','z', 'A','Z', // ALPHA
+                                    '0','9', // DIGIT
+                                    '-','-', '.','.', '_','_', '~','~',
+
+                                    // sub-delims
+                                    '!','!', '$','$', '&','&', '\'','\'', '(','(', ')',')',
+                                    '*','*', '+','+', ',',',', ';',';', '=','=',
+
+                                    // (also allowed in segment or pchar)
+                                    ':',':', '@','@'
+                                }
+                            )
+                        ) {
+                            segment.push_back(c);
+                        } else {
+                            return false;
+                        }
+                    }
+                } break;
+
+                case 1: { // % ...
+                    decoderState = 2;
+                    decodedCharacter <<= 4;
+                    if (IsCharacterInSet(c, {'0','9'})) {
+                        decodedCharacter += (int)(c - '0');
+                    } else if (IsCharacterInSet(c, {'A','F'})) {
+                        decodedCharacter += (int)(c - 'A') + 10;
+                    } else {
+                        return false;
+                    }
+                } break;
+
+                case 2: { // %[0-9A-F] ...
+                    decoderState = 0;
+                    decodedCharacter <<= 4;
+                    if (IsCharacterInSet(c, {'0','9'})) {
+                        decodedCharacter += (int)(c - '0');
+                    } else if (IsCharacterInSet(c, {'A','F'})) {
+                        decodedCharacter += (int)(c - 'A') + 10;
+                    } else {
+                        return false;
+                    }
+                    segment.push_back((char)decodedCharacter);
+                } break;
+            }
+        }
+        return true;
+    }
+
+    /**
+     * This method checks and decodes the given query or fragment.
+     *
+     * @param[in,out] queryOrFragment
+     *     On input, this is the query or fragment to check and decode.
+     *     On output, this is the decoded query or fragment.
+     *
+     * @return
+     *     An indication of whether or not the query or fragment
+     *     passed all checks and was decoded successfully is returned.
+     */
+    bool DecodeQueryOrFragment(std::string& queryOrFragment) {
+        const auto originalQueryOrFragment = std::move(queryOrFragment);
+        queryOrFragment.clear();
+        size_t decoderState = 0;
+        int decodedCharacter = 0;
+        for (const auto c: originalQueryOrFragment) {
+            switch(decoderState) {
+                case 0: { // default
+                    if (c == '%') {
+                        decoderState = 1;
+                    } else {
+                        if (
+                            IsCharacterInSet(
+                                c,
+                                {
+                                    // unreserved
+                                    'a','z', 'A','Z', // ALPHA
+                                    '0','9', // DIGIT
+                                    '-','-', '.','.', '_','_', '~','~',
+
+                                    // sub-delims
+                                    '!','!', '$','$', '&','&', '\'','\'', '(','(', ')',')',
+                                    '*','*', '+','+', ',',',', ';',';', '=','=',
+
+                                    // (also allowed in pchar)
+                                    ':',':', '@','@',
+
+                                    // (also allowed in query or fragment)
+                                    '/','/', '?','?'
+                                }
+                            )
+                        ) {
+                            queryOrFragment.push_back(c);
+                        } else {
+                            return false;
+                        }
+                    }
+                } break;
+
+                case 1: { // % ...
+                    decoderState = 2;
+                    decodedCharacter <<= 4;
+                    if (IsCharacterInSet(c, {'0','9'})) {
+                        decodedCharacter += (int)(c - '0');
+                    } else if (IsCharacterInSet(c, {'A','F'})) {
+                        decodedCharacter += (int)(c - 'A') + 10;
+                    } else {
+                        return false;
+                    }
+                } break;
+
+                case 2: { // %[0-9A-F] ...
+                    decoderState = 0;
+                    decodedCharacter <<= 4;
+                    if (IsCharacterInSet(c, {'0','9'})) {
+                        decodedCharacter += (int)(c - '0');
+                    } else if (IsCharacterInSet(c, {'A','F'})) {
+                        decodedCharacter += (int)(c - 'A') + 10;
+                    } else {
+                        return false;
+                    }
+                    queryOrFragment.push_back((char)decodedCharacter);
+                } break;
+            }
+        }
+        return true;
+    }
+
 }
 
 namespace Uri {
@@ -185,82 +340,6 @@ namespace Uri {
         // Methods
 
         /**
-         * This method checks and decodes the given path segment.
-         *
-         * @param[in,out] segment
-         *     On input, this is the path segment to check and decode.
-         *     On output, this is the decoded path segment.
-         *
-         * @return
-         *     An indication of whether or not the path segment
-         *     passed all checks and was decoded successfully is returned.
-         */
-        bool DecodePathSegment(std::string& segment) {
-            const auto originalSegment = std::move(segment);
-            segment.clear();
-            size_t decoderState = 0;
-            int decodedCharacter = 0;
-            for (const auto c: originalSegment) {
-                switch(decoderState) {
-                    case 0: { // default
-                        if (c == '%') {
-                            decoderState = 1;
-                        } else {
-                            if (
-                                IsCharacterInSet(
-                                    c,
-                                    {
-                                        // unreserved
-                                        'a','z', 'A','Z', // ALPHA
-                                        '0','9', // DIGIT
-                                        '-','-', '.','.', '_','_', '~','~',
-
-                                        // sub-delims
-                                        '!','!', '$','$', '&','&', '\'','\'', '(','(', ')',')',
-                                        '*','*', '+','+', ',',',', ';',';', '=','=',
-
-                                        // (also allowed in segment or pchar)
-                                        ':',':', '@','@'
-                                    }
-                                )
-                            ) {
-                                segment.push_back(c);
-                            } else {
-                                return false;
-                            }
-                        }
-                    } break;
-
-                    case 1: { // % ...
-                        decoderState = 2;
-                        decodedCharacter <<= 4;
-                        if (IsCharacterInSet(c, {'0','9'})) {
-                            decodedCharacter += (int)(c - '0');
-                        } else if (IsCharacterInSet(c, {'A','F'})) {
-                            decodedCharacter += (int)(c - 'A') + 10;
-                        } else {
-                            return false;
-                        }
-                    } break;
-
-                    case 2: { // %[0-9A-F] ...
-                        decoderState = 0;
-                        decodedCharacter <<= 4;
-                        if (IsCharacterInSet(c, {'0','9'})) {
-                            decodedCharacter += (int)(c - '0');
-                        } else if (IsCharacterInSet(c, {'A','F'})) {
-                            decodedCharacter += (int)(c - 'A') + 10;
-                        } else {
-                            return false;
-                        }
-                        segment.push_back((char)decodedCharacter);
-                    } break;
-                }
-            }
-            return true;
-        }
-
-        /**
          * This method builds the internal path element sequence
          * by parsing it from the given path string.
          *
@@ -616,6 +695,9 @@ namespace Uri {
             impl_->fragment = queryAndOrFragment.substr(fragmentDelimiter + 1);
             rest = queryAndOrFragment.substr(0, fragmentDelimiter);
         }
+        if (!DecodeQueryOrFragment(impl_->fragment)) {
+            return false;
+        }
 
         // Finally, if anything is left, it's the query.
         if (rest.empty()) {
@@ -623,6 +705,9 @@ namespace Uri {
         } else {
             impl_->query = rest.substr(1);
         }
+        if (!DecodeQueryOrFragment(impl_->query)) {
+            return false;
+        }
         return true;
     }
 
diff --git a/test/src/UriTests.cpp b/test/src/UriTests.cpp
index d5ab920..6d59697 100644
--- a/test/src/UriTests.cpp
+++ b/test/src/UriTests.cpp
@@ -418,3 +418,109 @@ TEST(UriTests, ParseFromStringPathBarelyLegal) {
         ++index;
     }
 }
+
+TEST(UriTests, ParseFromStringQueryIllegalCharacters) {
+    const std::vector< std::string > testVectors{
+        {"http://www.example.com/?foo[bar"},
+        {"http://www.example.com/?]bar"},
+        {"http://www.example.com/?foo]"},
+        {"http://www.example.com/?["},
+        {"http://www.example.com/?abc/foo]"},
+        {"http://www.example.com/?abc/["},
+        {"http://www.example.com/?foo]/abc"},
+        {"http://www.example.com/?[/abc"},
+        {"http://www.example.com/?foo]/"},
+        {"http://www.example.com/?[/"},
+        {"?foo[bar"},
+        {"?]bar"},
+        {"?foo]"},
+        {"?["},
+        {"?abc/foo]"},
+        {"?abc/["},
+        {"?foo]/abc"},
+        {"?[/abc"},
+        {"?foo]/"},
+        {"?[/"},
+    };
+    size_t index = 0;
+    for (const auto& testVector : testVectors) {
+        Uri::Uri uri;
+        ASSERT_FALSE(uri.ParseFromString(testVector)) << index;
+        ++index;
+    }
+}
+
+TEST(UriTests, ParseFromStringQueryBarelyLegal) {
+    struct TestVector {
+        std::string uriString;
+        std::string query;
+    };
+    const std::vector< TestVector > testVectors{
+        {"/?:/foo", ":/foo"},
+        {"?bob@/foo", "bob@/foo"},
+        {"?hello!", "hello!"},
+        {"urn:?hello,%20w%6Frld", "hello, world"},
+        {"//example.com/foo?(bar)/", "(bar)/"},
+        {"http://www.example.com/?foo?bar", "foo?bar" },
+    };
+    size_t index = 0;
+    for (const auto& testVector : testVectors) {
+        Uri::Uri uri;
+        ASSERT_TRUE(uri.ParseFromString(testVector.uriString)) << index;
+        ASSERT_EQ(testVector.query, uri.GetQuery());
+        ++index;
+    }
+}
+
+TEST(UriTests, ParseFromStringFragmentIllegalCharacters) {
+    const std::vector< std::string > testVectors{
+        {"http://www.example.com/#foo[bar"},
+        {"http://www.example.com/#]bar"},
+        {"http://www.example.com/#foo]"},
+        {"http://www.example.com/#["},
+        {"http://www.example.com/#abc/foo]"},
+        {"http://www.example.com/#abc/["},
+        {"http://www.example.com/#foo]/abc"},
+        {"http://www.example.com/#[/abc"},
+        {"http://www.example.com/#foo]/"},
+        {"http://www.example.com/#[/"},
+        {"#foo[bar"},
+        {"#]bar"},
+        {"#foo]"},
+        {"#["},
+        {"#abc/foo]"},
+        {"#abc/["},
+        {"#foo]/abc"},
+        {"#[/abc"},
+        {"#foo]/"},
+        {"#[/"},
+    };
+    size_t index = 0;
+    for (const auto& testVector : testVectors) {
+        Uri::Uri uri;
+        ASSERT_FALSE(uri.ParseFromString(testVector)) << index;
+        ++index;
+    }
+}
+
+TEST(UriTests, ParseFromStringFragmentBarelyLegal) {
+    struct TestVector {
+        std::string uriString;
+        std::string fragment;
+    };
+    const std::vector< TestVector > testVectors{
+        {"/#:/foo", ":/foo"},
+        {"#bob@/foo", "bob@/foo"},
+        {"#hello!", "hello!"},
+        {"urn:#hello,%20w%6Frld", "hello, world"},
+        {"//example.com/foo#(bar)/", "(bar)/"},
+        {"http://www.example.com/#foo?bar", "foo?bar" },
+    };
+    size_t index = 0;
+    for (const auto& testVector : testVectors) {
+        Uri::Uri uri;
+        ASSERT_TRUE(uri.ParseFromString(testVector.uriString)) << index;
+        ASSERT_EQ(testVector.fragment, uri.GetFragment());
+        ++index;
+    }
+}
author	Richard Walters <rwalters@digitalstirling.com>	2018-07-01 15:20:30 -0700
committer	Richard Walters <rwalters@digitalstirling.com>	2018-07-01 15:20:30 -0700
commit	d3a446cd9c3846735f4ea9e5270352633c597071 (patch)
tree	4dd760e75c295b340db489df35b5275e1fba9352
parent	4eb4f0c150642cf2fa92f75000ab5108d1908e48 (diff)