Refactoring

Added CharacterSet as a class to represent character sets, allowing us to build singletons and composite character sets more concisely.
author: Richard Walters <rwalters@digitalstirling.com> 2018-07-01 16:58:37 -0700
committer: Richard Walters <rwalters@digitalstirling.com> 2018-07-01 16:58:37 -0700
commit: 16b5c56c4ecbbb5c6153f9e16228a8d8cf95c50d (patch)
tree: 9bf5689b45b08c4d6a9319651a6ca80e4f7830c6
parent: cdc3f449812d0d45a3ea271636d669eb05ba3751 (diff)
4 files changed, 287 insertions, 117 deletions
diff --git a/src/IsCharacterInSet.cpp b/src/IsCharacterInSet.cpp
index 82625e9..1b2882f 100644
--- a/src/IsCharacterInSet.cpp
+++ b/src/IsCharacterInSet.cpp
@@ -9,27 +9,79 @@
 
 #include "IsCharacterInSet.hpp"
 
+#include <set>
+
 namespace Uri {
 
-    bool IsCharacterInSet(
-        char c,
-        std::initializer_list< char > characterSet
-    ) {
+    /**
+     * This contains the private properties of the CharacterSet class.
+     */
+    struct CharacterSet::Impl {
+        /**
+         * This holds the characters in the set.
+         */
+        std::set< char > charactersInSet;
+    };
+
+    CharacterSet::~CharacterSet() = default;
+    CharacterSet::CharacterSet(const CharacterSet& other)
+        : impl_(new Impl(*other.impl_))
+    {
+    }
+    CharacterSet::CharacterSet(CharacterSet&& other) = default;
+    CharacterSet& CharacterSet::operator=(const CharacterSet& other) {
+        if (this != &other) {
+            *impl_ = *other.impl_;
+        }
+        return *this;
+    }
+    CharacterSet& CharacterSet::operator=(CharacterSet&& other) = default;
+
+    CharacterSet::CharacterSet()
+        : impl_(new Impl)
+    {
+    }
+
+    CharacterSet::CharacterSet(char c)
+        : impl_(new Impl)
+    {
+        (void)impl_->charactersInSet.insert(c);
+    }
+
+    CharacterSet::CharacterSet(char first, char last)
+        : impl_(new Impl)
+    {
+        for (char c = first; c < last + 1; ++c) {
+            (void)impl_->charactersInSet.insert(c);
+        }
+    }
+
+    CharacterSet::CharacterSet(
+        std::initializer_list< const CharacterSet > characterSets
+    )
+        : impl_(new Impl)
+    {
         for (
-            auto charInSet = characterSet.begin();
-            charInSet != characterSet.end();
-            ++charInSet
+            auto characterSet = characterSets.begin();
+            characterSet != characterSets.end();
+            ++characterSet
         ) {
-            const auto first = *charInSet++;
-            const auto last = *charInSet;
-            if (
-                (c >= first)
-                && (c <= last)
-            ) {
-                return true;
-            }
+            impl_->charactersInSet.insert(
+                characterSet->impl_->charactersInSet.begin(),
+                characterSet->impl_->charactersInSet.end()
+            );
         }
-        return false;
+    }
+
+    bool CharacterSet::Contains(char c) const {
+        return impl_->charactersInSet.find(c) != impl_->charactersInSet.end();
+    }
+
+    bool IsCharacterInSet(
+        char c,
+        const CharacterSet& characterSet
+    ) {
+        return characterSet.Contains(c);
     }
 
 }
diff --git a/src/IsCharacterInSet.hpp b/src/IsCharacterInSet.hpp
index f17460c..93d8fa9 100644
--- a/src/IsCharacterInSet.hpp
+++ b/src/IsCharacterInSet.hpp
@@ -10,10 +10,94 @@
  */
 
 #include <initializer_list>
+#include <memory>
 
 namespace Uri {
 
     /**
+     * This represents a set of characters which can be queried
+     * to find out if a character is in the set or not.
+     */
+    class CharacterSet {
+        // Lifecycle management
+    public:
+        ~CharacterSet();
+        CharacterSet(const CharacterSet&);
+        CharacterSet(CharacterSet&&);
+        CharacterSet& operator=(const CharacterSet&);
+        CharacterSet& operator=(CharacterSet&&);
+
+        // Methods
+    public:
+        /**
+         * This is the default constructor.
+         */
+        CharacterSet();
+
+        /**
+         * This constructs a character set that contains
+         * just the given character.
+         *
+         * @param[in] c
+         *     This is the only character to put in the set.
+         */
+        CharacterSet(char c);
+
+        /**
+         * This constructs a character set that contains all the
+         * characters between the given "first" and "last"
+         * characters, inclusive.
+         *
+         * @param[in] first
+         *     This is the first of the range of characters
+         *     to put in the set.
+         *
+         * @param[in] last
+         *     This is the last of the range of characters
+         *     to put in the set.
+         */
+        CharacterSet(char first, char last);
+
+        /**
+         * This constructs a character set that contains all the
+         * characters in all the other given character sets.
+         *
+         * @param[in] characterSets
+         *     These are the character sets to include.
+         */
+        CharacterSet(
+            std::initializer_list< const CharacterSet > characterSets
+        );
+
+        /**
+         * This method checks to see if the given character
+         * is in the character set.
+         *
+         * @param[in] c
+         *     This is the character to check.
+         *
+         * @return
+         *     An indication of whether or not the given character
+         *     is in the character set is returned.
+         */
+        bool Contains(char c) const;
+
+        // Private Properties
+    private:
+        /**
+         * This is the type of structure that contains the private
+         * properties of the instance.  It is defined in the implementation
+         * and declared here to ensure that it is scoped inside the class.
+         */
+        struct Impl;
+
+        /**
+         * This contains the private properties of the instance.
+         */
+        std::unique_ptr< struct Impl > impl_;
+    };
+
+    /**
      * This function determines whether or not the given character
      * is in the given character set.
      *
@@ -29,7 +113,7 @@ namespace Uri {
      */
     bool IsCharacterInSet(
         char c,
-        std::initializer_list< char > characterSet
+        const CharacterSet& characterSet
     );
 
 }
diff --git a/src/PercentEncodedCharacterDecoder.cpp b/src/PercentEncodedCharacterDecoder.cpp
index d3bc0d9..890d392 100644
--- a/src/PercentEncodedCharacterDecoder.cpp
+++ b/src/PercentEncodedCharacterDecoder.cpp
@@ -10,6 +10,21 @@
 #include "IsCharacterInSet.hpp"
 #include "PercentEncodedCharacterDecoder.hpp"
 
+namespace {
+
+    /**
+     * This is the character set containing just numbers.
+     */
+    const Uri::CharacterSet DIGIT('0', '9');
+
+    /**
+     * This is the character set containing just the upper-case
+     * letters 'A' through 'F', used in upper-case hexadecimal.
+     */
+    const Uri::CharacterSet HEX('A', 'F');
+
+}
+
 namespace Uri {
 
     struct PercentEncodedCharacterDecoder::Impl {
@@ -41,9 +56,9 @@ namespace Uri {
             case 0: { // % ...
                 impl_->decoderState = 1;
                 impl_->decodedCharacter <<= 4;
-                if (IsCharacterInSet(c, {'0','9'})) {
+                if (IsCharacterInSet(c, DIGIT)) {
                     impl_->decodedCharacter += (int)(c - '0');
-                } else if (IsCharacterInSet(c, {'A','F'})) {
+                } else if (IsCharacterInSet(c, HEX)) {
                     impl_->decodedCharacter += (int)(c - 'A') + 10;
                 } else {
                     return false;
@@ -53,9 +68,9 @@ namespace Uri {
             case 1: { // %[0-9A-F] ...
                 impl_->decoderState = 2;
                 impl_->decodedCharacter <<= 4;
-                if (IsCharacterInSet(c, {'0','9'})) {
+                if (IsCharacterInSet(c, DIGIT)) {
                     impl_->decodedCharacter += (int)(c - '0');
-                } else if (IsCharacterInSet(c, {'A','F'})) {
+                } else if (IsCharacterInSet(c, HEX)) {
                     impl_->decodedCharacter += (int)(c - 'A') + 10;
                 } else {
                     return false;
diff --git a/src/Uri.cpp b/src/Uri.cpp
index 208297d..3bb3a59 100644
--- a/src/Uri.cpp
+++ b/src/Uri.cpp
@@ -19,6 +19,113 @@
 namespace {
 
     /**
+     * This is the character set containing just the alphabetic characters
+     * from the ASCII character set.
+     */
+    const Uri::CharacterSet ALPHA{
+        Uri::CharacterSet('a', 'z'),
+        Uri::CharacterSet('A', 'Z')
+    };
+
+    /**
+     * This is the character set containing just numbers.
+     */
+    const Uri::CharacterSet DIGIT('0', '9');
+
+    /**
+     * This is the character set containing just the characters allowed
+     * in a hexadecimal digit.
+     */
+    const Uri::CharacterSet HEXDIG{
+        Uri::CharacterSet('0', '9'),
+        Uri::CharacterSet('A', 'F')
+    };
+
+    /**
+     * This is the character set corresponds to the "unreserved" syntax
+     * specified in RFC 3986 (https://tools.ietf.org/html/rfc3986).
+     */
+    const Uri::CharacterSet UNRESERVED{
+        ALPHA,
+        DIGIT,
+        '-', '.', '_', '~'
+    };
+
+    /**
+     * This is the character set corresponds to the "sub-delims" syntax
+     * specified in RFC 3986 (https://tools.ietf.org/html/rfc3986).
+     */
+    const Uri::CharacterSet SUB_DELIMS{
+        '!', '$', '&', '\'', '(', ')',
+        '*', '+', ',', ';', '='
+    };
+
+    /**
+     * This is the character set corresponds to the second part
+     * of the "scheme" syntax
+     * specified in RFC 3986 (https://tools.ietf.org/html/rfc3986).
+     */
+    const Uri::CharacterSet SCHEME_NOT_FIRST{
+        ALPHA,
+        DIGIT,
+        '+', '-', '.',
+    };
+
+    /**
+     * This is the character set corresponds to the "pchar" syntax
+     * specified in RFC 3986 (https://tools.ietf.org/html/rfc3986),
+     * leaving out "pct-encoded".
+     */
+    const Uri::CharacterSet PCHAR_NOT_PCT_ENCODED{
+        UNRESERVED,
+        SUB_DELIMS,
+        ':', '@'
+    };
+
+    /**
+     * This is the character set corresponds to the "query" syntax
+     * and the "fragment" syntax
+     * specified in RFC 3986 (https://tools.ietf.org/html/rfc3986),
+     * leaving out "pct-encoded".
+     */
+    const Uri::CharacterSet QUERY_OR_FRAGMENT_NOT_PCT_ENCODED{
+        PCHAR_NOT_PCT_ENCODED,
+        '/', '?'
+    };
+
+    /**
+     * This is the character set corresponds to the "userinfo" syntax
+     * specified in RFC 3986 (https://tools.ietf.org/html/rfc3986),
+     * leaving out "pct-encoded".
+     */
+    const Uri::CharacterSet USER_INFO_NOT_PCT_ENCODED{
+        UNRESERVED,
+        SUB_DELIMS,
+        ':',
+    };
+
+    /**
+     * This is the character set corresponds to the "reg-name" syntax
+     * specified in RFC 3986 (https://tools.ietf.org/html/rfc3986),
+     * leaving out "pct-encoded".
+     */
+    const Uri::CharacterSet REG_NAME_NOT_PCT_ENCODED{
+        UNRESERVED,
+        SUB_DELIMS
+    };
+
+    /**
+     * This is the character set corresponds to the last part of
+     * the "IPvFuture" syntax
+     * specified in RFC 3986 (https://tools.ietf.org/html/rfc3986).
+     */
+    const Uri::CharacterSet IPV_FUTURE_LAST_PART{
+        UNRESERVED,
+        SUB_DELIMS,
+        ':'
+    };
+
+    /**
      * This function parses the given string as an unsigned 16-bit
      * integer, detecting invalid characters, overflow, etc.
      *
@@ -101,9 +208,9 @@ namespace {
             } else {
                 bool check;
                 if (*isFirstCharacter) {
-                    check = Uri::IsCharacterInSet(c, { 'a','z', 'A','Z' });
+                    check = Uri::IsCharacterInSet(c, ALPHA);
                 } else {
-                    check = Uri::IsCharacterInSet(c, { 'a','z', 'A','Z', '0','9', '+','+', '-','-', '.','.' });
+                    check = Uri::IsCharacterInSet(c, SCHEME_NOT_FIRST);
                 }
                 *isFirstCharacter = false;
                 return check;
@@ -135,24 +242,7 @@ namespace {
                         pecDecoder = Uri::PercentEncodedCharacterDecoder();
                         decoderState = 1;
                     } else {
-                        if (
-                            Uri::IsCharacterInSet(
-                                c,
-                                {
-                                    // unreserved
-                                    'a','z', 'A','Z', // ALPHA
-                                    '0','9', // DIGIT
-                                    '-','-', '.','.', '_','_', '~','~',
-
-                                    // sub-delims
-                                    '!','!', '$','$', '&','&', '\'','\'', '(','(', ')',')',
-                                    '*','*', '+','+', ',',',', ';',';', '=','=',
-
-                                    // (also allowed in segment or pchar)
-                                    ':',':', '@','@'
-                                }
-                            )
-                        ) {
+                        if (Uri::IsCharacterInSet(c, PCHAR_NOT_PCT_ENCODED)) {
                             segment.push_back(c);
                         } else {
                             return false;
@@ -198,27 +288,7 @@ namespace {
                         pecDecoder = Uri::PercentEncodedCharacterDecoder();
                         decoderState = 1;
                     } else {
-                        if (
-                            Uri::IsCharacterInSet(
-                                c,
-                                {
-                                    // unreserved
-                                    'a','z', 'A','Z', // ALPHA
-                                    '0','9', // DIGIT
-                                    '-','-', '.','.', '_','_', '~','~',
-
-                                    // sub-delims
-                                    '!','!', '$','$', '&','&', '\'','\'', '(','(', ')',')',
-                                    '*','*', '+','+', ',',',', ';',';', '=','=',
-
-                                    // (also allowed in pchar)
-                                    ':',':', '@','@',
-
-                                    // (also allowed in query or fragment)
-                                    '/','/', '?','?'
-                                }
-                            )
-                        ) {
+                        if (Uri::IsCharacterInSet(c, QUERY_OR_FRAGMENT_NOT_PCT_ENCODED)) {
                             queryOrFragment.push_back(c);
                         } else {
                             return false;
@@ -368,24 +438,7 @@ namespace Uri {
                                 pecDecoder = PercentEncodedCharacterDecoder();
                                 decoderState = 1;
                             } else {
-                                if (
-                                    IsCharacterInSet(
-                                        c,
-                                        {
-                                            // unreserved
-                                            'a','z', 'A','Z', // ALPHA
-                                            '0','9', // DIGIT
-                                            '-','-', '.','.', '_','_', '~','~',
-
-                                            // sub-delims
-                                            '!','!', '$','$', '&','&', '\'','\'', '(','(', ')',')',
-                                            '*','*', '+','+', ',',',', ';',';', '=','=',
-
-                                            // (also allowed in userinfo)
-                                            ':',':',
-                                        }
-                                    )
-                                ) {
+                                if (IsCharacterInSet(c, USER_INFO_NOT_PCT_ENCODED)) {
                                     userInfo.push_back(c);
                                 } else {
                                     return false;
@@ -432,24 +485,7 @@ namespace Uri {
                         } else if (c == ':') {
                             decoderState = 8;
                         } else {
-                            if (
-                                IsCharacterInSet(
-                                    c,
-                                    {
-                                        // unreserved
-                                        'a','z', 'A','Z', // ALPHA
-                                        '0','9', // DIGIT
-                                        '-','-', '.','.', '_','_', '~','~',
-
-                                        // sub-delims
-                                        '!','!', '$','$', '&','&', '\'','\'', '(','(', ')',')',
-                                        '*','*', '+','+', ',',',', ';',';', '=','=',
-
-                                        // (also allowed in reg-name)
-                                        ':',':',
-                                    }
-                                )
-                            ) {
+                            if (IsCharacterInSet(c, REG_NAME_NOT_PCT_ENCODED)) {
                                 host.push_back(c);
                             } else {
                                 return false;
@@ -489,7 +525,7 @@ namespace Uri {
                     case 5: { // IPvFuture: v ...
                         if (c == '.') {
                             decoderState = 6;
-                        } else if (!IsCharacterInSet(c, {'0','9', 'A','F'})) {
+                        } else if (!IsCharacterInSet(c, HEXDIG)) {
                             return false;
                         }
                         host.push_back(c);
@@ -499,24 +535,7 @@ namespace Uri {
                         host.push_back(c);
                         if (c == ']') {
                             decoderState = 7;
-                        } else if (
-                            !IsCharacterInSet(
-                                c,
-                                {
-                                    // unreserved
-                                    'a','z', 'A','Z', // ALPHA
-                                    '0','9', // DIGIT
-                                    '-','-', '.','.', '_','_', '~','~',
-
-                                    // sub-delims
-                                    '!','!', '$','$', '&','&', '\'','\'', '(','(', ')',')',
-                                    '*','*', '+','+', ',',',', ';',';', '=','=',
-
-                                    // (also allowed in IPvFuture)
-                                    ':',':',
-                                }
-                            )
-                        ) {
+                        } else if (!IsCharacterInSet(c, IPV_FUTURE_LAST_PART)) {
                             return false;
                         }
                     } break;
author	Richard Walters <rwalters@digitalstirling.com>	2018-07-01 16:58:37 -0700
committer	Richard Walters <rwalters@digitalstirling.com>	2018-07-01 16:58:37 -0700
commit	16b5c56c4ecbbb5c6153f9e16228a8d8cf95c50d (patch)
tree	9bf5689b45b08c4d6a9319651a6ca80e4f7830c6
parent	cdc3f449812d0d45a3ea271636d669eb05ba3751 (diff)