/** * @file Uri.cpp * * This module contains the implementation of the Uri::Uri class. * * © 2018 by Richard Walters */ #include "CharacterSet.hpp" #include "NormalizeCaseInsensitiveString.hpp" #include "PercentEncodedCharacterDecoder.hpp" #include #include #include #include #include #include #include namespace { /** * This is the character set containing just the alphabetic characters * from the ASCII character set. */ const Uri::CharacterSet ALPHA{ Uri::CharacterSet('a', 'z'), Uri::CharacterSet('A', 'Z') }; /** * This is the character set containing just numbers. */ const Uri::CharacterSet DIGIT('0', '9'); /** * This is the character set containing just the characters allowed * in a hexadecimal digit. */ const Uri::CharacterSet HEXDIG{ Uri::CharacterSet('0', '9'), Uri::CharacterSet('A', 'F'), Uri::CharacterSet('a', 'f') }; /** * This is the character set corresponds to the "unreserved" syntax * specified in RFC 3986 (https://tools.ietf.org/html/rfc3986). */ const Uri::CharacterSet UNRESERVED{ ALPHA, DIGIT, '-', '.', '_', '~' }; /** * This is the character set corresponds to the "sub-delims" syntax * specified in RFC 3986 (https://tools.ietf.org/html/rfc3986). */ const Uri::CharacterSet SUB_DELIMS{ '!', '$', '&', '\'', '(', ')', '*', '+', ',', ';', '=' }; /** * This is the character set corresponds to the second part * of the "scheme" syntax * specified in RFC 3986 (https://tools.ietf.org/html/rfc3986). */ const Uri::CharacterSet SCHEME_NOT_FIRST{ ALPHA, DIGIT, '+', '-', '.', }; /** * This is the character set corresponds to the "pchar" syntax * specified in RFC 3986 (https://tools.ietf.org/html/rfc3986), * leaving out "pct-encoded". */ const Uri::CharacterSet PCHAR_NOT_PCT_ENCODED{ UNRESERVED, SUB_DELIMS, ':', '@' }; /** * This is the character set corresponds to the "query" syntax * and the "fragment" syntax * specified in RFC 3986 (https://tools.ietf.org/html/rfc3986), * leaving out "pct-encoded". */ const Uri::CharacterSet QUERY_OR_FRAGMENT_NOT_PCT_ENCODED{ PCHAR_NOT_PCT_ENCODED, '/', '?' }; /** * This is the character set corresponds to the "userinfo" syntax * specified in RFC 3986 (https://tools.ietf.org/html/rfc3986), * leaving out "pct-encoded". */ const Uri::CharacterSet USER_INFO_NOT_PCT_ENCODED{ UNRESERVED, SUB_DELIMS, ':', }; /** * This is the character set corresponds to the "reg-name" syntax * specified in RFC 3986 (https://tools.ietf.org/html/rfc3986), * leaving out "pct-encoded". */ const Uri::CharacterSet REG_NAME_NOT_PCT_ENCODED{ UNRESERVED, SUB_DELIMS }; /** * This is the character set corresponds to the last part of * the "IPvFuture" syntax * specified in RFC 3986 (https://tools.ietf.org/html/rfc3986). */ const Uri::CharacterSet IPV_FUTURE_LAST_PART{ UNRESERVED, SUB_DELIMS, ':' }; /** * This function parses the given string as an unsigned 16-bit * integer, detecting invalid characters, overflow, etc. * * @param[in] numberString * This is the string containing the number to parse. * * @param[out] number * This is where to store the number parsed. * * @return * An indication of whether or not the number was parsed * successfully is returned. */ bool ParseUint16( const std::string& numberString, uint16_t& number ) { uint32_t numberIn32Bits = 0; for (auto c: numberString) { if ( (c < '0') || (c > '9') ) { return false; } numberIn32Bits *= 10; numberIn32Bits += (uint16_t)(c - '0'); if ( (numberIn32Bits & ~((1 << 16) - 1)) != 0 ) { return false; } } number = (uint16_t)numberIn32Bits; return true; } /** * This function takes a given "stillPassing" strategy * and invokes it on the sequence of characters in the given * string, to check if the string passes or not. * * @param[in] candidate * This is the string to test. * * @param[in] stillPassing * This is the strategy to invoke in order to test the string. * * @return * An indication of whether or not the given candidate string * passes the test is returned. */ bool FailsMatch( const std::string& candidate, std::function< bool(char, bool) > stillPassing ) { for (const auto c: candidate) { if (!stillPassing(c, false)) { return true; } } return !stillPassing(' ', true); } /** * This function returns a strategy function that * may be used with the FailsMatch function to test a scheme * to make sure it is legal according to the standard. * * @return * A strategy function that may be used with the * FailsMatch function to test a scheme to make sure * it is legal according to the standard is returned. */ std::function< bool(char, bool) > LegalSchemeCheckStrategy() { auto isFirstCharacter = std::make_shared< bool >(true); return [isFirstCharacter](char c, bool end){ if (end) { return !*isFirstCharacter; } else { bool check; if (*isFirstCharacter) { check = ALPHA.Contains(c); } else { check = SCHEME_NOT_FIRST.Contains(c); } *isFirstCharacter = false; return check; } }; } /** * This method checks and decodes the given URI element. * What we are calling a "URI element" is any part of the URI * which is a sequence of characters that: * - may be percent-encoded * - if not percent-encoded, are in a restricted set of characters * * @param[in,out] element * On input, this is the element to check and decode. * On output, this is the decoded element. * * @param[in] allowedCharacters * This is the set of characters that do not need to * be percent-encoded. * * @return * An indication of whether or not the element * passed all checks and was decoded successfully is returned. */ bool DecodeElement( std::string& element, const Uri::CharacterSet& allowedCharacters ) { const auto originalSegment = std::move(element); element.clear(); bool decodingPec = false; Uri::PercentEncodedCharacterDecoder pecDecoder; for (const auto c: originalSegment) { if (decodingPec) { if (!pecDecoder.NextEncodedCharacter(c)) { return false; } if (pecDecoder.Done()) { decodingPec = false; element.push_back((char)pecDecoder.GetDecodedCharacter()); } } else if (c == '%') { decodingPec = true; pecDecoder = Uri::PercentEncodedCharacterDecoder(); } else { if (allowedCharacters.Contains(c)) { element.push_back(c); } else { return false; } } } return true; } /** * This method checks and decodes the given query or fragment. * * @param[in,out] queryOrFragment * On input, this is the query or fragment to check and decode. * On output, this is the decoded query or fragment. * * @return * An indication of whether or not the query or fragment * passed all checks and was decoded successfully is returned. */ bool DecodeQueryOrFragment(std::string& queryOrFragment) { return DecodeElement( queryOrFragment, QUERY_OR_FRAGMENT_NOT_PCT_ENCODED ); } } namespace Uri { /** * This contains the private properties of a Uri instance. */ struct Uri::Impl { // Properties /** * This is the "scheme" element of the URI. */ std::string scheme; /** * This is the "UserInfo" element of the URI. */ std::string userInfo; /** * This is the "host" element of the URI. */ std::string host; /** * This flag indicates whether or not the * URI includes a port number. */ bool hasPort = false; /** * This is the port number element of the URI. */ uint16_t port = 0; /** * This is the "path" element of the URI, * as a sequence of segments. */ std::vector< std::string > path; /** * This is the "query" element of the URI, * if it has one. */ std::string query; /** * This is the "fragment" element of the URI, * if it has one. */ std::string fragment; // Methods /** * This method builds the internal path element sequence * by parsing it from the given path string. * * @param[in] pathString * This is the string containing the whole path of the URI. * * @return * An indication if the path was parsed correctly or not * is returned. */ bool ParsePath(std::string pathString) { path.clear(); if (pathString == "/") { // Special case of a path that is empty but needs a single // empty-string element to indicate that it is absolute. path.push_back(""); pathString.clear(); } else if (!pathString.empty()) { for(;;) { auto pathDelimiter = pathString.find('/'); if (pathDelimiter == std::string::npos) { path.push_back(pathString); pathString.clear(); break; } else { path.emplace_back( pathString.begin(), pathString.begin() + pathDelimiter ); pathString = pathString.substr(pathDelimiter + 1); } } } for (auto& segment: path) { if (!DecodeElement(segment, PCHAR_NOT_PCT_ENCODED)) { return false; } } return true; } /** * This method parses the elements that make up the authority * composite part of the URI, by parsing it from the given string. * * @param[in] authorityString * This is the string containing the whole authority part * of the URI. * * @return * An indication if the path was parsed correctly or not * is returned. */ bool ParseAuthority(const std::string& authorityString) { // Next, check if there is a UserInfo, and if so, extract it. const auto userInfoDelimiter = authorityString.find('@'); std::string hostPortString; userInfo.clear(); if (userInfoDelimiter == std::string::npos) { hostPortString = authorityString; } else { userInfo = authorityString.substr(0, userInfoDelimiter); if (!DecodeElement(userInfo, USER_INFO_NOT_PCT_ENCODED)) { return false; } hostPortString = authorityString.substr(userInfoDelimiter + 1); } // Next, parsing host and port from authority and path. std::string portString; size_t decoderState = 0; int decodedCharacter = 0; host.clear(); PercentEncodedCharacterDecoder pecDecoder; bool hostIsRegName = false; for (const auto c: hostPortString) { switch(decoderState) { case 0: { // first character if (c == '[') { host.push_back(c); decoderState = 3; break; } else { decoderState = 1; hostIsRegName = true; } } case 1: { // reg-name or IPv4Address if (c == '%') { pecDecoder = PercentEncodedCharacterDecoder(); decoderState = 2; } else if (c == ':') { decoderState = 8; } else { if (REG_NAME_NOT_PCT_ENCODED.Contains(c)) { host.push_back(c); } else { return false; } } } break; case 2: { // % ... if (!pecDecoder.NextEncodedCharacter(c)) { return false; } if (pecDecoder.Done()) { decoderState = 1; host.push_back((char)pecDecoder.GetDecodedCharacter()); } } break; case 3: { // IP-literal if (c == 'v') { host.push_back(c); decoderState = 5; break; } else { decoderState = 4; } } case 4: { // IPv6Address // TODO: research this offline first // before attempting to code it host.push_back(c); if (c == ']') { decoderState = 7; } } break; case 5: { // IPvFuture: v ... if (c == '.') { decoderState = 6; } else if (!HEXDIG.Contains(c)) { return false; } host.push_back(c); } break; case 6: { // IPvFuture v 1*HEXDIG . ... host.push_back(c); if (c == ']') { decoderState = 7; } else if (!IPV_FUTURE_LAST_PART.Contains(c)) { return false; } } break; case 7: { // illegal to have anything else, unless it's a colon, // in which case it's a port delimiter if (c == ':') { decoderState = 8; } else { return false; } } break; case 8: { // port portString.push_back(c); } break; } } if (hostIsRegName) { host = NormalizeCaseInsensitiveString(host); } if (portString.empty()) { hasPort = false; } else { if (!ParseUint16(portString, port)) { return false; } hasPort = true; } return true; } /** * This method takes the part of a URI string that has just * the query element with its delimiter, and breaks off * and decodes the query. * * @param[in] queryWithDelimiter * This is the part of a URI string that has just * the query element with its delimiter. * * @return * An indication of whether or not the method succeeded * is returned. */ bool ParseQuery(const std::string& queryWithDelimiter) { if (queryWithDelimiter.empty()) { query.clear(); } else { query = queryWithDelimiter.substr(1); } return DecodeQueryOrFragment(query); } /** * This method takes the part of a URI string that has just * the query and/or fragment elements, and breaks off * and decodes the fragment part, returning the rest, * which will be either empty or have the query with the * query delimiter still attached. * * @param[in] queryAndOrFragment * This is the part of a URI string that has just * the query and/or fragment elements. * * @param[out] rest * This is where to store the rest of the input string * after removing any fragment and fragment delimiter. * * @return * An indication of whether or not the method succeeded * is returned. */ bool ParseFragment( const std::string& queryAndOrFragment, std::string& rest ) { const auto fragmentDelimiter = queryAndOrFragment.find('#'); if (fragmentDelimiter == std::string::npos) { fragment.clear(); rest = queryAndOrFragment; } else { fragment = queryAndOrFragment.substr(fragmentDelimiter + 1); rest = queryAndOrFragment.substr(0, fragmentDelimiter); } return DecodeQueryOrFragment(fragment); } }; Uri::~Uri() = default; Uri::Uri(Uri&&) = default; Uri& Uri::operator=(Uri&&) = default; Uri::Uri() : impl_(new Impl) { } bool Uri::operator==(const Uri& other) const { return ( (impl_->scheme == other.impl_->scheme) && (impl_->userInfo == other.impl_->userInfo) && (impl_->host == other.impl_->host) && ( (!impl_->hasPort && !other.impl_->hasPort) || ( (impl_->hasPort && other.impl_->hasPort) && (impl_->port == other.impl_->port) ) ) && (impl_->path == other.impl_->path) && (impl_->query == other.impl_->query) && (impl_->fragment == other.impl_->fragment) ); } bool Uri::operator!=(const Uri& other) const { return !(*this == other); } bool Uri::ParseFromString(const std::string& uriString) { // First parse the scheme. // Limit our search so we don't scan into the authority // or path elements, because these may have the colon // character as well, which we might misinterpret // as the scheme delimiter. auto authorityOrPathDelimiterStart = uriString.find('/'); if (authorityOrPathDelimiterStart == std::string::npos) { authorityOrPathDelimiterStart = uriString.length(); } const auto schemeEnd = uriString.substr(0, authorityOrPathDelimiterStart).find(':'); std::string rest; if (schemeEnd == std::string::npos) { impl_->scheme.clear(); rest = uriString; } else { impl_->scheme = uriString.substr(0, schemeEnd); bool isFirstCharacter = true; if ( FailsMatch( impl_->scheme, LegalSchemeCheckStrategy() ) ) { return false; } impl_->scheme = NormalizeCaseInsensitiveString(impl_->scheme); rest = uriString.substr(schemeEnd + 1); } // Next parse the authority. const auto pathEnd = rest.find_first_of("?#"); auto authorityAndPathString = rest.substr(0, pathEnd); const auto queryAndOrFragment = rest.substr(authorityAndPathString.length()); std::string pathString; if (authorityAndPathString.substr(0, 2) == "//") { // Strip off authority marker. authorityAndPathString = authorityAndPathString.substr(2); // First separate the authority from the path. auto authorityEnd = authorityAndPathString.find('/'); if (authorityEnd == std::string::npos) { authorityEnd = authorityAndPathString.length(); } pathString = authorityAndPathString.substr(authorityEnd); auto authorityString = authorityAndPathString.substr(0, authorityEnd); // Parse the elements inside the authority string. if (!impl_->ParseAuthority(authorityString)) { return false; } } else { impl_->userInfo.clear(); impl_->host.clear(); impl_->hasPort = false; pathString = authorityAndPathString; } // Next, parse the path. if (!impl_->ParsePath(pathString)) { return false; } // Handle special case of URI with authority and empty // path -- treat the same as "/" path. if ( !impl_->host.empty() && impl_->path.empty() ) { impl_->path.push_back(""); } // Next, parse the fragment if there is one. if (!impl_->ParseFragment(queryAndOrFragment, rest)) { return false; } // Finally, if anything is left, it's the query. return impl_->ParseQuery(rest); } std::string Uri::GetScheme() const { return impl_->scheme; } std::string Uri::GetUserInfo() const { return impl_->userInfo; } std::string Uri::GetHost() const { return impl_->host; } std::vector< std::string > Uri::GetPath() const { return impl_->path; } bool Uri::HasPort() const { return impl_->hasPort; } uint16_t Uri::GetPort() const { return impl_->port; } bool Uri::IsRelativeReference() const { return impl_->scheme.empty(); } bool Uri::ContainsRelativePath() const { if (impl_->path.empty()) { return true; } else { return !impl_->path[0].empty(); } } std::string Uri::GetQuery() const { return impl_->query; } std::string Uri::GetFragment() const { return impl_->fragment; } void Uri::NormalizePath() { /* * This is a straight-up implementation of the * algorithm from section 5.2.4 of * RFC 3986 (https://tools.ietf.org/html/rfc3986). */ // Step 1 auto oldPath = std::move(impl_->path); impl_->path.clear(); // Step 2 while (!oldPath.empty()) { // Step 2A if ( (oldPath[0] == ".") || (oldPath[0] == "..") ) { oldPath.erase(oldPath.begin()); } else // Step 2B if ( (oldPath.size() >= 2) && (oldPath[0] == "") && (oldPath[1] == ".") ) { oldPath.erase(oldPath.begin() + 1); } else // Step 2C if ( (oldPath.size() >= 2) && (oldPath[0] == "") && (oldPath[1] == "..") ) { oldPath.erase(oldPath.begin() + 1); if (!impl_->path.empty()) { impl_->path.pop_back(); } } else // Step 2D if ( (oldPath.size() == 1) && ( (oldPath[0] == ".") || (oldPath[0] == "..") ) ) { oldPath.erase(oldPath.begin()); } else // Step 2E { if (oldPath[0] == "") { impl_->path.push_back(""); oldPath.erase(oldPath.begin()); } if (!oldPath.empty()) { impl_->path.push_back(oldPath[0]); if (oldPath.size() > 1) { oldPath[0] = ""; } else { oldPath.erase(oldPath.begin()); } } } } } Uri Uri::Resolve(const Uri& relativeReference) const { // Resolve the reference by following the algorithm // from section 5.2.2 in // RFC 3986 (https://tools.ietf.org/html/rfc3986). Uri target; if (!relativeReference.impl_->scheme.empty()) { target.impl_->scheme = relativeReference.impl_->scheme; target.impl_->host = relativeReference.impl_->host; target.impl_->userInfo = relativeReference.impl_->userInfo; target.impl_->hasPort = relativeReference.impl_->hasPort; target.impl_->port = relativeReference.impl_->port; target.impl_->path = relativeReference.impl_->path; target.NormalizePath(); target.impl_->query = relativeReference.impl_->query; } else { if (!relativeReference.impl_->host.empty()) { target.impl_->host = relativeReference.impl_->host; target.impl_->userInfo = relativeReference.impl_->userInfo; target.impl_->hasPort = relativeReference.impl_->hasPort; target.impl_->port = relativeReference.impl_->port; target.impl_->path = relativeReference.impl_->path; target.NormalizePath(); target.impl_->query = relativeReference.impl_->query; } else { if (relativeReference.impl_->path.empty()) { target.impl_->path = impl_->path; if (!relativeReference.impl_->query.empty()) { target.impl_->query = relativeReference.impl_->query; } else { target.impl_->query = impl_->query; } } else { if ( !relativeReference.impl_->path.empty() && (relativeReference.impl_->path[0] == "") ) { target.impl_->path = relativeReference.impl_->path; target.NormalizePath(); } else { target.impl_->path = impl_->path; if (target.impl_->path.size() > 1) { target.impl_->path.pop_back(); } std::copy( relativeReference.impl_->path.begin(), relativeReference.impl_->path.end(), std::back_inserter(target.impl_->path) ); target.NormalizePath(); } target.impl_->query = relativeReference.impl_->query; } target.impl_->host = impl_->host; target.impl_->userInfo = impl_->userInfo; target.impl_->hasPort = impl_->hasPort; target.impl_->port = impl_->port; } target.impl_->scheme = impl_->scheme; } target.impl_->fragment = relativeReference.impl_->fragment; return target; } }