diff options
author | Richard Walters <rwalters@digitalstirling.com> | 2018-07-01 14:48:14 -0700 |
---|---|---|
committer | Richard Walters <rwalters@digitalstirling.com> | 2018-07-01 14:48:14 -0700 |
commit | 9f9ee6af4299dbc95f5d7b814679714ba0ab5051 (patch) | |
tree | f69cafb9f60470d762b73388fbd86775459df200 /src/Uri.cpp | |
parent | 6b9217cc7eeb72363f33a0b1330dcdca52d25f8e (diff) |
Handle bad host names
* Detect bad characters in host names.
* Incorporate splitting host and port into the state
machine that is parsing/decoding the host.
NOTE:
IPv6address is not checked for bad characters yet.
More research is needed to learn exactly what are
the various ways to write an IPv6 address.
Diffstat (limited to 'src/Uri.cpp')
-rw-r--r-- | src/Uri.cpp | 146 |
1 files changed, 141 insertions, 5 deletions
diff --git a/src/Uri.cpp b/src/Uri.cpp index e218ecd..ba04298 100644 --- a/src/Uri.cpp +++ b/src/Uri.cpp @@ -305,13 +305,149 @@ namespace Uri { } // Next, parsing host and port from authority and path. - const auto portDelimiter = hostPortString.find(':'); - if (portDelimiter == std::string::npos) { - host = hostPortString; + std::string portString; + size_t decoderState = 0; + int decodedCharacter = 0; + host.clear(); + for (const auto c: hostPortString) { + switch(decoderState) { + case 0: { // first character + if (c == '[') { + host.push_back(c); + decoderState = 4; + break; + } else { + decoderState = 1; + } + } + + case 1: { // reg-name or IPv4Address + if (c == '%') { + decoderState = 2; + } else if (c == ':') { + decoderState = 9; + } else { + if ( + IsCharacterInSet( + c, + { + // unreserved + 'a','z', 'A','Z', // ALPHA + '0','9', // DIGIT + '-','-', '.','.', '_','_', '~','~', + + // sub-delims + '!','!', '$','$', '&','&', '\'','\'', '(','(', ')',')', + '*','*', '+','+', ',',',', ';',';', '=','=', + + // (also allowed in reg-name) + ':',':', + } + ) + ) { + host.push_back(c); + } else { + return false; + } + } + } break; + + case 2: { // % ... + decoderState = 3; + decodedCharacter <<= 4; + if (IsCharacterInSet(c, {'0','9'})) { + decodedCharacter += (int)(c - '0'); + } else if (IsCharacterInSet(c, {'A','F'})) { + decodedCharacter += (int)(c - 'A') + 10; + } else { + return false; + } + } break; + + case 3: { // %[0-9A-F] ... + decoderState = 1; + decodedCharacter <<= 4; + if (IsCharacterInSet(c, {'0','9'})) { + decodedCharacter += (int)(c - '0'); + } else if (IsCharacterInSet(c, {'A','F'})) { + decodedCharacter += (int)(c - 'A') + 10; + } else { + return false; + } + host.push_back((char)decodedCharacter); + } break; + + case 4: { // IP-literal + if (c == 'v') { + host.push_back(c); + decoderState = 6; + break; + } else { + decoderState = 5; + } + } + + case 5: { // IPv6Address + // TODO: research this offline first + // before attempting to code it + host.push_back(c); + if (c == ']') { + decoderState = 8; + } + } break; + + case 6: { // IPvFuture: v ... + if (c == '.') { + decoderState = 7; + } else if (!IsCharacterInSet(c, {'0','9', 'A','F'})) { + return false; + } + host.push_back(c); + } break; + + case 7: { // IPvFuture v 1*HEXDIG . ... + host.push_back(c); + if (c == ']') { + decoderState = 8; + } else if ( + !IsCharacterInSet( + c, + { + // unreserved + 'a','z', 'A','Z', // ALPHA + '0','9', // DIGIT + '-','-', '.','.', '_','_', '~','~', + + // sub-delims + '!','!', '$','$', '&','&', '\'','\'', '(','(', ')',')', + '*','*', '+','+', ',',',', ';',';', '=','=', + + // (also allowed in IPvFuture) + ':',':', + } + ) + ) { + return false; + } + } break; + + case 8: { // illegal to have anything else, unless it's a colon, + // in which case it's a port delimiter + if (c == ':') { + decoderState = 9; + } else { + return false; + } + } break; + + case 9: { // port + portString.push_back(c); + } break; + } + } + if (portString.empty()) { hasPort = false; } else { - host = hostPortString.substr(0, portDelimiter); - const auto portString = hostPortString.substr(portDelimiter + 1); if (!ParseUint16(portString, port)) { return false; } |