From 9f9ee6af4299dbc95f5d7b814679714ba0ab5051 Mon Sep 17 00:00:00 2001 From: Richard Walters Date: Sun, 1 Jul 2018 14:48:14 -0700 Subject: Handle bad host names * Detect bad characters in host names. * Incorporate splitting host and port into the state machine that is parsing/decoding the host. NOTE: IPv6address is not checked for bad characters yet. More research is needed to learn exactly what are the various ways to write an IPv6 address. --- src/Uri.cpp | 146 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 141 insertions(+), 5 deletions(-) (limited to 'src') diff --git a/src/Uri.cpp b/src/Uri.cpp index e218ecd..ba04298 100644 --- a/src/Uri.cpp +++ b/src/Uri.cpp @@ -305,13 +305,149 @@ namespace Uri { } // Next, parsing host and port from authority and path. - const auto portDelimiter = hostPortString.find(':'); - if (portDelimiter == std::string::npos) { - host = hostPortString; + std::string portString; + size_t decoderState = 0; + int decodedCharacter = 0; + host.clear(); + for (const auto c: hostPortString) { + switch(decoderState) { + case 0: { // first character + if (c == '[') { + host.push_back(c); + decoderState = 4; + break; + } else { + decoderState = 1; + } + } + + case 1: { // reg-name or IPv4Address + if (c == '%') { + decoderState = 2; + } else if (c == ':') { + decoderState = 9; + } else { + if ( + IsCharacterInSet( + c, + { + // unreserved + 'a','z', 'A','Z', // ALPHA + '0','9', // DIGIT + '-','-', '.','.', '_','_', '~','~', + + // sub-delims + '!','!', '$','$', '&','&', '\'','\'', '(','(', ')',')', + '*','*', '+','+', ',',',', ';',';', '=','=', + + // (also allowed in reg-name) + ':',':', + } + ) + ) { + host.push_back(c); + } else { + return false; + } + } + } break; + + case 2: { // % ... + decoderState = 3; + decodedCharacter <<= 4; + if (IsCharacterInSet(c, {'0','9'})) { + decodedCharacter += (int)(c - '0'); + } else if (IsCharacterInSet(c, {'A','F'})) { + decodedCharacter += (int)(c - 'A') + 10; + } else { + return false; + } + } break; + + case 3: { // %[0-9A-F] ... + decoderState = 1; + decodedCharacter <<= 4; + if (IsCharacterInSet(c, {'0','9'})) { + decodedCharacter += (int)(c - '0'); + } else if (IsCharacterInSet(c, {'A','F'})) { + decodedCharacter += (int)(c - 'A') + 10; + } else { + return false; + } + host.push_back((char)decodedCharacter); + } break; + + case 4: { // IP-literal + if (c == 'v') { + host.push_back(c); + decoderState = 6; + break; + } else { + decoderState = 5; + } + } + + case 5: { // IPv6Address + // TODO: research this offline first + // before attempting to code it + host.push_back(c); + if (c == ']') { + decoderState = 8; + } + } break; + + case 6: { // IPvFuture: v ... + if (c == '.') { + decoderState = 7; + } else if (!IsCharacterInSet(c, {'0','9', 'A','F'})) { + return false; + } + host.push_back(c); + } break; + + case 7: { // IPvFuture v 1*HEXDIG . ... + host.push_back(c); + if (c == ']') { + decoderState = 8; + } else if ( + !IsCharacterInSet( + c, + { + // unreserved + 'a','z', 'A','Z', // ALPHA + '0','9', // DIGIT + '-','-', '.','.', '_','_', '~','~', + + // sub-delims + '!','!', '$','$', '&','&', '\'','\'', '(','(', ')',')', + '*','*', '+','+', ',',',', ';',';', '=','=', + + // (also allowed in IPvFuture) + ':',':', + } + ) + ) { + return false; + } + } break; + + case 8: { // illegal to have anything else, unless it's a colon, + // in which case it's a port delimiter + if (c == ':') { + decoderState = 9; + } else { + return false; + } + } break; + + case 9: { // port + portString.push_back(c); + } break; + } + } + if (portString.empty()) { hasPort = false; } else { - host = hostPortString.substr(0, portDelimiter); - const auto portString = hostPortString.substr(portDelimiter + 1); if (!ParseUint16(portString, port)) { return false; } -- cgit v1.2.3