diff options
Diffstat (limited to 'src/lib.rs')
-rw-r--r-- | src/lib.rs | 512 |
1 files changed, 429 insertions, 83 deletions
@@ -4,13 +4,20 @@ #[macro_use] extern crate lazy_static; +mod percent_encoded_character_decoder; +use percent_encoded_character_decoder::PercentEncodedCharacterDecoder; + use std::collections::HashSet; +use std::convert::TryFrom; // This is the character set containing just the alphabetic characters // from the ASCII character set. // // TODO: improvement // [16:16] silen_z: btw char::is_ascii_letter or something like that exists +// +// [14:49] silmeth: @rhymu8354 you might want to look at once_cell as a nicer +// macro-less replacement for lazystatic!() lazy_static! { static ref ALPHA: HashSet<char> = ('a'..='z') @@ -25,6 +32,16 @@ lazy_static! { .collect::<HashSet<char>>(); } +// This is the character set containing just the characters allowed +// in a hexadecimal digit. +lazy_static! { + static ref HEXDIG: HashSet<char> = + ('0'..='9') + .chain('A'..='F') + .chain('a'..='f') + .collect::<HashSet<char>>(); +} + // This is the character set corresponds to the "unreserved" syntax // specified in RFC 3986 (https://tools.ietf.org/html/rfc3986). lazy_static! { @@ -49,6 +66,18 @@ lazy_static! { .collect::<HashSet<char>>(); } +// This is the character set corresponds to the second part +// of the "scheme" syntax +// specified in RFC 3986 (https://tools.ietf.org/html/rfc3986). +lazy_static! { + static ref SCHEME_NOT_FIRST: HashSet<char> = + ALPHA.iter() + .chain(DIGIT.iter()) + .chain(['+', '-', '.'].iter()) + .copied() + .collect::<HashSet<char>>(); +} + // This is the character set corresponds to the "pchar" syntax // specified in RFC 3986 (https://tools.ietf.org/html/rfc3986), // leaving out "pct-encoded". @@ -61,10 +90,46 @@ lazy_static! { .collect::<HashSet<char>>(); } -#[derive(Debug, Clone)] +// This is the character set corresponds to the "userinfo" syntax +// specified in RFC 3986 (https://tools.ietf.org/html/rfc3986), +// leaving out "pct-encoded". +lazy_static! { + static ref USER_INFO_NOT_PCT_ENCODED: HashSet<char> = + UNRESERVED.iter() + .chain(SUB_DELIMS.iter()) + .chain([':'].iter()) + .copied() + .collect::<HashSet<char>>(); +} + +// This is the character set corresponds to the "reg-name" syntax +// specified in RFC 3986 (https://tools.ietf.org/html/rfc3986), +// leaving out "pct-encoded". +lazy_static! { + static ref REG_NAME_NOT_PCT_ENCODED: HashSet<char> = + UNRESERVED.iter() + .chain(SUB_DELIMS.iter()) + .copied() + .collect::<HashSet<char>>(); +} + +// This is the character set corresponds to the last part of +// the "IPvFuture" syntax +// specified in RFC 3986 (https://tools.ietf.org/html/rfc3986). +lazy_static! { + static ref IPV_FUTURE_LAST_PART: HashSet<char> = + UNRESERVED.iter() + .chain(SUB_DELIMS.iter()) + .chain([':'].iter()) + .copied() + .collect::<HashSet<char>>(); +} + +#[derive(Debug, Clone, PartialEq)] pub enum Error { EmptyScheme, - SchemeWithIllegalCharacter, + IllegalCharacter, + TruncatedHost, } impl std::fmt::Display for Error { @@ -74,40 +139,128 @@ impl std::fmt::Display for Error { write!(f, "scheme expected but missing") }, - Error::SchemeWithIllegalCharacter => { - write!(f, "scheme contains illegal character") + Error::IllegalCharacter => { + write!(f, "illegal character") + }, + + Error::TruncatedHost => { + write!(f, "truncated host") + }, + } + } +} + +impl From<percent_encoded_character_decoder::Error> for Error { + fn from(error: percent_encoded_character_decoder::Error) -> Self { + match error { + percent_encoded_character_decoder::Error::IllegalCharacter => { + Error::IllegalCharacter }, } } } pub struct Authority { - userinfo: Option<String>, - host: Option<String>, + userinfo: Option<Vec<u8>>, + host: Vec<u8>, port: Option<u16>, } pub struct Uri { scheme: Option<String>, authority: Option<Authority>, - path: Vec<String>, - query: Option<String>, - fragment: Option<String>, + path: Vec<Vec<u8>>, + query: Option<Vec<u8>>, + fragment: Option<Vec<u8>>, } impl Uri { - fn check_scheme(_scheme: &str) -> Result<&str, Error> { - unimplemented!() + fn check_scheme(scheme: &str) -> Result<&str, Error> { + if scheme.is_empty() { + return Err(Error::EmptyScheme); + } + // TODO: Improve on this by enumerating + // + // [16:20] everx80: you could enumerate() and then check the index, + // instead of having a bool flag? + let mut is_FirstCharacter = true; + for c in scheme.chars() { + let valid_characters: &HashSet<char> = if is_FirstCharacter { + &ALPHA + } else { + &SCHEME_NOT_FIRST + }; + if !valid_characters.contains(&c) { + return Err(Error::IllegalCharacter); + } + is_FirstCharacter = false; + } + Ok(scheme) } // TODO: look into making element type more flexible fn decode_element( - _element: &str, - _allowed_characters: &'static HashSet<char> - ) -> Result<String, Error> { + element: &str, + allowed_characters: &'static HashSet<char> + ) -> Result<Vec<u8>, Error> { + let mut decoding_pec = false; + let mut output = Vec::<u8>::new(); + let mut pec_decoder = PercentEncodedCharacterDecoder::new(); + // TODO: revisit this and try to use iterators, once I get better at + // Rust. + // + // [13:50] LeinardoSmith: you could do the find_if and set the + // condition to when you want to exit + // + // [13:52] 715209: i found this: https://stackoverflow.com/a/31507194 + for c in element.chars() { + if decoding_pec { + if let Some(c) = pec_decoder.next(c)? { + decoding_pec = false; + output.push(c); + } + } else if c == '%' { + decoding_pec = true; + } else if allowed_characters.contains(&c) { + output.push(c as u8); + } else { + return Err(Error::IllegalCharacter); + } + } + Ok(output) + } + + fn decode_query_or_fragment(query_or_fragment: &str) -> Result<Vec<u8>, Error> { unimplemented!() } + #[must_use = "why u no use host return value?"] + pub fn host(&self) -> Option<&[u8]> { + // Here is another way to do the same thing, but with some Rust-fu. + // Credit goes to everx80, ABuffSeagull, and silen_z: + // + // self.authority + // .as_ref() + // .and_then( + // |authority| authority.host.as_deref() + // ) + // + // * First `as_ref` gets our authority from `&Option<Authority>` into + // `Option<&Authority>` (there is an implicit borrow of + // `self.authority` first). + // * Next, `and_then` basically converts `Option<&Authority>` + // into `Option<&[u8]>` by leveraging the closure we provide + // to convert `&Authority` into `Option<&[u8]>`. + // * Finally, our closure uses `as_deref` to turn our `Option<Vec<u8>>` + // into an `Option<&[u8]>` since Vec<T> implements DeRef with + // `Target=[T]` + if let Some(authority) = &self.authority { + Some(&authority.host) + } else { + None + } + } + pub fn parse(uri_string: &str) -> Result<Uri, Error> { let (scheme, rest) = Self::parse_scheme(uri_string)?; @@ -130,22 +283,187 @@ impl Uri { }) } - fn parse_authority(_authority_string: &str) -> Result<Authority, Error> { - unimplemented!() + // TODO: Needs refactoring, as Clippy dutifully told us. + #[allow(clippy::too_many_lines)] + fn parse_authority(authority_string: &str) -> Result<Authority, Error> { + // These are the various states for the state machine implemented + // below to correctly split up and validate the URI substring + // containing the host and potentially a port number as well. + #[derive(PartialEq)] + enum HostParsingState { + NotIpLiteral, + PercentEncodedCharacter, + IpLiteral, + Ipv6Address, + IpvFutureNumber, + IpvFutureBody, + GarbageCheck, + Port, + }; + + // First, check if there is a UserInfo, and if so, extract it. + let (userinfo, host_port_string) = match authority_string.find('@') { + Some(user_info_delimiter) => ( + Some( + Self::decode_element( + &authority_string[0..user_info_delimiter], + &USER_INFO_NOT_PCT_ENCODED + )? + ), + &authority_string[user_info_delimiter+1..] + ), + _ => ( + None, + authority_string + ) + }; + + // Next, parsing host and port from authority and path. + let mut port_string = String::new(); + let (mut host_parsing_state, host_is_reg_name) = if host_port_string.starts_with('[') { + (HostParsingState::IpLiteral, false) + } else { + (HostParsingState::NotIpLiteral, true) + }; + let mut host = Vec::<u8>::new(); + let mut ipv6_address = String::new(); + let mut pec_decoder = PercentEncodedCharacterDecoder::new(); + for c in host_port_string.chars() { + host_parsing_state = match host_parsing_state { + HostParsingState::NotIpLiteral => { + if c == '%' { + HostParsingState::PercentEncodedCharacter + } else if c == ':' { + HostParsingState::Port + } else if REG_NAME_NOT_PCT_ENCODED.contains(&c) { + host.push(u8::try_from(c as u32).unwrap()); + host_parsing_state + } else { + return Err(Error::IllegalCharacter); + } + }, + + HostParsingState::PercentEncodedCharacter => { + if let Some(ci) = pec_decoder.next(c)? { + host.push(ci); + HostParsingState::NotIpLiteral + } else { + host_parsing_state + } + }, + + HostParsingState::IpLiteral => { + if c == 'v' { + host.push(b'v'); + HostParsingState::IpvFutureNumber + } else { + HostParsingState::Ipv6Address + } + }, + + HostParsingState::Ipv6Address => { + if c == ']' { + host = Self::validate_ipv6_address(&ipv6_address)?; + HostParsingState::GarbageCheck + } else { + ipv6_address.push(c); + host_parsing_state + } + }, + + HostParsingState::IpvFutureNumber => { + if c == '.' { + host_parsing_state = HostParsingState::IpvFutureBody + } else if !HEXDIG.contains(&c) { + return Err(Error::IllegalCharacter); + } + host.push(u8::try_from(c as u32).unwrap()); + host_parsing_state + }, + + HostParsingState::IpvFutureBody => { + if c == ']' { + HostParsingState::GarbageCheck + } else if IPV_FUTURE_LAST_PART.contains(&c) { + host.push(u8::try_from(c as u32).unwrap()); + host_parsing_state + } else { + return Err(Error::IllegalCharacter); + } + }, + + HostParsingState::GarbageCheck => { + // illegal to have anything else, unless it's a colon, + // in which case it's a port delimiter + if c == ':' { + HostParsingState::Port + } else { + return Err(Error::IllegalCharacter); + } + }, + + HostParsingState::Port => { + port_string.push(c); + host_parsing_state + }, + } + } + + // My normal coding style requires extra parentheses for conditionals + // having multiple parts broken up into different lines, but rust + // hates it. Well, sorry rust, but we're going to do it anyway. + // FeelsUnusedParensMan + #[allow(unused_parens)] + if ( + (host_parsing_state != HostParsingState::NotIpLiteral) + && (host_parsing_state != HostParsingState::GarbageCheck) + && (host_parsing_state != HostParsingState::Port) + ) { + // truncated or ended early + return Err(Error::TruncatedHost); + } + if host_is_reg_name { + host.make_ascii_lowercase(); + } + let port = if port_string.is_empty() { + None + } else if let Ok(port) = port_string.parse::<u16>() { + Some(port) + } else { + return Err(Error::IllegalCharacter); + }; + Ok(Authority{ + userinfo, + host, + port, + }) } - fn parse_fragment(_query_and_or_fragment: &str) -> Result<(Option<String>, &str), Error> { - unimplemented!() + fn parse_fragment(query_and_or_fragment: &str) -> Result<(Option<Vec<u8>>, &str), Error> { + if let Some(fragment_delimiter) = query_and_or_fragment.find('#') { + let fragment = Self::decode_query_or_fragment( + &query_and_or_fragment[fragment_delimiter+1..] + )?; + Ok(( + Some(fragment), + &query_and_or_fragment[0..fragment_delimiter] + )) + } else { + Ok(( + None, + query_and_or_fragment + )) + } } - fn parse_path(path_string: &str) -> Result<Vec<String>, Error> { + fn parse_path(path_string: &str) -> Result<Vec<Vec<u8>>, Error> { // TODO: improvement: make an iterator and only collect at the end. - let mut path = Vec::<String>::new(); + let mut path_encoded = Vec::<String>::new(); match path_string { "/" => { // Special case of a path that is empty but needs a single // empty-string element to indicate that it is absolute. - path.push("".to_string()); + path_encoded.push("".to_string()); }, "" => { @@ -156,12 +474,12 @@ impl Uri { // [15:49] silen_z: path_string.split('/').collect() loop { if let Some(path_delimiter) = path_string.find('/') { - path.push( + path_encoded.push( path_string[0..path_delimiter].to_string() ); path_string = &path_string[path_delimiter+1..]; } else { - path.push(path_string.to_string()); + path_encoded.push(path_string.to_string()); break; } } @@ -169,15 +487,21 @@ impl Uri { } // TODO: improvement // [15:57] silen_z: collect into Result<Vec<_>, Error> - for segment in &mut path { - let new_segment = Self::decode_element(segment, &PCHAR_NOT_PCT_ENCODED)?; - *segment = new_segment; - } - Ok(path) + path_encoded.into_iter().map( + |segment| { + Self::decode_element(&segment, &PCHAR_NOT_PCT_ENCODED) + } + ) + .collect::<Result<Vec<Vec<u8>>, Error>>() } - fn parse_query(_query_and_or_fragment: &str) -> Result<Option<String>, Error> { - unimplemented!() + fn parse_query(query_and_or_fragment: &str) -> Result<Option<Vec<u8>>, Error> { + if query_and_or_fragment.is_empty() { + Ok(Some(Vec::new())) + } else { + let query = Self::decode_query_or_fragment(&query_and_or_fragment[1..])?; + Ok(Some(query)) + } } fn parse_scheme(uri_string: &str) -> Result<(Option<String>, &str), Error> { @@ -190,25 +514,41 @@ impl Uri { if let Some(scheme_end) = &uri_string[0..authority_or_path_delimiter_start].find(':') { let scheme = Self::check_scheme(&uri_string[0..*scheme_end])? .to_lowercase(); - Ok((Some(scheme), &uri_string[*scheme_end..])) + Ok((Some(scheme), &uri_string[*scheme_end+1..])) } else { Ok((None, uri_string)) } } - pub fn path(&self) -> &str { - "" + #[must_use = "you called path() to get the path, so why you no use?"] + pub fn path(&self) -> &Vec<Vec<u8>> { + &self.path + } + + #[must_use = "why did you get the port number and then throw it away?"] + pub fn port(&self) -> Option<u16> { + if let Some(authority) = &self.authority { + authority.port + } else { + None + } } - // TODO: - // [17:09] silen_z: as_ref() - pub fn scheme(&self) -> &Option<String> { - &self.scheme + #[must_use = "you wanted to use that scheme, right?"] + pub fn scheme(&self) -> Option<&str> { + // NOTE: This seemingly magic `as_deref` works because of two + // things that are going on here: + // 1) String implements DeRef with `str` as the associated type + // `Target`, meaning you can use a String in a context requiring + // &str, and String does the conversion work. + // 2) as_deref works by turning `Option<T>` into `Option<&T::Target>`, + // requiring T to implement Deref. In this case T is String. + self.scheme.as_deref() } fn split_authority_from_path_and_parse_them( authority_and_path_string: &str - ) -> Result<(Option<Authority>, Vec<String>), Error> { + ) -> Result<(Option<Authority>, Vec<Vec<u8>>), Error> { // Split authority from path. If there is an authority, parse it. if &authority_and_path_string[0..2] == "//" { // Strip off authority marker. @@ -229,6 +569,10 @@ impl Uri { Ok((None, path)) } } + + fn validate_ipv6_address(_address: &str) -> Result<Vec<u8>, Error> { + unimplemented!() + } } #[cfg(test)] @@ -243,78 +587,80 @@ mod tests { assert!(uri.is_ok()); let uri = uri.unwrap(); assert_eq!(None, uri.scheme()); - assert_eq!("foo/bar", uri.path()); - assert_eq!(uri.path(), "foo/bar"); + // TODO: needs improvement; I don't like having to spam `to_vec`. + // [15:49] kimundi2016: &b""[..] may also work + // Indeed, we could replace `.to_vec()` with `[..]`. + // + // Maybe we just make a convenience method we could use like this: + // assert_eq!("foo/bar", uri.path_as_str()); + assert_eq!(&[&b"foo"[..], &b"bar"[..]].to_vec(), uri.path()); + assert_eq!(uri.path(), &[&b"foo"[..], &b"bar"[..]].to_vec()); } #[test] fn parse_from_string_url() { - let uri = uriparse::URIReference::try_from("http://www.example.com/foo/bar"); + let uri = Uri::parse("http://www.example.com/foo/bar"); assert!(uri.is_ok()); let uri = uri.unwrap(); - assert_eq!(Some("http"), uri.scheme().map(uriparse::Scheme::as_str)); - assert_eq!( - Some("www.example.com".to_string()), - uri.host().map(std::string::ToString::to_string) - ); - assert_eq!(uri.path(), "/foo/bar"); + assert_eq!(Some("http"), uri.scheme()); + assert_eq!(Some(&b"www.example.com"[..]), uri.host()); + assert_eq!(uri.path(), &[&b""[..], &b"foo"[..], &b"bar"[..]].to_vec()); } #[test] fn parse_from_string_urn_default_path_delimiter() { - let uri = uriparse::URIReference::try_from("urn:book:fantasy:Hobbit"); + let uri = Uri::parse("urn:book:fantasy:Hobbit"); assert!(uri.is_ok()); let uri = uri.unwrap(); - assert_eq!(Some("urn"), uri.scheme().map(uriparse::Scheme::as_str)); + assert_eq!(Some("urn"), uri.scheme()); assert_eq!(None, uri.host()); - assert_eq!(uri.path(), "book:fantasy:Hobbit"); + assert_eq!(uri.path(), &[&b"book:fantasy:Hobbit"[..]].to_vec()); } - #[test] - fn parse_from_string_path_corner_cases() { - let test_vectors = [ - "", - "/", - "/foo", - "foo/" - ]; - for test_vector in &test_vectors { - let uri = uriparse::URIReference::try_from(*test_vector); - assert!(uri.is_ok()); - let uri = uri.unwrap(); - assert_eq!(uri.path(), test_vector); - } - } + // TODO: Fix this test! + // #[test] + // fn parse_from_string_path_corner_cases() { + // struct TestVector { + // path_in: &'static str, + // path_out: Vec<Vec<u8>>, + // }; + // let test_vectors = [ + // TestVector{path_in: "", path_out: &[].to_vec()}, + // TestVector{path_in: "/", path_out: &[&b"/"[..]].to_vec()}, + // TestVector{path_in: "/foo", path_out: &[&b""[..], &b"foo"[..]].to_vec()}, + // TestVector{path_in: "foo/", path_out: &[&b"foo"[..], &b""[..]].to_vec()}, + // ]; + // for test_vector in &test_vectors { + // let uri = Uri::parse(test_vector.path_in); + // assert!(uri.is_ok()); + // let uri = uri.unwrap(); + // assert_eq!(uri.path(), &test_vector.path_out); + // } + // } #[test] fn parse_from_string_has_a_port_number() { - let uri = uriparse::URIReference::try_from("http://www.example.com:8080/foo/bar"); + let uri = Uri::parse("http://www.example.com:8080/foo/bar"); assert!(uri.is_ok()); let uri = uri.unwrap(); - assert_eq!( - Some("www.example.com".to_string()), - uri.host().map(std::string::ToString::to_string) - ); + assert_eq!(Some(&b"www.example.com"[..]), uri.host()); assert_eq!(Some(8080), uri.port()); } #[test] fn parse_from_string_does_not_have_a_port_number() { - let uri = uriparse::URIReference::try_from("http://www.example.com/foo/bar"); + let uri = Uri::parse("http://www.example.com/foo/bar"); assert!(uri.is_ok()); let uri = uri.unwrap(); - assert_eq!( - Some("www.example.com".to_string()), - uri.host().map(std::string::ToString::to_string) - ); + assert_eq!(Some(&b"www.example.com"[..]), uri.host()); assert_eq!(None, uri.port()); } #[test] fn parse_from_string_twice_first_with_port_number_then_without() { - let uri = uriparse::URIReference::try_from("http://www.example.com:8080/foo/bar"); + let uri = Uri::parse("http://www.example.com:8080/foo/bar"); assert!(uri.is_ok()); - let uri = uriparse::URIReference::try_from("http://www.example.com/foo/bar"); + let uri = Uri::parse("http://www.example.com/foo/bar"); assert!(uri.is_ok()); let uri = uri.unwrap(); assert_eq!(None, uri.port()); @@ -322,19 +668,19 @@ mod tests { #[test] fn parse_from_string_bad_port_number_purly_alphabetic() { - let uri = uriparse::URIReference::try_from("http://www.example.com:spam/foo/bar"); + let uri = Uri::parse("http://www.example.com:spam/foo/bar"); assert!(uri.is_err()); } #[test] fn parse_from_string_bad_port_number_starts_numeric_ends_alphabetic() { - let uri = uriparse::URIReference::try_from("http://www.example.com:8080spam/foo/bar"); + let uri = Uri::parse("http://www.example.com:8080spam/foo/bar"); assert!(uri.is_err()); } #[test] fn parse_from_string_largest_valid_port_number() { - let uri = uriparse::URIReference::try_from("http://www.example.com:65535/foo/bar"); + let uri = Uri::parse("http://www.example.com:65535/foo/bar"); assert!(uri.is_ok()); let uri = uri.unwrap(); assert_eq!(Some(65535), uri.port()); @@ -342,19 +688,19 @@ mod tests { #[test] fn parse_from_string_bad_port_number_too_big() { - let uri = uriparse::URIReference::try_from("http://www.example.com:65536/foo/bar"); + let uri = Uri::parse("http://www.example.com:65536/foo/bar"); assert!(uri.is_err()); } #[test] fn parse_from_string_bad_port_number_negative() { - let uri = uriparse::URIReference::try_from("http://www.example.com:-1234/foo/bar"); + let uri = Uri::parse("http://www.example.com:-1234/foo/bar"); assert!(uri.is_err()); } #[test] fn parse_from_string_ends_after_authority() { - let uri = uriparse::URIReference::try_from("http://www.example.com"); + let uri = Uri::parse("http://www.example.com"); assert!(uri.is_ok()); } |