From 58b228cbf8c3013bedb48e65c6dc0199076b592a Mon Sep 17 00:00:00 2001 From: Richard Walters Date: Tue, 6 Oct 2020 12:53:09 -0700 Subject: Work in progress porting implementation to Rust --- Cargo.toml | 3 + src/lib.rs | 246 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 237 insertions(+), 12 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index f59789b..e9f72ce 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -6,5 +6,8 @@ edition = "2018" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html +[dependencies] +lazy_static = "1.4" + [dev-dependencies] uriparse = "0.6" diff --git a/src/lib.rs b/src/lib.rs index 11b702e..11e02a3 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,27 +1,249 @@ #![warn(clippy::pedantic)] +#![allow(clippy::missing_errors_doc)] + +#[macro_use] +extern crate lazy_static; + +use std::collections::HashSet; + +// This is the character set containing just the alphabetic characters +// from the ASCII character set. +// +// TODO: improvement +// [16:16] silen_z: btw char::is_ascii_letter or something like that exists +lazy_static! { + static ref ALPHA: HashSet = + ('a'..='z') + .chain('A'..='Z') + .collect::>(); +} + +// This is the character set containing just numbers. +lazy_static! { + static ref DIGIT: HashSet = + ('0'..='9') + .collect::>(); +} + +// This is the character set corresponds to the "unreserved" syntax +// specified in RFC 3986 (https://tools.ietf.org/html/rfc3986). +lazy_static! { + static ref UNRESERVED: HashSet = + ALPHA.iter() + .chain(DIGIT.iter()) + .chain(['-', '.', '_', '~'].iter()) + .copied() + .collect::>(); +} + +// This is the character set corresponds to the "sub-delims" syntax +// specified in RFC 3986 (https://tools.ietf.org/html/rfc3986). +lazy_static! { + static ref SUB_DELIMS: HashSet = + [ + '!', '$', '&', '\'', '(', ')', + '*', '+', ',', ';', '=' + ] + .iter() + .copied() + .collect::>(); +} + +// This is the character set corresponds to the "pchar" syntax +// specified in RFC 3986 (https://tools.ietf.org/html/rfc3986), +// leaving out "pct-encoded". +lazy_static! { + static ref PCHAR_NOT_PCT_ENCODED: HashSet = + UNRESERVED.iter() + .chain(SUB_DELIMS.iter()) + .chain([':', '@'].iter()) + .copied() + .collect::>(); +} + +#[derive(Debug, Clone)] +pub enum Error { + EmptyScheme, + SchemeWithIllegalCharacter, +} + +impl std::fmt::Display for Error { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Error::EmptyScheme => { + write!(f, "scheme expected but missing") + }, + + Error::SchemeWithIllegalCharacter => { + write!(f, "scheme contains illegal character") + }, + } + } +} + +pub struct Authority { + userinfo: Option, + host: Option, + port: Option, +} + +pub struct Uri { + scheme: Option, + authority: Option, + path: Vec, + query: Option, + fragment: Option, +} + +impl Uri { + fn check_scheme(_scheme: &str) -> Result<&str, Error> { + unimplemented!() + } + + // TODO: look into making element type more flexible + fn decode_element( + _element: &str, + _allowed_characters: &'static HashSet + ) -> Result { + unimplemented!() + } + + pub fn parse(uri_string: &str) -> Result { + let (scheme, rest) = Self::parse_scheme(uri_string)?; + + let path_end = rest.find(&['?', '#'][..]) + .unwrap_or_else(|| rest.len()); +// let path_end = rest.find(|c| "?#".find(c).is_some()) +// .unwrap_or(rest.len()); + + let authority_and_path_string = &rest[0..path_end]; + let query_and_or_fragment = &rest[path_end..]; + let (authority, path) = Self::split_authority_from_path_and_parse_them(authority_and_path_string)?; + let (fragment, possible_query) = Self::parse_fragment(query_and_or_fragment)?; + let query = Self::parse_query(possible_query)?; + Ok(Uri{ + scheme, + authority, + path, + query, + fragment + }) + } + + fn parse_authority(_authority_string: &str) -> Result { + unimplemented!() + } + + fn parse_fragment(_query_and_or_fragment: &str) -> Result<(Option, &str), Error> { + unimplemented!() + } + + fn parse_path(path_string: &str) -> Result, Error> { + // TODO: improvement: make an iterator and only collect at the end. + let mut path = Vec::::new(); + match path_string { + "/" => { + // Special case of a path that is empty but needs a single + // empty-string element to indicate that it is absolute. + path.push("".to_string()); + }, + + "" => { + }, + + mut path_string => { + // TODO: Try out this improvement: + // [15:49] silen_z: path_string.split('/').collect() + loop { + if let Some(path_delimiter) = path_string.find('/') { + path.push( + path_string[0..path_delimiter].to_string() + ); + path_string = &path_string[path_delimiter+1..]; + } else { + path.push(path_string.to_string()); + break; + } + } + } + } + // TODO: improvement + // [15:57] silen_z: collect into Result, Error> + for segment in &mut path { + let new_segment = Self::decode_element(segment, &PCHAR_NOT_PCT_ENCODED)?; + *segment = new_segment; + } + Ok(path) + } + + fn parse_query(_query_and_or_fragment: &str) -> Result, Error> { + unimplemented!() + } + + fn parse_scheme(uri_string: &str) -> Result<(Option, &str), Error> { + // Limit our search so we don't scan into the authority + // or path elements, because these may have the colon + // character as well, which we might misinterpret + // as the scheme delimiter. + let authority_or_path_delimiter_start = uri_string.find('/') + .unwrap_or_else(|| uri_string.len()); + if let Some(scheme_end) = &uri_string[0..authority_or_path_delimiter_start].find(':') { + let scheme = Self::check_scheme(&uri_string[0..*scheme_end])? + .to_lowercase(); + Ok((Some(scheme), &uri_string[*scheme_end..])) + } else { + Ok((None, uri_string)) + } + } + + pub fn path(&self) -> &str { + "" + } + + // TODO: + // [17:09] silen_z: as_ref() + pub fn scheme(&self) -> &Option { + &self.scheme + } + + fn split_authority_from_path_and_parse_them( + authority_and_path_string: &str + ) -> Result<(Option, Vec), Error> { + // Split authority from path. If there is an authority, parse it. + if &authority_and_path_string[0..2] == "//" { + // Strip off authority marker. + let authority_and_path_string = &authority_and_path_string[2..]; + + // First separate the authority from the path. + let authority_end = authority_and_path_string.find('/') + .unwrap_or_else(|| authority_and_path_string.len()); + let authority_string = &authority_and_path_string[0..authority_end]; + let path_string = &authority_and_path_string[authority_end..]; + + // Parse the elements inside the authority string. + let authority = Self::parse_authority(authority_string)?; + let path = Self::parse_path(path_string)?; + Ok((Some(authority), path)) + } else { + let path = Self::parse_path(authority_and_path_string)?; + Ok((None, path)) + } + } +} #[cfg(test)] mod tests { + use super::*; use std::convert::TryFrom; #[test] fn parse_from_string_no_scheme() { - let uri = uriparse::URIReference::try_from("foo/bar"); + let uri = Uri::parse("foo/bar"); assert!(uri.is_ok()); let uri = uri.unwrap(); assert_eq!(None, uri.scheme()); - - // FIXME: Due to what looks like a bug in either uriparse - // or the Rust core (so probably uriparse after all), I can't - // order the arguments as I would like: - // - //assert_eq!("foo/bar", uri.path()); - // - // This causes a stack overflow. - // https://github.com/sgodwincs/uriparse-rs/issues/14 - // - // So for now, we work around the issue by swapping the two arguments: + assert_eq!("foo/bar", uri.path()); assert_eq!(uri.path(), "foo/bar"); } -- cgit v1.2.3