diff options
-rw-r--r-- | src/authority.rs | 51 | ||||
-rw-r--r-- | src/context.rs | 28 | ||||
-rw-r--r-- | src/error.rs | 24 | ||||
-rw-r--r-- | src/lib.rs | 55 | ||||
-rw-r--r-- | src/uri.rs | 212 |
5 files changed, 363 insertions, 7 deletions
diff --git a/src/authority.rs b/src/authority.rs index d0b49d7..2a2801c 100644 --- a/src/authority.rs +++ b/src/authority.rs @@ -8,6 +8,42 @@ use super::error::Error; use super::parse_host_port::parse_host_port; use super::validate_ipv6_address::validate_ipv6_address; +/// This is the optional part of a URI which governs the URI's namespace. It +/// typically contains a host name or IP address, and may also include a port +/// number and/or userinfo component. +/// +/// # Examples +/// +/// ## Parsing an Authority into its components +/// +/// ```rust +/// # extern crate uri; +/// use uri::Authority; +/// +/// # fn test() -> Result<(), uri::Error> { +/// let authority = Authority::parse("nobody@www.example.com:8080")?; +/// assert_eq!(Some("nobody".as_bytes()), authority.userinfo()); +/// assert_eq!("www.example.com".as_bytes(), authority.host()); +/// assert_eq!(Some(8080), authority.port()); +/// # Ok(()) +/// # } +/// ``` +/// +/// ## Generating a URI from its components +/// +/// ```rust +/// # extern crate uri; +/// use uri::Authority; +/// +/// # fn test() -> Result<(), uri::Error> { +/// let mut authority = Authority::default(); +/// authority.set_userinfo(Some("nobody").map(Into::into)); +/// authority.set_host("www.example.com"); +/// authority.set_port(Some(8080)); +/// assert_eq!("nobody@www.example.com:8080", authority.to_string()); +/// # Ok(()) +/// # } +/// ``` #[derive(Clone, Debug, Default, PartialEq)] pub struct Authority { userinfo: Option<Vec<u8>>, @@ -16,37 +52,52 @@ pub struct Authority { } impl Authority { + /// Borrow the host name part of the Authority. #[must_use = "why u no use host return value?"] pub fn host(&self) -> &[u8] { &self.host } + /// Borrow the port number part of the Authority. #[must_use = "why did you get the port number and then throw it away?"] pub fn port(&self) -> Option<u16> { self.port } + /// Change the userinfo part of the Authority. pub fn set_userinfo<T>(&mut self, userinfo: T) where T: Into<Option<Vec<u8>>> { self.userinfo = userinfo.into(); } + /// Change the host name part of the Authority. pub fn set_host<T>(&mut self, host: T) where T: Into<Vec<u8>> { self.host = host.into(); } + /// Change the port number part of the Authority. pub fn set_port(&mut self, port: Option<u16>) { self.port = port; } + /// Borrow the userinfo part of the Authority. #[must_use = "security breach... security breach... userinfo not used"] pub fn userinfo(&self) -> Option<&[u8]> { self.userinfo.as_deref() } + /// Interpret the given string as the Authority component of a URI, + /// separating its various subcomponents, returning an `Authority` value + /// containing them. + /// + /// # Errors + /// + /// There are many ways to screw up the Authority part of URI string, and + /// this function will let you know what's up by returning a variant of the + /// [`Error`](enum.Error.html) type. #[must_use = "you parsed it; don't you want the results?"] pub fn parse<T>(authority_string: T) -> Result<Self, Error> where T: AsRef<str> diff --git a/src/context.rs b/src/context.rs index bbc6613..bb6667e 100644 --- a/src/context.rs +++ b/src/context.rs @@ -1,13 +1,41 @@ +/// This enumerates the various places where an error might occur parsing a +/// URI. #[derive(Debug, Clone, Copy, PartialEq)] pub enum Context { + /// This is the fragment of the URI, such as `#baz` in + /// `http://www.example.com/foo?bar#baz`. Fragment, + + /// This is the host name of the URI, such as `www.example.com` in + /// `http://www.example.com/foo?bar#baz`. Host, + + /// This is the IPv4 portion of the IPv6 host name in the URI, such as + /// `1.2.3.4` in `http://[::ffff:1.2.3.4]/foo?bar#baz`. Ipv4Address, + + /// This is the IPv6 host name in the URI, such as + /// `::ffff:1.2.3.4` in `http://[::ffff:1.2.3.4]/foo?bar#baz`. Ipv6Address, + + /// This is the `IPvFuture` host name in the URI, such as + /// `v7.aB` in `http://[v7.aB]/foo?bar#baz`. IpvFuture, + + /// This is the path of the URI, such as `/foo` in + /// `http://www.example.com/foo?bar#baz`. Path, + + /// This is the query of the URI, such as `?bar` in + /// `http://www.example.com/foo?bar#baz`. Query, + + /// This is the scheme of the URI, such as `http` in + /// `http://www.example.com/foo?bar#baz`. Scheme, + + /// This is the scheme of the URI, such as `nobody` in + /// `http://nobody@www.example.com/foo?bar#baz`. Userinfo, } diff --git a/src/error.rs b/src/error.rs index 9f710fa..44bd8b6 100644 --- a/src/error.rs +++ b/src/error.rs @@ -1,37 +1,61 @@ use super::context::Context; +/// This is the enumeration of all the different kinds of errors which this +/// crate generates. #[derive(Debug, Clone, thiserror::Error, PartialEq)] pub enum Error { + /// One or more components of the URI contains a non-UTF8 sequence, + /// and so cannot be converted to a Rust string. #[error("URI contains non-UTF8 sequences")] CannotExpressAsUtf8(#[from] std::string::FromUtf8Error), + /// URI begins with an empty scheme, such as `://www.example.com` #[error("scheme expected but missing")] EmptyScheme, + /// URI contains a character which is not permitted in the context where it + /// was encountered; for example, a caret (`^`) in a query: + /// `http://www.example.com?eat_my_^` #[error("illegal character in {0}")] IllegalCharacter(Context), + /// URI contains an incorrect percent encoding, such as + /// `http://www.example.com?foo=%GG` #[error("illegal percent encoding")] IllegalPercentEncoding, + /// URI contains an invalid port number, such as + /// `http://www.example.com:99999` or `http://www.example.com:foo` #[error("illegal port number")] IllegalPortNumber(#[source] std::num::ParseIntError), + /// URI contains an IPv4 address with one or more bad parts, such as + /// `http://[::ffff:1.2.3.256]/` #[error("octet group expected")] InvalidDecimalOctet, + /// URI contains an IP address with missing parts, such as + /// `http://[::ffff:1.2.3]/` #[error("too few address parts")] TooFewAddressParts, + /// URI contains an IP address with too many parts, such as + /// `http://[::ffff:1.2.3.4.8]/` #[error("too many address parts")] TooManyAddressParts, + /// URI contains an IPv6 address with too many digits, such as + /// `http://[20001:db8:85a3::1]/` #[error("too many digits in IPv6 address part")] TooManyDigits, + /// URI contains an IPv6 address with more than one double-colon, such as + /// `http://[2001:db8:85a3::8a2e::]/` #[error("too many double-colons in IPv6 address")] TooManyDoubleColons, + /// URI contains an IPv6 address that is truncated, such as + /// `http://[2001:db8:85a3::8a2e:0:]/` #[error("truncated host")] TruncatedHost, } @@ -1,3 +1,58 @@ +//! This crate implements [IETF RFC 3986](https://tools.ietf.org/html/rfc3986), +//! "Uniform Resource Identifier (URI): Generic Syntax". The [`Uri`] type +//! can be used to parse and generate RFC-conformant URI strings to and from +//! their various components. +//! +//! A Uniform Resource Identifier (URI) is a compact sequence of characters +//! that identifies an abstract or physical resource. One common form of URI +//! is the Uniform Resource Locator (URL), used to reference web resources: +//! +//! ```text +//! http://www.example.com/foo?bar#baz +//! ``` +//! +//! Another kind of URI is the path reference: +//! +//! ```text +//! /usr/bin/zip +//! ``` +//! +//! # Examples +//! +//! ## Parsing a URI into its components +//! +//! ```rust +//! # extern crate uri; +//! use uri::Uri; +//! +//! let uri = Uri::parse("http://www.example.com/foo?bar#baz").unwrap(); +//! let authority = uri.authority().unwrap(); +//! assert_eq!("www.example.com".as_bytes(), authority.host()); +//! assert_eq!(Some("www.example.com"), uri.host_as_string().unwrap().as_deref()); +//! assert_eq!("/foo", uri.path_as_string().unwrap()); +//! assert_eq!(Some("bar"), uri.query_as_string().unwrap().as_deref()); +//! assert_eq!(Some("baz"), uri.fragment_as_string().unwrap().as_deref()); +//! ``` +//! +//! ## Generating a URI from its components +//! +//! ```rust +//! # extern crate uri; +//! use uri::{Authority, Uri}; +//! +//! let mut uri = Uri::default(); +//! assert!(uri.set_scheme(String::from("http")).is_ok()); +//! let mut authority = Authority::default(); +//! authority.set_host("www.example.com"); +//! uri.set_authority(Some(authority)); +//! uri.set_path_from_str("/foo"); +//! uri.set_query(Some("bar".into())); +//! uri.set_fragment(Some("baz".into())); +//! assert_eq!("http://www.example.com/foo?bar#baz", uri.to_string()); +//! ``` +//! +//! [`Uri`]: struct.Uri.html + #![warn(clippy::pedantic)] #![allow(clippy::non_ascii_literal)] #![warn(missing_docs)] @@ -12,6 +12,73 @@ use super::character_classes::{ QUERY_NOT_PCT_ENCODED_WITHOUT_PLUS, }; +/// This type is used to parse and generate URI strings to and from their +/// various components. Components are percent-encoded as necessary during +/// generation, and percent encodings are decoded during parsing. +/// +/// Since most URI components, once decoded, may include non-UTF8 byte +/// sequences (which are always percent-encoded), getter methods such as +/// [`path`] and [`query`] return byte array [slice] references (`&[u8]`) +/// rather than string or string slice references. Fallible convenience +/// methods ending in `_as_string`, such as [`path_as_string`] and +/// [`query_as_string`], are provided to convert these to strings. +/// +/// The "Authority" part of the Uri is represented by the [`Authority` type]. +/// Although the `Uri` type provides [`userinfo`], [`host`], and [`port`] +/// methods for convenience, `Uri` holds these components through the +/// [`Authority` type], which can be accessed via [`authority`] and +/// [`set_authority`]. To set or change the userinfo, host, or port of a +/// `Uri`, construct a new `Authority` value and set it in the `Uri` with +/// [`set_authority`]. +/// +/// # Examples +/// +/// ## Parsing a URI into its components +/// +/// ```rust +/// # extern crate uri; +/// use uri::Uri; +/// +/// # fn test() -> Result<(), uri::Error> { +/// let uri = Uri::parse("http://www.example.com/foo?bar#baz")?; +/// let authority = uri.authority().unwrap(); +/// assert_eq!("www.example.com".as_bytes(), authority.host()); +/// assert_eq!(Some("www.example.com"), uri.host_as_string()?.as_deref()); +/// assert_eq!("/foo", uri.path_as_string()?); +/// assert_eq!(Some("bar"), uri.query_as_string()?.as_deref()); +/// assert_eq!(Some("baz"), uri.fragment_as_string()?.as_deref()); +/// # Ok(()) +/// # } +/// ``` +/// +/// ## Generating a URI from its components +/// +/// ```rust +/// # extern crate uri; +/// use uri::{Authority, Uri}; +/// +/// let mut uri = Uri::default(); +/// assert!(uri.set_scheme(String::from("http")).is_ok()); +/// let mut authority = Authority::default(); +/// authority.set_host("www.example.com"); +/// uri.set_authority(Some(authority)); +/// uri.set_path_from_str("/foo"); +/// uri.set_query(Some("bar".into())); +/// uri.set_fragment(Some("baz".into())); +/// assert_eq!("http://www.example.com/foo?bar#baz", uri.to_string()); +/// ``` +/// +/// [`authority`]: #method.authority +/// [`Authority` type]: struct.Authority.html +/// [`host`]: #method.host +/// [`path`]: #method.path +/// [`path_as_string`]: #method.path_as_string +/// [`port`]: #method.port +/// [`query`]: #method.query +/// [`query_as_string`]: #method.query_as_string +/// [`set_authority`]: #method.set_authority +/// [`userinfo`]: #method.userinfo +/// [slice]: https://doc.rust-lang.org/std/primitive.slice.html #[derive(Clone, Debug, Default, PartialEq)] pub struct Uri { scheme: Option<String>, @@ -22,6 +89,7 @@ pub struct Uri { } impl Uri { + /// Borrow the authority (if any) of the URI. #[must_use = "respect mah authoritah"] pub fn authority(&self) -> Option<&Authority> { self.authority.as_ref() @@ -67,6 +135,8 @@ impl Uri { Ok(scheme) } + /// Determines if the URI contains a relative path rather than an absolute + /// path. #[must_use = "please use the return value kthxbye"] pub fn contains_relative_path(&self) -> bool { !Self::is_path_absolute(&self.path) @@ -85,11 +155,18 @@ impl Uri { ) } + /// Borrow the fragment (if any) of the URI. #[must_use = "A query and a fragment walked into a bar. Too bad you're ignoring the fragment because it's actually a funny joke."] pub fn fragment(&self) -> Option<&[u8]> { self.fragment.as_deref() } + /// Convert the fragment (if any) into a string. + /// + /// # Errors + /// + /// Since fragments may contain non-UTF8 byte sequences, this function may + /// return [`Error::CannotExpressAsUtf8`](enum.Error.html#variant.CannotExpressAsUtf8). #[must_use = "use the fragment return value silly programmer"] pub fn fragment_as_string(&self) -> Result<Option<String>, Error> { self.fragment() @@ -100,6 +177,7 @@ impl Uri { .transpose() } + /// Borrow the host portion of the Authority (if any) of the URI. #[must_use = "why u no use host return value?"] pub fn host(&self) -> Option<&[u8]> { self.authority @@ -107,6 +185,13 @@ impl Uri { .map(Authority::host) } + /// Convert the host portion of the Authority (if any) into a string. + /// + /// # Errors + /// + /// Since host names may contain non-UTF8 byte sequences, this function may + /// return + /// [`Error::CannotExpressAsUtf8`](enum.Error.html#variant.CannotExpressAsUtf8). #[must_use = "I made that host field into a string for you; don't you want it?"] pub fn host_as_string(&self) -> Result<Option<String>, Error> { self.host() @@ -126,19 +211,38 @@ impl Uri { } } + /// Determines if the URI is a `relative-ref` (relative reference), as + /// defined in [RFC 3986 section + /// 4.2](https://tools.ietf.org/html/rfc3986#section-4.2). A relative + /// reference has no scheme, but may still have an authority. #[must_use = "why would you call an accessor method and not use the return value, silly human"] pub fn is_relative_reference(&self) -> bool { self.scheme.is_none() } + /// Apply the `remove_dot_segments` routine talked about + /// in [RFC 3986 section + /// 5.2](https://tools.ietf.org/html/rfc3986#section-5.2) to the path + /// segments of the URI, in order to normalize the path (apply and remove + /// "." and ".." segments). + /// + /// # Examples + /// + /// ```rust + /// # extern crate uri; + /// use uri::Uri; + /// + /// # fn test() -> Result<(), uri::Error> { + /// let mut uri = Uri::parse("/a/b/c/./../../g")?; + /// uri.normalize(); + /// assert_eq!("/a/g", uri.path_as_string()?); + /// # Ok(()) + /// # } + /// ``` pub fn normalize(&mut self) { self.path = Self::normalize_path(&self.path); } - // This method applies the "remove_dot_segments" routine talked about - // in RFC 3986 (https://tools.ietf.org/html/rfc3986) to the path - // segments of the URI, in order to normalize the path - // (apply and remove "." and ".." segments). fn normalize_path<T>(original_path: T) -> Vec<Vec<u8>> where T: AsRef<[Vec<u8>]> { @@ -190,6 +294,14 @@ impl Uri { normalized_path } + /// Interpret the given string as a URI, separating its various components, + /// returning a `Uri` value containing them. + /// + /// # Errors + /// + /// There are many ways to screw up a URI string, and this function will + /// let you know what's up by returning a variant of the + /// [`Error`](enum.Error.html) type. pub fn parse<T>(uri_string: T) -> Result<Self, Error> where T: AsRef<str> { @@ -292,11 +404,43 @@ impl Uri { } } + /// Borrow the path component of the URI. + /// + /// The path is represented as a two-dimensional vector: + /// * the "segments" or pieces of the path between the slashes + /// * the bytes that make up each segment + /// + /// Byte vectors are used instead of strings because segments may contain + /// non-UTF8 sequences. + /// + /// Leading and trailing slashes in the path are special cases represented + /// by extra empty segments at the beginning and/or end of the path. + /// + /// # Examples + /// + /// (Note: the examples below show strings, not byte vectors, simply to be + /// more readable.) + /// + /// ```text + /// "foo/bar" -> ["foo", "bar"] + /// "/foo/bar" -> ["", "foo", "bar"] + /// "foo/bar/" -> ["foo", "bar", ""] + /// "/foo/bar/" -> ["", "foo", "bar", ""] + /// "/" -> [""] + /// "" -> [] + /// ``` #[must_use = "you called path() to get the path, so why you no use?"] pub fn path(&self) -> &Vec<Vec<u8>> { &self.path } + /// Convert the path portion of the URI into a string. + /// + /// # Errors + /// + /// Since path segments may contain non-UTF8 byte sequences, this function + /// may return + /// [`Error::CannotExpressAsUtf8`](enum.Error.html#variant.CannotExpressAsUtf8). #[must_use = "we went through all that trouble to put the path into a string, and you don't want it?"] pub fn path_as_string(&self) -> Result<String, Error> { match &*self.path { @@ -310,6 +454,7 @@ impl Uri { } } + /// Return a copy of the port (if any) contained in the URI. #[must_use = "why did you get the port number and then throw it away?"] pub fn port(&self) -> Option<u16> { if let Some(authority) = &self.authority { @@ -319,11 +464,18 @@ impl Uri { } } + /// Borrow the query (if any) of the URI. #[must_use = "don't you want to know what that query was?"] pub fn query(&self) -> Option<&[u8]> { self.query.as_deref() } + /// Convert the query (if any) into a string. + /// + /// # Errors + /// + /// Since queries may contain non-UTF8 byte sequences, this function may + /// return [`Error::CannotExpressAsUtf8`](enum.Error.html#variant.CannotExpressAsUtf8). #[must_use = "use the query return value silly programmer"] pub fn query_as_string(&self) -> Result<Option<String>, Error> { self.query() @@ -334,11 +486,26 @@ impl Uri { .transpose() } + /// Return a new URI which is the result of applying the given relative + /// reference to the URI, following the algorithm from [RFC 3986 section + /// 5.2.2](https://tools.ietf.org/html/rfc3986#section-5.2.2). + /// + /// # Examples + /// + /// ```rust + /// # extern crate uri; + /// use uri::Uri; + /// + /// # fn test() -> Result<(), uri::Error> { + /// let base = Uri::parse("http://a/b/c/d;p?q")?; + /// let relative_reference = Uri::parse("g;x?y#s")?; + /// let resolved = base.resolve(&relative_reference); + /// assert_eq!("http://a/b/c/g;x?y#s", resolved.path_as_string()?); + /// # Ok(()) + /// # } + /// ``` #[must_use = "why go through all that effort to resolve the URI, when you're not going to use it?!"] pub fn resolve(&self, relative_reference: &Self) -> Self { - // Resolve the reference by following the algorithm - // from section 5.2.2 in - // RFC 3986 (https://tools.ietf.org/html/rfc3986). let (scheme, authority, path, query) = if relative_reference.scheme.is_some() { ( relative_reference.scheme.clone(), @@ -409,6 +576,7 @@ impl Uri { } } + /// Borrow the scheme (if any) component of the URI. #[must_use = "you wanted to use that scheme, right?"] pub fn scheme(&self) -> Option<&str> { // NOTE: This seemingly magic `as_deref` works because of two @@ -421,24 +589,35 @@ impl Uri { self.scheme.as_deref() } + /// Change the authority of the URI. pub fn set_authority<T>(&mut self, authority: T) where T: Into<Option<Authority>> { self.authority = authority.into(); } + /// Change the fragment of the URI. pub fn set_fragment<T>(&mut self, fragment: T) where T: Into<Option<Vec<u8>>> { self.fragment = fragment.into(); } + /// Change the path of the URI. + /// + /// Note: See [`path`](#method.path) for special notes about what the + /// segments of the path mean. pub fn set_path<T>(&mut self, path: T) where T: Into<Vec<Vec<u8>>> { self.path = path.into(); } + /// Change the path of the URI using a string which is split by its slash + /// (`/`) characters to determine the path segments. + /// + /// Note: See [`path`](#method.path) for special notes about what the + /// segments of the path mean. pub fn set_path_from_str<T>(&mut self, path: T) where T: AsRef<str> { @@ -453,12 +632,20 @@ impl Uri { } } + /// Change the query of the URI. pub fn set_query<T>(&mut self, query: T) where T: Into<Option<Vec<u8>>> { self.query = query.into(); } + /// Change the scheme of the URI. + /// + /// # Errors + /// + /// The set of characters allowed in the scheme of a URI is limited. + /// [`Error::IllegalCharacter`](enum.Error.html#variant.IllegalCharacter) + /// is returned if you try to use a character that isn't allowed. pub fn set_scheme<T>(&mut self, scheme: T) -> Result<(), Error> where T: Into<Option<String>> { @@ -503,6 +690,11 @@ impl Uri { } } + /// Borrow the userinfo portion (if any) of the Authority (if any) of the + /// URI. + /// + /// Note that you can get `None` if there is either no Authority in the URI + /// or there is an Authority in the URI but it has no userinfo in it. #[must_use = "security breach... security breach... userinfo not used"] pub fn userinfo(&self) -> Option<&[u8]> { if let Some(authority) = &self.authority { @@ -512,6 +704,12 @@ impl Uri { } } + /// Convert the fragment (if any) into a string. + /// + /// # Errors + /// + /// Since fragments may contain non-UTF8 byte sequences, this function may + /// return [`Error::CannotExpressAsUtf8`](enum.Error.html#variant.CannotExpressAsUtf8). #[must_use = "come on, you intended to use that userinfo return value, didn't you?"] pub fn userinfo_as_string(&self) -> Result<Option<String>, Error> { self.userinfo() |