diff --git a/examples/builder/builder.rs b/examples/builder/builder.rs index 85367bf4a3..f6794d37fc 100644 --- a/examples/builder/builder.rs +++ b/examples/builder/builder.rs @@ -17,10 +17,7 @@ async fn main() -> Result<()> { let mut headers = HeaderMap::new(); headers.insert(header::ACCEPT, "text/html".parse().unwrap()); - let accepted = Some(HashSet::from_iter(vec![ - StatusCode::OK, - StatusCode::NO_CONTENT, - ])); + let accepted = HashSet::from_iter(vec![StatusCode::OK, StatusCode::NO_CONTENT]); let client = ClientBuilder::builder() .excludes(excludes) diff --git a/lychee-bin/src/client.rs b/lychee-bin/src/client.rs index 2dca723aea..e5592e1fc0 100644 --- a/lychee-bin/src/client.rs +++ b/lychee-bin/src/client.rs @@ -17,6 +17,7 @@ pub(crate) fn create(cfg: &Config, cookie_jar: Option<&Arc>) - let remaps = parse_remaps(&cfg.remap)?; let includes = RegexSet::new(&cfg.include)?; let excludes = RegexSet::new(&cfg.exclude)?; + let accepted: HashSet = cfg.accept.clone().try_into()?; // Offline mode overrides the scheme let schemes = if cfg.offline { @@ -25,14 +26,6 @@ pub(crate) fn create(cfg: &Config, cookie_jar: Option<&Arc>) - cfg.scheme.clone() }; - let accepted = cfg - .accept - .clone() - .into_set() - .iter() - .map(|value| StatusCode::from_u16(*value)) - .collect::, _>>()?; - let headers = HeaderMap::from_header_pairs(&cfg.header)?; ClientBuilder::builder() diff --git a/lychee-bin/src/commands/check.rs b/lychee-bin/src/commands/check.rs index b622f29f2f..d661657e2c 100644 --- a/lychee-bin/src/commands/check.rs +++ b/lychee-bin/src/commands/check.rs @@ -49,7 +49,7 @@ where let client = params.client; let cache = params.cache; let cache_exclude_status = params.cfg.cache_exclude_status.into_set(); - let accept = params.cfg.accept.into_set(); + let accept = params.cfg.accept.into(); let pb = if params.cfg.no_progress || params.cfg.verbose.log_level() >= log::Level::Info { None diff --git a/lychee-bin/tests/cli.rs b/lychee-bin/tests/cli.rs index e08f766154..1119a600df 100644 --- a/lychee-bin/tests/cli.rs +++ b/lychee-bin/tests/cli.rs @@ -562,7 +562,7 @@ mod cli { .failure() .code(2) .stdout(contains( - "[404] https://github.com/mre/idiomatic-rust-doesnt-exist-man | Network error: Not Found" + r#"[404] https://github.com/mre/idiomatic-rust-doesnt-exist-man | Rejected status code (this depends on your "accept" configuration): Not Found"# )) .stderr(contains( "There were issues with GitHub URLs. You could try setting a GitHub token and running lychee again.", @@ -1118,7 +1118,7 @@ mod cli { mock_server_no_content.uri() ))) .stderr(contains(format!( - "[429] {}/ | Network error: Too Many Requests\n", + "[429] {}/ | Rejected status code (this depends on your \"accept\" configuration): Too Many Requests\n", mock_server_too_many_requests.uri() ))); @@ -1176,11 +1176,11 @@ mod cli { .failure() .code(2) .stdout(contains(format!( - "[418] {}/ | Network error: I\'m a teapot", + r#"[418] {}/ | Rejected status code (this depends on your "accept" configuration): I'm a teapot"#, mock_server_teapot.uri() ))) .stdout(contains(format!( - "[500] {}/ | Network error: Internal Server Error", + r#"[500] {}/ | Rejected status code (this depends on your "accept" configuration): Internal Server Error"#, mock_server_server_error.uri() ))); @@ -1213,6 +1213,26 @@ mod cli { Ok(()) } + #[tokio::test] + async fn test_accept_overrides_defaults_not_additive() -> Result<()> { + let mock_server_200 = mock_server!(StatusCode::OK); + + let mut cmd = main_command(); + cmd.arg("--accept") + .arg("404") // ONLY accept 404 - should reject 200 as we overwrite the default + .arg("-") + .write_stdin(mock_server_200.uri()) + .assert() + .failure() + .code(2) + .stdout(contains(format!( + r#"[200] {}/ | Rejected status code (this depends on your "accept" configuration): OK"#, + mock_server_200.uri() + ))); + + Ok(()) + } + #[tokio::test] async fn test_skip_cache_unsupported() -> Result<()> { let base_path = fixtures_path().join("cache"); diff --git a/lychee-lib/src/checker/website.rs b/lychee-lib/src/checker/website.rs index 60a92f056f..de118bee28 100644 --- a/lychee-lib/src/checker/website.rs +++ b/lychee-lib/src/checker/website.rs @@ -36,7 +36,7 @@ pub(crate) struct WebsiteChecker { /// Set of accepted return codes / status codes. /// /// Unmatched return codes/ status codes are deemed as errors. - accepted: Option>, + accepted: HashSet, /// Requires using HTTPS when it's available. /// @@ -59,7 +59,7 @@ impl WebsiteChecker { retry_wait_time: Duration, max_retries: u64, reqwest_client: reqwest::Client, - accepted: Option>, + accepted: HashSet, github_client: Option, require_https: bool, plugin_request_chain: RequestChain, @@ -102,11 +102,12 @@ impl WebsiteChecker { let method = request.method().clone(); match self.reqwest_client.execute(request).await { Ok(response) => { - let mut status = Status::new(&response, self.accepted.clone()); + let status = Status::new(&response, &self.accepted); if self.include_fragments && status.is_success() && method == Method::GET { - status = self.check_html_fragment(status, response).await; + self.check_html_fragment(status, response).await + } else { + status } - status } Err(e) => e.into(), } @@ -128,7 +129,7 @@ impl WebsiteChecker { .await { Ok(true) => status, - Ok(false) => Status::Error(ErrorKind::InvalidFragment(url.clone().into())), + Ok(false) => Status::Error(ErrorKind::InvalidFragment(url.into())), Err(e) => Status::Error(e), } } diff --git a/lychee-lib/src/client.rs b/lychee-lib/src/client.rs index 4c9100b735..b7858e97a0 100644 --- a/lychee-lib/src/client.rs +++ b/lychee-lib/src/client.rs @@ -33,6 +33,7 @@ use crate::{ checker::{file::FileChecker, mail::MailChecker, website::WebsiteChecker}, filter::{Excludes, Filter, Includes}, remap::Remaps, + types::DEFAULT_ACCEPTED_STATUS_CODES, }; /// Default number of redirects before a request is deemed as failed, 5. @@ -238,7 +239,8 @@ pub struct ClientBuilder { /// Set of accepted return codes / status codes. /// /// Unmatched return codes/ status codes are deemed as errors. - accepted: Option>, + #[builder(default = DEFAULT_ACCEPTED_STATUS_CODES.clone())] + accepted: HashSet, /// Response timeout per request in seconds. timeout: Option, @@ -361,7 +363,7 @@ impl ClientBuilder { None => builder, } .build() - .map_err(ErrorKind::NetworkRequest)?; + .map_err(ErrorKind::BuildRequestClient)?; let github_client = match self.github_token.as_ref().map(ExposeSecret::expose_secret) { Some(token) if !token.is_empty() => Some( diff --git a/lychee-lib/src/retry.rs b/lychee-lib/src/retry.rs index f6333c142f..41baf925b5 100644 --- a/lychee-lib/src/retry.rs +++ b/lychee-lib/src/retry.rs @@ -105,6 +105,10 @@ impl RetryExt for ErrorKind { }) = self.github_error() { source.should_retry() + } else if let Self::RejectedStatusCode(StatusCode::TOO_MANY_REQUESTS) = self { + // We encountered `StatusCode::TOO_MANY_REQUESTS` and the user configured to reject this code. + // In this case we want to retry after some time because rate limiting is only temporary. + return true; } else { false } diff --git a/lychee-lib/src/types/cache.rs b/lychee-lib/src/types/cache.rs index 03b289d8c4..aa42a50a9d 100644 --- a/lychee-lib/src/types/cache.rs +++ b/lychee-lib/src/types/cache.rs @@ -76,12 +76,13 @@ impl From<&Status> for CacheStatus { Status::Redirected(code) => Self::Error(Some(code.as_u16())), Status::Timeout(code) => Self::Error(code.map(|code| code.as_u16())), Status::Error(e) => match e { - ErrorKind::NetworkRequest(e) - | ErrorKind::ReadResponseBody(e) - | ErrorKind::BuildRequestClient(e) => match e.status() { - Some(code) => Self::Error(Some(code.as_u16())), - None => Self::Error(None), - }, + ErrorKind::RejectedStatusCode(code) => Self::Error(Some(code.as_u16())), + ErrorKind::ReadResponseBody(e) | ErrorKind::BuildRequestClient(e) => { + match e.status() { + Some(code) => Self::Error(Some(code.as_u16())), + None => Self::Error(None), + } + } _ => Self::Error(None), }, } diff --git a/lychee-lib/src/types/error.rs b/lychee-lib/src/types/error.rs index 9dbd28c267..be84c20c4e 100644 --- a/lychee-lib/src/types/error.rs +++ b/lychee-lib/src/types/error.rs @@ -1,3 +1,4 @@ +use http::StatusCode; use serde::{Serialize, Serializer}; use std::error::Error; use std::hash::Hash; @@ -14,7 +15,8 @@ use crate::{Uri, basic_auth::BasicAuthExtractorError, utils}; #[derive(Error, Debug)] #[non_exhaustive] pub enum ErrorKind { - /// Network error while handling request + /// Network error while handling request. + /// This does not include erroneous status codes, `RejectedStatusCode` will be used in that case. #[error("Network error")] NetworkRequest(#[source] reqwest::Error), /// Cannot read the body of the received response @@ -144,6 +146,10 @@ pub enum ErrorKind { #[error("Invalid status code: {0}")] InvalidStatusCode(u16), + /// The given status code was not accepted (this depends on the `accept` configuration) + #[error(r#"Rejected status code (this depends on your "accept" configuration)"#)] + RejectedStatusCode(StatusCode), + /// Regex error #[error("Error when using regex engine: {0}")] Regex(#[from] regex::Error), @@ -175,25 +181,22 @@ impl ErrorKind { pub fn details(&self) -> Option { match self { ErrorKind::NetworkRequest(e) => { - if let Some(status) = e.status() { - Some( - status - .canonical_reason() - .unwrap_or("Unknown status code") - .to_string(), - ) + // Get the relevant details from the specific reqwest error + let details = utils::reqwest::trim_error_output(e); + + // Provide support for common error types + if e.is_connect() { + Some(format!("{details} Maybe a certificate error?")) } else { - // Get the relevant details from the specific reqwest error - let details = utils::reqwest::trim_error_output(e); - - // Provide support for common error types - if e.is_connect() { - Some(format!("{details} Maybe a certificate error?")) - } else { - Some(details) - } + Some(details) } } + ErrorKind::RejectedStatusCode(status) => Some( + status + .canonical_reason() + .unwrap_or("Unknown status code") + .to_string(), + ), ErrorKind::GithubRequest(e) => { if let octocrab::Error::GitHub { source, .. } = &**e { Some(source.message.clone()) @@ -322,6 +325,7 @@ impl Hash for ErrorKind { Self::InvalidHeader(e) => e.to_string().hash(state), Self::InvalidGlobPattern(e) => e.to_string().hash(state), Self::InvalidStatusCode(c) => c.hash(state), + Self::RejectedStatusCode(c) => c.hash(state), Self::Channel(e) => e.to_string().hash(state), Self::MissingGitHubToken | Self::InvalidUrlHost => { std::mem::discriminant(self).hash(state); diff --git a/lychee-lib/src/types/status.rs b/lychee-lib/src/types/status.rs index 38535b64b3..8674750329 100644 --- a/lychee-lib/src/types/status.rs +++ b/lychee-lib/src/types/status.rs @@ -83,18 +83,13 @@ impl Serialize for Status { impl Status { #[must_use] /// Create a status object from a response and the set of accepted status codes - pub fn new(response: &Response, accepted: Option>) -> Self { + pub fn new(response: &Response, accepted: &HashSet) -> Self { let code = response.status(); - if let Some(true) = accepted.map(|a| a.contains(&code)) { + if accepted.contains(&code) { Self::Ok(code) } else { - match response.error_for_status_ref() { - Ok(_) if code.is_success() => Self::Ok(code), - Ok(_) if code.is_redirection() => Self::Redirected(code), - Ok(_) => Self::UnknownStatusCode(code), - Err(e) => e.into(), - } + Self::Error(ErrorKind::RejectedStatusCode(code)) } } @@ -217,13 +212,13 @@ impl Status { | Status::Redirected(code) | Status::UnknownStatusCode(code) | Status::Timeout(Some(code)) => Some(*code), - Status::Error(kind) | Status::Unsupported(kind) => { - if let Some(error) = kind.reqwest_error() { - error.status() - } else { - None - } - } + Status::Error(kind) | Status::Unsupported(kind) => match kind { + ErrorKind::RejectedStatusCode(status_code) => Some(*status_code), + _ => match kind.reqwest_error() { + Some(error) => error.status(), + None => None, + }, + }, Status::Cached(CacheStatus::Ok(code) | CacheStatus::Error(Some(code))) => { StatusCode::from_u16(*code).ok() } @@ -240,12 +235,13 @@ impl Status { } Status::Excluded => "EXCLUDED".to_string(), Status::Error(e) => match e { - ErrorKind::NetworkRequest(e) - | ErrorKind::ReadResponseBody(e) - | ErrorKind::BuildRequestClient(e) => match e.status() { - Some(code) => code.as_str().to_string(), - None => "ERROR".to_string(), - }, + ErrorKind::RejectedStatusCode(code) => code.as_str().to_string(), + ErrorKind::ReadResponseBody(e) | ErrorKind::BuildRequestClient(e) => { + match e.status() { + Some(code) => code.as_str().to_string(), + None => "ERROR".to_string(), + } + } _ => "ERROR".to_string(), }, Status::Timeout(code) => match code { diff --git a/lychee-lib/src/types/status_code/selector.rs b/lychee-lib/src/types/status_code/selector.rs index ea7637a30e..17324be3d6 100644 --- a/lychee-lib/src/types/status_code/selector.rs +++ b/lychee-lib/src/types/status_code/selector.rs @@ -1,10 +1,16 @@ -use std::{collections::HashSet, fmt::Display, str::FromStr}; +use std::{collections::HashSet, fmt::Display, hash::BuildHasher, str::FromStr, sync::LazyLock}; +use http::StatusCode; use serde::{Deserialize, de::Visitor}; use thiserror::Error; use crate::{AcceptRangeError, types::accept::AcceptRange}; +/// These values are the default status codes which are accepted by lychee. +/// SAFETY: This does not panic as all provided status codes are valid. +pub static DEFAULT_ACCEPTED_STATUS_CODES: LazyLock> = + LazyLock::new(|| >::try_from(StatusCodeSelector::default()).unwrap()); + #[derive(Debug, Error, PartialEq)] pub enum StatusCodeSelectorError { #[error("invalid/empty input")] @@ -44,6 +50,7 @@ impl FromStr for StatusCodeSelector { } } +/// These values are the default status codes which are accepted by lychee. impl Default for StatusCodeSelector { fn default() -> Self { Self::new_from(vec![AcceptRange::new(100, 103), AcceptRange::new(200, 299)]) @@ -98,27 +105,33 @@ impl StatusCodeSelector { self.ranges.iter().any(|range| range.contains(value)) } - /// Consumes self and creates a [`HashSet`] which contains all - /// accepted status codes. - #[must_use] - pub fn into_set(self) -> HashSet { - let mut set = HashSet::new(); - - for range in self.ranges { - for value in range.inner() { - set.insert(value); - } - } - - set - } - #[cfg(test)] pub(crate) fn len(&self) -> usize { self.ranges.len() } } +impl From for HashSet { + fn from(value: StatusCodeSelector) -> Self { + value + .ranges + .into_iter() + .flat_map(|range| range.inner().collect::>()) + .collect() + } +} + +impl TryFrom for HashSet { + type Error = http::status::InvalidStatusCode; + + fn try_from(value: StatusCodeSelector) -> Result { + >::from(value) + .into_iter() + .map(StatusCode::from_u16) + .collect() + } +} + struct StatusCodeSelectorVisitor; impl<'de> Visitor<'de> for StatusCodeSelectorVisitor { @@ -246,4 +259,17 @@ mod test { let selector = StatusCodeSelector::from_str(input).unwrap(); assert_eq!(selector.to_string(), display); } + + #[rstest] + #[case("100..=102,200..202", HashSet::from([100, 101, 102, 200, 201]))] + fn test_into_u16_set(#[case] input: &str, #[case] expected: HashSet) { + let actual: HashSet = StatusCodeSelector::from_str(input).unwrap().into(); + assert_eq!(actual, expected); + } + + #[test] + fn test_default_accepted_values() { + // assert that accessing the value does not panic as described in the SAFETY note. + let _ = &*DEFAULT_ACCEPTED_STATUS_CODES; + } }