From 6c736b6cfae8123487b4dc787973f1ab778d9b36 Mon Sep 17 00:00:00 2001
From: Gerrit Renker <grenker@aurora.tech>
Date: Wed, 2 Aug 2023 09:07:57 -0400
Subject: [PATCH] [aws-cpp-sdk-core]: increase STS reliability and retries

This fixes issues we have repeatedly experienced when using STS for authentication
in a large Kubernetes cluster, with heavy load on STS:
1. The default connect timeout of 1s is too low. It happens that
   connections slow down. One case is very high load on kube DNS.
   A value of 30 seconds has proven to be robust.
2. The retry parameters are too short, authentication would frequently
   fail whenever STS was under higher load. The retry settings have
   worked in production for about 2 years.
---
 .../source/auth/STSCredentialsProvider.cpp               | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/src/aws-cpp-sdk-core/source/auth/STSCredentialsProvider.cpp b/src/aws-cpp-sdk-core/source/auth/STSCredentialsProvider.cpp
index 7747d86951c..f30eb561d5d 100644
--- a/src/aws-cpp-sdk-core/source/auth/STSCredentialsProvider.cpp
+++ b/src/aws-cpp-sdk-core/source/auth/STSCredentialsProvider.cpp
@@ -100,12 +100,19 @@ STSAssumeRoleWebIdentityCredentialsProvider::STSAssumeRoleWebIdentityCredentials
     Aws::Client::ClientConfiguration config;
     config.scheme = Aws::Http::Scheme::HTTPS;
     config.region = tmpRegion;
+    // Set the Connect Timeout to 30s. Default of 1s causes a timeout when STS is under load.
+    config.connectTimeoutMs = 30000;
 
     Aws::Vector<Aws::String> retryableErrors;
     retryableErrors.push_back("IDPCommunicationError");
     retryableErrors.push_back("InvalidIdentityToken");
 
-    config.retryStrategy = Aws::MakeShared<SpecifiedRetryableErrorsRetryStrategy>(STS_ASSUME_ROLE_WEB_IDENTITY_LOG_TAG, retryableErrors, 3/*maxRetries*/);
+    // The retry parameters are optimized for STS to still respond when under heavy load in production.
+    config.retryStrategy = Aws::MakeShared<SpecifiedRetryableErrorsRetryStrategy>(
+        STS_ASSUME_ROLE_WEB_IDENTITY_LOG_TAG,
+        retryableErrors,
+        9, /*maxRetries*/
+        588 /*scaleFactor*/);
 
     m_client = Aws::MakeUnique<Aws::Internal::STSCredentialsClient>(STS_ASSUME_ROLE_WEB_IDENTITY_LOG_TAG, config);
     m_initialized = true;