Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
7e9b945
Add header for 404/1002 retry requests.
aavasthy Oct 14, 2025
7786e45
Merge branch 'master' into users/aavasthy/404_1002
aavasthy Oct 14, 2025
d85ed0a
Merge branch 'master' into users/aavasthy/404_1002
aavasthy Oct 15, 2025
163be31
Merge branch 'master' into users/aavasthy/404_1002
aavasthy Nov 4, 2025
e523341
Update direct package and retry header code
aavasthy Nov 4, 2025
822b8e6
Merge branch 'master' into users/aavasthy/404_1002
aavasthy Nov 4, 2025
1ad3fef
Merge branch 'master' into users/aavasthy/404_1002
aavasthy Nov 5, 2025
1f63898
Merge branch 'master' into users/aavasthy/404_1002
aavasthy Nov 6, 2025
2d90cb1
Resolve merge conflicts.
aavasthy Nov 18, 2025
b1fa2b2
Merge branch 'master' into users/aavasthy/404_1002
aavasthy Dec 13, 2025
35e0eeb
Code clean up
aavasthy Dec 16, 2025
d51b105
Merge branch 'master' into users/aavasthy/404_1002
aavasthy Dec 18, 2025
f622ebd
Add not to be used internally check for hubregion header.
aavasthy Dec 18, 2025
a15e97a
Merge branch 'master' into users/aavasthy/404_1002
aavasthy Dec 18, 2025
961288f
Add not to be used internally check for hubregion header.
aavasthy Dec 19, 2025
b8facfd
Merge with master
aavasthy Dec 19, 2025
1fbd6c0
Merge branch 'master' into users/aavasthy/404_1002
aavasthy Dec 19, 2025
7668f2a
Merge branch 'master' into users/aavasthy/404_1002
aavasthy Dec 30, 2025
692484c
Made property volatile
aavasthy Jan 6, 2026
9dd4ec0
Merge branch 'master' into users/aavasthy/404_1002
aavasthy Jan 6, 2026
66d481b
Update retry logic and add header for all subsequent request.
aavasthy Jan 9, 2026
5b541e1
Merge with master
aavasthy Jan 9, 2026
7936ada
Correct formatting
aavasthy Jan 9, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 13 additions & 4 deletions Microsoft.Azure.Cosmos/src/ClientRetryPolicy.cs
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,8 @@ internal sealed class ClientRetryPolicy : IDocumentClientRetryPolicy
{
private const int RetryIntervalInMS = 1000; // Once we detect failover wait for 1 second before retrying request.
private const int MaxRetryCount = 120;
private const int MaxServiceUnavailableRetryCount = 1;
private const int MaxServiceUnavailableRetryCount = 1;
private const string HubRegionHeader = "x-ms-cosmos-hub-region-processing-only";
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we have any central place to host these types of headers in the code base?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes we are actually using the central file HttpConstants.cs. I removed this line as part of code clean up.


private readonly IDocumentClientRetryPolicy throttlingRetry;
private readonly GlobalEndpointManager globalEndpointManager;
Expand All @@ -38,7 +39,8 @@ internal sealed class ClientRetryPolicy : IDocumentClientRetryPolicy
private bool isMultiMasterWriteRequest;
private Uri locationEndpoint;
private RetryContext retryContext;
private DocumentServiceRequest documentServiceRequest;
private DocumentServiceRequest documentServiceRequest;
private bool addHubRegionProcessingOnlyHeader;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should volatile be used for cross-thread visibility?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I dont the volatile keyword is required here. The code follows a sequential pattern where ShouldRetryInternalAsync sets the flag, then OnBeforeSendRequest reads it - with an async/await in between which provides memory barrier. Also see other fields with same pattern sessionTokenRetryCount, serviceUnavailableRetryCount etc. Can add volatile keyword for defensive coding.


public ClientRetryPolicy(
GlobalEndpointManager globalEndpointManager,
Expand Down Expand Up @@ -222,6 +224,12 @@ public void OnBeforeSendRequest(DocumentServiceRequest request)
// set location-based routing directive based on request retry context
request.RequestContext.RouteToLocation(this.retryContext.RetryLocationIndex, this.retryContext.RetryRequestOnPreferredLocations);
}
}
// If previous attempt failed with 404/1002, add the hub-region-processing-only header
if (this.addHubRegionProcessingOnlyHeader)
{
request.Headers[HubRegionHeader] = bool.TrueString;
this.addHubRegionProcessingOnlyHeader = false; // reset after applying
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what would be the errors returned if SDK try to read from non-hub region?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also falling back to new hub for that partition.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

SDK ends up getting 403/3(WriteForbidden) if it tries to read from a non-hub region. Then SDK has internal logic where every 403/3 response leads to new endpoint discovery. The SDK then retries the request to the hub region.

}

// Resolve the endpoint for the request and pin the resolution to the resolved endpoint
Expand Down Expand Up @@ -322,7 +330,8 @@ private async Task<ShouldRetryResult> ShouldRetryInternalAsync(

if (statusCode == HttpStatusCode.NotFound
&& subStatusCode == SubStatusCodes.ReadSessionNotAvailable)
Copy link
Member

@xinlian12 xinlian12 Oct 23, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

also double check: does this change also targeted for MM as well? For MM, writes can happen in any region, also enable this for MM might cause regression

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

confirmed this with backend team and this change is not intended to be used in multi-master.

{
{
this.addHubRegionProcessingOnlyHeader = true;
return this.ShouldRetryOnSessionNotAvailable(this.documentServiceRequest);
}

Expand All @@ -338,7 +347,7 @@ private async Task<ShouldRetryResult> ShouldRetryInternalAsync(
|| (statusCode == HttpStatusCode.Gone && subStatusCode == SubStatusCodes.LeaseNotFound))
{
return this.ShouldRetryOnUnavailableEndpointStatusCodes();
}
}

return null;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ namespace Microsoft.Azure.Cosmos.SDK.EmulatorTests
using System.Threading.Tasks;
using Microsoft.Azure.Cosmos;
using Microsoft.Azure.Cosmos.Diagnostics;
using Microsoft.Azure.Cosmos.Handlers;
using Microsoft.Azure.Cosmos.Json;
using Microsoft.Azure.Cosmos.Query.Core.ExecutionContext;
using Microsoft.Azure.Cosmos.Query.Core.QueryClient;
Expand All @@ -39,7 +40,8 @@ public class CosmosItemTests : BaseCosmosClientHelper
{
private Container Container = null;
private ContainerProperties containerSettings = null;


private const string HubRegionHeader = "x-ms-cosmos-hub-region-processing-only";
private static readonly string nonPartitionItemId = "fixed-Container-Item";
private static readonly string undefinedPartitionItemId = "undefined-partition-Item";

Expand Down Expand Up @@ -4315,7 +4317,115 @@ private static async Task GivenItemAsyncWhenMissingMemberHandlingIsErrorThenExpe

JsonConvert.DefaultSettings = () => default;
}
}
}

[TestMethod]
[Owner("aavasthy")]
[Description("Forces a single 404/1002 from the gateway and verifies ClientRetryPolicy adds x-ms-cosmos-hub-region-processing-only on the retry request.")]
public async Task ReadItemAsync_ShouldAddHubHeader_OnRetryAfter_404_1002()
{
bool headerObservedOnRetry = false;
int requestCount = 0;
bool shouldReturn404 = true;

// Created HTTP handler to intercept requests
HttpClientHandlerHelper httpHandler = new HttpClientHandlerHelper
{
RequestCallBack = (request, cancellationToken) =>
{
// Track all document read requests
if (request.Method == HttpMethod.Get &&
request.RequestUri != null &&
request.RequestUri.AbsolutePath.Contains("/docs/"))
{
requestCount++;

// Check for hub header on retry (2nd+ request)
if (requestCount > 1 &&
request.Headers.TryGetValues(HubRegionHeader, out IEnumerable<string> values) &&
values.Any(v => v.Equals(bool.TrueString, StringComparison.OrdinalIgnoreCase)))
{
headerObservedOnRetry = true;
}
}

return Task.FromResult<HttpResponseMessage>(null);
},

ResponseIntercepter = (response, request) =>
{
if (shouldReturn404 &&
request.Method == HttpMethod.Get &&
request.RequestUri != null &&
request.RequestUri.AbsolutePath.Contains("/docs/"))
{
shouldReturn404 = false; // Only return 404 once

var errorResponse = new
{
code = "NotFound",
message = "Message: {\"Errors\":[\"Resource Not Found. Learn more: https://aka.ms/cosmosdb-tsg-not-found\"]}\r\nActivityId: " + Guid.NewGuid() + ", Request URI: " + request.RequestUri,
additionalErrorInfo = ""
};

HttpResponseMessage notFoundResponse = new HttpResponseMessage(HttpStatusCode.NotFound)
{
Content = new StringContent(
JsonConvert.SerializeObject(errorResponse),
Encoding.UTF8,
"application/json"
)
};

// Add the substatus header for ReadSessionNotAvailable
notFoundResponse.Headers.Add("x-ms-substatus", "1002");
notFoundResponse.Headers.Add("x-ms-activity-id", Guid.NewGuid().ToString());
notFoundResponse.Headers.Add("x-ms-request-charge", "1.0");

return Task.FromResult(notFoundResponse);
}

return Task.FromResult(response);
}
};

CosmosClientOptions clientOptions = new CosmosClientOptions
{
ConnectionMode = ConnectionMode.Gateway,
ConsistencyLevel = Cosmos.ConsistencyLevel.Session,
HttpClientFactory = () => new HttpClient(httpHandler),
MaxRetryAttemptsOnRateLimitedRequests = 9,
MaxRetryWaitTimeOnRateLimitedRequests = TimeSpan.FromSeconds(30)
};

using CosmosClient customClient = TestCommon.CreateCosmosClient(clientOptions);

Container customContainer = customClient.GetContainer(this.database.Id, this.Container.Id);

// Create a test item first
ToDoActivity testItem = ToDoActivity.CreateRandomToDoActivity();
await this.Container.CreateItemAsync(testItem, new Cosmos.PartitionKey(testItem.pk));

try
{
// This should trigger 404/1002 on first attempt, then retry with hub header
ItemResponse<ToDoActivity> response = await customContainer.ReadItemAsync<ToDoActivity>(
testItem.id,
new Cosmos.PartitionKey(testItem.pk));

Assert.IsNotNull(response);
Assert.IsNotNull(response.Resource);
}
catch (CosmosException)
{
// It's possible the retry also fails, but should still have seen the retry attempt
}

// Verifying retry happened
Assert.IsTrue(requestCount >= 2, $"Expected at least 2 requests (original + retry), but got {requestCount}");
Assert.IsTrue(headerObservedOnRetry, $"Expected retry request to include '{HubRegionHeader}: true'");
}


private async Task<T> AutoGenerateIdPatternTest<T>(Cosmos.PartitionKey pk, T itemWithoutId)
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,8 @@ public sealed class ClientRetryPolicyTests
{
private static Uri Location1Endpoint = new Uri("https://location1.documents.azure.com");
private static Uri Location2Endpoint = new Uri("https://location2.documents.azure.com");


private const string HubRegionHeader = "x-ms-cosmos-hub-region-processing-only";
private ReadOnlyCollection<string> preferredLocations;
private AccountProperties databaseAccount;
private GlobalPartitionEndpointManager partitionKeyRangeLocationCache;
Expand Down Expand Up @@ -400,6 +401,51 @@ public async Task ClientRetryPolicy_NoRetry_MultiMaster_Write_NoPreferredLocatio
{
await this.ValidateConnectTimeoutTriggersClientRetryPolicyAsync(isReadRequest: false, useMultipleWriteLocations: true, usesPreferredLocations: false, true);
}

[TestMethod]
public async Task ClientRetryPolicy_AddsHubRegionProcessingOnlyHeader_On404_1002()
{
// Arrange
const bool enableEndpointDiscovery = true;

using GlobalEndpointManager endpointManager = this.Initialize(
useMultipleWriteLocations: true,
enableEndpointDiscovery: enableEndpointDiscovery,
isPreferredLocationsListEmpty: false);

ClientRetryPolicy retryPolicy = new ClientRetryPolicy(
endpointManager,
this.partitionKeyRangeLocationCache,
new RetryOptions(),
enableEndpointDiscovery,
isThinClientEnabled: false);

DocumentServiceRequest request1 = this.CreateRequest(isReadRequest: true, isMasterResourceType: false);

Assert.IsNull(request1.Headers.GetValues(HubRegionHeader), "Header should not exist before any retry.");

DocumentClientException simulatedException = new DocumentClientException(
message: "Simulated 404/1002 ReadSessionNotAvailable",
innerException: null,
statusCode: HttpStatusCode.NotFound,
substatusCode: SubStatusCodes.ReadSessionNotAvailable,
requestUri: request1.RequestContext.LocationEndpointToRoute,
responseHeaders: new DictionaryNameValueCollection());

// Act: policy detects error and sets flag
ShouldRetryResult shouldRetry = await retryPolicy.ShouldRetryAsync(simulatedException, CancellationToken.None);

retryPolicy.OnBeforeSendRequest(request1);
string[] headerValues = request1.Headers.GetValues(HubRegionHeader);
Assert.IsNotNull(headerValues, "Expected header to be added after 404/1002 retry signal.");
Assert.AreEqual(1, headerValues.Length, "Header should have exactly one value.");
Assert.AreEqual(bool.TrueString, headerValues[0], "Header value should be 'True'.");

// Header not applied to a new request
DocumentServiceRequest request2 = this.CreateRequest(isReadRequest: true, isMasterResourceType: false);
retryPolicy.OnBeforeSendRequest(request2);
Assert.IsNull(request2.Headers.GetValues(HubRegionHeader), "Header should not be set on a new request after flag is reset.");
}

private async Task ValidateConnectTimeoutTriggersClientRetryPolicyAsync(
bool isReadRequest,
Expand Down
Loading