Skip to content

Commit 8e27fd3

Browse files
badrishcCopilot
andcommitted
Fix GetNext hang: use cached SafeTailAddress with fallback to scan
GetNext was using only the cached SafeTailAddress, which could remain stale when no consumer-side RefreshSafeTailAddress had run yet. This caused the iterator to return false prematurely, and since no scan happened in GetNext, replication consumers would hang waiting for records that were already safe to read. Fix: check the cached value first (O(1)). Only call RefreshSafeTailAddress when the current address has caught up to the cached boundary. This preserves the fast path (no scan while the cache is ahead) while ensuring correctness when the cache is stale. Also restore RefreshSafeTailAddress in WaitAsync's fast path for the same reason — the cache-only check could miss records and enter SlowWait unnecessarily. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
1 parent 2ed4dce commit 8e27fd3

1 file changed

Lines changed: 14 additions & 10 deletions

File tree

libs/storage/Tsavorite/cs/src/core/TsavoriteLog/TsavoriteLogScanIterator.cs

Lines changed: 14 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -144,10 +144,7 @@ public ValueTask<bool> WaitAsync(CancellationToken token = default)
144144
return SlowWaitAsync(this, token);
145145
}
146146

147-
// Check the cached SafeTailAddress only (O(1)). The more expensive
148-
// RefreshSafeTailAddress scan is deferred to SlowWaitUncommittedAsync where
149-
// per-iterator backoff logic can throttle scan frequency.
150-
if (NextAddress < tsavoriteLog.SafeTailAddress)
147+
if (NextAddress < tsavoriteLog.SafeTailAddress || NextAddress < tsavoriteLog.RefreshSafeTailAddress())
151148
return new ValueTask<bool>(true);
152149
return SlowWaitUncommittedAsync(token);
153150
}
@@ -752,9 +749,13 @@ private unsafe bool GetNextInternal(out long physicalAddress, out int entryLengt
752749
if (disposed)
753750
return false;
754751

755-
// Use the cached SafeTailAddress for the hot path. Refresh only happens when the
756-
// iterator catches up to the cache, via WaitAsync / SlowWaitUncommittedAsync.
757-
if ((currentAddress >= endAddress) || (currentAddress >= (scanUncommitted ? tsavoriteLog.SafeTailAddress : tsavoriteLog.CommittedUntilAddress)))
752+
// For scanUncommitted: check cached SafeTailAddress first (O(1)). Only scan the
753+
// epoch table via RefreshSafeTailAddress when caught up to the cache, avoiding
754+
// the O(kTableSize) scan on every record.
755+
var boundary = scanUncommitted
756+
? (currentAddress < tsavoriteLog.SafeTailAddress ? tsavoriteLog.SafeTailAddress : tsavoriteLog.RefreshSafeTailAddress())
757+
: tsavoriteLog.CommittedUntilAddress;
758+
if (currentAddress >= endAddress || currentAddress >= boundary)
758759
return false;
759760

760761
if (currentAddress < _headAddress)
@@ -872,9 +873,12 @@ private unsafe bool ExpandGetNextInternal(long startPhysicalAddress, ref int tot
872873
if (disposed)
873874
return false;
874875

875-
// Use the cached SafeTailAddress for the hot path. Refresh only happens when the
876-
// iterator catches up to the cache, via WaitAsync / SlowWaitUncommittedAsync.
877-
if ((currentAddress >= endAddress) || (currentAddress >= (scanUncommitted ? tsavoriteLog.SafeTailAddress : tsavoriteLog.CommittedUntilAddress)))
876+
// For scanUncommitted: check cached SafeTailAddress first (O(1)). Only scan the
877+
// epoch table via RefreshSafeTailAddress when caught up to the cache.
878+
var boundary2 = scanUncommitted
879+
? (currentAddress < tsavoriteLog.SafeTailAddress ? tsavoriteLog.SafeTailAddress : tsavoriteLog.RefreshSafeTailAddress())
880+
: tsavoriteLog.CommittedUntilAddress;
881+
if (currentAddress >= endAddress || currentAddress >= boundary2)
878882
return false;
879883

880884
if (currentAddress < _headAddress)

0 commit comments

Comments
 (0)