Skip to content

Commit ef893e7

Browse files
committed
fix race in moveRegularItemWith sync where insertOrReplace can cause move to fail
1 parent f306fd0 commit ef893e7

File tree

2 files changed

+51
-15
lines changed

2 files changed

+51
-15
lines changed

cachelib/allocator/CacheAllocator-inl.h

+50-14
Original file line numberDiff line numberDiff line change
@@ -1294,8 +1294,21 @@ size_t CacheAllocator<CacheTrait>::wakeUpWaitersLocked(folly::StringPiece key,
12941294
}
12951295

12961296
template <typename CacheTrait>
1297-
void CacheAllocator<CacheTrait>::moveRegularItemWithSync(
1297+
bool CacheAllocator<CacheTrait>::moveRegularItemWithSync(
12981298
Item& oldItem, WriteHandle& newItemHdl) {
1299+
//on function exit - the new item handle is no longer moving
1300+
//and other threads may access it - but in case where
1301+
//we failed to replace in access container we can give the
1302+
//new item back to the allocator
1303+
auto guard = folly::makeGuard([&]() {
1304+
auto ref = newItemHdl->unmarkMoving();
1305+
if (UNLIKELY(ref == 0)) {
1306+
const auto res =
1307+
releaseBackToAllocator(*newItemHdl, RemoveContext::kNormal, false);
1308+
XDCHECK(res == ReleaseRes::kReleased);
1309+
}
1310+
});
1311+
12991312
XDCHECK(oldItem.isMoving());
13001313
XDCHECK(!oldItem.isExpired());
13011314
// TODO: should we introduce new latency tracker. E.g. evictRegularLatency_
@@ -1326,6 +1339,22 @@ void CacheAllocator<CacheTrait>::moveRegularItemWithSync(
13261339

13271340
auto replaced = accessContainer_->replaceIf(oldItem, *newItemHdl,
13281341
predicate);
1342+
// another thread may have called insertOrReplace which could have
1343+
// marked this item as unaccessible causing the replaceIf
1344+
// in the access container to fail - in this case we want
1345+
// to abort the move since the item is no longer valid
1346+
if (!replaced) {
1347+
return false;
1348+
}
1349+
// what if another thread calls insertOrReplace now when
1350+
// the item is moving and already replaced in the hash table?
1351+
// 1. it succeeds in updating the hash table - so there is
1352+
// no guarentee that isAccessible() is true
1353+
// 2. it will then try to remove from MM container
1354+
// - this operation will wait for newItemHdl to
1355+
// be unmarkedMoving via the waitContext
1356+
// 3. replaced handle is returned and eventually drops
1357+
// ref to 0 and the item is recycled back to allocator.
13291358

13301359
if (config_.moveCb) {
13311360
// Execute the move callback. We cannot make any guarantees about the
@@ -1367,14 +1396,7 @@ void CacheAllocator<CacheTrait>::moveRegularItemWithSync(
13671396
XDCHECK(newItemHdl->hasChainedItem());
13681397
}
13691398
newItemHdl.unmarkNascent();
1370-
auto ref = newItemHdl->unmarkMoving();
1371-
//remove because there is a chance the new item was not
1372-
//added to the access container
1373-
if (UNLIKELY(ref == 0)) {
1374-
const auto res =
1375-
releaseBackToAllocator(*newItemHdl, RemoveContext::kNormal, false);
1376-
XDCHECK(res == ReleaseRes::kReleased);
1377-
}
1399+
return true;
13781400
}
13791401

13801402
template <typename CacheTrait>
@@ -1624,8 +1646,13 @@ CacheAllocator<CacheTrait>::findEviction(TierId tid, PoolId pid, ClassId cid) {
16241646
auto evictedToNext = lastTier ? nullptr
16251647
: tryEvictToNextMemoryTier(*candidate, false);
16261648
if (!evictedToNext) {
1627-
if (!token.isValid()) {
1649+
if (!token.isValid() && candidate->isAccessible()) {
16281650
token = createPutToken(*candidate);
1651+
} else if (!candidate->isAccessible()) {
1652+
if (UNLIKELY(nvmCache_ != nullptr)) {
1653+
HashedKey hk{candidate->getKey()};
1654+
nvmCache_->remove(hk, nvmCache_->createDeleteTombStone(hk));
1655+
}
16291656
}
16301657
// tryEvictToNextMemoryTier should only fail if allocation of the new item fails
16311658
// in that case, it should be still possible to mark item as exclusive.
@@ -1756,7 +1783,10 @@ CacheAllocator<CacheTrait>::tryEvictToNextMemoryTier(
17561783

17571784
if (newItemHdl) {
17581785
XDCHECK_EQ(newItemHdl->getSize(), item.getSize());
1759-
moveRegularItemWithSync(item, newItemHdl);
1786+
if (!moveRegularItemWithSync(item, newItemHdl)) {
1787+
return WriteHandle{};
1788+
}
1789+
XDCHECK_EQ(newItemHdl->getKey(),item.getKey());
17601790
item.unmarkMoving();
17611791
return newItemHdl;
17621792
} else {
@@ -1795,7 +1825,9 @@ CacheAllocator<CacheTrait>::tryPromoteToNextMemoryTier(
17951825

17961826
if (newItemHdl) {
17971827
XDCHECK_EQ(newItemHdl->getSize(), item.getSize());
1798-
moveRegularItemWithSync(item, newItemHdl);
1828+
if (!moveRegularItemWithSync(item, newItemHdl)) {
1829+
return WriteHandle{};
1830+
}
17991831
item.unmarkMoving();
18001832
return newItemHdl;
18011833
} else {
@@ -3148,9 +3180,13 @@ bool CacheAllocator<CacheTrait>::tryMovingForSlabRelease(
31483180
// TODO: add support for chained items
31493181
return false;
31503182
} else {
3151-
moveRegularItemWithSync(oldItem, newItemHdl);
3183+
//move can fail if another thread calls insertOrReplace
3184+
//in this case oldItem is no longer valid (not accessible,
3185+
//it gets removed from MMContainer and evictForSlabRelease
3186+
//will send it back to the allocator
3187+
bool ret = moveRegularItemWithSync(oldItem, newItemHdl);
31523188
removeFromMMContainer(oldItem);
3153-
return true;
3189+
return ret;
31543190
}
31553191
}
31563192
}

cachelib/allocator/CacheAllocator.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -1617,7 +1617,7 @@ class CacheAllocator : public CacheBase {
16171617
//
16181618
// @return true If the move was completed, and the containers were updated
16191619
// successfully.
1620-
void moveRegularItemWithSync(Item& oldItem, WriteHandle& newItemHdl);
1620+
bool moveRegularItemWithSync(Item& oldItem, WriteHandle& newItemHdl);
16211621

16221622
// Moves a regular item to a different slab. This should only be used during
16231623
// slab release after the item's exclusive bit has been set. The user supplied

0 commit comments

Comments
 (0)