diff --git a/.github/workflows/build-cachelib-centos-8-5.yml b/.github/workflows/build-cachelib-centos-8-5.yml index 14ab8cfa74..30247b2e94 100644 --- a/.github/workflows/build-cachelib-centos-8-5.yml +++ b/.github/workflows/build-cachelib-centos-8-5.yml @@ -13,8 +13,6 @@ # limitations under the License. name: build-cachelib-centos-8.5 on: -# push: - pull_request: schedule: - cron: '0 9 * * *' jobs: diff --git a/.github/workflows/build-cachelib-centos-long.yml b/.github/workflows/build-cachelib-centos-long.yml new file mode 100644 index 0000000000..92165f603b --- /dev/null +++ b/.github/workflows/build-cachelib-centos-long.yml @@ -0,0 +1,39 @@ +name: build-cachelib-centos-latest +on: + schedule: + - cron: '0 7 * * *' + +jobs: + build-cachelib-centos8-latest: + name: "CentOS/latest - Build CacheLib with all dependencies" + runs-on: ubuntu-latest + # Docker container image name + container: "centos:latest" + steps: + - name: "update packages" + run: dnf upgrade -y + - name: "install sudo,git" + run: dnf install -y sudo git cmake gcc + - name: "System Information" + run: | + echo === uname === + uname -a + echo === /etc/os-release === + cat /etc/os-release + echo === df -hl === + df -hl + echo === free -h === + free -h + echo === top === + top -b -n1 -1 -Eg || timeout 1 top -b -n1 + echo === env === + env + echo === gcc -v === + gcc -v + - name: "checkout sources" + uses: actions/checkout@v2 + - name: "build CacheLib using build script" + run: ./contrib/build.sh -j -v -T + - name: "run tests" + timeout-minutes: 60 + run: cd opt/cachelib/tests && ../../../run_tests.sh long diff --git a/.github/workflows/build-cachelib-debian.yml b/.github/workflows/build-cachelib-debian.yml new file mode 100644 index 0000000000..5bc3ad3c70 --- /dev/null +++ b/.github/workflows/build-cachelib-debian.yml @@ -0,0 +1,43 @@ +name: build-cachelib-debian-10 +on: + schedule: + - cron: '30 5 * * 0,3' + +jobs: + build-cachelib-debian-10: + name: "Debian/Buster - Build CacheLib with all dependencies" + runs-on: ubuntu-latest + # Docker container image name + container: "debian:buster-slim" + steps: + - name: "update packages" + run: apt-get update + - name: "upgrade packages" + run: apt-get -y upgrade + - name: "install sudo,git" + run: apt-get install -y sudo git procps + - name: "System Information" + run: | + echo === uname === + uname -a + echo === /etc/os-release === + cat /etc/os-release + echo === df -hl === + df -hl + echo === free -h === + free -h + echo === top === + top -b -n1 -1 -Eg || timeout 1 top -b -n1 ; true + echo === env === + env + echo === cc -v === + cc -v || true + echo === g++ -v === + g++ - || true + - name: "checkout sources" + uses: actions/checkout@v2 + - name: "build CacheLib using build script" + run: ./contrib/build.sh -j -v -T + - name: "run tests" + timeout-minutes: 60 + run: cd opt/cachelib/tests && ../../../run_tests.sh diff --git a/.github/workflows/build-cachelib-docker.yml b/.github/workflows/build-cachelib-docker.yml new file mode 100644 index 0000000000..be28bc233c --- /dev/null +++ b/.github/workflows/build-cachelib-docker.yml @@ -0,0 +1,49 @@ +name: build-cachelib-docker +on: + push: + pull_request: + +jobs: + build-cachelib-docker: + name: "CentOS/latest - Build CacheLib with all dependencies" + runs-on: ubuntu-latest + env: + REPO: cachelib + GITHUB_REPO: intel/CacheLib + CONTAINER_REG: ghcr.io/pmem/cachelib + CONTAINER_REG_USER: ${{ secrets.GH_CR_USER }} + CONTAINER_REG_PASS: ${{ secrets.GH_CR_PAT }} + FORCE_IMAGE_ACTION: ${{ secrets.FORCE_IMAGE_ACTION }} + HOST_WORKDIR: ${{ github.workspace }} + WORKDIR: docker + IMG_VER: devel + strategy: + matrix: + CONFIG: ["OS=centos OS_VER=8streams PUSH_IMAGE=1"] + steps: + - name: "System Information" + run: | + echo === uname === + uname -a + echo === /etc/os-release === + cat /etc/os-release + echo === df -hl === + df -hl + echo === free -h === + free -h + echo === top === + top -b -n1 -1 -Eg || timeout 1 top -b -n1 + echo === env === + env + echo === gcc -v === + gcc -v + - name: "checkout sources" + uses: actions/checkout@v2 + with: + fetch-depth: 0 + + - name: Pull the image or rebuild and push it + run: cd $WORKDIR && ${{ matrix.CONFIG }} ./pull-or-rebuild-image.sh $FORCE_IMAGE_ACTION + + - name: Run the build + run: cd $WORKDIR && ${{ matrix.CONFIG }} ./build.sh diff --git a/.github/workflows/clang-format-check.yml b/.github/workflows/clang-format-check.yml index 26d942d182..54045f0a36 100644 --- a/.github/workflows/clang-format-check.yml +++ b/.github/workflows/clang-format-check.yml @@ -1,6 +1,6 @@ # From: https://github.com/marketplace/actions/clang-format-check#multiple-paths name: clang-format Check -on: [pull_request] +on: [] jobs: formatting-check: name: Formatting Check diff --git a/MultiTierDataMovement.md b/MultiTierDataMovement.md new file mode 100644 index 0000000000..cccc14b947 --- /dev/null +++ b/MultiTierDataMovement.md @@ -0,0 +1,90 @@ +# Background Data Movement + +In order to reduce the number of online evictions and support asynchronous +promotion - we have added two periodic workers to handle eviction and promotion. + +The diagram below shows a simplified version of how the background evictor +thread (green) is integrated to the CacheLib architecture. + +

+ BackgroundEvictor +

+ +## Background Evictors + +The background evictors scan each class to see if there are objects to move the next (lower) +tier using a given strategy. Here we document the parameters for the different +strategies and general parameters. + +- `backgroundEvictorIntervalMilSec`: The interval that this thread runs for - by default +the background evictor threads will wake up every 10 ms to scan the AllocationClasses. Also, +the background evictor thread will be woken up everytime there is a failed allocation (from +a request handling thread) and the current percentage of free memory for the +AllocationClass is lower than `lowEvictionAcWatermark`. This may render the interval parameter +not as important when there are many allocations occuring from request handling threads. + +- `evictorThreads`: The number of background evictors to run - each thread is a assigned +a set of AllocationClasses to scan and evict objects from. Currently, each thread gets +an equal number of classes to scan - but as object size distribution may be unequal - future +versions will attempt to balance the classes among threads. The range is 1 to number of AllocationClasses. +The default is 1. + +- `maxEvictionBatch`: The number of objects to remove in a given eviction call. The +default is 40. Lower range is 10 and the upper range is 1000. Too low and we might not +remove objects at a reasonable rate, too high and it might increase contention with user threads. + +- `minEvictionBatch`: Minimum number of items to evict at any time (if there are any +candidates) + +- `maxEvictionPromotionHotness`: Maximum candidates to consider for eviction. This is similar to `maxEvictionBatch` +but it specifies how many candidates will be taken into consideration, not the actual number of items to evict. +This option can be used to configure duration of critical section on LRU lock. + + +### FreeThresholdStrategy (default) + +- `lowEvictionAcWatermark`: Triggers background eviction thread to run +when this percentage of the AllocationClass is free. +The default is `2.0`, to avoid wasting capacity we don't set this above `10.0`. + +- `highEvictionAcWatermark`: Stop the evictions from an AllocationClass when this +percentage of the AllocationClass is free. The default is `5.0`, to avoid wasting capacity we +don't set this above `10`. + + +## Background Promoters + +The background promoters scan each class to see if there are objects to move to a lower +tier using a given strategy. Here we document the parameters for the different +strategies and general parameters. + +- `backgroundPromoterIntervalMilSec`: The interval that this thread runs for - by default +the background promoter threads will wake up every 10 ms to scan the AllocationClasses for +objects to promote. + +- `promoterThreads`: The number of background promoters to run - each thread is a assigned +a set of AllocationClasses to scan and promote objects from. Currently, each thread gets +an equal number of classes to scan - but as object size distribution may be unequal - future +versions will attempt to balance the classes among threads. The range is `1` to number of AllocationClasses. The default is `1`. + +- `maxProtmotionBatch`: The number of objects to promote in a given promotion call. The +default is 40. Lower range is 10 and the upper range is 1000. Too low and we might not +remove objects at a reasonable rate, too high and it might increase contention with user threads. + +- `minPromotionBatch`: Minimum number of items to promote at any time (if there are any +candidates) + +- `numDuplicateElements`: This allows us to promote items that have existing handles (read-only) since +we won't need to modify the data when a user is done with the data. Therefore, for a short time +the data could reside in both tiers until it is evicted from its current tier. The default is to +not allow this (0). Setting the value to 100 will enable duplicate elements in tiers. + +### Background Promotion Strategy (only one currently) + +- `promotionAcWatermark`: Promote items if there is at least this +percent of free AllocationClasses. Promotion thread will attempt to move `maxPromotionBatch` number of objects +to that tier. The objects are chosen from the head of the LRU. The default is `4.0`. +This value should correlate with `lowEvictionAcWatermark`, `highEvictionAcWatermark`, `minAcAllocationWatermark`, `maxAcAllocationWatermark`. +- `maxPromotionBatch`: The number of objects to promote in batch during BG promotion. Analogous to +`maxEvictionBatch`. It's value should be lower to decrease contention on hot items. + diff --git a/cachelib/CMakeLists.txt b/cachelib/CMakeLists.txt index 6be819974e..407342b581 100644 --- a/cachelib/CMakeLists.txt +++ b/cachelib/CMakeLists.txt @@ -85,6 +85,11 @@ set(CMAKE_MODULE_PATH set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD_REQUIRED True) +if(COVERAGE_ENABLED) + # Add code coverage + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} --coverage -fprofile-arcs -ftest-coverage") +endif() + # include(fb_cxx_flags) message(STATUS "Update CXXFLAGS: ${CMAKE_CXX_FLAGS}") diff --git a/cachelib/allocator/BackgroundMover-inl.h b/cachelib/allocator/BackgroundMover-inl.h new file mode 100644 index 0000000000..b77436635f --- /dev/null +++ b/cachelib/allocator/BackgroundMover-inl.h @@ -0,0 +1,112 @@ +/* + * Copyright (c) Intel and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +namespace facebook { +namespace cachelib { + +template +BackgroundMover::BackgroundMover( + Cache& cache, + std::shared_ptr strategy, + MoverDir direction) + : cache_(cache), strategy_(strategy), direction_(direction) { + if (direction_ == MoverDir::Evict) { + moverFunc = BackgroundMoverAPIWrapper::traverseAndEvictItems; + + } else if (direction_ == MoverDir::Promote) { + moverFunc = BackgroundMoverAPIWrapper::traverseAndPromoteItems; + } +} + +template +BackgroundMover::~BackgroundMover() { + stop(std::chrono::seconds(0)); +} + +template +void BackgroundMover::work() { + try { + checkAndRun(); + } catch (const std::exception& ex) { + XLOGF(ERR, "BackgroundMover interrupted due to exception: {}", ex.what()); + } +} + +template +void BackgroundMover::setAssignedMemory( + std::vector&& assignedMemory) { + XLOG(INFO, "Class assigned to background worker:"); + for (auto [tid, pid, cid] : assignedMemory) { + XLOGF(INFO, "Tid: {}, Pid: {}, Cid: {}", tid, pid, cid); + } + + mutex.lock_combine([this, &assignedMemory] { + this->assignedMemory_ = std::move(assignedMemory); + }); +} + +// Look for classes that exceed the target memory capacity +// and return those for eviction +template +void BackgroundMover::checkAndRun() { + auto assignedMemory = mutex.lock_combine([this] { return assignedMemory_; }); + + unsigned int moves = 0; + std::set classes{}; + auto batches = strategy_->calculateBatchSizes(cache_, assignedMemory); + + for (size_t i = 0; i < batches.size(); i++) { + const auto [tid, pid, cid] = assignedMemory[i]; + const auto batch = batches[i]; + + classes.insert(cid); + const auto& mpStats = cache_.getPoolByTid(pid, tid).getStats(); + + if (!batch) { + continue; + } + + // try moving BATCH items from the class in order to reach free target + auto moved = moverFunc(cache_, tid, pid, cid, batch); + moves += moved; + moves_per_class_[tid][pid][cid] += moved; + totalBytesMoved.add(moved * mpStats.acStats.at(cid).allocSize); + } + + numTraversals.inc(); + numMovedItems.add(moves); + totalClasses.add(classes.size()); +} + +template +BackgroundMoverStats BackgroundMover::getStats() const noexcept { + BackgroundMoverStats stats; + stats.numMovedItems = numMovedItems.get(); + stats.runCount = numTraversals.get(); + stats.totalBytesMoved = totalBytesMoved.get(); + stats.totalClasses = totalClasses.get(); + + return stats; +} + +template +std::map>> +BackgroundMover::getClassStats() const noexcept { + return moves_per_class_; +} + +} // namespace cachelib +} // namespace facebook diff --git a/cachelib/allocator/BackgroundMover.h b/cachelib/allocator/BackgroundMover.h new file mode 100644 index 0000000000..1246676d6e --- /dev/null +++ b/cachelib/allocator/BackgroundMover.h @@ -0,0 +1,103 @@ +/* + * Copyright (c) Intel and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "cachelib/allocator/BackgroundMoverStrategy.h" +#include "cachelib/allocator/CacheStats.h" +#include "cachelib/common/AtomicCounter.h" +#include "cachelib/common/PeriodicWorker.h" + +namespace facebook { +namespace cachelib { + +// wrapper that exposes the private APIs of CacheType that are specifically +// needed for the cache api +template +struct BackgroundMoverAPIWrapper { + static size_t traverseAndEvictItems(C& cache, + unsigned int tid, + unsigned int pid, + unsigned int cid, + size_t batch) { + return cache.traverseAndEvictItems(tid, pid, cid, batch); + } + + static size_t traverseAndPromoteItems(C& cache, + unsigned int tid, + unsigned int pid, + unsigned int cid, + size_t batch) { + return cache.traverseAndPromoteItems(tid, pid, cid, batch); + } +}; + +enum class MoverDir { Evict = 0, Promote }; + +// Periodic worker that evicts items from tiers in batches +// The primary aim is to reduce insertion times for new items in the +// cache +template +class BackgroundMover : public PeriodicWorker { + public: + using Cache = CacheT; + // @param cache the cache interface + // @param strategy the stragey class that defines how objects are + // moved, + // (promoted vs. evicted and how much) + BackgroundMover(Cache& cache, + std::shared_ptr strategy, + MoverDir direction_); + + ~BackgroundMover() override; + + BackgroundMoverStats getStats() const noexcept; + std::map>> + getClassStats() const noexcept; + + void setAssignedMemory( + std::vector&& assignedMemory); + + private: + std::map>> + moves_per_class_; + // cache allocator's interface for evicting + using Item = typename Cache::Item; + + Cache& cache_; + std::shared_ptr strategy_; + MoverDir direction_; + + std::function + moverFunc; + + // implements the actual logic of running the background evictor + void work() override final; + void checkAndRun(); + + AtomicCounter numMovedItems{0}; + AtomicCounter numTraversals{0}; + AtomicCounter totalClasses{0}; + AtomicCounter totalBytesMoved{0}; + + std::vector assignedMemory_; + folly::DistributedMutex mutex; +}; +} // namespace cachelib +} // namespace facebook + +#include "cachelib/allocator/BackgroundMover-inl.h" diff --git a/cachelib/allocator/BackgroundMoverStrategy.h b/cachelib/allocator/BackgroundMoverStrategy.h new file mode 100644 index 0000000000..7706a625a5 --- /dev/null +++ b/cachelib/allocator/BackgroundMoverStrategy.h @@ -0,0 +1,42 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "cachelib/allocator/Cache.h" + + +namespace facebook { +namespace cachelib { + +struct MemoryDescriptorType { + MemoryDescriptorType(TierId tid, PoolId pid, ClassId cid) : + tid_(tid), pid_(pid), cid_(cid) {} + TierId tid_; + PoolId pid_; + ClassId cid_; +}; + +// Base class for background eviction strategy. +class BackgroundMoverStrategy { + public: + virtual std::vector calculateBatchSizes( + const CacheBase& cache, + std::vector acVec) = 0; +}; + +} // namespace cachelib +} // namespace facebook diff --git a/cachelib/allocator/CMakeLists.txt b/cachelib/allocator/CMakeLists.txt index f94c8c90c7..0f96a0cd7f 100644 --- a/cachelib/allocator/CMakeLists.txt +++ b/cachelib/allocator/CMakeLists.txt @@ -35,6 +35,7 @@ add_library (cachelib_allocator CCacheManager.cpp ContainerTypes.cpp FreeMemStrategy.cpp + FreeThresholdStrategy.cpp HitsPerSlabStrategy.cpp LruTailAgeStrategy.cpp MarginalHitsOptimizeStrategy.cpp @@ -54,6 +55,7 @@ add_library (cachelib_allocator PoolOptimizeStrategy.cpp PoolRebalancer.cpp PoolResizer.cpp + PrivateMemoryManager.cpp RebalanceStrategy.cpp SlabReleaseStats.cpp TempShmMapping.cpp diff --git a/cachelib/allocator/Cache.cpp b/cachelib/allocator/Cache.cpp index 058eb84501..37457cc3e9 100644 --- a/cachelib/allocator/Cache.cpp +++ b/cachelib/allocator/Cache.cpp @@ -245,6 +245,7 @@ void CacheBase::updateGlobalCacheStats(const std::string& statPrefix) const { statPrefix + "cache.size.configured", memStats.configuredRamCacheSize + memStats.nvmCacheSize); + //TODO: add specific per-tier counters const auto stats = getGlobalCacheStats(); // Eviction Stats @@ -254,7 +255,8 @@ void CacheBase::updateGlobalCacheStats(const std::string& statPrefix) const { // from both ram and nvm, this is counted as a single eviction from cache. // Ram Evictions: item evicted from ram but it can be inserted into nvm const std::string ramEvictionKey = statPrefix + "ram.evictions"; - counters_.updateDelta(ramEvictionKey, stats.numEvictions); + counters_.updateDelta(ramEvictionKey, + std::accumulate(stats.numEvictions.begin(), stats.numEvictions.end(), 0)); // Nvm Evictions: item evicted from nvm but it can be still in ram const std::string nvmEvictionKey = statPrefix + "nvm.evictions"; counters_.updateDelta(nvmEvictionKey, stats.numNvmEvictions); @@ -296,11 +298,11 @@ void CacheBase::updateGlobalCacheStats(const std::string& statPrefix) const { } counters_.updateDelta(statPrefix + "cache.alloc_attempts", - stats.allocAttempts); + std::accumulate(stats.allocAttempts.begin(), stats.allocAttempts.end(),0)); counters_.updateDelta(statPrefix + "cache.eviction_attempts", - stats.evictionAttempts); + std::accumulate(stats.evictionAttempts.begin(),stats.evictionAttempts.end(),0)); counters_.updateDelta(statPrefix + "cache.alloc_failures", - stats.allocFailures); + std::accumulate(stats.allocFailures.begin(),stats.allocFailures.end(),0)); counters_.updateDelta(statPrefix + "cache.invalid_allocs", stats.invalidAllocs); @@ -476,6 +478,10 @@ void CacheBase::updateGlobalCacheStats(const std::string& statPrefix) const { visitEstimates(uploadStatsNanoToMicro, stats.allocateLatencyNs, statPrefix + "allocate.latency_us"); + visitEstimates(uploadStatsNanoToMicro, stats.bgEvictLatencyNs, + statPrefix + "background.eviction.latency_us"); + visitEstimates(uploadStatsNanoToMicro, stats.bgPromoteLatencyNs, + statPrefix + "background.promotion.latency_us"); visitEstimates(uploadStatsNanoToMicro, stats.moveChainedLatencyNs, statPrefix + "move.chained.latency_us"); visitEstimates(uploadStatsNanoToMicro, stats.moveRegularLatencyNs, diff --git a/cachelib/allocator/Cache.h b/cachelib/allocator/Cache.h index e225ba8a01..c871358189 100644 --- a/cachelib/allocator/Cache.h +++ b/cachelib/allocator/Cache.h @@ -85,6 +85,9 @@ class CacheBase { CacheBase(CacheBase&&) = default; CacheBase& operator=(CacheBase&&) = default; + // TODO: come up with some reasonable number + static constexpr unsigned kMaxTiers = 2; + // Get a string referring to the cache name for this cache virtual const std::string getCacheName() const = 0; @@ -95,6 +98,12 @@ class CacheBase { // // @param poolId The pool id to query virtual const MemoryPool& getPool(PoolId poolId) const = 0; + + // Get the reference to a memory pool using a tier id, for stats purposes + // + // @param poolId The pool id to query + // @param tierId The tier of the pool id + virtual const MemoryPool& getPoolByTid(PoolId poolId, TierId tid) const = 0; // Get Pool specific stats (regular pools). This includes stats from the // Memory Pool and also the cache. @@ -102,6 +111,12 @@ class CacheBase { // @param poolId the pool id virtual PoolStats getPoolStats(PoolId poolId) const = 0; + // Get Allocation Class specific stats. + // + // @param poolId the pool id + // @param classId the class id + virtual ACStats getACStats(TierId tid, PoolId poolId, ClassId classId) const = 0; + // @param poolId the pool id virtual AllSlabReleaseEvents getAllSlabReleaseEvents(PoolId poolId) const = 0; diff --git a/cachelib/allocator/CacheAllocator-inl.h b/cachelib/allocator/CacheAllocator-inl.h index 92a04807a7..11e9058a34 100644 --- a/cachelib/allocator/CacheAllocator-inl.h +++ b/cachelib/allocator/CacheAllocator-inl.h @@ -16,6 +16,8 @@ #pragma once +#include + namespace facebook { namespace cachelib { @@ -35,6 +37,7 @@ CacheAllocator::CacheAllocator(SharedMemNewT, Config config) template CacheAllocator::CacheAllocator(SharedMemAttachT, Config config) : CacheAllocator(InitMemType::kMemAttach, config) { + /* TODO - per tier? */ for (auto pid : *metadata_.compactCachePools()) { isCompactCachePool_[pid] = true; } @@ -56,6 +59,9 @@ CacheAllocator::CacheAllocator( tempShm_(type == InitMemType::kNone && isOnShm_ ? std::make_unique(config_.getCacheSize()) : nullptr), + privMemManager_(type == InitMemType::kNone && !isOnShm_ + ? std::make_unique() + : nullptr), shmManager_(type != InitMemType::kNone ? std::make_unique(config_.cacheDir, config_.isUsingPosixShm()) @@ -67,12 +73,12 @@ CacheAllocator::CacheAllocator( : serialization::CacheAllocatorMetadata{}}, allocator_(initAllocator(type)), compactCacheManager_(type != InitMemType::kMemAttach - ? std::make_unique(*allocator_) - : restoreCCacheManager()), + ? std::make_unique(*allocator_[0] /* TODO: per tier */) + : restoreCCacheManager(0/* TODO: per tier */)), compressor_(createPtrCompressor()), mmContainers_(type == InitMemType::kMemAttach ? deserializeMMContainers(*deserializer_, compressor_) - : MMContainers{}), + : MMContainers{getNumTiers()}), accessContainer_(initAccessContainer( type, detail::kShmHashTableName, config.accessConfig)), chainedItemAccessContainer_( @@ -81,6 +87,8 @@ CacheAllocator::CacheAllocator( config.chainedItemAccessConfig)), chainedItemLocks_(config_.chainedItemsLockPower, std::make_shared()), + movesMap_(kShards), + moveLock_(kShards), cacheCreationTime_{ type != InitMemType::kMemAttach ? util::getCurrentTimeSec() @@ -105,48 +113,115 @@ CacheAllocator::~CacheAllocator() { } template -ShmSegmentOpts CacheAllocator::createShmCacheOpts() { +ShmSegmentOpts CacheAllocator::createShmCacheOpts(TierId tid) { ShmSegmentOpts opts; opts.alignment = sizeof(Slab); // TODO: we support single tier so far - if (config_.memoryTierConfigs.size() > 1) { - throw std::invalid_argument("CacheLib only supports a single memory tier"); + if (config_.memoryTierConfigs.size() > 2) { + throw std::invalid_argument("CacheLib only supports two memory tiers"); } - opts.memBindNumaNodes = config_.memoryTierConfigs[0].getMemBind(); + opts.memBindNumaNodes = config_.memoryTierConfigs[tid].getMemBind(); + return opts; +} + +template +PrivateSegmentOpts CacheAllocator::createPrivateSegmentOpts(TierId tid) { + PrivateSegmentOpts opts; + opts.alignment = sizeof(Slab); + auto memoryTierConfigs = config_.getMemoryTierConfigs(); + opts.memBindNumaNodes = memoryTierConfigs[tid].getMemBind(); + return opts; } +template +size_t CacheAllocator::memoryTierSize(TierId tid) const { + auto& memoryTierConfigs = config_.memoryTierConfigs; + auto partitions = std::accumulate(memoryTierConfigs.begin(), memoryTierConfigs.end(), 0UL, + [](const size_t i, const MemoryTierCacheConfig& config){ + return i + config.getRatio(); + }); + + return memoryTierConfigs[tid].calculateTierSize(config_.getCacheSize(), partitions); +} + +template +std::unique_ptr +CacheAllocator::createPrivateAllocator(TierId tid) { + if (isOnShm_) + return std::make_unique( + getAllocatorConfig(config_), + tempShm_->getAddr(), + memoryTierSize(tid)); + else + return std::make_unique( + getAllocatorConfig(config_), + privMemManager_->createMapping(config_.size, createPrivateSegmentOpts(tid)), + memoryTierSize(tid)); +} + template std::unique_ptr -CacheAllocator::createNewMemoryAllocator() { +CacheAllocator::createNewMemoryAllocator(TierId tid) { + size_t tierSize = memoryTierSize(tid); return std::make_unique( getAllocatorConfig(config_), shmManager_ - ->createShm(detail::kShmCacheName, config_.getCacheSize(), - config_.slabMemoryBaseAddr, createShmCacheOpts()) + ->createShm(detail::kShmCacheName + std::to_string(tid), + tierSize, config_.slabMemoryBaseAddr, + createShmCacheOpts(tid)) .addr, - config_.getCacheSize()); + tierSize); } template std::unique_ptr -CacheAllocator::restoreMemoryAllocator() { +CacheAllocator::restoreMemoryAllocator(TierId tid) { return std::make_unique( deserializer_->deserialize(), shmManager_ - ->attachShm(detail::kShmCacheName, config_.slabMemoryBaseAddr, - createShmCacheOpts()) - .addr, - config_.getCacheSize(), + ->attachShm(detail::kShmCacheName + std::to_string(tid), + config_.slabMemoryBaseAddr, createShmCacheOpts(tid)).addr, + memoryTierSize(tid), config_.disableFullCoredump); } +template +std::vector> +CacheAllocator::createPrivateAllocators() { + std::vector> allocators; + for (int tid = 0; tid < getNumTiers(); tid++) { + allocators.emplace_back(createPrivateAllocator(tid)); + } + return allocators; +} + +template +std::vector> +CacheAllocator::createAllocators() { + std::vector> allocators; + for (int tid = 0; tid < getNumTiers(); tid++) { + allocators.emplace_back(createNewMemoryAllocator(tid)); + } + return allocators; +} + +template +std::vector> +CacheAllocator::restoreAllocators() { + std::vector> allocators; + for (int tid = 0; tid < getNumTiers(); tid++) { + allocators.emplace_back(restoreMemoryAllocator(tid)); + } + return allocators; +} + template std::unique_ptr -CacheAllocator::restoreCCacheManager() { +CacheAllocator::restoreCCacheManager(TierId tid) { return std::make_unique( deserializer_->deserialize(), - *allocator_); + *allocator_[tid]); } template @@ -235,24 +310,30 @@ void CacheAllocator::initWorkers() { config_.poolOptimizeStrategy, config_.ccacheOptimizeStepSizePercent); } + + if (config_.backgroundEvictorEnabled()) { + startNewBackgroundEvictor(config_.backgroundEvictorInterval, + config_.backgroundEvictorStrategy, + config_.backgroundEvictorThreads); + } + + if (config_.backgroundPromoterEnabled()) { + startNewBackgroundPromoter(config_.backgroundPromoterInterval, + config_.backgroundPromoterStrategy, + config_.backgroundPromoterThreads); + } } template -std::unique_ptr CacheAllocator::initAllocator( +std::vector> +CacheAllocator::initAllocator( InitMemType type) { if (type == InitMemType::kNone) { - if (isOnShm_ == true) { - return std::make_unique(getAllocatorConfig(config_), - tempShm_->getAddr(), - config_.getCacheSize()); - } else { - return std::make_unique(getAllocatorConfig(config_), - config_.getCacheSize()); - } + return createPrivateAllocators(); } else if (type == InitMemType::kMemNew) { - return createNewMemoryAllocator(); + return createAllocators(); } else if (type == InitMemType::kMemAttach) { - return restoreMemoryAllocator(); + return restoreAllocators(); } // Invalid type @@ -320,27 +401,54 @@ CacheAllocator::allocate(PoolId poolId, } template -typename CacheAllocator::WriteHandle -CacheAllocator::allocateInternal(PoolId pid, - typename Item::Key key, - uint32_t size, - uint32_t creationTime, - uint32_t expiryTime) { - util::LatencyTracker tracker{stats().allocateLatency_}; +bool CacheAllocator::shouldWakeupBgEvictor(TierId tid, PoolId pid, ClassId cid) { + // TODO: should we also work on lower tiers? should we have separate set of params? + if (tid == 1) return false; + return (1-getACStats(tid, pid, cid).usageFraction())*100 <= config_.lowEvictionAcWatermark; +} + +template +size_t CacheAllocator::backgroundWorkerId(TierId tid, PoolId pid, ClassId cid, size_t numWorkers) { + XDCHECK(numWorkers); + + // TODO: came up with some better sharding (use some hashing) + return (tid + pid + cid) % numWorkers; +} + +template +typename CacheAllocator::WriteHandle +CacheAllocator::allocateInternalTier(TierId tid, + PoolId pid, + typename Item::Key key, + uint32_t size, + uint32_t creationTime, + uint32_t expiryTime, + bool fromBgThread, + bool evict) { + util::LatencyTracker tracker{stats().allocateLatency_, static_cast(!fromBgThread)}; SCOPE_FAIL { stats_.invalidAllocs.inc(); }; // number of bytes required for this item const auto requiredSize = Item::getRequiredSize(key, size); // the allocation class in our memory allocator. - const auto cid = allocator_->getAllocationClassId(pid, requiredSize); + const auto cid = allocator_[tid]->getAllocationClassId(pid, requiredSize); - (*stats_.allocAttempts)[pid][cid].inc(); + util::RollingLatencyTracker rollTracker{(*stats_.classAllocLatency)[tid][pid][cid]}; - void* memory = allocator_->allocate(pid, requiredSize); - if (memory == nullptr) { - memory = findEviction(pid, cid); + (*stats_.allocAttempts)[tid][pid][cid].inc(); + + void* memory = allocator_[tid]->allocate(pid, requiredSize); + + if (backgroundEvictor_.size() && !fromBgThread && (memory == nullptr || shouldWakeupBgEvictor(tid, pid, cid))) { + backgroundEvictor_[backgroundWorkerId(tid, pid, cid, backgroundEvictor_.size())]->wakeUp(); + } + + if (memory == nullptr && !evict) { + return {}; + } else if (memory == nullptr) { + memory = findEviction(tid, pid, cid); } WriteHandle handle; @@ -351,18 +459,18 @@ CacheAllocator::allocateInternal(PoolId pid, // for example. SCOPE_FAIL { // free back the memory to the allocator since we failed. - allocator_->free(memory); + allocator_[tid]->free(memory); }; handle = acquire(new (memory) Item(key, size, creationTime, expiryTime)); if (handle) { handle.markNascent(); - (*stats_.fragmentationSize)[pid][cid].add( + (*stats_.fragmentationSize)[tid][pid][cid].add( util::getFragmentation(*this, *handle)); } } else { // failed to allocate memory. - (*stats_.allocFailures)[pid][cid].inc(); + (*stats_.allocFailures)[tid][pid][cid].inc(); // wake up rebalancer if (!config_.poolRebalancerDisableForcedWakeUp && poolRebalancer_) { poolRebalancer_->wakeUp(); @@ -379,6 +487,23 @@ CacheAllocator::allocateInternal(PoolId pid, return handle; } +template +typename CacheAllocator::WriteHandle +CacheAllocator::allocateInternal(PoolId pid, + typename Item::Key key, + uint32_t size, + uint32_t creationTime, + uint32_t expiryTime, + bool fromBgThread) { + auto tid = 0; /* TODO: consult admission policy */ + for(TierId tid = 0; tid < getNumTiers(); ++tid) { + bool evict = !config_.insertToFirstFreeTier || tid == getNumTiers() - 1; + auto handle = allocateInternalTier(tid, pid, key, size, creationTime, expiryTime, fromBgThread, evict); + if (handle) return handle; + } + return {}; +} + template typename CacheAllocator::WriteHandle CacheAllocator::allocateChainedItem(const ReadHandle& parent, @@ -402,6 +527,18 @@ template typename CacheAllocator::WriteHandle CacheAllocator::allocateChainedItemInternal( const ReadHandle& parent, uint32_t size) { + auto tid = 0; /* TODO: consult admission policy */ + for(TierId tid = 0; tid < getNumTiers(); ++tid) { + auto handle = allocateChainedItemInternalTier(*parent, size, tid); + if (handle) return handle; + } + return {}; +} + +template +typename CacheAllocator::WriteHandle +CacheAllocator::allocateChainedItemInternalTier( + const Item& parent, uint32_t size, TierId tid) { util::LatencyTracker tracker{stats().allocateLatency_}; SCOPE_FAIL { stats_.invalidAllocs.inc(); }; @@ -409,29 +546,33 @@ CacheAllocator::allocateChainedItemInternal( // number of bytes required for this item const auto requiredSize = ChainedItem::getRequiredSize(size); - const auto pid = allocator_->getAllocInfo(parent->getMemory()).poolId; - const auto cid = allocator_->getAllocationClassId(pid, requiredSize); + const auto ptid = getTierId(parent); //it is okay because pools/classes are duplicated among the tiers + const auto pid = allocator_[ptid]->getAllocInfo(parent.getMemory()).poolId; + const auto cid = allocator_[ptid]->getAllocationClassId(pid, requiredSize); - (*stats_.allocAttempts)[pid][cid].inc(); + util::RollingLatencyTracker rollTracker{ + (*stats_.classAllocLatency)[tid][pid][cid]}; + + (*stats_.allocAttempts)[tid][pid][cid].inc(); - void* memory = allocator_->allocate(pid, requiredSize); + void* memory = allocator_[tid]->allocate(pid, requiredSize); if (memory == nullptr) { - memory = findEviction(pid, cid); + memory = findEviction(tid, pid, cid); } if (memory == nullptr) { - (*stats_.allocFailures)[pid][cid].inc(); + (*stats_.allocFailures)[tid][pid][cid].inc(); return WriteHandle{}; } - SCOPE_FAIL { allocator_->free(memory); }; + SCOPE_FAIL { allocator_[tid]->free(memory); }; auto child = acquire( - new (memory) ChainedItem(compressor_.compress(parent.getInternal()), size, + new (memory) ChainedItem(compressor_.compress(&parent), size, util::getCurrentTimeSec())); if (child) { child.markNascent(); - (*stats_.fragmentationSize)[pid][cid].add( + (*stats_.fragmentationSize)[tid][pid][cid].add( util::getFragmentation(*this, *child)); } @@ -467,14 +608,15 @@ void CacheAllocator::addChainedItem(WriteHandle& parent, // Count a new child stats_.numChainedChildItems.inc(); - insertInMMContainer(*child); - // Increment refcount since this chained item is now owned by the parent // Parent will decrement the refcount upon release. Since this is an // internal refcount, we dont include it in active handle tracking. - child->incRef(); + auto ret = child->incRef(); + XDCHECK(ret == RefcountWithFlags::incResult::incOk); XDCHECK_EQ(2u, child->getRefCount()); + insertInMMContainer(*child); + invalidateNvm(*parent); if (auto eventTracker = getEventTracker()) { eventTracker->record(AllocatorApiEvent::ADD_CHAINED, parent->getKey(), @@ -538,22 +680,20 @@ CacheAllocator::getParentKey(const Item& chainedItem) { } template -void CacheAllocator::transferChainLocked(WriteHandle& parent, +void CacheAllocator::transferChainLocked(Item& parent, WriteHandle& newParent) { // parent must be in a state to not have concurrent readers. Eviction code - // paths rely on holding the last item handle. Since we hold on to an item - // handle here, the chain will not be touched by any eviction code path. - XDCHECK(parent); + // paths rely on holding the last item handle. XDCHECK(newParent); - XDCHECK_EQ(parent->getKey(), newParent->getKey()); - XDCHECK(parent->hasChainedItem()); + XDCHECK_EQ(parent.getKey(), newParent->getKey()); + XDCHECK(parent.hasChainedItem()); if (newParent->hasChainedItem()) { throw std::invalid_argument(folly::sformat( "New Parent {} has invalid state", newParent->toString())); } - auto headHandle = findChainedItem(*parent); + auto headHandle = findChainedItem(parent); XDCHECK(headHandle); // remove from the access container since we are changing the key @@ -565,6 +705,7 @@ void CacheAllocator::transferChainLocked(WriteHandle& parent, while (curr) { XDCHECK_EQ(curr == headHandle.get() ? 2u : 1u, curr->getRefCount()); XDCHECK(curr->isInMMContainer()); + XDCHECK(!newParent->isMoving()); curr->changeKey(newParentPtr); curr = curr->getNext(compressor_); } @@ -576,7 +717,7 @@ void CacheAllocator::transferChainLocked(WriteHandle& parent, folly::sformat("Did not expect to find an existing chain for {}", newParent->toString(), oldHead->toString())); } - parent->unmarkHasChainedItem(); + parent.unmarkHasChainedItem(); } template @@ -587,7 +728,7 @@ void CacheAllocator::transferChainAndReplace( } { // scope for chained item lock auto l = chainedItemLocks_.lockExclusive(parent->getKey()); - transferChainLocked(parent, newParent); + transferChainLocked(*parent, newParent); } if (replaceIfAccessible(*parent, *newParent)) { @@ -654,33 +795,10 @@ CacheAllocator::replaceChainedItem(Item& oldItem, } template -typename CacheAllocator::WriteHandle -CacheAllocator::replaceChainedItemLocked(Item& oldItem, - WriteHandle newItemHdl, - const Item& parent) { - XDCHECK(newItemHdl != nullptr); - XDCHECK_GE(1u, oldItem.getRefCount()); - - // grab the handle to the old item so that we can return this. Also, we need - // to drop the refcount the parent holds on oldItem by manually calling - // decRef. To do that safely we need to have a proper outstanding handle. - auto oldItemHdl = acquire(&oldItem); - - // Replace the old chained item with new item in the MMContainer before we - // actually replace the old item in the chain - - if (!replaceChainedItemInMMContainer(oldItem, *newItemHdl)) { - // This should never happen since we currently hold an valid - // parent handle. None of its chained items can be removed - throw std::runtime_error(folly::sformat( - "chained item cannot be replaced in MM container, oldItem={}, " - "newItem={}, parent={}", - oldItem.toString(), newItemHdl->toString(), parent.toString())); - } - - XDCHECK(!oldItem.isInMMContainer()); - XDCHECK(newItemHdl->isInMMContainer()); - +void CacheAllocator::replaceInChainLocked(Item& oldItem, + WriteHandle& newItemHdl, + const Item& parent, + bool fromMove) { auto head = findChainedItem(parent); XDCHECK(head != nullptr); XDCHECK_EQ(reinterpret_cast( @@ -709,16 +827,62 @@ CacheAllocator::replaceChainedItemLocked(Item& oldItem, oldItem.asChainedItem().getNext(compressor_), compressor_); oldItem.asChainedItem().setNext(nullptr, compressor_); - // this should not result in 0 refcount. We are bumping down the internal - // refcount. If it did, we would leak an item. - oldItem.decRef(); - XDCHECK_LT(0u, oldItem.getRefCount()) << oldItem.toString(); + //if called from moveChainedItem then ref will be zero, else + //greater than 0 + if (fromMove) { + //if this is the head chained item, release the handle now + //while refCount > 1 so that the destructor does not + //call releaseBackToAllocator since we want recycle oldItem + if (head) { + head.reset(); + XDCHECK_EQ(1u, oldItem.getRefCount()); + } + oldItem.decRef(); + XDCHECK_EQ(0u, oldItem.getRefCount()) << oldItem.toString(); + } else { + oldItem.decRef(); + XDCHECK_LT(0u, oldItem.getRefCount()) << oldItem.toString(); + } // increment refcount to indicate parent owns this similar to addChainedItem // Since this is an internal refcount, we dont include it in active handle // tracking. - newItemHdl->incRef(); + auto ret = newItemHdl->incRef(); + XDCHECK(ret == RefcountWithFlags::incResult::incOk); +} + +template +typename CacheAllocator::WriteHandle +CacheAllocator::replaceChainedItemLocked(Item& oldItem, + WriteHandle newItemHdl, + const Item& parent) { + XDCHECK(newItemHdl != nullptr); + XDCHECK_GE(1u, oldItem.getRefCount()); + + // grab the handle to the old item so that we can return this. Also, we need + // to drop the refcount the parent holds on oldItem by manually calling + // decRef. To do that safely we need to have a proper outstanding handle. + auto oldItemHdl = acquire(&oldItem); + XDCHECK_GE(2u, oldItem.getRefCount()); + + // Replace the old chained item with new item in the MMContainer before we + // actually replace the old item in the chain + + if (!replaceChainedItemInMMContainer(oldItem, *newItemHdl)) { + // This should never happen since we currently hold an valid + // parent handle. None of its chained items can be removed + throw std::runtime_error(folly::sformat( + "chained item cannot be replaced in MM container, oldItem={}, " + "newItem={}, parent={}", + oldItem.toString(), newItemHdl->toString(), parent.toString())); + } + + XDCHECK(!oldItem.isInMMContainer()); + XDCHECK(newItemHdl->isInMMContainer()); + + replaceInChainLocked(oldItem, newItemHdl, parent, false); + return oldItemHdl; } @@ -732,8 +896,8 @@ CacheAllocator::releaseBackToAllocator(Item& it, throw std::runtime_error( folly::sformat("cannot release this item: {}", it.toString())); } - - const auto allocInfo = allocator_->getAllocInfo(it.getMemory()); + const auto tid = getTierId(it); + const auto allocInfo = allocator_[tid]->getAllocInfo(it.getMemory()); if (ctx == RemoveContext::kEviction) { const auto timeNow = util::getCurrentTimeSec(); @@ -744,21 +908,23 @@ CacheAllocator::releaseBackToAllocator(Item& it, stats_.perPoolEvictionAgeSecs_[allocInfo.poolId].trackValue(refreshTime); } - (*stats_.fragmentationSize)[allocInfo.poolId][allocInfo.classId].sub( + (*stats_.fragmentationSize)[tid][allocInfo.poolId][allocInfo.classId].sub( util::getFragmentation(*this, it)); // Chained items can only end up in this place if the user has allocated // memory for a chained item but has decided not to insert the chained item // to a parent item and instead drop the chained item handle. In this case, // we free the chained item directly without calling remove callback. - if (it.isChainedItem()) { + // + // Except if we are moving a chained item between tiers - + // then it == toRecycle and we will want the normal recycle path + if (it.isChainedItem() && &it != toRecycle) { if (toRecycle) { throw std::runtime_error( folly::sformat("Can not recycle a chained item {}, toRecyle", it.toString(), toRecycle->toString())); } - - allocator_->free(&it); + allocator_[tid]->free(&it); return ReleaseRes::kReleased; } @@ -825,17 +991,19 @@ CacheAllocator::releaseBackToAllocator(Item& it, while (head) { auto next = head->getNext(compressor_); - + const auto ctid = getTierId(head); const auto childInfo = - allocator_->getAllocInfo(static_cast(head)); - (*stats_.fragmentationSize)[childInfo.poolId][childInfo.classId].sub( + allocator_[ctid]->getAllocInfo(static_cast(head)); + (*stats_.fragmentationSize)[ctid][childInfo.poolId][childInfo.classId].sub( util::getFragmentation(*this, *head)); removeFromMMContainer(*head); // If this chained item is marked as moving, we will not free it. // We must capture the moving state before we do the decRef when - // we know the item must still be valid + // we know the item must still be valid. Item cannot be marked as + // exclusive. Only parent can be marked as such and even parent needs + // to be unmark prior to calling releaseBackToAllocator. const bool wasMoving = head->isMoving(); XDCHECK(!head->isMarkedForEviction()); @@ -847,22 +1015,21 @@ CacheAllocator::releaseBackToAllocator(Item& it, // If the item is already moving and we already decremented the // refcount, we don't need to free this item. We'll let the slab // release thread take care of that - if (!wasMoving) { - if (childRef != 0) { - throw std::runtime_error(folly::sformat( - "chained item refcount is not zero. We cannot proceed! " - "Ref: {}, Chained Item: {}", - childRef, head->toString())); - } + XDCHECK(!wasMoving); + if (childRef != 0) { + throw std::runtime_error(folly::sformat( + "chained item refcount is not zero. We cannot proceed! " + "Ref: {}, Chained Item: {}", + childRef, head->toString())); + } - // Item is not moving and refcount is 0, we can proceed to - // free it or recylce the memory - if (head == toRecycle) { - XDCHECK(ReleaseRes::kReleased != res); - res = ReleaseRes::kRecycled; - } else { - allocator_->free(head); - } + // Item is not moving and refcount is 0, we can proceed to + // free it or recylce the memory + if (head == toRecycle) { + XDCHECK(ReleaseRes::kReleased != res); + res = ReleaseRes::kRecycled; + } else { + allocator_[ctid]->free(head); } stats_.numChainedChildItems.dec(); @@ -872,23 +1039,24 @@ CacheAllocator::releaseBackToAllocator(Item& it, } if (&it == toRecycle) { + XDCHECK_EQ(it.getRefCount(),0u); XDCHECK(ReleaseRes::kReleased != res); res = ReleaseRes::kRecycled; } else { XDCHECK(it.isDrained()); - allocator_->free(&it); + allocator_[tid]->free(&it); } return res; } template -bool CacheAllocator::incRef(Item& it) { - if (it.incRef()) { - ++handleCount_.tlStats(); - return true; - } - return false; +RefcountWithFlags::incResult CacheAllocator::incRef(Item& it) { + auto ret = it.incRef(); + if (ret == RefcountWithFlags::incResult::incOk) { + ++handleCount_.tlStats(); + } + return ret; } template @@ -908,11 +1076,19 @@ CacheAllocator::acquire(Item* it) { SCOPE_FAIL { stats_.numRefcountOverflow.inc(); }; - if (LIKELY(incRef(*it))) { - return WriteHandle{it, *this}; - } else { - // item is being evicted - return WriteHandle{}; + while (true) { + auto incRes = incRef(*it); + if (LIKELY(incRes == RefcountWithFlags::incResult::incOk)) { + return WriteHandle{it, *this}; + } else if (incRes == RefcountWithFlags::incResult::incFailedEviction){ + // item is being evicted + return WriteHandle{}; + } else { + // item is being moved - wait for completion + WriteHandle handle; + if (tryGetHandleWithWaitContextForMovingItem(*it, handle)) + return handle; + } } } @@ -955,6 +1131,25 @@ bool CacheAllocator::replaceInMMContainer(Item& oldItem, } } +template +bool CacheAllocator::replaceInMMContainer(Item* oldItem, + Item& newItem) { + return replaceInMMContainer(*oldItem, newItem); +} + +template +bool CacheAllocator::replaceInMMContainer(EvictionIterator& oldItemIt, + Item& newItem) { + auto& oldContainer = getMMContainer(*oldItemIt); + auto& newContainer = getMMContainer(newItem); + + // This function is used for eviction across tiers + XDCHECK(&oldContainer != &newContainer); + oldContainer.remove(oldItemIt); + + return newContainer.add(newItem); +} + template bool CacheAllocator::replaceChainedItemInMMContainer( Item& oldItem, Item& newItem) { @@ -1054,7 +1249,6 @@ CacheAllocator::insertOrReplace(const WriteHandle& handle) { : std::unique_lock(); replaced = accessContainer_->insertOrReplace(*(handle.getInternal())); - if (replaced && replaced->isNvmClean() && !replaced->isNvmEvicted()) { // item is to be replaced and the destructor will be executed // upon memory released, mark it in nvm to avoid destructor @@ -1100,19 +1294,73 @@ CacheAllocator::insertOrReplace(const WriteHandle& handle) { return replaced; } +/* Next two methods are used to asynchronously move Item between memory tiers. + * + * The thread, which moves Item, allocates new Item in the tier we are moving to + * and calls moveRegularItem() method. This method does the following: + * 1. Update the access container with the new item from the tier we are + * moving to. This Item has moving flag set. + * 2. Copy data from the old Item to the new one. + * + * Concurrent threads which are getting handle to the same key: + * 1. When a handle is created it checks if the moving flag is set + * 2. If so, Handle implementation creates waitContext and adds it to the + * MoveCtx by calling handleWithWaitContextForMovingItem() method. + * 3. Wait until the moving thread will complete its job. + */ template -bool CacheAllocator::moveRegularItem(Item& oldItem, - WriteHandle& newItemHdl) { - XDCHECK(config_.moveCb); - util::LatencyTracker tracker{stats_.moveRegularLatency_}; +bool +CacheAllocator::tryGetHandleWithWaitContextForMovingItem(Item& item, WriteHandle& handle) { + auto shard = getShardForKey(item.getKey()); + auto& movesMap = getMoveMapForShard(shard); + { + auto lock = getMoveLockForShard(shard); - if (!oldItem.isAccessible() || oldItem.isExpired()) { - return false; + // item might have been evicted or moved before the lock was acquired + if (!item.isMoving()) + return false; + + WriteHandle hdl{*this}; + auto waitContext = hdl.getItemWaitContext(); + + auto ret = movesMap.try_emplace(item.getKey(), std::make_unique()); + ret.first->second->addWaiter(std::move(waitContext)); + + handle = std::move(hdl); + return true; } +} + +template +size_t CacheAllocator::wakeUpWaitersLocked(folly::StringPiece key, + WriteHandle&& handle) { + std::unique_ptr ctx; + auto shard = getShardForKey(key); + auto& movesMap = getMoveMapForShard(shard); + { + auto lock = getMoveLockForShard(shard); + movesMap.eraseInto(key, [&](auto &&key, auto &&value) { + ctx = std::move(value); + }); + } + + if (ctx) { + ctx->setItemHandle(std::move(handle)); + return ctx->numWaiters(); + } + + return 0; +} + +template +bool CacheAllocator::moveRegularItem( + Item& oldItem, WriteHandle& newItemHdl) { + XDCHECK(oldItem.isMoving()); + XDCHECK(!oldItem.isExpired()); + // TODO: should we introduce new latency tracker. E.g. evictRegularLatency_ + // ??? util::LatencyTracker tracker{stats_.evictRegularLatency_}; XDCHECK_EQ(newItemHdl->getSize(), oldItem.getSize()); - XDCHECK_EQ(reinterpret_cast(&getMMContainer(oldItem)), - reinterpret_cast(&getMMContainer(*newItemHdl))); // take care of the flags before we expose the item to be accessed. this // will ensure that when another thread removes the item from RAM, we issue @@ -1121,52 +1369,44 @@ bool CacheAllocator::moveRegularItem(Item& oldItem, newItemHdl->markNvmClean(); } - // Execute the move callback. We cannot make any guarantees about the - // consistency of the old item beyond this point, because the callback can - // do more than a simple memcpy() e.g. update external references. If there - // are any remaining handles to the old item, it is the caller's - // responsibility to invalidate them. The move can only fail after this - // statement if the old item has been removed or replaced, in which case it - // should be fine for it to be left in an inconsistent state. - config_.moveCb(oldItem, *newItemHdl, nullptr); - // Inside the access container's lock, this checks if the old item is - // accessible and its refcount is zero. If the item is not accessible, - // there is no point to replace it since it had already been removed - // or in the process of being removed. If the item is in cache but the - // refcount is non-zero, it means user could be attempting to remove - // this item through an API such as remove(itemHandle). In this case, - // it is unsafe to replace the old item with a new one, so we should - // also abort. - if (!accessContainer_->replaceIf(oldItem, *newItemHdl, - itemExclusivePredicate)) { - return false; + if (config_.moveCb) { + // Execute the move callback. We cannot make any guarantees about the + // consistency of the old item beyond this point, because the callback can + // do more than a simple memcpy() e.g. update external references. If there + // are any remaining handles to the old item, it is the caller's + // responsibility to invalidate them. The move can only fail after this + // statement if the old item has been removed or replaced, in which case it + // should be fine for it to be left in an inconsistent state. + config_.moveCb(oldItem, *newItemHdl, nullptr); + } else { + std::memcpy(newItemHdl->getMemory(), oldItem.getMemory(), + oldItem.getSize()); } - // Inside the MM container's lock, this checks if the old item exists to - // make sure that no other thread removed it, and only then replaces it. - if (!replaceInMMContainer(oldItem, *newItemHdl)) { - accessContainer_->remove(*newItemHdl); - return false; - } + // Adding the item to mmContainer has to succeed since no one can remove the item + auto& newContainer = getMMContainer(*newItemHdl); + auto mmContainerAdded = newContainer.add(*newItemHdl); + XDCHECK(mmContainerAdded); - // Replacing into the MM container was successful, but someone could have - // called insertOrReplace() or remove() before or after the - // replaceInMMContainer() operation, which would invalidate newItemHdl. - if (!newItemHdl->isAccessible()) { - removeFromMMContainer(*newItemHdl); - return false; - } - // no one can add or remove chained items at this point + auto predicate = [&](const Item& item){ + // we rely on moving flag being set (it should block all readers) + XDCHECK_EQ(item.getRefCount(),0); + XDCHECK(item.isMoving()); + return item.isMoving(); + }; + if (oldItem.hasChainedItem()) { - // safe to acquire handle for a moving Item - auto oldHandle = acquire(&oldItem); - XDCHECK_EQ(1u, oldHandle->getRefCount()) << oldHandle->toString(); XDCHECK(!newItemHdl->hasChainedItem()) << newItemHdl->toString(); try { - auto l = chainedItemLocks_.lockExclusive(oldItem.getKey()); - transferChainLocked(oldHandle, newItemHdl); + auto l = chainedItemLocks_.tryLockExclusive(oldItem.getKey()); + if (l) { + transferChainLocked(oldItem, newItemHdl); + } else { + newContainer.remove(*newItemHdl); + return false; + } } catch (const std::exception& e) { // this should never happen because we drained all the handles. XLOGF(DFATAL, "{}", e.what()); @@ -1176,71 +1416,87 @@ bool CacheAllocator::moveRegularItem(Item& oldItem, XDCHECK(!oldItem.hasChainedItem()); XDCHECK(newItemHdl->hasChainedItem()); } + + if (!accessContainer_->replaceIf(oldItem, *newItemHdl, predicate)) { + newContainer.remove(*newItemHdl); + return false; + } + newItemHdl.unmarkNascent(); return true; } template bool CacheAllocator::moveChainedItem(ChainedItem& oldItem, - WriteHandle& newItemHdl) { - XDCHECK(config_.moveCb); + WriteHandle& newItemHdl, + Item& parentItem) { + XDCHECK(parentItem.isMoving()); util::LatencyTracker tracker{stats_.moveChainedLatency_}; - // This item has been unlinked from its parent and we're the only - // owner of it, so we're done here - if (!oldItem.isInMMContainer() || oldItem.isOnlyMoving()) { - return false; - } - - const auto parentKey = oldItem.getParentItem(compressor_).getKey(); - - // Grab lock to prevent anyone else from modifying the chain - auto l = chainedItemLocks_.lockExclusive(parentKey); - - auto parentHandle = - validateAndGetParentHandleForChainedMoveLocked(oldItem, parentKey); - if (!parentHandle) { - return false; + auto& expectedParent = oldItem.getParentItem(compressor_); + const auto parentKey = expectedParent.getKey(); + auto l = chainedItemLocks_.tryLockExclusive(parentKey); + if (!l) { + return false; } + XDCHECK_EQ(&expectedParent,&parentItem); - // once we have the moving sync and valid parent for the old item, check if + // check if // the original allocation was made correctly. If not, we destroy the // allocation to indicate a retry to moving logic above. if (reinterpret_cast( &newItemHdl->asChainedItem().getParentItem(compressor_)) != - reinterpret_cast(&parentHandle->asChainedItem())) { - newItemHdl.reset(); + reinterpret_cast(&parentItem.asChainedItem())) { + XDCHECK(false); return false; } XDCHECK_EQ(reinterpret_cast( &newItemHdl->asChainedItem().getParentItem(compressor_)), - reinterpret_cast(&parentHandle->asChainedItem())); - - // In case someone else had removed this chained item from its parent by now - // So we check again to see if the it has been unlinked from its parent - if (!oldItem.isInMMContainer() || oldItem.isOnlyMoving()) { - return false; - } + reinterpret_cast(&parentItem.asChainedItem())); - auto parentPtr = parentHandle.getInternal(); + auto parentPtr = &parentItem; XDCHECK_EQ(reinterpret_cast(parentPtr), reinterpret_cast(&oldItem.getParentItem(compressor_))); - // Invoke the move callback to fix up any user data related to the chain - config_.moveCb(oldItem, *newItemHdl, parentPtr); + if (config_.moveCb) { + // Execute the move callback. We cannot make any guarantees about the + // consistency of the old item beyond this point, because the callback can + // do more than a simple memcpy() e.g. update external references. If there + // are any remaining handles to the old item, it is the caller's + // responsibility to invalidate them. The move can only fail after this + // statement if the old item has been removed or replaced, in which case it + // should be fine for it to be left in an inconsistent state. + config_.moveCb(oldItem, *newItemHdl, parentPtr); + } else { + std::memcpy(newItemHdl->getMemory(), oldItem.getMemory(), + oldItem.getSize()); + } // Replace the new item in the position of the old one before both in the // parent's chain and the MMContainer. - auto oldItemHandle = - replaceChainedItemLocked(oldItem, std::move(newItemHdl), *parentHandle); - XDCHECK(oldItemHandle->isMoving()); - XDCHECK(!oldItemHandle->isInMMContainer()); + XDCHECK_EQ(parentItem.getRefCount(),0); + XDCHECK(parentItem.isMoving()); + XDCHECK(l); + + auto& newContainer = getMMContainer(*newItemHdl); + auto mmContainerAdded = newContainer.add(*newItemHdl); + XDCHECK(mmContainerAdded); + + replaceInChainLocked(oldItem, newItemHdl, parentItem, true); return true; } +template +typename CacheAllocator::NvmCacheT::PutToken +CacheAllocator::createPutToken(Item& item) { + const bool evictToNvmCache = shouldWriteToNvmCache(item); + return evictToNvmCache ? nvmCache_->createPutToken(item.getKey()) + : typename NvmCacheT::PutToken{}; +} + template void CacheAllocator::unlinkItemForEviction(Item& it) { XDCHECK(it.isMarkedForEviction()); @@ -1257,20 +1513,28 @@ void CacheAllocator::unlinkItemForEviction(Item& it) { template std::pair::Item*, typename CacheAllocator::Item*> -CacheAllocator::getNextCandidate(PoolId pid, +CacheAllocator::getNextCandidate(TierId tid, + PoolId pid, ClassId cid, unsigned int& searchTries) { typename NvmCacheT::PutToken token; Item* toRecycle = nullptr; + Item* toRecycleParent = nullptr; Item* candidate = nullptr; - auto& mmContainer = getMMContainer(pid, cid); - - mmContainer.withEvictionIterator([this, pid, cid, &candidate, &toRecycle, - &searchTries, &mmContainer, - &token](auto&& itr) { + bool isExpired = false; + Item* syncItem = nullptr; + bool chainedItem = false; + auto& mmContainer = getMMContainer(tid, pid, cid); + bool lastTier = tid+1 >= getNumTiers(); + + mmContainer.withEvictionIterator([this, tid, pid, cid, &candidate, + &toRecycle, &toRecycleParent, &syncItem, + &chainedItem, + &searchTries, &mmContainer, &lastTier, + &isExpired, &token](auto&& itr) { if (!itr) { ++searchTries; - (*stats_.evictionAttempts)[pid][cid].inc(); + (*stats_.evictionAttempts)[tid][pid][cid].inc(); return; } @@ -1278,50 +1542,82 @@ CacheAllocator::getNextCandidate(PoolId pid, config_.evictionSearchTries > searchTries) && itr) { ++searchTries; - (*stats_.evictionAttempts)[pid][cid].inc(); + (*stats_.evictionAttempts)[tid][pid][cid].inc(); auto* toRecycle_ = itr.get(); - auto* candidate_ = - toRecycle_->isChainedItem() + bool chainedItem_ = toRecycle_->isChainedItem(); + Item* toRecycleParent_ = chainedItem_ ? &toRecycle_->asChainedItem().getParentItem(compressor_) - : toRecycle_; - - const bool evictToNvmCache = shouldWriteToNvmCache(*candidate_); - auto putToken = evictToNvmCache - ? nvmCache_->createPutToken(candidate_->getKey()) - : typename NvmCacheT::PutToken{}; - - if (evictToNvmCache && !putToken.isValid()) { + : nullptr; + // in order to safely check if the expected parent (toRecycleParent_) matches + // the current parent on the chained item, we need to take the chained + // item lock so we are sure that nobody else will be editing the chain + auto l_ = chainedItem_ + ? chainedItemLocks_.tryLockExclusive(toRecycleParent_->getKey()) + : decltype(chainedItemLocks_.tryLockExclusive(toRecycle_->getKey()))(); + + if (chainedItem_ && + ( !l_ || &toRecycle_->asChainedItem().getParentItem(compressor_) + != toRecycleParent_) ) { + ++itr; + continue; + } + Item* candidate_; + Item* syncItem_; + //sync on the parent item for chained items to move to next tier + if (!lastTier && chainedItem_) { + syncItem_ = toRecycleParent_; + candidate_ = toRecycle_; + } else if (lastTier && chainedItem_) { + candidate_ = toRecycleParent_; + syncItem_ = toRecycleParent_; + } else { + candidate_ = toRecycle_; + syncItem_ = toRecycle_; + } + // if it's last tier, the item will be evicted + // need to create put token before marking it exclusive + const bool evictToNvmCache = lastTier && shouldWriteToNvmCache(*candidate_); + + auto token_ = evictToNvmCache + ? nvmCache_->createPutToken(candidate_->getKey()) + : typename NvmCacheT::PutToken{}; + + if (evictToNvmCache && !token_.isValid()) { stats_.evictFailConcurrentFill.inc(); ++itr; continue; } - - auto markedForEviction = candidate_->markForEviction(); - if (!markedForEviction) { + + auto marked = (lastTier || candidate_->isExpired()) ? syncItem_->markForEviction() : syncItem_->markMoving(); + if (!marked) { if (candidate_->hasChainedItem()) { stats_.evictFailParentAC.inc(); } else { stats_.evictFailAC.inc(); } ++itr; + XDCHECK_EQ(toRecycle,nullptr); + XDCHECK_EQ(candidate,nullptr); continue; } - + + XDCHECK(syncItem_->isMoving() || syncItem_->isMarkedForEviction()); + toRecycleParent = toRecycleParent_; + chainedItem = chainedItem_; // markForEviction to make sure no other thead is evicting the item - // nor holding a handle to that item + // nor holding a handle to that item if this is last tier + // since we won't be moving the item to the next tier toRecycle = toRecycle_; candidate = candidate_; - token = std::move(putToken); - - // Check if parent changed for chained items - if yes, we cannot - // remove the child from the mmContainer as we will not be evicting - // it. We could abort right here, but we need to cleanup in case - // unmarkForEviction() returns 0 - so just go through normal path. - if (!toRecycle_->isChainedItem() || - &toRecycle->asChainedItem().getParentItem(compressor_) == candidate) { - mmContainer.remove(itr); + isExpired = candidate_->isExpired(); + token = std::move(token_); + if (chainedItem) { + XDCHECK(l_); + XDCHECK_EQ(toRecycleParent,&toRecycle_->asChainedItem().getParentItem(compressor_)); } + mmContainer.remove(itr); + return; } }); @@ -1332,25 +1628,106 @@ CacheAllocator::getNextCandidate(PoolId pid, XDCHECK(toRecycle); XDCHECK(candidate); - XDCHECK(candidate->isMarkedForEviction()); + XDCHECK(candidate->isMoving() || candidate->isMarkedForEviction()); + + auto evictedToNext = (lastTier || isExpired) ? nullptr + : tryEvictToNextMemoryTier(*candidate, false); + if (!evictedToNext) { + //failed to move a chained item - so evict the entire chain + if (candidate->isChainedItem()) { + //candidate should be parent now + XDCHECK(toRecycleParent->isMoving()); + XDCHECK_EQ(candidate,toRecycle); + candidate = toRecycleParent; //but now we evict the chain and in + //doing so recycle the child + } + //if insertOrReplace was called during move + //then candidate will not be accessible (failed replace during tryEvict) + // - therefore this was why we failed to + // evict to the next tier and insertOrReplace + // will remove from NVM cache + //however, if candidate is accessible + //that means the allocation in the next + //tier failed - so we will continue to + //evict the item to NVM cache + bool failedToReplace = !candidate->isAccessible(); + if (!token.isValid() && !failedToReplace) { + token = createPutToken(*candidate); + } + // tryEvictToNextMemoryTier can fail if: + // a) allocation of the new item fails in that case, + // it should be still possible to mark item for eviction. + // b) another thread calls insertOrReplace and the item + // is no longer accessible + // + // in case that we are on the last tier, we whould have already marked + // as exclusive since we will not be moving the item to the next tier + // but rather just evicting all together, no need to + // markForEvictionWhenMoving + auto ret = (lastTier || isExpired) ? true : candidate->markForEvictionWhenMoving(); + XDCHECK(ret); + + unlinkItemForEviction(*candidate); + + if (token.isValid() && shouldWriteToNvmCacheExclusive(*candidate) + && !failedToReplace) { + nvmCache_->put(*candidate, std::move(token)); + } + // wake up any readers that wait for the move to complete + // it's safe to do now, as we have the item marked exclusive and + // no other reader can be added to the waiters list + wakeUpWaiters(*candidate, {}); + } else { + XDCHECK(!evictedToNext->isMarkedForEviction() && !evictedToNext->isMoving()); + XDCHECK(!candidate->isMarkedForEviction() && !candidate->isMoving()); + XDCHECK(!candidate->isAccessible()); + XDCHECK(candidate->getKey() == evictedToNext->getKey()); + + (*stats_.numWritebacks)[tid][pid][cid].inc(); + if (chainedItem) { + XDCHECK(toRecycleParent->isMoving()); + XDCHECK_EQ(evictedToNext->getRefCount(),2u); + (*stats_.chainedItemEvictions)[tid][pid][cid].inc(); + // check if by releasing the item we intend to, we actually + // recycle the candidate. + auto ret = releaseBackToAllocator(*candidate, RemoveContext::kEviction, + /* isNascent */ false, toRecycle); + XDCHECK_EQ(ret,ReleaseRes::kRecycled); + evictedToNext.reset(); //once we unmark moving threads will try and alloc, drop + //the handle now - and refcount will drop to 1 + auto ref = toRecycleParent->unmarkMoving(); + if (UNLIKELY(ref == 0)) { + wakeUpWaiters(*toRecycleParent,{}); + const auto res = + releaseBackToAllocator(*toRecycleParent, RemoveContext::kNormal, false); + XDCHECK(res == ReleaseRes::kReleased); + } else { + auto parentHandle = acquire(toRecycleParent); + if (parentHandle) { + wakeUpWaiters(*toRecycleParent,std::move(parentHandle)); + } //in case where parent handle is null that means some other thread + // would have called wakeUpWaiters with null handle and released + // parent back to allocator + } + } else { + wakeUpWaiters(*candidate, std::move(evictedToNext)); + } + } - unlinkItemForEviction(*candidate); + XDCHECK(!candidate->isMarkedForEviction() && !candidate->isMoving()); - if (token.isValid() && shouldWriteToNvmCacheExclusive(*candidate)) { - nvmCache_->put(*candidate, std::move(token)); - } return {candidate, toRecycle}; } template typename CacheAllocator::Item* -CacheAllocator::findEviction(PoolId pid, ClassId cid) { +CacheAllocator::findEviction(TierId tid, PoolId pid, ClassId cid) { // Keep searching for a candidate until we were able to evict it // or until the search limit has been exhausted unsigned int searchTries = 0; while (config_.evictionSearchTries == 0 || config_.evictionSearchTries > searchTries) { - auto [candidate, toRecycle] = getNextCandidate(pid, cid, searchTries); + auto [candidate, toRecycle] = getNextCandidate(tid, pid, cid, searchTries); // Reached the end of the eviction queue but doulen't find a candidate, // start again. @@ -1361,9 +1738,9 @@ CacheAllocator::findEviction(PoolId pid, ClassId cid) { // NULL. If `ref` == 0 then it means that we are the last holder of // that item. if (candidate->hasChainedItem()) { - (*stats_.chainedItemEvictions)[pid][cid].inc(); + (*stats_.chainedItemEvictions)[tid][pid][cid].inc(); } else { - (*stats_.regularItemEvictions)[pid][cid].inc(); + (*stats_.regularItemEvictions)[tid][pid][cid].inc(); } if (auto eventTracker = getEventTracker()) { @@ -1372,6 +1749,7 @@ CacheAllocator::findEviction(PoolId pid, ClassId cid) { candidate->getConfiguredTTL().count()); } + XDCHECK(!candidate->isChainedItem()); // check if by releasing the item we intend to, we actually // recycle the candidate. auto ret = releaseBackToAllocator(*candidate, RemoveContext::kEviction, @@ -1431,6 +1809,117 @@ bool CacheAllocator::shouldWriteToNvmCacheExclusive( return true; } +template +typename CacheAllocator::WriteHandle +CacheAllocator::tryEvictToNextMemoryTier( + TierId tid, PoolId pid, Item& item, bool fromBgThread) { + + TierId nextTier = tid; // TODO - calculate this based on some admission policy + while (++nextTier < getNumTiers()) { // try to evict down to the next memory tiers + // always evict item from the nextTier to make room for new item + bool evict = true; + + // allocateInternal might trigger another eviction + WriteHandle newItemHdl{}; + Item* parentItem; + bool chainedItem = false; + if(item.isChainedItem()) { + chainedItem = true; + parentItem = &item.asChainedItem().getParentItem(compressor_); + XDCHECK(parentItem->isMoving()); + XDCHECK(item.isChainedItem() && item.getRefCount() == 1); + XDCHECK_EQ(0, parentItem->getRefCount()); + newItemHdl = allocateChainedItemInternalTier(*parentItem, + item.getSize(), + nextTier); + } else { + // this assert can fail if parent changed + XDCHECK(item.isMoving()); + XDCHECK(item.getRefCount() == 0); + newItemHdl = allocateInternalTier(nextTier, pid, + item.getKey(), + item.getSize(), + item.getCreationTime(), + item.getExpiryTime(), + fromBgThread, + evict); + } + + if (newItemHdl) { + XDCHECK_EQ(newItemHdl->getSize(), item.getSize()); + bool moveSuccess = chainedItem + ? moveChainedItem(item.asChainedItem(), + newItemHdl, *parentItem) + : moveRegularItem(item, newItemHdl); + if (!moveSuccess) { + return WriteHandle{}; + } + if (!chainedItem) { + XDCHECK_EQ(newItemHdl->getKey(),item.getKey()); + item.unmarkMoving(); + } + return newItemHdl; + } else { + return WriteHandle{}; + } + } + + return {}; +} + +template +typename CacheAllocator::WriteHandle +CacheAllocator::tryEvictToNextMemoryTier(Item& item, bool fromBgThread) { + auto tid = getTierId(item); + auto pid = allocator_[tid]->getAllocInfo(item.getMemory()).poolId; + return tryEvictToNextMemoryTier(tid, pid, item, fromBgThread); +} + +template +typename CacheAllocator::WriteHandle +CacheAllocator::tryPromoteToNextMemoryTier( + TierId tid, PoolId pid, Item& item, bool fromBgThread) { + if(item.isExpired()) { return {}; } + TierId nextTier = tid; + while (nextTier > 0) { // try to evict down to the next memory tiers + auto toPromoteTier = nextTier - 1; + --nextTier; + + // always evict item from the toPromoteTier to make room for new item + bool evict = true; + + // allocateInternal might trigger another eviction + auto newItemHdl = allocateInternalTier(toPromoteTier, pid, + item.getKey(), + item.getSize(), + item.getCreationTime(), + item.getExpiryTime(), + fromBgThread, + true); + + if (newItemHdl) { + XDCHECK_EQ(newItemHdl->getSize(), item.getSize()); + if (!moveRegularItem(item, newItemHdl)) { + return WriteHandle{}; + } + item.unmarkMoving(); + return newItemHdl; + } else { + return WriteHandle{}; + } + } + + return {}; +} + +template +typename CacheAllocator::WriteHandle +CacheAllocator::tryPromoteToNextMemoryTier(Item& item, bool fromBgThread) { + auto tid = getTierId(item); + auto pid = allocator_[tid]->getAllocInfo(item.getMemory()).poolId; + return tryPromoteToNextMemoryTier(tid, pid, item, fromBgThread); +} + template typename CacheAllocator::RemoveRes CacheAllocator::remove(typename Item::Key key) { @@ -1631,21 +2120,57 @@ void CacheAllocator::invalidateNvm(Item& item) { } } +template +TierId +CacheAllocator::getTierId(const Item& item) const { + return getTierId(item.getMemory()); +} + +template +TierId +CacheAllocator::getTierId(const void* ptr) const { + for (TierId tid = 0; tid < getNumTiers(); tid++) { + if (allocator_[tid]->isMemoryInAllocator(ptr)) + return tid; + } + + throw std::invalid_argument("Item does not belong to any tier!"); +} + template typename CacheAllocator::MMContainer& CacheAllocator::getMMContainer(const Item& item) const noexcept { + const auto tid = getTierId(item); const auto allocInfo = - allocator_->getAllocInfo(static_cast(&item)); - return getMMContainer(allocInfo.poolId, allocInfo.classId); + allocator_[tid]->getAllocInfo(static_cast(&item)); + return getMMContainer(tid, allocInfo.poolId, allocInfo.classId); } template typename CacheAllocator::MMContainer& -CacheAllocator::getMMContainer(PoolId pid, +CacheAllocator::getMMContainer(TierId tid, + PoolId pid, ClassId cid) const noexcept { - XDCHECK_LT(static_cast(pid), mmContainers_.size()); - XDCHECK_LT(static_cast(cid), mmContainers_[pid].size()); - return *mmContainers_[pid][cid]; + XDCHECK_LT(static_cast(tid), mmContainers_.size()); + XDCHECK_LT(static_cast(pid), mmContainers_[tid].size()); + XDCHECK_LT(static_cast(cid), mmContainers_[tid][pid].size()); + return *mmContainers_[tid][pid][cid]; +} + +template +MMContainerStat CacheAllocator::getMMContainerStat( + TierId tid, PoolId pid, ClassId cid) const noexcept { + if(static_cast(tid) >= mmContainers_.size()) { + return MMContainerStat{}; + } + if (static_cast(pid) >= mmContainers_[tid].size()) { + return MMContainerStat{}; + } + if (static_cast(cid) >= mmContainers_[tid][pid].size()) { + return MMContainerStat{}; + } + return mmContainers_[tid][pid][cid] ? mmContainers_[tid][pid][cid]->getStats() + : MMContainerStat{}; } template @@ -1839,23 +2364,25 @@ void CacheAllocator::markUseful(const ReadHandle& handle, template bool CacheAllocator::recordAccessInMMContainer(Item& item, AccessMode mode) { + const auto tid = getTierId(item); const auto allocInfo = - allocator_->getAllocInfo(static_cast(&item)); - (*stats_.cacheHits)[allocInfo.poolId][allocInfo.classId].inc(); + allocator_[tid]->getAllocInfo(static_cast(&item)); + (*stats_.cacheHits)[tid][allocInfo.poolId][allocInfo.classId].inc(); // track recently accessed items if needed if (UNLIKELY(config_.trackRecentItemsForDump)) { ring_->trackItem(reinterpret_cast(&item), item.getSize()); } - auto& mmContainer = getMMContainer(allocInfo.poolId, allocInfo.classId); + auto& mmContainer = getMMContainer(tid, allocInfo.poolId, allocInfo.classId); return mmContainer.recordAccess(item, mode); } template uint32_t CacheAllocator::getUsableSize(const Item& item) const { + const auto tid = getTierId(item); const auto allocSize = - allocator_->getAllocInfo(static_cast(&item)).allocSize; + allocator_[tid]->getAllocInfo(static_cast(&item)).allocSize; return item.isChainedItem() ? allocSize - ChainedItem::getRequiredSize(0) : allocSize - Item::getRequiredSize(item.getKey(), 0); @@ -1864,8 +2391,10 @@ uint32_t CacheAllocator::getUsableSize(const Item& item) const { template typename CacheAllocator::SampleItem CacheAllocator::getSampleItem() { - size_t nvmCacheSize = nvmCache_ ? nvmCache_->getUsableSize() : 0; - size_t ramCacheSize = allocator_->getMemorySizeInclAdvised(); + // TODO: is using random tier a good idea? + auto tid = folly::Random::rand32() % getNumTiers(); + static size_t nvmCacheSize = nvmCache_ ? nvmCache_->getUsableSize() : 0; + static size_t ramCacheSize = allocator_[tid]->getMemorySizeInclAdvised(); bool fromNvm = folly::Random::rand64(0, nvmCacheSize + ramCacheSize) >= ramCacheSize; @@ -1874,19 +2403,18 @@ CacheAllocator::getSampleItem() { } // Sampling from DRAM cache - auto item = reinterpret_cast(allocator_->getRandomAlloc()); + auto item = reinterpret_cast(allocator_[tid]->getRandomAlloc()); if (!item) { return SampleItem{false /* fromNvm */}; } // Check that item returned is the same that was sampled - auto sharedHdl = std::make_shared(findInternal(item->getKey())); if (sharedHdl->get() != item) { return SampleItem{false /* fromNvm */}; } - const auto allocInfo = allocator_->getAllocInfo(item->getMemory()); + const auto allocInfo = allocator_[tid]->getAllocInfo(item->getMemory()); // Convert the Item to IOBuf to make SampleItem auto iobuf = folly::IOBuf{ @@ -1905,28 +2433,33 @@ CacheAllocator::getSampleItem() { template std::vector CacheAllocator::dumpEvictionIterator( - PoolId pid, ClassId cid, size_t numItems) { + PoolId pid, ClassId cid, size_t numItems) { if (numItems == 0) { return {}; } - if (static_cast(pid) >= mmContainers_.size() || - static_cast(cid) >= mmContainers_[pid].size()) { + // Always evict from the lowest layer. + int tid = getNumTiers() - 1; + + if (static_cast(tid) >= mmContainers_.size() || + static_cast(pid) >= mmContainers_[tid].size() || + static_cast(cid) >= mmContainers_[tid][pid].size()) { throw std::invalid_argument( - folly::sformat("Invalid PoolId: {} and ClassId: {}.", pid, cid)); + folly::sformat("Invalid TierId: {} and PoolId: {} and ClassId: {}.", tid, pid, cid)); } std::vector content; - auto& mm = *mmContainers_[pid][cid]; - auto evictItr = mm.getEvictionIterator(); - size_t i = 0; - while (evictItr && i < numItems) { - content.push_back(evictItr->toString()); - ++evictItr; - ++i; + while (tid >= 0) { + auto& mm = *mmContainers_[tid][pid][cid]; + mm.withEvictionIterator([&content, numItems](auto&& itr) { + while (itr && content.size() < numItems) { + content.push_back(itr->toString()); + ++itr; + } + }); + --tid; } - return content; } @@ -2102,19 +2635,50 @@ PoolId CacheAllocator::addPool( std::shared_ptr resizeStrategy, bool ensureProvisionable) { folly::SharedMutex::WriteHolder w(poolsResizeAndRebalanceLock_); - auto pid = allocator_->addPool(name, size, allocSizes, ensureProvisionable); + + PoolId pid = 0; + size_t totalCacheSize = 0; + + for (TierId tid = 0; tid < getNumTiers(); tid++) { + totalCacheSize += allocator_[tid]->getMemorySize(); + } + + for (TierId tid = 0; tid < getNumTiers(); tid++) { + auto tierSizeRatio = + static_cast(allocator_[tid]->getMemorySize()) / totalCacheSize; + size_t tierPoolSize = static_cast(tierSizeRatio * size); + + // TODO: what if we manage to add pool only in one tier? + // we should probably remove that on failure + auto res = allocator_[tid]->addPool( + name, tierPoolSize, allocSizes, ensureProvisionable); + XDCHECK(tid == 0 || res == pid); + pid = res; + } + createMMContainers(pid, std::move(config)); setRebalanceStrategy(pid, std::move(rebalanceStrategy)); setResizeStrategy(pid, std::move(resizeStrategy)); + + if (backgroundEvictor_.size()) { + for (size_t id = 0; id < backgroundEvictor_.size(); id++) + backgroundEvictor_[id]->setAssignedMemory(getAssignedMemoryToBgWorker(id, backgroundEvictor_.size(), 0)); + } + + if (backgroundPromoter_.size()) { + for (size_t id = 0; id < backgroundPromoter_.size(); id++) + backgroundPromoter_[id]->setAssignedMemory(getAssignedMemoryToBgWorker(id, backgroundPromoter_.size(), 1)); + } + return pid; } template void CacheAllocator::overridePoolRebalanceStrategy( PoolId pid, std::shared_ptr rebalanceStrategy) { - if (static_cast(pid) >= mmContainers_.size()) { + if (static_cast(pid) >= mmContainers_[0].size()) { throw std::invalid_argument(folly::sformat( - "Invalid PoolId: {}, size of pools: {}", pid, mmContainers_.size())); + "Invalid PoolId: {}, size of pools: {}", pid, mmContainers_[0].size())); } setRebalanceStrategy(pid, std::move(rebalanceStrategy)); } @@ -2122,9 +2686,9 @@ void CacheAllocator::overridePoolRebalanceStrategy( template void CacheAllocator::overridePoolResizeStrategy( PoolId pid, std::shared_ptr resizeStrategy) { - if (static_cast(pid) >= mmContainers_.size()) { + if (static_cast(pid) >= mmContainers_[0].size()) { throw std::invalid_argument(folly::sformat( - "Invalid PoolId: {}, size of pools: {}", pid, mmContainers_.size())); + "Invalid PoolId: {}, size of pools: {}", pid, mmContainers_[0].size())); } setResizeStrategy(pid, std::move(resizeStrategy)); } @@ -2136,14 +2700,14 @@ void CacheAllocator::overridePoolOptimizeStrategy( } template -void CacheAllocator::overridePoolConfig(PoolId pid, +void CacheAllocator::overridePoolConfig(TierId tid, PoolId pid, const MMConfig& config) { - if (static_cast(pid) >= mmContainers_.size()) { + // TODO: add generic tier id checking + if (static_cast(pid) >= mmContainers_[tid].size()) { throw std::invalid_argument(folly::sformat( - "Invalid PoolId: {}, size of pools: {}", pid, mmContainers_.size())); + "Invalid PoolId: {}, size of pools: {}", pid, mmContainers_[tid].size())); } - - auto& pool = allocator_->getPool(pid); + auto& pool = allocator_[tid]->getPool(pid); for (unsigned int cid = 0; cid < pool.getNumClassId(); ++cid) { MMConfig mmConfig = config; mmConfig.addExtraConfig( @@ -2151,29 +2715,35 @@ void CacheAllocator::overridePoolConfig(PoolId pid, ? pool.getAllocationClass(static_cast(cid)) .getAllocsPerSlab() : 0); - DCHECK_NOTNULL(mmContainers_[pid][cid].get()); - mmContainers_[pid][cid]->setConfig(mmConfig); + DCHECK_NOTNULL(mmContainers_[tid][pid][cid].get()); + mmContainers_[tid][pid][cid]->setConfig(mmConfig); } } template void CacheAllocator::createMMContainers(const PoolId pid, MMConfig config) { - auto& pool = allocator_->getPool(pid); + // pools on each layer should have the same number of class id, etc. + // TODO: think about deduplication + auto& pool = allocator_[0]->getPool(pid); + for (unsigned int cid = 0; cid < pool.getNumClassId(); ++cid) { config.addExtraConfig( config_.trackTailHits ? pool.getAllocationClass(static_cast(cid)) .getAllocsPerSlab() : 0); - mmContainers_[pid][cid].reset(new MMContainer(config, compressor_)); + for (TierId tid = 0; tid < getNumTiers(); tid++) { + mmContainers_[tid][pid][cid].reset(new MMContainer(config, compressor_)); + } } } template PoolId CacheAllocator::getPoolId( folly::StringPiece name) const noexcept { - return allocator_->getPoolId(name.str()); + // each tier has the same pools + return allocator_[0]->getPoolId(name.str()); } // The Function returns a consolidated vector of Release Slab @@ -2216,7 +2786,9 @@ std::set CacheAllocator::filterCompactCachePools( template std::set CacheAllocator::getRegularPoolIds() const { folly::SharedMutex::ReadHolder r(poolsResizeAndRebalanceLock_); - return filterCompactCachePools(allocator_->getPoolIds()); + // TODO - get rid of the duplication - right now, each tier + // holds pool objects with mostly the same info + return filterCompactCachePools(allocator_[0]->getPoolIds()); } template @@ -2241,10 +2813,9 @@ std::set CacheAllocator::getRegularPoolIdsForResize() // getAdvisedMemorySize - then pools may be overLimit even when // all slabs are not allocated. Otherwise, pools may be overLimit // only after all slabs are allocated. - // - return (allocator_->allSlabsAllocated()) || - (allocator_->getAdvisedMemorySize() != 0) - ? filterCompactCachePools(allocator_->getPoolsOverLimit()) + return (allocator_[currentTier()]->allSlabsAllocated()) || + (allocator_[currentTier()]->getAdvisedMemorySize() != 0) + ? filterCompactCachePools(allocator_[currentTier()]->getPoolsOverLimit()) : std::set{}; } @@ -2253,9 +2824,89 @@ const std::string CacheAllocator::getCacheName() const { return config_.cacheName; } +template +size_t CacheAllocator::getPoolSize(PoolId poolId) const { + size_t poolSize = 0; + for (auto& allocator: allocator_) { + const auto& pool = allocator->getPool(poolId); + poolSize += pool.getPoolSize(); + } + return poolSize; +} + template PoolStats CacheAllocator::getPoolStats(PoolId poolId) const { - const auto& pool = allocator_->getPool(poolId); + //this pool ref is just used to get class ids, which will be the + //same across tiers + const auto& pool = allocator_[currentTier()]->getPool(poolId); + const auto& allocSizes = pool.getAllocSizes(); + auto mpStats = pool.getStats(); + const auto& classIds = mpStats.classIds; + + // check if this is a compact cache. + bool isCompactCache = false; + { + folly::SharedMutex::ReadHolder lock(compactCachePoolsLock_); + isCompactCache = isCompactCachePool_[poolId]; + } + + std::unordered_map cacheStats; + uint64_t totalHits = 0; + // cacheStats is only menaningful for pools that are not compact caches. + // TODO export evictions, numItems etc from compact cache directly. + if (!isCompactCache) { + for (const ClassId cid : classIds) { + uint64_t allocAttempts = 0, evictionAttempts = 0, allocFailures = 0, + fragmentationSize = 0, classHits = 0, chainedItemEvictions = 0, + regularItemEvictions = 0, numWritebacks = 0; + MMContainerStat mmContainerStats; + for (TierId tid = 0; tid < getNumTiers(); tid++) { + allocAttempts += (*stats_.allocAttempts)[tid][poolId][cid].get(); + evictionAttempts += (*stats_.evictionAttempts)[tid][poolId][cid].get(); + allocFailures += (*stats_.allocFailures)[tid][poolId][cid].get(); + fragmentationSize += (*stats_.fragmentationSize)[tid][poolId][cid].get(); + classHits += (*stats_.cacheHits)[tid][poolId][cid].get(); + chainedItemEvictions += (*stats_.chainedItemEvictions)[tid][poolId][cid].get(); + regularItemEvictions += (*stats_.regularItemEvictions)[tid][poolId][cid].get(); + numWritebacks += (*stats_.numWritebacks)[tid][poolId][cid].get(); + mmContainerStats += getMMContainerStat(tid, poolId, cid); + XDCHECK(mmContainers_[tid][poolId][cid], + folly::sformat("Tid {}, Pid {}, Cid {} not initialized.", tid, poolId, cid)); + } + cacheStats.insert( + {cid, + {allocSizes[cid], + allocAttempts, + evictionAttempts, + allocFailures, + fragmentationSize, + classHits, + chainedItemEvictions, + regularItemEvictions, + numWritebacks, + mmContainerStats}}); + totalHits += classHits; + } + } + + PoolStats ret; + ret.isCompactCache = isCompactCache; + //pool name is also shared among tiers + ret.poolName = allocator_[currentTier()]->getPoolName(poolId); + ret.poolSize = pool.getPoolSize(); + ret.poolUsableSize = pool.getPoolUsableSize(); + ret.poolAdvisedSize = pool.getPoolAdvisedSize(); + ret.cacheStats = std::move(cacheStats); + ret.mpStats = std::move(mpStats); + ret.numPoolGetHits = totalHits; + ret.evictionAgeSecs = stats_.perPoolEvictionAgeSecs_[poolId].estimate(); + + return ret; +} + +template +PoolStats CacheAllocator::getPoolStats(TierId tid, PoolId poolId) const { + const auto& pool = allocator_[tid]->getPool(poolId); const auto& allocSizes = pool.getAllocSizes(); auto mpStats = pool.getStats(); const auto& classIds = mpStats.classIds; @@ -2273,27 +2924,28 @@ PoolStats CacheAllocator::getPoolStats(PoolId poolId) const { // TODO export evictions, numItems etc from compact cache directly. if (!isCompactCache) { for (const ClassId cid : classIds) { - uint64_t classHits = (*stats_.cacheHits)[poolId][cid].get(); - XDCHECK(mmContainers_[poolId][cid], - folly::sformat("Pid {}, Cid {} not initialized.", poolId, cid)); + uint64_t classHits = (*stats_.cacheHits)[tid][poolId][cid].get(); + XDCHECK(mmContainers_[tid][poolId][cid], + folly::sformat("Tid {}, Pid {}, Cid {} not initialized.", tid, poolId, cid)); cacheStats.insert( {cid, - {allocSizes[cid], (*stats_.allocAttempts)[poolId][cid].get(), - (*stats_.evictionAttempts)[poolId][cid].get(), - (*stats_.allocFailures)[poolId][cid].get(), - (*stats_.fragmentationSize)[poolId][cid].get(), classHits, - (*stats_.chainedItemEvictions)[poolId][cid].get(), - (*stats_.regularItemEvictions)[poolId][cid].get(), - mmContainers_[poolId][cid]->getStats()} - - }); + {allocSizes[cid], + (*stats_.allocAttempts)[tid][poolId][cid].get(), + (*stats_.evictionAttempts)[tid][poolId][cid].get(), + (*stats_.allocFailures)[tid][poolId][cid].get(), + (*stats_.fragmentationSize)[tid][poolId][cid].get(), + classHits, + (*stats_.chainedItemEvictions)[tid][poolId][cid].get(), + (*stats_.regularItemEvictions)[tid][poolId][cid].get(), + (*stats_.numWritebacks)[tid][poolId][cid].get(), + getMMContainerStat(tid, poolId, cid)}}); totalHits += classHits; } } PoolStats ret; ret.isCompactCache = isCompactCache; - ret.poolName = allocator_->getPoolName(poolId); + ret.poolName = allocator_[tid]->getPoolName(poolId); ret.poolSize = pool.getPoolSize(); ret.poolUsableSize = pool.getPoolUsableSize(); ret.poolAdvisedSize = pool.getPoolAdvisedSize(); @@ -2305,22 +2957,32 @@ PoolStats CacheAllocator::getPoolStats(PoolId poolId) const { return ret; } +template +ACStats CacheAllocator::getACStats(TierId tid, + PoolId poolId, + ClassId classId) const { + const auto& pool = allocator_[tid]->getPool(poolId); + const auto& ac = pool.getAllocationClass(classId); + + auto stats = ac.getStats(); + stats.allocLatencyNs = (*stats_.classAllocLatency)[tid][poolId][classId]; + return stats; +} + template PoolEvictionAgeStats CacheAllocator::getPoolEvictionAgeStats( PoolId pid, unsigned int slabProjectionLength) const { PoolEvictionAgeStats stats; - - const auto& pool = allocator_->getPool(pid); + const auto& pool = allocator_[currentTier()]->getPool(pid); const auto& allocSizes = pool.getAllocSizes(); for (ClassId cid = 0; cid < static_cast(allocSizes.size()); ++cid) { - auto& mmContainer = getMMContainer(pid, cid); + auto& mmContainer = getMMContainer(currentTier(), pid, cid); const auto numItemsPerSlab = - allocator_->getPool(pid).getAllocationClass(cid).getAllocsPerSlab(); + allocator_[currentTier()]->getPool(pid).getAllocationClass(cid).getAllocsPerSlab(); const auto projectionLength = numItemsPerSlab * slabProjectionLength; stats.classEvictionAgeStats[cid] = mmContainer.getEvictionAgeStat(projectionLength); } - return stats; } @@ -2359,7 +3021,7 @@ void CacheAllocator::releaseSlab(PoolId pid, } try { - auto releaseContext = allocator_->startSlabRelease( + auto releaseContext = allocator_[currentTier()]->startSlabRelease( pid, victim, receiver, mode, hint, [this]() -> bool { return shutDownInProgress_; }); @@ -2368,15 +3030,15 @@ void CacheAllocator::releaseSlab(PoolId pid, return; } - releaseSlabImpl(releaseContext); - if (!allocator_->allAllocsFreed(releaseContext)) { + releaseSlabImpl(currentTier(), releaseContext); + if (!allocator_[currentTier()]->allAllocsFreed(releaseContext)) { throw std::runtime_error( folly::sformat("Was not able to free all allocs. PoolId: {}, AC: {}", releaseContext.getPoolId(), releaseContext.getClassId())); } - allocator_->completeSlabRelease(releaseContext); + allocator_[currentTier()]->completeSlabRelease(releaseContext); } catch (const exception::SlabReleaseAborted& e) { stats_.numAbortedSlabReleases.inc(); throw exception::SlabReleaseAborted(folly::sformat( @@ -2387,8 +3049,7 @@ void CacheAllocator::releaseSlab(PoolId pid, } template -SlabReleaseStats CacheAllocator::getSlabReleaseStats() - const noexcept { +SlabReleaseStats CacheAllocator::getSlabReleaseStats() const noexcept { std::lock_guard l(workersMutex_); return SlabReleaseStats{stats_.numActiveSlabReleases.get(), stats_.numReleasedForRebalance.get(), @@ -2406,7 +3067,7 @@ SlabReleaseStats CacheAllocator::getSlabReleaseStats() } template -void CacheAllocator::releaseSlabImpl( +void CacheAllocator::releaseSlabImpl(TierId tid, const SlabReleaseContext& releaseContext) { auto startTime = std::chrono::milliseconds(util::getCurrentTimeMs()); bool releaseStuck = false; @@ -2434,6 +3095,8 @@ void CacheAllocator::releaseSlabImpl( // 3. If 2 is successful, Move or Evict // 4. Move on to the next item if current item is freed for (auto alloc : releaseContext.getActiveAllocations()) { + Item& item = *static_cast(alloc); + // Need to mark an item for release before proceeding // If we can't mark as moving, it means the item is already freed const bool isAlreadyFreed = @@ -2442,8 +3105,6 @@ void CacheAllocator::releaseSlabImpl( continue; } - Item& item = *static_cast(alloc); - // Try to move this item and make sure we can free the memory const bool isMoved = moveForSlabRelease(releaseContext, item, throttler); @@ -2451,7 +3112,7 @@ void CacheAllocator::releaseSlabImpl( if (!isMoved) { evictForSlabRelease(releaseContext, item, throttler); } - XDCHECK(allocator_->isAllocFreed(releaseContext, alloc)); + XDCHECK(allocator_[tid]->isAllocFreed(releaseContext, alloc)); } } @@ -2465,34 +3126,64 @@ void CacheAllocator::throttleWith(util::Throttler& t, } } +template +typename RefcountWithFlags::Value CacheAllocator::unmarkMovingAndWakeUpWaiters(Item &item, WriteHandle handle) +{ + auto ret = item.unmarkMoving(); + wakeUpWaiters(item, std::move(handle)); + return ret; +} + template bool CacheAllocator::moveForSlabRelease( const SlabReleaseContext& ctx, Item& oldItem, util::Throttler& throttler) { + if (!config_.moveCb) { - return false; + return false; } - bool isMoved = false; auto startTime = util::getCurrentTimeSec(); - WriteHandle newItemHdl = allocateNewItemForOldItem(oldItem); + WriteHandle newItemHdl{}; + Item *parentItem; + bool chainedItem = oldItem.isChainedItem(); for (unsigned int itemMovingAttempts = 0; itemMovingAttempts < config_.movingTries; ++itemMovingAttempts) { stats_.numMoveAttempts.inc(); - // Nothing to move and the key is likely also bogus for chained items. + // Nothing to move - in the case that tryMoving failed + // for chained items we would have already evicted the entire chain. if (oldItem.isOnlyMoving()) { - oldItem.unmarkMoving(); + XDCHECK(!oldItem.isChainedItem()); + auto ret = unmarkMovingAndWakeUpWaiters(oldItem, {}); + XDCHECK(ret == 0); const auto res = releaseBackToAllocator(oldItem, RemoveContext::kNormal, false); XDCHECK(res == ReleaseRes::kReleased); return true; } + throttleWith(throttler, [&] { + XLOGF(WARN, + "Spent {} seconds, slab release still trying to move Item: {}. " + "Pool: {}, Class: {}.", + util::getCurrentTimeSec() - startTime, oldItem.toString(), + ctx.getPoolId(), ctx.getClassId()); + }); + if (!newItemHdl) { - // try to allocate again if it previously wasn't successful - newItemHdl = allocateNewItemForOldItem(oldItem); + if (chainedItem) { + parentItem = &oldItem.asChainedItem().getParentItem(compressor_); + XDCHECK(parentItem->isMoving()); + XDCHECK(oldItem.isChainedItem() && oldItem.getRefCount() == 1); + XDCHECK_EQ(0, parentItem->getRefCount()); + newItemHdl = + allocateChainedItemInternalTier(*parentItem, oldItem.getSize(), getTierId(oldItem)); + } else { + XDCHECK(oldItem.isMoving()); + newItemHdl = allocateNewItemForOldItem(oldItem); + } } // if we have a valid handle, try to move, if not, we retry. @@ -2502,14 +3193,6 @@ bool CacheAllocator::moveForSlabRelease( break; } } - - throttleWith(throttler, [&] { - XLOGF(WARN, - "Spent {} seconds, slab release still trying to move Item: {}. " - "Pool: {}, Class: {}.", - util::getCurrentTimeSec() - startTime, oldItem.toString(), - ctx.getPoolId(), ctx.getClassId()); - }); } // Return false if we've exhausted moving tries. @@ -2522,7 +3205,7 @@ bool CacheAllocator::moveForSlabRelease( // that's identical to this one to replace it. Here we just need to wait // until all users have dropped the item handles before we can proceed. startTime = util::getCurrentTimeSec(); - while (!oldItem.isOnlyMoving()) { + while (!chainedItem && !oldItem.isOnlyMoving()) { throttleWith(throttler, [&] { XLOGF(WARN, "Spent {} seconds, slab release still waiting for refcount to " @@ -2531,20 +3214,41 @@ bool CacheAllocator::moveForSlabRelease( ctx.getPoolId(), ctx.getClassId()); }); } - const auto allocInfo = allocator_->getAllocInfo(oldItem.getMemory()); - allocator_->free(&oldItem); - - (*stats_.fragmentationSize)[allocInfo.poolId][allocInfo.classId].sub( + auto tid = getTierId(oldItem); + const auto allocInfo = allocator_[tid]->getAllocInfo(oldItem.getMemory()); + if (chainedItem) { + newItemHdl.reset(); + auto ref = parentItem->unmarkMoving(); + if (UNLIKELY(ref == 0)) { + wakeUpWaiters(*parentItem,{}); + const auto res = + releaseBackToAllocator(*parentItem, RemoveContext::kNormal, false); + XDCHECK(res == ReleaseRes::kReleased); + return true; + } else { + XDCHECK_NE(ref,0); + auto parentHdl = acquire(parentItem); + if (parentHdl) { + wakeUpWaiters(*parentItem,std::move(parentHdl)); + } + } + } else { + auto ref = unmarkMovingAndWakeUpWaiters(oldItem, std::move(newItemHdl)); + XDCHECK(ref == 0); + } + allocator_[tid]->free(&oldItem); + (*stats_.fragmentationSize)[tid][allocInfo.poolId][allocInfo.classId].sub( util::getFragmentation(*this, oldItem)); stats_.numMoveSuccesses.inc(); return true; + } template -typename CacheAllocator::ReadHandle +typename CacheAllocator::WriteHandle CacheAllocator::validateAndGetParentHandleForChainedMoveLocked( const ChainedItem& item, const Key& parentKey) { - ReadHandle parentHandle{}; + WriteHandle parentHandle{}; try { parentHandle = findInternal(parentKey); // If the parent is not the same as the parent of the chained item, @@ -2576,33 +3280,38 @@ CacheAllocator::allocateNewItemForOldItem(const Item& oldItem) { return {}; } - // Set up the destination for the move. Since oldChainedItem would be - // marked as moving, it won't be picked for eviction. + // Set up the destination for the move. Since oldChainedItem would + // be marked as moving, it won't be picked for eviction. auto newItemHdl = - allocateChainedItemInternal(parentHandle, oldChainedItem.getSize()); + allocateChainedItemInternal(parentHandle, oldItem.getSize()); if (!newItemHdl) { return {}; } - XDCHECK_EQ(newItemHdl->getSize(), oldChainedItem.getSize()); + XDCHECK_EQ(newItemHdl->getSize(), oldItem.getSize()); auto parentPtr = parentHandle.getInternal(); XDCHECK_EQ(reinterpret_cast(parentPtr), reinterpret_cast( - &oldChainedItem.getParentItem(compressor_))); + &newItemHdl->asChainedItem().getParentItem(compressor_))); return newItemHdl; } const auto allocInfo = - allocator_->getAllocInfo(static_cast(&oldItem)); + allocator_[getTierId(oldItem)]->getAllocInfo(static_cast(&oldItem)); + + bool evict = !config_.insertToFirstFreeTier || getTierId(oldItem) == getNumTiers() - 1; // Set up the destination for the move. Since oldItem would have the moving // bit set, it won't be picked for eviction. - auto newItemHdl = allocateInternal(allocInfo.poolId, - oldItem.getKey(), - oldItem.getSize(), - oldItem.getCreationTime(), - oldItem.getExpiryTime()); + auto newItemHdl = allocateInternalTier(getTierId(oldItem), + allocInfo.poolId, + oldItem.getKey(), + oldItem.getSize(), + oldItem.getCreationTime(), + oldItem.getExpiryTime(), + false, + evict); if (!newItemHdl) { return {}; } @@ -2617,100 +3326,59 @@ CacheAllocator::allocateNewItemForOldItem(const Item& oldItem) { template bool CacheAllocator::tryMovingForSlabRelease( Item& oldItem, WriteHandle& newItemHdl) { - // By holding onto a user-level synchronization object, we ensure moving - // a regular item or chained item is synchronized with any potential - // user-side mutation. - std::unique_ptr syncObj; - if (config_.movingSync) { - if (!oldItem.isChainedItem()) { - syncObj = config_.movingSync(oldItem.getKey()); - } else { - // Copy the key so we have a valid key to work with if the chained - // item is still valid. - const std::string parentKey = - oldItem.asChainedItem().getParentItem(compressor_).getKey().str(); - if (oldItem.isOnlyMoving()) { - // If chained item no longer has a refcount, its parent is already - // being released, so we abort this try to moving. - return false; + std::unique_ptr syncObj; + if (config_.movingSync) { + if (!oldItem.isChainedItem()) { + syncObj = config_.movingSync(oldItem.getKey()); + } else { + // Copy the key so we have a valid key to work with if the chained + // item is still valid. + const std::string parentKey = + oldItem.asChainedItem().getParentItem(compressor_).getKey().str(); + syncObj = config_.movingSync(parentKey); + } + if (syncObj && !syncObj->isValid()) { + return false; } - syncObj = config_.movingSync(parentKey); - } - // We need to differentiate between the following three scenarios: - // 1. nullptr indicates no move sync required for this particular item - // 2. moveSync.isValid() == true meaning we've obtained the sync - // 3. moveSync.isValid() == false meaning we need to abort and retry - if (syncObj && !syncObj->isValid()) { - return false; - } - } + } + //move can fail if another thread calls insertOrReplace + //in this case oldItem is no longer valid (not accessible, + //it gets removed from MMContainer and evictForSlabRelease + //will send it back to the allocator + bool ret = oldItem.isChainedItem() + ? moveChainedItem(oldItem.asChainedItem(), newItemHdl, + oldItem.asChainedItem().getParentItem(compressor_)) + : moveRegularItem(oldItem, newItemHdl); + removeFromMMContainer(oldItem); + return ret; +} - return oldItem.isChainedItem() - ? moveChainedItem(oldItem.asChainedItem(), newItemHdl) - : moveRegularItem(oldItem, newItemHdl); +template +void CacheAllocator::wakeUpWaiters(Item& item, WriteHandle handle) +{ + wakeUpWaitersLocked(item.getKey(), std::move(handle)); } template void CacheAllocator::evictForSlabRelease( const SlabReleaseContext& ctx, Item& item, util::Throttler& throttler) { auto startTime = util::getCurrentTimeSec(); + while (true) { + //we can't rely on an item being marked moving because + //it may have previously been a chained item stats_.numEvictionAttempts.inc(); - // if the item is already in a state where only the exclusive bit is set, - // nothing needs to be done. We simply need to call unmarkMoving and free - // the item. - if (item.isOnlyMoving()) { - item.unmarkMoving(); - const auto res = - releaseBackToAllocator(item, RemoveContext::kNormal, false); - XDCHECK(ReleaseRes::kReleased == res); - return; - } - - // Since we couldn't move, we now evict this item. Owning handle will be - // the item's handle for regular/normal items and will be the parent - // handle for chained items. - auto owningHandle = - item.isChainedItem() - ? evictChainedItemForSlabRelease(item.asChainedItem()) - : evictNormalItemForSlabRelease(item); - - // we managed to evict the corresponding owner of the item and have the - // last handle for the owner. - if (owningHandle) { - const auto allocInfo = - allocator_->getAllocInfo(static_cast(&item)); - if (owningHandle->hasChainedItem()) { - (*stats_.chainedItemEvictions)[allocInfo.poolId][allocInfo.classId] - .inc(); - } else { - (*stats_.regularItemEvictions)[allocInfo.poolId][allocInfo.classId] - .inc(); - } - - stats_.numEvictionSuccesses.inc(); - - // we have the last handle. no longer need to hold on to the exclusive bit - item.unmarkMoving(); - - // manually decrement the refcount to call releaseBackToAllocator - const auto ref = decRef(*owningHandle); - XDCHECK(ref == 0); - const auto res = releaseBackToAllocator(*owningHandle.release(), - RemoveContext::kEviction, false); - XDCHECK(res == ReleaseRes::kReleased); - return; - } - if (shutDownInProgress_) { - item.unmarkMoving(); - allocator_->abortSlabRelease(ctx); - throw exception::SlabReleaseAborted( - folly::sformat("Slab Release aborted while trying to evict" - " Item: {} Pool: {}, Class: {}.", - item.toString(), ctx.getPoolId(), ctx.getClassId())); + if (item.isMoving()) { + auto ref = unmarkMovingAndWakeUpWaiters(item, {}); + allocator_[getTierId(item)]->abortSlabRelease(ctx); + throw exception::SlabReleaseAborted( + folly::sformat("Slab Release aborted while trying to evict" + " Item: {} Pool: {}, Class: {}.", + item.toString(), ctx.getPoolId(), ctx.getClassId())); + } } throttleWith(throttler, [&] { XLOGF(WARN, @@ -2725,146 +3393,93 @@ void CacheAllocator::evictForSlabRelease( .toString()) : ""); }); - } -} - -template -typename CacheAllocator::WriteHandle -CacheAllocator::evictNormalItemForSlabRelease(Item& item) { - XDCHECK(item.isMoving()); - - if (item.isOnlyMoving()) { - return WriteHandle{}; - } - - auto predicate = [](const Item& it) { return it.getRefCount() == 0; }; + // if the item is already in a state where only the exclusive bit is set, + // nothing needs to be done. We simply need to call unmarkMoving and free + // the item. + if (item.isOnlyMoving()) { + auto ref = unmarkMovingAndWakeUpWaiters(item, {}); + XDCHECK(ref == 0); + const auto res = + releaseBackToAllocator(item, RemoveContext::kNormal, false); + XDCHECK(ReleaseRes::kReleased == res); + return; + } - const bool evictToNvmCache = shouldWriteToNvmCache(item); - auto token = evictToNvmCache ? nvmCache_->createPutToken(item.getKey()) - : typename NvmCacheT::PutToken{}; + typename NvmCacheT::PutToken token; + bool isChainedItem = item.isChainedItem(); + Item* evicted; + Item *expectedParent = isChainedItem + ? &item.asChainedItem().getParentItem(compressor_) + : nullptr; + if (isChainedItem) { + XDCHECK(expectedParent->isMoving()); + XDCHECK_EQ(expectedParent,&item.asChainedItem().getParentItem(compressor_)); + if (expectedParent != &item.asChainedItem().getParentItem(compressor_)) { + XDCHECK_EQ(expectedParent,&item.asChainedItem().getParentItem(compressor_)); + throw std::runtime_error(folly::sformat( + "Slab release aborted while evicting " + "item {}", item.toString())); + } + evicted = expectedParent; + } else { + evicted = &item; + } + XDCHECK(evicted->isMoving()); + token = createPutToken(*evicted); + auto ret = evicted->markForEvictionWhenMoving(); + XDCHECK(ret); + // unmark the child so it will be freed + // TODO entire chain just gets evicted since moveForSlabRelease + // returns false + XDCHECK(!item.isMoving()); + unlinkItemForEviction(*evicted); + // wake up any readers that wait for the move to complete + // it's safe to do now, as we have the item marked exclusive and + // no other reader can be added to the waiters list + wakeUpWaiters(*evicted, {}); + + if (token.isValid() && shouldWriteToNvmCacheExclusive(*evicted)) { + nvmCache_->put(*evicted, std::move(token)); + } - // We remove the item from both access and mm containers. It doesn't matter - // if someone else calls remove on the item at this moment, the item cannot - // be freed as long as it's marked for eviction. - auto handle = accessContainer_->removeIf(item, std::move(predicate)); + const auto tid = getTierId(*evicted); + const auto allocInfo = + allocator_[tid]->getAllocInfo(static_cast(evicted)); + if (evicted->hasChainedItem()) { + (*stats_.chainedItemEvictions)[tid][allocInfo.poolId][allocInfo.classId].inc(); + } else { + (*stats_.regularItemEvictions)[tid][allocInfo.poolId][allocInfo.classId].inc(); + } - if (!handle) { - return handle; - } + stats_.numEvictionSuccesses.inc(); - XDCHECK_EQ(reinterpret_cast(handle.get()), - reinterpret_cast(&item)); - XDCHECK_EQ(1u, handle->getRefCount()); - removeFromMMContainer(item); + XDCHECK(evicted->getRefCount() == 0); + const auto res = + releaseBackToAllocator(*evicted, RemoveContext::kEviction, false); - // now that we are the only handle and we actually removed something from - // the RAM cache, we enqueue it to nvmcache. - if (evictToNvmCache && shouldWriteToNvmCacheExclusive(item)) { - nvmCache_->put(*handle, std::move(token)); + const bool isAlreadyFreed = + !markMovingForSlabRelease(ctx, &item, throttler); + if (!isAlreadyFreed) { + continue; + } + + return; } - - return handle; } template +template typename CacheAllocator::WriteHandle -CacheAllocator::evictChainedItemForSlabRelease(ChainedItem& child) { - XDCHECK(child.isMoving()); - - // We have the child marked as moving, but dont know anything about the - // state of the parent. Unlike the case of regular eviction where we are - // sure that the child is inside the MMContainer, ensuring its parent is - // valid, we can not make any assumptions here. We try to find the parent - // first through the access container and then verify that the parent's - // chain points to the child before cleaning up the parent. If the parent - // was in the process of being re-allocated or child was being removed - // concurrently, we would synchronize here on one of the checks. - Item& expectedParent = child.getParentItem(compressor_); - - // Grab exclusive lock since we are modifying the chain. at this point, we - // dont know the state of the parent. so we need to do some validity checks - // after we have the chained item lock to ensure that we got the lock off of - // a valid state. - const std::string parentKey = expectedParent.getKey().str(); - auto l = chainedItemLocks_.lockExclusive(parentKey); - - // check if the child is still in mmContainer and the expected parent is - // valid under the chained item lock. - if (expectedParent.getKey() != parentKey || !child.isInMMContainer() || - child.isOnlyMoving() || - &expectedParent != &child.getParentItem(compressor_) || - !expectedParent.isAccessible() || !expectedParent.hasChainedItem()) { - return {}; - } - - // search if the child is present in the chain - auto parentHandle = findInternal(parentKey); - if (!parentHandle || parentHandle != &expectedParent) { - return {}; - } - - ChainedItem* head = nullptr; - { // scope for the handle - auto headHandle = findChainedItem(expectedParent); - head = headHandle ? &headHandle->asChainedItem() : nullptr; - } - - bool found = false; - while (head) { - if (head == &child) { - found = true; - break; - } - head = head->getNext(compressor_); - } - - if (!found) { - return {}; - } - - // if we found the child in the parent's chain, we remove it and ensure that - // the handle we obtained was the last one. Before that, create a put token - // to guard any racing cache find to avoid item re-appearing in NvmCache. - const bool evictToNvmCache = shouldWriteToNvmCache(expectedParent); - - auto token = evictToNvmCache - ? nvmCache_->createPutToken(expectedParent.getKey()) - : typename NvmCacheT::PutToken{}; - - if (!accessContainer_->removeIf(expectedParent, - parentEvictForSlabReleasePredicate)) { - return {}; - } - - // at this point, we should be the last handle owner - XDCHECK_EQ(1u, parentHandle->getRefCount()); - - // We remove the parent from both access and mm containers. It doesn't - // matter if someone else calls remove on the parent at this moment, it - // cannot be freed since we hold an active item handle - removeFromMMContainer(*parentHandle); - - // In case someone else had removed this chained item from its parent by now - // So we check again to see if it has been unlinked from its parent - if (!child.isInMMContainer() || child.isOnlyMoving()) { - return {}; - } - - // check after removing from the MMContainer that the parent is still not - // being marked as moving. If parent is moving, it will release the child - // item and we will wait for that. - if (parentHandle->isMoving()) { - return {}; - } +CacheAllocator::removeIf(Item& item, Fn&& predicate) { + auto handle = accessContainer_->removeIf(item, std::forward(predicate)); - // now that we are the only handle and we actually removed something from - // the RAM cache, we enqueue it to nvmcache. - if (evictToNvmCache && shouldWriteToNvmCacheExclusive(*parentHandle)) { - DCHECK(parentHandle->hasChainedItem()); - nvmCache_->put(*parentHandle, std::move(token)); + if (handle) { + XDCHECK_EQ(reinterpret_cast(handle.get()), + reinterpret_cast(&item)); + removeFromMMContainer(item); } - return parentHandle; + return handle; } template @@ -2875,14 +3490,7 @@ bool CacheAllocator::removeIfExpired(const ReadHandle& handle) { // We remove the item from both access and mm containers. // We want to make sure the caller is the only one holding the handle. - auto removedHandle = - accessContainer_->removeIf(*(handle.getInternal()), itemExpiryPredicate); - if (removedHandle) { - removeFromMMContainer(*(handle.getInternal())); - return true; - } - - return false; + return (bool)removeIf(*(handle.getInternal()), itemExpiryPredicate); } template @@ -2900,25 +3508,81 @@ bool CacheAllocator::markMovingForSlabRelease( // At first, we assume this item was already freed bool itemFreed = true; + Item *syncItem = nullptr; bool markedMoving = false; - const auto fn = [&markedMoving, &itemFreed](void* memory) { + TierId tid = getTierId(alloc); + const auto fn = [this, tid, &syncItem, &markedMoving, &itemFreed](void* memory) { // Since this callback is executed, the item is not yet freed itemFreed = false; Item* item = static_cast(memory); - if (item->markMoving()) { - markedMoving = true; - } + auto allocInfo = allocator_[tid]->getAllocInfo(memory); + auto pid = allocInfo.poolId; + auto cid = allocInfo.classId; + auto& mmContainer = getMMContainer(tid, pid, cid); + mmContainer.withContainerLock([this, &mmContainer, + &syncItem, &item, &markedMoving]() { + //we rely on the mmContainer lock to safely check that the item is + //currently in the mmContainer (no other threads are currently allocating + //this item). This is needed to sync on the case where a chained item + //is being released back to allocator and it's parent ref could be + //invalid. We need a valid parent ref in order to mark a chained item + //as moving since we sync on the parent by marking it as moving. + if (!item->isInMMContainer()) { + return; + } + bool chainedItem_ = item->isChainedItem(); + XDCHECK_EQ(&getMMContainer(*item),&mmContainer); + XDCHECK_EQ(item->isChainedItem(),chainedItem_); + Item* syncItem_ = chainedItem_ + ? &item->asChainedItem().getParentItem(compressor_) + : item; + // in order to safely check if the expected parent (syncItem_) matches + // the current parent on the chained item, we need to take the chained + // item lock so we are sure that nobody else will be editing the chain + auto l_ = chainedItem_ + ? chainedItemLocks_.tryLockExclusive(syncItem_->getKey()) + : decltype(chainedItemLocks_.tryLockExclusive(syncItem_->getKey()))(); + + XDCHECK_EQ(item->isChainedItem(),chainedItem_); + if (chainedItem_ && + ( !l_ || &item->asChainedItem().getParentItem(compressor_) != syncItem_) ) { + markedMoving = false; + return; + } + if (syncItem_->markMoving()) { + markedMoving = true; + syncItem = syncItem_; + } + }); }; auto startTime = util::getCurrentTimeSec(); while (true) { - allocator_->processAllocForRelease(ctx, alloc, fn); + allocator_[tid]->processAllocForRelease(ctx, alloc, fn); // If item is already freed we give up trying to mark the item moving // and return false, otherwise if marked as moving, we return true. if (itemFreed) { return false; } else if (markedMoving) { + Item* item = static_cast(alloc); + XDCHECK(syncItem->isMoving()); + XDCHECK(item->isChainedItem() + ? item->asChainedItem().getParentItem(compressor_).isMoving() + : item->isMoving()) << item->toString() << "\n" << syncItem->toString(); + if ( ( item->isChainedItem() && + !item->asChainedItem().getParentItem(compressor_).isMoving() ) + || (!item->isChainedItem() && !item->isMoving()) ) { + throw std::runtime_error( + folly::sformat("Slab Release aborted - failed to mark" + " as moving for Item: {}. Pool: {}, Class: {}. Parent is {}", + item->toString(), ctx.getPoolId(), + ctx.getClassId(), + item->isChainedItem() + ? item->asChainedItem().getParentItem(compressor_).toString() + : "none")); + + } return true; } @@ -2927,13 +3591,14 @@ bool CacheAllocator::markMovingForSlabRelease( itemFreed = true; if (shutDownInProgress_) { - allocator_->abortSlabRelease(ctx); + allocator_[tid]->abortSlabRelease(ctx); throw exception::SlabReleaseAborted( folly::sformat("Slab Release aborted while still trying to mark" " as moving for Item: {}. Pool: {}, Class: {}.", static_cast(alloc)->toString(), ctx.getPoolId(), ctx.getClassId())); } + stats_.numMoveAttempts.inc(); throttleWith(throttler, [&] { XLOGF(WARN, "Spent {} seconds, slab release still trying to mark as moving for " @@ -2950,12 +3615,15 @@ template CCacheT* CacheAllocator::addCompactCache(folly::StringPiece name, size_t size, Args&&... args) { + if (getNumTiers() != 1) + throw std::runtime_error("TODO: compact cache for multi-tier Cache not supported."); + if (!config_.isCompactCacheEnabled()) { throw std::logic_error("Compact cache is not enabled"); } folly::SharedMutex::WriteHolder lock(compactCachePoolsLock_); - auto poolId = allocator_->addPool(name, size, {Slab::kSize}); + auto poolId = allocator_[0]->addPool(name, size, {Slab::kSize}); isCompactCachePool_[poolId] = true; auto ptr = std::make_unique( @@ -3051,8 +3719,13 @@ folly::IOBufQueue CacheAllocator::saveStateToIOBuf() { for (PoolId pid : pools) { for (unsigned int cid = 0; cid < (*stats_.fragmentationSize)[pid].size(); ++cid) { + uint64_t fragmentationSize = 0; + for (TierId tid = 0; tid < getNumTiers(); tid++) { + fragmentationSize += (*stats_.fragmentationSize)[tid][pid][cid].get(); + } metadata_.fragmentationSize()[pid][static_cast(cid)] = - (*stats_.fragmentationSize)[pid][cid].get(); + fragmentationSize; + } if (isCompactCachePool_[pid]) { metadata_.compactCachePools()->push_back(pid); @@ -3064,12 +3737,15 @@ folly::IOBufQueue CacheAllocator::saveStateToIOBuf() { *metadata_.numChainedChildItems() = stats_.numChainedChildItems.get(); *metadata_.numAbortedSlabReleases() = stats_.numAbortedSlabReleases.get(); + // TODO: implement serialization for multiple tiers auto serializeMMContainers = [](MMContainers& mmContainers) { MMSerializationTypeContainer state; - for (unsigned int i = 0; i < mmContainers.size(); ++i) { + for (unsigned int i = 0; i < 1 /* TODO: */ ; ++i) { for (unsigned int j = 0; j < mmContainers[i].size(); ++j) { - if (mmContainers[i][j]) { - state.pools_ref()[i][j] = mmContainers[i][j]->saveState(); + for (unsigned int k = 0; k < mmContainers[i][j].size(); ++k) { + if (mmContainers[i][j][k]) { + state.pools_ref()[j][k] = mmContainers[i][j][k]->saveState(); + } } } } @@ -3079,7 +3755,8 @@ folly::IOBufQueue CacheAllocator::saveStateToIOBuf() { serializeMMContainers(mmContainers_); AccessSerializationType accessContainerState = accessContainer_->saveState(); - MemoryAllocator::SerializationType allocatorState = allocator_->saveState(); + // TODO: foreach allocator + MemoryAllocator::SerializationType allocatorState = allocator_[0]->saveState(); CCacheManager::SerializationType ccState = compactCacheManager_->saveState(); AccessSerializationType chainedItemAccessContainerState = @@ -3104,6 +3781,8 @@ bool CacheAllocator::stopWorkers(std::chrono::seconds timeout) { success &= stopPoolResizer(timeout); success &= stopMemMonitor(timeout); success &= stopReaper(timeout); + success &= stopBackgroundEvictor(timeout); + success &= stopBackgroundPromoter(timeout); return success; } @@ -3141,6 +3820,8 @@ CacheAllocator::shutDown() { (shmShutDownStatus == ShmShutDownRes::kSuccess); shmManager_.reset(); + // TODO: save per-tier state + if (shmShutDownSucceeded) { if (!nvmShutDownStatusOpt || *nvmShutDownStatusOpt) return ShutDownStatus::kSuccess; @@ -3204,22 +3885,26 @@ CacheAllocator::deserializeMMContainers( const auto container = deserializer.deserialize(); - MMContainers mmContainers; + /* TODO: right now, we create empty containers because deserialization + * only works for a single (topmost) tier. */ + MMContainers mmContainers{getNumTiers()}; for (auto& kvPool : *container.pools_ref()) { auto i = static_cast(kvPool.first); auto& pool = getPool(i); for (auto& kv : kvPool.second) { auto j = static_cast(kv.first); - MMContainerPtr ptr = - std::make_unique(kv.second, - compressor); - auto config = ptr->getConfig(); - config.addExtraConfig(config_.trackTailHits - ? pool.getAllocationClass(j).getAllocsPerSlab() - : 0); - ptr->setConfig(config); - mmContainers[i][j] = std::move(ptr); + for (TierId tid = 0; tid < getNumTiers(); tid++) { + MMContainerPtr ptr = + std::make_unique(kv.second, + compressor); + auto config = ptr->getConfig(); + config.addExtraConfig(config_.trackTailHits + ? pool.getAllocationClass(j).getAllocsPerSlab() + : 0); + ptr->setConfig(config); + mmContainers[tid][i][j] = std::move(ptr); + } } } // We need to drop the unevictableMMContainer in the desierializer. @@ -3286,8 +3971,19 @@ void CacheAllocator::initStats() { // deserialize the fragmentation size of each thread. for (const auto& pid : *metadata_.fragmentationSize()) { for (const auto& cid : pid.second) { - (*stats_.fragmentationSize)[pid.first][cid.first].set( - static_cast(cid.second)); + //in multi-tier we serialized as the sum - no way + //to get back so just divide the two for now + //TODO: proper multi-tier serialization + uint64_t total = static_cast(cid.second); + uint64_t part = total / getNumTiers(); + uint64_t sum = 0; + for (TierId tid = 1; tid < getNumTiers(); tid++) { + (*stats_.fragmentationSize)[tid][pid.first][cid.first].set(part); + sum += part; + } + uint64_t leftover = total - sum; + (*stats_.fragmentationSize)[0][pid.first][cid.first].set(leftover); + } } @@ -3360,6 +4056,8 @@ GlobalCacheStats CacheAllocator::getGlobalCacheStats() const { ret.nvmCacheEnabled = nvmCache_ ? nvmCache_->isEnabled() : false; ret.reaperStats = getReaperStats(); ret.rebalancerStats = getRebalancerStats(); + ret.evictionStats = getBackgroundMoverStats(MoverDir::Evict); + ret.promotionStats = getBackgroundMoverStats(MoverDir::Promote); ret.numActiveHandles = getNumActiveHandles(); ret.isNewRamCache = cacheCreationTime_ == cacheInstanceCreationTime_; @@ -3371,11 +4069,14 @@ GlobalCacheStats CacheAllocator::getGlobalCacheStats() const { template CacheMemoryStats CacheAllocator::getCacheMemoryStats() const { - const auto totalCacheSize = allocator_->getMemorySize(); - const auto configuredTotalCacheSize = allocator_->getMemorySizeInclAdvised(); - + size_t totalCacheSize = 0; + size_t configuredTotalCacheSize = 0; + for(auto& allocator: allocator_) { + totalCacheSize += allocator->getMemorySize(); + configuredTotalCacheSize += allocator->getMemorySizeInclAdvised(); + } auto addSize = [this](size_t a, PoolId pid) { - return a + allocator_->getPool(pid).getPoolSize(); + return a + allocator_[currentTier()]->getPool(pid).getPoolSize(); }; const auto regularPoolIds = getRegularPoolIds(); const auto ccCachePoolIds = getCCachePoolIds(); @@ -3388,9 +4089,9 @@ CacheMemoryStats CacheAllocator::getCacheMemoryStats() const { configuredTotalCacheSize, configuredRegularCacheSize, configuredCompactCacheSize, - allocator_->getAdvisedMemorySize(), + allocator_[currentTier()]->getAdvisedMemorySize(), memMonitor_ ? memMonitor_->getMaxAdvisePct() : 0, - allocator_->getUnreservedMemorySize(), + allocator_[currentTier()]->getUnreservedMemorySize(), nvmCache_ ? nvmCache_->getSize() : 0, util::getMemAvailable(), util::getRSSBytes()}; @@ -3527,6 +4228,64 @@ bool CacheAllocator::startNewReaper( return true; } +template +auto CacheAllocator::getAssignedMemoryToBgWorker(size_t evictorId, size_t numWorkers, TierId tid) +{ + std::vector asssignedMemory; + // TODO: for now, only evict from tier 0 + auto pools = filterCompactCachePools(allocator_[tid]->getPoolIds()); + for (const auto pid : pools) { + const auto& mpStats = getPoolByTid(pid,tid).getStats(); + for (const auto cid : mpStats.classIds) { + if (backgroundWorkerId(tid, pid, cid, numWorkers) == evictorId) { + asssignedMemory.emplace_back(tid, pid, cid); + } + } + } + return asssignedMemory; +} + +template +bool CacheAllocator::startNewBackgroundEvictor( + std::chrono::milliseconds interval, + std::shared_ptr strategy, + size_t threads) { + XDCHECK(threads > 0); + backgroundEvictor_.resize(threads); + bool result = true; + + for (size_t i = 0; i < threads; i++) { + auto ret = startNewWorker("BackgroundEvictor" + std::to_string(i), backgroundEvictor_[i], interval, *this, strategy, MoverDir::Evict); + result = result && ret; + + if (result) { + backgroundEvictor_[i]->setAssignedMemory(getAssignedMemoryToBgWorker(i, backgroundEvictor_.size(), 0)); + } + } + return result; +} + +template +bool CacheAllocator::startNewBackgroundPromoter( + std::chrono::milliseconds interval, + std::shared_ptr strategy, + size_t threads) { + XDCHECK(threads > 0); + XDCHECK(getNumTiers() > 1); + backgroundPromoter_.resize(threads); + bool result = true; + + for (size_t i = 0; i < threads; i++) { + auto ret = startNewWorker("BackgroundPromoter" + std::to_string(i), backgroundPromoter_[i], interval, *this, strategy, MoverDir::Promote); + result = result && ret; + + if (result) { + backgroundPromoter_[i]->setAssignedMemory(getAssignedMemoryToBgWorker(i, backgroundPromoter_.size(), 1)); + } + } + return result; +} + template bool CacheAllocator::stopPoolRebalancer( std::chrono::seconds timeout) { @@ -3575,6 +4334,26 @@ bool CacheAllocator::stopReaper(std::chrono::seconds timeout) { return res; } +template +bool CacheAllocator::stopBackgroundEvictor(std::chrono::seconds timeout) { + bool result = true; + for (size_t i = 0; i < backgroundEvictor_.size(); i++) { + auto ret = stopWorker("BackgroundEvictor", backgroundEvictor_[i], timeout); + result = result && ret; + } + return result; +} + +template +bool CacheAllocator::stopBackgroundPromoter(std::chrono::seconds timeout) { + bool result = true; + for (size_t i = 0; i < backgroundPromoter_.size(); i++) { + auto ret = stopWorker("BackgroundPromoter", backgroundPromoter_[i], timeout); + result = result && ret; + } + return result; +} + template bool CacheAllocator::cleanupStrayShmSegments( const std::string& cacheDir, bool posix) { @@ -3583,6 +4362,8 @@ bool CacheAllocator::cleanupStrayShmSegments( // cache dir exists. clean up only if there are no other processes // attached. if another process was attached, the following would fail. ShmManager::cleanup(cacheDir, posix); + + // TODO: cleanup per-tier state } catch (const std::exception& e) { XLOGF(ERR, "Error cleaning up {}. Exception: ", cacheDir, e.what()); return false; @@ -3592,7 +4373,8 @@ bool CacheAllocator::cleanupStrayShmSegments( // Any other concurrent process can not be attached to the segments or // even if it does, we want to mark it for destruction. ShmManager::removeByName(cacheDir, detail::kShmInfoName, posix); - ShmManager::removeByName(cacheDir, detail::kShmCacheName, posix); + ShmManager::removeByName(cacheDir, detail::kShmCacheName + + std::to_string(0 /* TODO: per tier */), posix); ShmManager::removeByName(cacheDir, detail::kShmHashTableName, posix); ShmManager::removeByName(cacheDir, detail::kShmChainedItemHashTableName, posix); @@ -3606,14 +4388,16 @@ uint64_t CacheAllocator::getItemPtrAsOffset(const void* ptr) { // the two differ (e.g. Mac OS 12) - causing templating instantiation // errors downstream. + auto tid = getTierId(ptr); + // if this succeeeds, the address is valid within the cache. - allocator_->getAllocInfo(ptr); + allocator_[tid]->getAllocInfo(ptr); if (!isOnShm_ || !shmManager_) { throw std::invalid_argument("Shared memory not used"); } - const auto& shm = shmManager_->getShmByName(detail::kShmCacheName); + const auto& shm = shmManager_->getShmByName(detail::kShmCacheName + std::to_string(tid)); return reinterpret_cast(ptr) - reinterpret_cast(shm.getCurrentMapping().addr); diff --git a/cachelib/allocator/CacheAllocator.h b/cachelib/allocator/CacheAllocator.h index 4c55496853..b3201daf76 100644 --- a/cachelib/allocator/CacheAllocator.h +++ b/cachelib/allocator/CacheAllocator.h @@ -22,6 +22,8 @@ #include #include #include +#include +#include #include #include @@ -38,6 +40,7 @@ #include #pragma GCC diagnostic pop +#include "cachelib/allocator/BackgroundMover.h" #include "cachelib/allocator/CCacheManager.h" #include "cachelib/allocator/Cache.h" #include "cachelib/allocator/CacheAllocatorConfig.h" @@ -57,6 +60,7 @@ #include "cachelib/allocator/PoolOptimizer.h" #include "cachelib/allocator/PoolRebalancer.h" #include "cachelib/allocator/PoolResizer.h" +#include "cachelib/allocator/PrivateMemoryManager.h" #include "cachelib/allocator/ReadOnlySharedCacheView.h" #include "cachelib/allocator/Reaper.h" #include "cachelib/allocator/RebalanceStrategy.h" @@ -710,6 +714,11 @@ class CacheAllocator : public CacheBase { // @return the full usable size for this item uint32_t getUsableSize(const Item& item) const; + // gets the allocation class assigned to BG worker + auto getAssignedMemoryToBgWorker(size_t evictorId, size_t numWorkers, TierId tid); + bool shouldWakeupBgEvictor(TierId tid, PoolId pid, ClassId cid); + size_t backgroundWorkerId(TierId tid, PoolId pid, ClassId cid, size_t numWorkers); + // Get a random item from memory // This is useful for profiling and sampling cachelib managed memory // @@ -806,7 +815,7 @@ class CacheAllocator : public CacheBase { // @param config new config for the pool // // @throw std::invalid_argument if the poolId is invalid - void overridePoolConfig(PoolId pid, const MMConfig& config); + void overridePoolConfig(TierId tid, PoolId pid, const MMConfig& config); // update an existing pool's rebalance strategy // @@ -847,8 +856,9 @@ class CacheAllocator : public CacheBase { // @return true if the operation succeeded. false if the size of the pool is // smaller than _bytes_ // @throw std::invalid_argument if the poolId is invalid. + // TODO: should call shrinkPool for specific tier? bool shrinkPool(PoolId pid, size_t bytes) { - return allocator_->shrinkPool(pid, bytes); + return allocator_[currentTier()]->shrinkPool(pid, bytes); } // grow an existing pool by _bytes_. This will fail if there is no @@ -857,8 +867,9 @@ class CacheAllocator : public CacheBase { // @return true if the pool was grown. false if the necessary number of // bytes were not available. // @throw std::invalid_argument if the poolId is invalid. + // TODO: should call growPool for specific tier? bool growPool(PoolId pid, size_t bytes) { - return allocator_->growPool(pid, bytes); + return allocator_[currentTier()]->growPool(pid, bytes); } // move bytes from one pool to another. The source pool should be at least @@ -871,7 +882,7 @@ class CacheAllocator : public CacheBase { // correct size to do the transfer. // @throw std::invalid_argument if src or dest is invalid pool bool resizePools(PoolId src, PoolId dest, size_t bytes) override { - return allocator_->resizePools(src, dest, bytes); + return allocator_[currentTier()]->resizePools(src, dest, bytes); } // Add a new compact cache with given name and size @@ -1053,6 +1064,11 @@ class CacheAllocator : public CacheBase { // @param reaperThrottleConfig throttling config bool startNewReaper(std::chrono::milliseconds interval, util::Throttler::Config reaperThrottleConfig); + + bool startNewBackgroundPromoter(std::chrono::milliseconds interval, + std::shared_ptr strategy, size_t threads); + bool startNewBackgroundEvictor(std::chrono::milliseconds interval, + std::shared_ptr strategy, size_t threads); // Stop existing workers with a timeout bool stopPoolRebalancer(std::chrono::seconds timeout = std::chrono::seconds{ @@ -1062,6 +1078,8 @@ class CacheAllocator : public CacheBase { 0}); bool stopMemMonitor(std::chrono::seconds timeout = std::chrono::seconds{0}); bool stopReaper(std::chrono::seconds timeout = std::chrono::seconds{0}); + bool stopBackgroundEvictor(std::chrono::seconds timeout = std::chrono::seconds{0}); + bool stopBackgroundPromoter(std::chrono::seconds timeout = std::chrono::seconds{0}); // Set pool optimization to either true or false // @@ -1076,12 +1094,13 @@ class CacheAllocator : public CacheBase { // @throw std::invalid_argument if the memory does not belong to this // cache allocator AllocInfo getAllocInfo(const void* memory) const { - return allocator_->getAllocInfo(memory); + return allocator_[getTierId(memory)]->getAllocInfo(memory); } // return the ids for the set of existing pools in this cache. std::set getPoolIds() const override final { - return allocator_->getPoolIds(); + // all tiers have the same pool ids. TODO: deduplicate + return allocator_[0]->getPoolIds(); } // return a list of pool ids that are backing compact caches. This includes @@ -1093,18 +1112,22 @@ class CacheAllocator : public CacheBase { // return the pool with speicified id. const MemoryPool& getPool(PoolId pid) const override final { - return allocator_->getPool(pid); + return allocator_[currentTier()]->getPool(pid); + } + + const MemoryPool& getPoolByTid(PoolId pid, TierId tid) const override final { + return allocator_[tid]->getPool(pid); } // calculate the number of slabs to be advised/reclaimed in each pool PoolAdviseReclaimData calcNumSlabsToAdviseReclaim() override final { auto regularPoolIds = getRegularPoolIds(); - return allocator_->calcNumSlabsToAdviseReclaim(regularPoolIds); + return allocator_[currentTier()]->calcNumSlabsToAdviseReclaim(regularPoolIds); } // update number of slabs to advise in the cache void updateNumSlabsToAdvise(int32_t numSlabsToAdvise) override final { - allocator_->updateNumSlabsToAdvise(numSlabsToAdvise); + allocator_[currentTier()]->updateNumSlabsToAdvise(numSlabsToAdvise); } // returns a valid PoolId corresponding to the name or kInvalidPoolId if the @@ -1112,8 +1135,9 @@ class CacheAllocator : public CacheBase { PoolId getPoolId(folly::StringPiece name) const noexcept; // returns the pool's name by its poolId. - std::string getPoolName(PoolId poolId) const override { - return allocator_->getPoolName(poolId); + std::string getPoolName(PoolId poolId) const { + // all tiers have the same pool names. + return allocator_[0]->getPoolName(poolId); } // get stats related to all kinds of slab release events. @@ -1145,6 +1169,52 @@ class CacheAllocator : public CacheBase { auto stats = reaper_ ? reaper_->getStats() : ReaperStats{}; return stats; } + + // returns the background mover stats + BackgroundMoverStats getBackgroundMoverStats(MoverDir direction) const { + + auto stats = BackgroundMoverStats{}; + if (direction == MoverDir::Evict) { + for (auto &bg : backgroundEvictor_) + stats += bg->getStats(); + } else if (direction == MoverDir::Promote) { + for (auto &bg : backgroundPromoter_) + stats += bg->getStats(); + } + return stats; + + } + + + std::map>> + getBackgroundMoverClassStats(MoverDir direction) const { + std::map>> stats; + + if (direction == MoverDir::Evict) { + for (auto &bg : backgroundEvictor_) { + for (auto &tid : bg->getClassStats()) { + for (auto &pid : tid.second) { + for (auto &cid : pid.second) { + stats[tid.first][pid.first][cid.first] += cid.second; + } + } + } + } + } else if (direction == MoverDir::Promote) { + for (auto &bg : backgroundPromoter_) { + for (auto &tid : bg->getClassStats()) { + for (auto &pid : tid.second) { + for (auto &cid : pid.second) { + stats[tid.first][pid.first][cid.first] += cid.second; + } + } + } + } + } + + return stats; + } + // returns the pool rebalancer stats RebalancerStats getRebalancerStats() const { @@ -1171,6 +1241,8 @@ class CacheAllocator : public CacheBase { // pool stats by pool id PoolStats getPoolStats(PoolId pid) const override final; + // pool stats by tier id and pool id + PoolStats getPoolStats(TierId tid, PoolId pid) const; // This can be expensive so it is not part of PoolStats PoolEvictionAgeStats getPoolEvictionAgeStats( @@ -1185,6 +1257,9 @@ class CacheAllocator : public CacheBase { // return cache's memory usage stats CacheMemoryStats getCacheMemoryStats() const override final; + // return stats for Allocation Class + ACStats getACStats(TierId tid, PoolId pid, ClassId cid) const override final; + // return the nvm cache stats map util::StatsMap getNvmCacheStatsMap() const override final; @@ -1294,6 +1369,7 @@ class CacheAllocator : public CacheBase { sizeof(typename RefcountWithFlags::Value) + sizeof(uint32_t) + sizeof(uint32_t) + sizeof(KAllocation)) == sizeof(Item), "vtable overhead"); + // Check for CompressedPtr single/multi tier support static_assert(32 == sizeof(Item), "item overhead is 32 bytes"); // make sure there is no overhead in ChainedItem on top of a regular Item @@ -1318,7 +1394,7 @@ class CacheAllocator : public CacheBase { private: // wrapper around Item's refcount and active handle tracking - FOLLY_ALWAYS_INLINE bool incRef(Item& it); + FOLLY_ALWAYS_INLINE RefcountWithFlags::incResult incRef(Item& it); FOLLY_ALWAYS_INLINE RefcountWithFlags::Value decRef(Item& it); // drops the refcount and if needed, frees the allocation back to the memory @@ -1388,11 +1464,14 @@ class CacheAllocator : public CacheBase { using MMContainerPtr = std::unique_ptr; using MMContainers = - std::array, - MemoryPoolManager::kMaxPools>; + std::vector, + MemoryPoolManager::kMaxPools>>; void createMMContainers(const PoolId pid, MMConfig config); + TierId getTierId(const Item& item) const; + TierId getTierId(const void* ptr) const; + // acquire the MMContainer corresponding to the the Item's class and pool. // // @return pointer to the MMContainer. @@ -1400,7 +1479,12 @@ class CacheAllocator : public CacheBase { // allocation from the memory allocator. MMContainer& getMMContainer(const Item& item) const noexcept; - MMContainer& getMMContainer(PoolId pid, ClassId cid) const noexcept; + MMContainer& getMMContainer(TierId tid, PoolId pid, ClassId cid) const noexcept; + + // Get stats of the specified pid and cid. + // If such mmcontainer is not valid (pool id or cid out of bound) + // or the mmcontainer is not initialized, return an empty stat. + MMContainerStat getMMContainerStat(TierId tid, PoolId pid, ClassId cid) const noexcept; // create a new cache allocation. The allocation can be initialized // appropriately and made accessible through insert or insertOrReplace. @@ -1430,7 +1514,26 @@ class CacheAllocator : public CacheBase { Key key, uint32_t size, uint32_t creationTime, - uint32_t expiryTime); + uint32_t expiryTime, + bool fromBgThread = false); + + // create a new cache allocation on specific memory tier. + // For description see allocateInternal. + // + // @param tid id a memory tier + // @param fromBgThread whether this function was called from a bg + // thread - this is used to decide whether bg thread should + // be waken in case there is no free memory + // @param evict whether to evict an item from tier tid in case there + // is not enough memory + WriteHandle allocateInternalTier(TierId tid, + PoolId id, + Key key, + uint32_t size, + uint32_t creationTime, + uint32_t expiryTime, + bool fromBgThread, + bool evict); // Allocate a chained item // @@ -1449,6 +1552,26 @@ class CacheAllocator : public CacheBase { WriteHandle allocateChainedItemInternal(const ReadHandle& parent, uint32_t size); + // Allocate a chained item to a specific tier + // + // The resulting chained item does not have a parent item yet + // and if we fail to link to the chain for any reasoin + // the chained item will be freed once the handle is dropped. + // + // The parent item parameter here is mainly used to find the + // correct pool to allocate memory for this chained item + // + // @param parent parent item + // @param size the size for the chained allocation + // @param tid the tier to allocate on + // + // @return handle to the chained allocation + // @throw std::invalid_argument if the size requested is invalid or + // if the item is invalid + WriteHandle allocateChainedItemInternalTier(const Item& parent, + uint32_t size, + TierId tid); + // Given an item and its parentKey, validate that the parentKey // corresponds to an item that's the parent of the supplied item. // @@ -1458,17 +1581,17 @@ class CacheAllocator : public CacheBase { // @return handle to the parent item if the validations pass // otherwise, an empty Handle is returned. // - ReadHandle validateAndGetParentHandleForChainedMoveLocked( + WriteHandle validateAndGetParentHandleForChainedMoveLocked( const ChainedItem& item, const Key& parentKey); // Given an existing item, allocate a new one for the // existing one to later be moved into. // - // @param oldItem the item we want to allocate a new item for + // @param item reference to the item we want to allocate a new item for // // @return handle to the newly allocated item // - WriteHandle allocateNewItemForOldItem(const Item& oldItem); + WriteHandle allocateNewItemForOldItem(const Item& item); // internal helper that grabs a refcounted handle to the item. This does // not record the access to reflect in the mmContainer. @@ -1522,10 +1645,7 @@ class CacheAllocator : public CacheBase { // not exist. FOLLY_ALWAYS_INLINE WriteHandle findFastImpl(Key key, AccessMode mode); - // Moves a regular item to a different slab. This should only be used during - // slab release after the item's exclusive bit has been set. The user supplied - // callback is responsible for copying the contents and fixing the semantics - // of chained item. + // Moves a regular item to a different memory tier. // // @param oldItem Reference to the item being moved // @param newItemHdl Reference to the handle of the new item being moved into @@ -1534,6 +1654,16 @@ class CacheAllocator : public CacheBase { // successfully. bool moveRegularItem(Item& oldItem, WriteHandle& newItemHdl); + // Moves a chained item to a different memory tier. + // + // @param oldItem Reference to the item being moved + // @param newItemHdl Reference to the handle of the new item being moved into + // @param parentHandle Reference to the handle of the parent item + // + // @return true If the move was completed, and the containers were updated + // successfully. + bool moveChainedItem(ChainedItem& oldItem, WriteHandle& newItemHdl, Item& parentItem); + // template class for viewAsChainedAllocs that takes either ReadHandle or // WriteHandle template @@ -1545,29 +1675,12 @@ class CacheAllocator : public CacheBase { template folly::IOBuf convertToIOBufT(Handle& handle); - // Moves a chained item to a different slab. This should only be used during - // slab release after the item's exclusive bit has been set. The user supplied - // callback is responsible for copying the contents and fixing the semantics - // of chained item. - // - // Note: If we have successfully moved the old item into the new, the - // newItemHdl is reset and no longer usable by the caller. - // - // @param oldItem Reference to the item being moved - // @param newItemHdl Reference to the handle of the new item being - // moved into - // - // @return true If the move was completed, and the containers were updated - // successfully. - bool moveChainedItem(ChainedItem& oldItem, WriteHandle& newItemHdl); - // Transfers the chain ownership from parent to newParent. Parent // will be unmarked as having chained allocations. Parent will not be null // after calling this API. // - // Parent and NewParent must be valid handles to items with same key and - // parent must have chained items and parent handle must be the only - // outstanding handle for parent. New parent must be without any chained item + // NewParent must be valid handles to item with same key as Parent and + // Parent must have chained items. New parent must be without any chained item // handles. // // Chained item lock for the parent's key needs to be held in exclusive mode. @@ -1576,7 +1689,7 @@ class CacheAllocator : public CacheBase { // @param newParent the new parent for the chain // // @throw if any of the conditions for parent or newParent are not met. - void transferChainLocked(WriteHandle& parent, WriteHandle& newParent); + void transferChainLocked(Item& parent, WriteHandle& newParent); // replace a chained item in the existing chain. This needs to be called // with the chained item lock held exclusive @@ -1590,6 +1703,24 @@ class CacheAllocator : public CacheBase { WriteHandle newItemHdl, const Item& parent); + // + // Performs the actual inplace replace - it is called from + // moveChainedItem and replaceChainedItemLocked + // must hold chainedItemLock + // + // @param oldItem the item we are replacing in the chain + // @param newItem the item we are replacing it with + // @param parent the parent for the chain + // @param fromMove used to determine if the replaced was called from + // moveChainedItem - we avoid the handle destructor + // in this case. + // + // @return handle to the oldItem + void replaceInChainLocked(Item& oldItem, + WriteHandle& newItemHdl, + const Item& parent, + bool fromMove); + // Insert an item into MM container. The caller must hold a valid handle for // the item. // @@ -1607,6 +1738,10 @@ class CacheAllocator : public CacheBase { // false if the item is not in MMContainer bool removeFromMMContainer(Item& item); + using EvictionIterator = typename MMContainer::LockedIterator; + + WriteHandle acquire(EvictionIterator& it) { return acquire(it.get()); } + // Replaces an item in the MMContainer with another item, at the same // position. // @@ -1617,6 +1752,8 @@ class CacheAllocator : public CacheBase { // destination item did not exist in the container, or if the // source item already existed. bool replaceInMMContainer(Item& oldItem, Item& newItem); + bool replaceInMMContainer(Item* oldItem, Item& newItem); + bool replaceInMMContainer(EvictionIterator& oldItemIt, Item& newItem); // Replaces an item in the MMContainer with another item, at the same // position. Or, if the two chained items belong to two different MM @@ -1676,15 +1813,17 @@ class CacheAllocator : public CacheBase { // Implementation to find a suitable eviction from the container. The // two parameters together identify a single container. // + // @param tid the id of the tier to look for evictions inside // @param pid the id of the pool to look for evictions inside // @param cid the id of the class to look for evictions inside // @return An evicted item or nullptr if there is no suitable candidate found // within the configured number of attempts. - Item* findEviction(PoolId pid, ClassId cid); + Item* findEviction(TierId tid, PoolId pid, ClassId cid); // Get next eviction candidate from MMContainer, remove from AccessContainer, // MMContainer and insert into NVMCache if enabled. // + // @param tid the id of the tier to look for evictions inside // @param pid the id of the pool to look for evictions inside // @param cid the id of the class to look for evictions inside // @param searchTries number of search attempts so far. @@ -1692,11 +1831,44 @@ class CacheAllocator : public CacheBase { // @return pair of [candidate, toRecycle]. Pair of null if reached the end of // the eviction queue or no suitable candidate found // within the configured number of attempts - std::pair getNextCandidate(PoolId pid, + std::pair getNextCandidate(TierId tid, + PoolId pid, ClassId cid, unsigned int& searchTries); - using EvictionIterator = typename MMContainer::LockedIterator; + // Try to move the item down to the next memory tier + // + // @param tid current tier ID of the item + // @param pid the pool ID the item belong to. + // @param item the item to evict + // + // @return valid handle to the item. This will be the last + // handle to the item. On failure an empty handle. + WriteHandle tryEvictToNextMemoryTier(TierId tid, PoolId pid, Item& item, bool fromBgThread); + + WriteHandle tryPromoteToNextMemoryTier(TierId tid, PoolId pid, Item& item, bool fromBgThread); + + WriteHandle tryPromoteToNextMemoryTier(Item& item, bool fromBgThread); + + // Wakes up waiters if there are any + // + // @param item wakes waiters that are waiting on that item + // @param handle handle to pass to the waiters + void wakeUpWaiters(Item& item, WriteHandle handle); + + // Unmarks item as moving and wakes up any waiters waiting on that item + // + // @param item wakes waiters that are waiting on that item + // @param handle handle to pass to the waiters + typename RefcountWithFlags::Value unmarkMovingAndWakeUpWaiters(Item &item, WriteHandle handle); + + // Try to move the item down to the next memory tier + // + // @param item the item to evict + // + // @return valid handle to the item. This will be the last + // handle to the item. On failure an empty handle. + WriteHandle tryEvictToNextMemoryTier(Item& item, bool fromBgThread); // Deserializer CacheAllocatorMetadata and verify the version // @@ -1710,7 +1882,7 @@ class CacheAllocator : public CacheBase { const typename Item::PtrCompressor& compressor); unsigned int reclaimSlabs(PoolId id, size_t numSlabs) final { - return allocator_->reclaimSlabsAndGrow(id, numSlabs); + return allocator_[currentTier()]->reclaimSlabsAndGrow(id, numSlabs); } FOLLY_ALWAYS_INLINE EventTracker* getEventTracker() const { @@ -1769,7 +1941,7 @@ class CacheAllocator : public CacheBase { const void* hint = nullptr) final; // @param releaseContext slab release context - void releaseSlabImpl(const SlabReleaseContext& releaseContext); + void releaseSlabImpl(TierId tid, const SlabReleaseContext& releaseContext); // @return true when successfully marked as moving, // fasle when this item has already been freed @@ -1782,13 +1954,14 @@ class CacheAllocator : public CacheBase { // // // @param ctx slab release context - // @param item old item to be moved elsewhere + // @param oldItem old item to be moved elsewhere + // @param handle handle to the item or to it's parent (if chained) // @param throttler slow this function down as not to take too much cpu // // @return true if the item has been moved // false if we have exhausted moving attempts bool moveForSlabRelease(const SlabReleaseContext& ctx, - Item& item, + Item& oldItem, util::Throttler& throttler); // "Move" (by copying) the content in this item to another memory @@ -1811,18 +1984,13 @@ class CacheAllocator : public CacheBase { Item& item, util::Throttler& throttler); - // Helper function to evict a normal item for slab release - // - // @return last handle for corresponding to item on success. empty handle on - // failure. caller can retry if needed. - WriteHandle evictNormalItemForSlabRelease(Item& item); + typename NvmCacheT::PutToken createPutToken(Item& item); - // Helper function to evict a child item for slab release - // As a side effect, the parent item is also evicted + // Helper function to remove a item if predicates is true. // - // @return last handle to the parent item of the child on success. empty - // handle on failure. caller can retry. - WriteHandle evictChainedItemForSlabRelease(ChainedItem& item); + // @return last handle to the item on success. empty handle on failure. + template + WriteHandle removeIf(Item& item, Fn&& predicate); // Helper function to remove a item if expired. // @@ -1841,10 +2009,169 @@ class CacheAllocator : public CacheBase { // primitives. So we consciously exempt ourselves here from TSAN data race // detection. folly::annotate_ignore_thread_sanitizer_guard g(__FILE__, __LINE__); - auto slabsSkipped = allocator_->forEachAllocation(std::forward(f)); + auto slabsSkipped = allocator_[currentTier()]->forEachAllocation(std::forward(f)); stats().numReaperSkippedSlabs.add(slabsSkipped); } + // exposed for the background evictor to iterate through the memory and evict + // in batch. This should improve insertion path for tiered memory config + size_t traverseAndEvictItems(unsigned int tid, unsigned int pid, unsigned int cid, size_t batch) { + util::LatencyTracker tracker{stats().bgEvictLatency_, batch}; + auto& mmContainer = getMMContainer(tid, pid, cid); + size_t evictions = 0; + size_t evictionCandidates = 0; + std::vector candidates; + candidates.reserve(batch); + + size_t tries = 0; + mmContainer.withEvictionIterator([&tries, &candidates, &batch, &mmContainer, this](auto &&itr) { + while (candidates.size() < batch && + (config_.maxEvictionPromotionHotness == 0 || tries < config_.maxEvictionPromotionHotness) && + itr) { + tries++; + Item* candidate = itr.get(); + XDCHECK(candidate); + + if (candidate->isChainedItem()) { + throw std::runtime_error("Not supported for chained items"); + } + + if (candidate->markMoving()) { + mmContainer.remove(itr); + candidates.push_back(candidate); + } else { + ++itr; + } + } + }); + + for (Item *candidate : candidates) { + auto evictedToNext = tryEvictToNextMemoryTier(*candidate, true /* from BgThread */); + if (!evictedToNext) { + auto token = createPutToken(*candidate); + + auto ret = candidate->markForEvictionWhenMoving(); + XDCHECK(ret); + + unlinkItemForEviction(*candidate); + // wake up any readers that wait for the move to complete + // it's safe to do now, as we have the item marked exclusive and + // no other reader can be added to the waiters list + wakeUpWaiters(*candidate, WriteHandle{}); + + if (token.isValid() && shouldWriteToNvmCacheExclusive(*candidate)) { + nvmCache_->put(*candidate, std::move(token)); + } + } else { + evictions++; + XDCHECK(!evictedToNext->isMarkedForEviction() && !evictedToNext->isMoving()); + XDCHECK(!candidate->isMarkedForEviction() && !candidate->isMoving()); + XDCHECK(!candidate->isAccessible()); + XDCHECK(candidate->getKey() == evictedToNext->getKey()); + + wakeUpWaiters(*candidate, std::move(evictedToNext)); + } + XDCHECK(!candidate->isMarkedForEviction() && !candidate->isMoving()); + + if (candidate->hasChainedItem()) { + (*stats_.chainedItemEvictions)[tid][pid][cid].inc(); + } else { + (*stats_.regularItemEvictions)[tid][pid][cid].inc(); + } + + // it's safe to recycle the item here as there are no more + // references and the item could not been marked as moving + // by other thread since it's detached from MMContainer. + auto res = releaseBackToAllocator(*candidate, RemoveContext::kEviction, + /* isNascent */ false); + XDCHECK(res == ReleaseRes::kReleased); + } + return evictions; + } + + size_t traverseAndPromoteItems(unsigned int tid, unsigned int pid, unsigned int cid, size_t batch) { + util::LatencyTracker tracker{stats().bgPromoteLatency_, batch}; + auto& mmContainer = getMMContainer(tid, pid, cid); + size_t promotions = 0; + std::vector candidates; + candidates.reserve(batch); + + size_t tries = 0; + + mmContainer.withPromotionIterator([&tries, &candidates, &batch, &mmContainer, this](auto &&itr){ + while (candidates.size() < batch && (config_.maxEvictionPromotionHotness == 0 || tries < config_.maxEvictionPromotionHotness) && itr) { + tries++; + Item* candidate = itr.get(); + XDCHECK(candidate); + + if (candidate->isChainedItem()) { + throw std::runtime_error("Not supported for chained items"); + } + + // TODO: only allow it for read-only items? + // or implement mvcc + if (candidate->markMoving()) { + // promotions should rarely fail since we already marked moving + mmContainer.remove(itr); + candidates.push_back(candidate); + } + + ++itr; + } + }); + + for (Item *candidate : candidates) { + auto promoted = tryPromoteToNextMemoryTier(*candidate, true); + if (promoted) { + promotions++; + XDCHECK(!candidate->isMarkedForEviction() && !candidate->isMoving()); + // it's safe to recycle the item here as there are no more + // references and the item could not been marked as moving + // by other thread since it's detached from MMContainer. + // + // but we need to wake up waiters before releasing + // since candidate's key can change after being sent + // back to allocator + wakeUpWaiters(*candidate, std::move(promoted)); + auto res = releaseBackToAllocator(*candidate, RemoveContext::kEviction, + /* isNascent */ false); + XDCHECK(res == ReleaseRes::kReleased); + } else { + // we failed to allocate a new item, this item is no longer moving + auto ref = candidate->unmarkMoving(); + if (UNLIKELY(ref == 0)) { + wakeUpWaiters(*candidate,{}); + const auto res = + releaseBackToAllocator(*candidate, + RemoveContext::kNormal, false); + XDCHECK(res == ReleaseRes::kReleased); + } else if (candidate->isAccessible()) { + //case where we failed to allocate in lower tier + //item is still present in accessContainer + //item is no longer moving - acquire and + //wake up waiters with this handle + auto hdl = acquire(candidate); + insertInMMContainer(*hdl); + wakeUpWaiters(*candidate,std::move(hdl)); + } else if (!candidate->isAccessible()) { + //case where we failed to replace in access + //container due to another thread calling insertOrReplace + //unmark moving and return null handle + wakeUpWaiters(*candidate,{}); + if (UNLIKELY(ref == 0)) { + const auto res = + releaseBackToAllocator(*candidate, RemoveContext::kNormal, + false); + XDCHECK(res == ReleaseRes::kReleased); + } + } else { + XDCHECK(false); + } + } + } + return promotions; + } + // returns true if nvmcache is enabled and we should write this item to // nvmcache. bool shouldWriteToNvmCache(const Item& item); @@ -1885,10 +2212,12 @@ class CacheAllocator : public CacheBase { std::unique_ptr& worker, std::chrono::seconds timeout = std::chrono::seconds{0}); - ShmSegmentOpts createShmCacheOpts(); - std::unique_ptr createNewMemoryAllocator(); - std::unique_ptr restoreMemoryAllocator(); - std::unique_ptr restoreCCacheManager(); + ShmSegmentOpts createShmCacheOpts(TierId tid); + PrivateSegmentOpts createPrivateSegmentOpts(TierId tid); + std::unique_ptr createPrivateAllocator(TierId tid); + std::unique_ptr createNewMemoryAllocator(TierId tid); + std::unique_ptr restoreMemoryAllocator(TierId tid); + std::unique_ptr restoreCCacheManager(TierId tid); PoolIds filterCompactCachePools(const PoolIds& poolIds) const; @@ -1908,7 +2237,7 @@ class CacheAllocator : public CacheBase { } typename Item::PtrCompressor createPtrCompressor() const { - return allocator_->createPtrCompressor(); + return typename Item::PtrCompressor(allocator_); } // helper utility to throttle and optionally log. @@ -1931,9 +2260,14 @@ class CacheAllocator : public CacheBase { // @param type the type of initialization // @return nullptr if the type is invalid - // @return pointer to memory allocator + // @return vector of pointers to memory allocator // @throw std::runtime_error if type is invalid - std::unique_ptr initAllocator(InitMemType type); + std::vector> initAllocator(InitMemType type); + + std::vector> createPrivateAllocators(); + std::vector> createAllocators(); + std::vector> restoreAllocators(); + // @param type the type of initialization // @return nullptr if the type is invalid // @return pointer to access container @@ -1945,18 +2279,14 @@ class CacheAllocator : public CacheBase { std::optional saveNvmCache(); void saveRamCache(); - static bool itemExclusivePredicate(const Item& item) { - return item.getRefCount() == 0; + static bool itemSlabMovePredicate(const Item& item) { + return item.isMoving() && item.getRefCount() == 0; } static bool itemExpiryPredicate(const Item& item) { return item.getRefCount() == 1 && item.isExpired(); } - static bool parentEvictForSlabReleasePredicate(const Item& item) { - return item.getRefCount() == 1 && !item.isMoving(); - } - std::unique_ptr createDeserializer(); // Execute func on each item. `func` can throw exception but must ensure @@ -1995,6 +2325,100 @@ class CacheAllocator : public CacheBase { // BEGIN private members + TierId currentTier() const { + // TODO: every function which calls this method should be refactored. + // We should go case by case and either make such function work on + // all tiers or expose separate parameter to describe the tier ID. + return 0; + } + + unsigned getNumTiers() const { + return config_.memoryTierConfigs.size(); + } + + size_t memoryTierSize(TierId tid) const; + + bool tryGetHandleWithWaitContextForMovingItem(Item& item, WriteHandle& handle); + + size_t wakeUpWaitersLocked(folly::StringPiece key, WriteHandle&& handle); + + class MoveCtx { + public: + MoveCtx() {} + + ~MoveCtx() { + // prevent any further enqueue to waiters + // Note: we don't need to hold locks since no one can enqueue + // after this point. + wakeUpWaiters(); + } + + // record the item handle. Upon destruction we will wake up the waiters + // and pass a clone of the handle to the callBack. By default we pass + // a null handle + void setItemHandle(WriteHandle _it) { it = std::move(_it); } + + // enqueue a waiter into the waiter list + // @param waiter WaitContext + void addWaiter(std::shared_ptr> waiter) { + XDCHECK(waiter); + waiters.push_back(std::move(waiter)); + } + + size_t numWaiters() const { return waiters.size(); } + + private: + // notify all pending waiters that are waiting for the fetch. + void wakeUpWaiters() { + bool refcountOverflowed = false; + for (auto& w : waiters) { + // If refcount overflowed earlier, then we will return miss to + // all subsequent waitors. + if (refcountOverflowed) { + w->set(WriteHandle{}); + continue; + } + + try { + w->set(it.clone()); + } catch (const exception::RefcountOverflow&) { + // We'll return a miss to the user's pending read, + // so we should enqueue a delete via NvmCache. + // TODO: cache.remove(it); + refcountOverflowed = true; + } + } + } + + WriteHandle it; // will be set when Context is being filled + std::vector>> waiters; // list of + // waiters + }; + using MoveMap = + folly::F14ValueMap, + folly::HeterogeneousAccessHash>; + + static size_t getShardForKey(folly::StringPiece key) { + return folly::Hash()(key) % kShards; + } + + MoveMap& getMoveMapForShard(size_t shard) { + return movesMap_[shard].movesMap_; + } + + MoveMap& getMoveMap(folly::StringPiece key) { + return getMoveMapForShard(getShardForKey(key)); + } + + std::unique_lock getMoveLockForShard(size_t shard) { + return std::unique_lock(moveLock_[shard].moveLock_); + } + + std::unique_lock getMoveLock(folly::StringPiece key) { + return getMoveLockForShard(getShardForKey(key)); + } + // Whether the memory allocator for this cache allocator was created on shared // memory. The hash table, chained item hash table etc is also created on // shared memory except for temporary shared memory mode when they're created @@ -2007,6 +2431,8 @@ class CacheAllocator : public CacheBase { // is not persisted when cache process exits. std::unique_ptr tempShm_; + std::unique_ptr privMemManager_; + std::unique_ptr shmManager_; // Deserialize data to restore cache allocator. Used only while attaching to @@ -2020,9 +2446,10 @@ class CacheAllocator : public CacheBase { const MMConfig mmConfig_{}; // the memory allocator for allocating out of the available memory. - std::unique_ptr allocator_; + std::vector> allocator_; // compact cache allocator manager + // TODO: per tier? std::unique_ptr compactCacheManager_; // compact cache instances reside here when user "add" or "attach" compact @@ -2072,6 +2499,10 @@ class CacheAllocator : public CacheBase { // free memory monitor std::unique_ptr memMonitor_; + + // background evictor + std::vector>> backgroundEvictor_; + std::vector>> backgroundPromoter_; // check whether a pool is a slabs pool std::array isCompactCachePool_{}; @@ -2084,6 +2515,22 @@ class CacheAllocator : public CacheBase { // poolResizer_, poolOptimizer_, memMonitor_, reaper_ mutable std::mutex workersMutex_; + static constexpr size_t kShards = 8192; // TODO: need to define right value + + struct MovesMapShard { + alignas(folly::hardware_destructive_interference_size) MoveMap movesMap_; + }; + + struct MoveLock { + alignas(folly::hardware_destructive_interference_size) std::mutex moveLock_; + }; + + // a map of all pending moves + std::vector movesMap_; + + // a map of move locks for each shard + std::vector moveLock_; + // time when the ram cache was first created const uint32_t cacheCreationTime_{0}; @@ -2117,6 +2564,7 @@ class CacheAllocator : public CacheBase { // Make this friend to give access to acquire and release friend ReadHandle; friend ReaperAPIWrapper; + friend BackgroundMoverAPIWrapper; friend class CacheAPIWrapperForNvm; friend class FbInternalRuntimeUpdateWrapper; friend class objcache2::ObjectCache; diff --git a/cachelib/allocator/CacheAllocatorConfig.h b/cachelib/allocator/CacheAllocatorConfig.h index 74a86e9789..d4a9bd04a9 100644 --- a/cachelib/allocator/CacheAllocatorConfig.h +++ b/cachelib/allocator/CacheAllocatorConfig.h @@ -31,6 +31,7 @@ #include "cachelib/allocator/MemoryTierCacheConfig.h" #include "cachelib/allocator/NvmAdmissionPolicy.h" #include "cachelib/allocator/PoolOptimizeStrategy.h" +#include "cachelib/allocator/BackgroundMoverStrategy.h" #include "cachelib/allocator/RebalanceStrategy.h" #include "cachelib/allocator/Util.h" #include "cachelib/common/EventInterface.h" @@ -267,6 +268,16 @@ class CacheAllocatorConfig { std::chrono::seconds regularInterval, std::chrono::seconds ccacheInterval, uint32_t ccacheStepSizePercent); + + // Enable the background evictor - scans a tier to look for objects + // to evict to the next tier + CacheAllocatorConfig& enableBackgroundEvictor( + std::shared_ptr backgroundMoverStrategy, + std::chrono::milliseconds regularInterval, size_t threads); + + CacheAllocatorConfig& enableBackgroundPromoter( + std::shared_ptr backgroundMoverStrategy, + std::chrono::milliseconds regularInterval, size_t threads); // This enables an optimization for Pool rebalancing and resizing. // The rough idea is to ensure only the least useful items are evicted when @@ -300,6 +311,9 @@ class CacheAllocatorConfig { // Library team if you find yourself customizing this. CacheAllocatorConfig& setThrottlerConfig(util::Throttler::Config config); + // Insert items to first free memory tier + CacheAllocatorConfig& enableInsertToFirstFreeTier(); + // Passes in a callback to initialize an event tracker when the allocator // starts CacheAllocatorConfig& setEventTracker(EventTrackerSharedPtr&&); @@ -342,6 +356,17 @@ class CacheAllocatorConfig { poolOptimizeStrategy != nullptr; } + // @return whether background evictor thread is enabled + bool backgroundEvictorEnabled() const noexcept { + return backgroundEvictorInterval.count() > 0 && + backgroundEvictorStrategy != nullptr; + } + + bool backgroundPromoterEnabled() const noexcept { + return backgroundPromoterInterval.count() > 0 && + backgroundPromoterStrategy != nullptr; + } + // @return whether memory monitor is enabled bool memMonitoringEnabled() const noexcept { return memMonitorConfig.mode != MemoryMonitor::Disabled && @@ -455,6 +480,16 @@ class CacheAllocatorConfig { // The slab release process is considered as being stuck if it does not // make any progress for the below threshold std::chrono::milliseconds slabReleaseStuckThreshold{std::chrono::seconds(60)}; + + // rebalance to avoid alloc fialures. + std::shared_ptr backgroundEvictorStrategy; + std::shared_ptr backgroundPromoterStrategy; + // time interval to sleep between runs of the background evictor + std::chrono::milliseconds backgroundEvictorInterval{std::chrono::milliseconds{1000}}; + std::chrono::milliseconds backgroundPromoterInterval{std::chrono::milliseconds{1000}}; + + size_t backgroundEvictorThreads{1}; + size_t backgroundPromoterThreads{1}; // time interval to sleep between iterations of pool size optimization, // for regular pools and compact caches @@ -495,6 +530,11 @@ class CacheAllocatorConfig { // ABOVE are the config for various cache workers // + // if turned off, always insert new elements to topmost memory tier. + // if turned on, insert new element to first free memory tier or evict memory + // from the bottom one if memory cache is full + bool insertToFirstFreeTier = false; + // the number of tries to search for an item to evict // 0 means it's infinite unsigned int evictionSearchTries{50}; @@ -594,6 +634,25 @@ class CacheAllocatorConfig { // If true, we will delay worker start until user explicitly calls // CacheAllocator::startCacheWorkers() bool delayCacheWorkersStart{false}; + + // see MultiTierDataMovement.md + double promotionAcWatermark{4.0}; + double lowEvictionAcWatermark{2.0}; + double highEvictionAcWatermark{5.0}; + double numDuplicateElements{0.0}; // inclusivness of the cache + double syncPromotion{0.0}; // can promotion be done synchronously in user thread + + uint64_t evictorThreads{1}; + uint64_t promoterThreads{1}; + + uint64_t maxEvictionBatch{40}; + uint64_t maxPromotionBatch{10}; + + uint64_t minEvictionBatch{1}; + uint64_t minPromotionBatch{1}; + + uint64_t maxEvictionPromotionHotness{60}; + friend CacheT; @@ -611,6 +670,12 @@ class CacheAllocatorConfig { {MemoryTierCacheConfig::fromShm().setRatio(1)}}; }; +template +CacheAllocatorConfig& CacheAllocatorConfig::enableInsertToFirstFreeTier() { + insertToFirstFreeTier = true; + return *this; +} + template CacheAllocatorConfig& CacheAllocatorConfig::setCacheName( const std::string& _cacheName) { @@ -933,6 +998,26 @@ CacheAllocatorConfig& CacheAllocatorConfig::enablePoolRebalancing( return *this; } +template +CacheAllocatorConfig& CacheAllocatorConfig::enableBackgroundEvictor( + std::shared_ptr strategy, + std::chrono::milliseconds interval, size_t evictorThreads) { + backgroundEvictorStrategy = strategy; + backgroundEvictorInterval = interval; + backgroundEvictorThreads = evictorThreads; + return *this; +} + +template +CacheAllocatorConfig& CacheAllocatorConfig::enableBackgroundPromoter( + std::shared_ptr strategy, + std::chrono::milliseconds interval, size_t promoterThreads) { + backgroundPromoterStrategy = strategy; + backgroundPromoterInterval = interval; + backgroundPromoterThreads = promoterThreads; + return *this; +} + template CacheAllocatorConfig& CacheAllocatorConfig::enablePoolResizing( std::shared_ptr resizeStrategy, @@ -1170,6 +1255,7 @@ std::map CacheAllocatorConfig::serialize() const { configMap["nvmAdmissionMinTTL"] = std::to_string(nvmAdmissionMinTTL); configMap["delayCacheWorkersStart"] = delayCacheWorkersStart ? "true" : "false"; + configMap["insertToFirstFreeTier"] = std::to_string(insertToFirstFreeTier); mergeWithPrefix(configMap, throttleConfig.serialize(), "throttleConfig"); mergeWithPrefix(configMap, chainedItemAccessConfig.serialize(), diff --git a/cachelib/allocator/CacheItem.h b/cachelib/allocator/CacheItem.h index afee315cbb..b7ae24fe6b 100644 --- a/cachelib/allocator/CacheItem.h +++ b/cachelib/allocator/CacheItem.h @@ -46,6 +46,9 @@ class BaseAllocatorTest; template class AllocatorHitStatsTest; +template +class AllocatorMemoryTiersTest; + template class MapTest; @@ -309,7 +312,7 @@ class CACHELIB_PACKED_ATTR CacheItem { // // @return true on success, failure if item is marked as exclusive // @throw exception::RefcountOverflow on ref count overflow - FOLLY_ALWAYS_INLINE bool incRef() { + FOLLY_ALWAYS_INLINE RefcountWithFlags::incResult incRef() { try { return ref_.incRef(); } catch (exception::RefcountOverflow& e) { @@ -473,6 +476,8 @@ class CACHELIB_PACKED_ATTR CacheItem { FRIEND_TEST(ItemTest, NonStringKey); template friend class facebook::cachelib::tests::AllocatorHitStatsTest; + template + friend class facebook::cachelib::tests::AllocatorMemoryTiersTest; }; // A chained item has a hook pointing to the next chained item. The hook is diff --git a/cachelib/allocator/CacheStats.cpp b/cachelib/allocator/CacheStats.cpp index 1c0440e94a..453d3a0abb 100644 --- a/cachelib/allocator/CacheStats.cpp +++ b/cachelib/allocator/CacheStats.cpp @@ -23,18 +23,21 @@ namespace cachelib { namespace detail { void Stats::init() { - cacheHits = std::make_unique(); - allocAttempts = std::make_unique(); - evictionAttempts = std::make_unique(); - fragmentationSize = std::make_unique(); - allocFailures = std::make_unique(); - chainedItemEvictions = std::make_unique(); - regularItemEvictions = std::make_unique(); + cacheHits = std::make_unique(); + allocAttempts = std::make_unique(); + evictionAttempts = std::make_unique(); + fragmentationSize = std::make_unique(); + allocFailures = std::make_unique(); + chainedItemEvictions = std::make_unique(); + regularItemEvictions = std::make_unique(); + numWritebacks = std::make_unique(); auto initToZero = [](auto& a) { - for (auto& s : a) { - for (auto& c : s) { + for (auto& t : a) { + for (auto& p : t) { + for (auto& c : p) { c.set(0); } + } } }; @@ -44,6 +47,9 @@ void Stats::init() { initToZero(*fragmentationSize); initToZero(*chainedItemEvictions); initToZero(*regularItemEvictions); + initToZero(*numWritebacks); + + classAllocLatency = std::make_unique(); } template @@ -51,7 +57,7 @@ struct SizeVerify {}; void Stats::populateGlobalCacheStats(GlobalCacheStats& ret) const { #ifndef SKIP_SIZE_VERIFY - SizeVerify a = SizeVerify<16176>{}; + SizeVerify a = SizeVerify<16192>{}; std::ignore = a; #endif ret.numCacheGets = numCacheGets.get(); @@ -100,6 +106,8 @@ void Stats::populateGlobalCacheStats(GlobalCacheStats& ret) const { ret.numNvmItemDestructorAllocErrors = numNvmItemDestructorAllocErrors.get(); ret.allocateLatencyNs = this->allocateLatency_.estimate(); + ret.bgEvictLatencyNs = this->bgEvictLatency_.estimate(); + ret.bgPromoteLatencyNs = this->bgPromoteLatency_.estimate(); ret.moveChainedLatencyNs = this->moveChainedLatency_.estimate(); ret.moveRegularLatencyNs = this->moveRegularLatency_.estimate(); ret.nvmLookupLatencyNs = this->nvmLookupLatency_.estimate(); @@ -114,20 +122,43 @@ void Stats::populateGlobalCacheStats(GlobalCacheStats& ret) const { ret.nvmEvictionSecondsToExpiry = this->nvmEvictionSecondsToExpiry_.estimate(); ret.nvmPutSize = this->nvmPutSize_.estimate(); - auto accum = [](const PerPoolClassAtomicCounters& c) { - uint64_t sum = 0; - for (const auto& x : c) { - for (const auto& v : x) { - sum += v.get(); - } + auto accum = [](const PerTierPerPoolClassAtomicCounters& t) { + std::vector stat; + for (const auto& c : t) { + uint64_t sum = 0; + for (const auto& x : c) { + for (const auto& v : x) { + sum += v.get(); + } + } + stat.push_back(sum); + } + return stat; + }; + + auto accumTL = [](const PerTierPerPoolClassTLCounters& t) { + std::vector stat; + for (const auto& c : t) { + uint64_t sum = 0; + for (const auto& x : c) { + for (const auto& v : x) { + sum += v.get(); + } + } + stat.push_back(sum); } - return sum; + return stat; }; ret.allocAttempts = accum(*allocAttempts); ret.evictionAttempts = accum(*evictionAttempts); ret.allocFailures = accum(*allocFailures); - ret.numEvictions = accum(*chainedItemEvictions); - ret.numEvictions += accum(*regularItemEvictions); + auto chainedEvictions = accum(*chainedItemEvictions); + auto regularEvictions = accum(*regularItemEvictions); + for (TierId tid = 0; tid < chainedEvictions.size(); tid++) { + ret.numEvictions.push_back(chainedEvictions[tid] + regularEvictions[tid]); + } + ret.numWritebacks = accum(*numWritebacks); + ret.numCacheHits = accumTL(*cacheHits); ret.invalidAllocs = invalidAllocs.get(); ret.numRefcountOverflow = numRefcountOverflow.get(); @@ -143,6 +174,18 @@ void Stats::populateGlobalCacheStats(GlobalCacheStats& ret) const { } // namespace detail +MMContainerStat& MMContainerStat::operator+=(const MMContainerStat& other) { + + size += other.size; + oldestTimeSec = std::min(oldestTimeSec,other.oldestTimeSec); + lruRefreshTime = std::max(lruRefreshTime,other.lruRefreshTime); + numHotAccesses += other.numHotAccesses; + numColdAccesses += other.numColdAccesses; + numWarmAccesses += other.numWarmAccesses; + numTailAccesses += other.numTailAccesses; + return *this; +} + PoolStats& PoolStats::operator+=(const PoolStats& other) { auto verify = [](bool isCompatible) { if (!isCompatible) { @@ -180,6 +223,7 @@ PoolStats& PoolStats::operator+=(const PoolStats& other) { d.allocFailures += s.allocFailures; d.fragmentationSize += s.fragmentationSize; d.numHits += s.numHits; + d.numWritebacks += s.numWritebacks; d.chainedItemEvictions += s.chainedItemEvictions; d.regularItemEvictions += s.regularItemEvictions; } @@ -235,6 +279,14 @@ uint64_t PoolStats::numEvictions() const noexcept { return n; } +uint64_t PoolStats::numWritebacks() const noexcept { + uint64_t n = 0; + for (const auto& s : cacheStats) { + n += s.second.numWritebacks; + } + return n; +} + uint64_t PoolStats::numItems() const noexcept { uint64_t n = 0; for (const auto& s : cacheStats) { @@ -243,6 +295,14 @@ uint64_t PoolStats::numItems() const noexcept { return n; } +uint64_t PoolStats::numHits() const noexcept { + uint64_t n = 0; + for (const auto& s : cacheStats) { + n += s.second.numHits; + } + return n; +} + uint64_t PoolStats::numAllocFailures() const { uint64_t n = 0; for (const auto& s : cacheStats) { diff --git a/cachelib/allocator/CacheStats.h b/cachelib/allocator/CacheStats.h index 0bdfc5db5d..cda2690bf8 100644 --- a/cachelib/allocator/CacheStats.h +++ b/cachelib/allocator/CacheStats.h @@ -25,6 +25,7 @@ #include "cachelib/allocator/memory/Slab.h" #include "cachelib/common/FastStats.h" #include "cachelib/common/PercentileStats.h" +#include "cachelib/common/RollingStats.h" #include "cachelib/common/Time.h" namespace facebook { @@ -77,22 +78,25 @@ struct PoolEvictionAgeStats { // Stats for MM container struct MMContainerStat { // number of elements in the container. - size_t size; + size_t size{0}; // what is the unix timestamp in seconds of the oldest element existing in // the container. - uint64_t oldestTimeSec; + uint64_t oldestTimeSec{0}; // refresh time for LRU - uint64_t lruRefreshTime; + uint64_t lruRefreshTime{0}; // TODO: Make the MMContainerStat generic by moving the Lru/2Q specific // stats inside MMType and exporting them through a generic stats interface. // number of hits in each lru. - uint64_t numHotAccesses; - uint64_t numColdAccesses; - uint64_t numWarmAccesses; - uint64_t numTailAccesses; + uint64_t numHotAccesses{0}; + uint64_t numColdAccesses{0}; + uint64_t numWarmAccesses{0}; + uint64_t numTailAccesses{0}; + + // aggregate stats together (accross tiers) + MMContainerStat& operator+=(const MMContainerStat& other); }; // cache related stats for a given allocation class. @@ -113,13 +117,16 @@ struct CacheStat { uint64_t fragmentationSize{0}; // number of hits for this container. - uint64_t numHits; + uint64_t numHits{0}; // number of evictions from this class id that was of a chained item - uint64_t chainedItemEvictions; + uint64_t chainedItemEvictions{0}; // number of regular items that were evicted from this classId - uint64_t regularItemEvictions; + uint64_t regularItemEvictions{0}; + + // number of items that are moved to next tier + uint64_t numWritebacks{0}; // the stats from the mm container MMContainerStat containerStat; @@ -196,12 +203,18 @@ struct PoolStats { // number of evictions for this pool uint64_t numEvictions() const noexcept; + // number of writebacks for this pool + uint64_t numWritebacks() const noexcept; + // number of all items in this pool uint64_t numItems() const noexcept; // total number of allocations currently in this pool uint64_t numActiveAllocs() const noexcept; + // number of hits for an alloc class in this pool + uint64_t numHits() const noexcept; + // number of hits for an alloc class in this pool uint64_t numHitsForClass(ClassId cid) const { return cacheStats.at(cid).numHits; @@ -300,6 +313,26 @@ struct RebalancerStats { uint64_t avgPickTimeMs{0}; }; +// Mover Stats +struct BackgroundMoverStats { + // the number of items this worker moved by looking at pools/classes stats + uint64_t numMovedItems{0}; + // number of times we went executed the thread //TODO: is this def correct? + uint64_t runCount{0}; + // total number of classes + uint64_t totalClasses{0}; + // eviction size + uint64_t totalBytesMoved{0}; + + BackgroundMoverStats& operator+=(const BackgroundMoverStats& rhs) { + numMovedItems += rhs.numMovedItems; + runCount += rhs.runCount; + totalClasses += rhs.totalClasses; + totalBytesMoved += rhs.totalBytesMoved; + return *this; + } +}; + // CacheMetadata type to export struct CacheMetadata { // allocator_version @@ -320,6 +353,11 @@ struct Stats; // Stats that apply globally in cache and // the ones that are aggregated over all pools struct GlobalCacheStats { + // background eviction stats + BackgroundMoverStats evictionStats; + + BackgroundMoverStats promotionStats; + // number of calls to CacheAllocator::find uint64_t numCacheGets{0}; @@ -426,16 +464,22 @@ struct GlobalCacheStats { uint64_t numNvmItemRemovedSetSize{0}; // number of attempts to allocate an item - uint64_t allocAttempts{0}; + std::vector allocAttempts; // number of eviction attempts - uint64_t evictionAttempts{0}; + std::vector evictionAttempts; // number of failures to allocate an item due to internal error - uint64_t allocFailures{0}; + std::vector allocFailures; // number of evictions across all the pools in the cache. - uint64_t numEvictions{0}; + std::vector numEvictions; + + // number of writebacks across all the pools in the cache. + std::vector numWritebacks; + + // number of hits per tier across all the pools in the cache. + std::vector numCacheHits; // number of allocation attempts with invalid input params. uint64_t invalidAllocs{0}; @@ -466,6 +510,8 @@ struct GlobalCacheStats { // latency and percentile stats of various cachelib operations util::PercentileStats::Estimates allocateLatencyNs{}; + util::PercentileStats::Estimates bgEvictLatencyNs{}; + util::PercentileStats::Estimates bgPromoteLatencyNs{}; util::PercentileStats::Estimates moveChainedLatencyNs{}; util::PercentileStats::Estimates moveRegularLatencyNs{}; util::PercentileStats::Estimates nvmLookupLatencyNs{}; diff --git a/cachelib/allocator/CacheStatsInternal.h b/cachelib/allocator/CacheStatsInternal.h index b2a5f8c469..da48df2d8f 100644 --- a/cachelib/allocator/CacheStatsInternal.h +++ b/cachelib/allocator/CacheStatsInternal.h @@ -21,6 +21,7 @@ #include "cachelib/allocator/Cache.h" #include "cachelib/allocator/memory/MemoryAllocator.h" #include "cachelib/common/AtomicCounter.h" +#include "cachelib/common/RollingStats.h" namespace facebook { namespace cachelib { @@ -188,6 +189,8 @@ struct Stats { // latency stats of various cachelib operations mutable util::PercentileStats allocateLatency_; + mutable util::PercentileStats bgEvictLatency_; + mutable util::PercentileStats bgPromoteLatency_; mutable util::PercentileStats moveChainedLatency_; mutable util::PercentileStats moveRegularLatency_; mutable util::PercentileStats nvmLookupLatency_; @@ -211,23 +214,34 @@ struct Stats { // we're currently writing into flash. mutable util::PercentileStats nvmPutSize_; - using PerPoolClassAtomicCounters = + using PerTierPerPoolClassAtomicCounters = std::array< std::array, - MemoryPoolManager::kMaxPools>; + MemoryPoolManager::kMaxPools>, + CacheBase::kMaxTiers>; // count of a stat for a specific allocation class - using PerPoolClassTLCounters = + using PerTierPerPoolClassTLCounters = std::array< std::array, - MemoryPoolManager::kMaxPools>; + MemoryPoolManager::kMaxPools>, + CacheBase::kMaxTiers>; // hit count for every alloc class in every pool - std::unique_ptr cacheHits{}; - std::unique_ptr allocAttempts{}; - std::unique_ptr evictionAttempts{}; - std::unique_ptr allocFailures{}; - std::unique_ptr fragmentationSize{}; - std::unique_ptr chainedItemEvictions{}; - std::unique_ptr regularItemEvictions{}; + std::unique_ptr cacheHits{}; + std::unique_ptr allocAttempts{}; + std::unique_ptr evictionAttempts{}; + std::unique_ptr allocFailures{}; + std::unique_ptr fragmentationSize{}; + std::unique_ptr chainedItemEvictions{}; + std::unique_ptr regularItemEvictions{}; + std::unique_ptr numWritebacks{}; + + using PerTierPoolClassRollingStats = std::array< + std::array, + MemoryPoolManager::kMaxPools>, + CacheBase::kMaxTiers>; + + // rolling latency tracking for every alloc class in every pool + std::unique_ptr classAllocLatency{}; // Eviction failures due to parent cannot be removed from access container AtomicCounter evictFailParentAC{0}; diff --git a/cachelib/allocator/FreeThresholdStrategy.cpp b/cachelib/allocator/FreeThresholdStrategy.cpp new file mode 100644 index 0000000000..4a900c2cb1 --- /dev/null +++ b/cachelib/allocator/FreeThresholdStrategy.cpp @@ -0,0 +1,74 @@ +/* + * Copyright (c) Intel and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "cachelib/allocator/FreeThresholdStrategy.h" + +#include + +namespace facebook { +namespace cachelib { + +FreeThresholdStrategy::FreeThresholdStrategy(double lowEvictionAcWatermark, + double highEvictionAcWatermark, + uint64_t maxEvictionBatch, + uint64_t minEvictionBatch) + : lowEvictionAcWatermark(lowEvictionAcWatermark), + highEvictionAcWatermark(highEvictionAcWatermark), + maxEvictionBatch(maxEvictionBatch), + minEvictionBatch(minEvictionBatch) {} + +std::vector FreeThresholdStrategy::calculateBatchSizes( + const CacheBase& cache, + std::vector acVec) { + std::vector batches{}; + for (auto [tid, pid, cid] : acVec) { + auto stats = cache.getACStats(tid, pid, cid); + if ((1-stats.usageFraction())*100 >= highEvictionAcWatermark) { + batches.push_back(0); + } else { + auto toFreeMemPercent = highEvictionAcWatermark - (1-stats.usageFraction())*100; + auto toFreeItems = static_cast( + toFreeMemPercent * (stats.totalSlabs() * Slab::kSize) / stats.allocSize); + batches.push_back(toFreeItems); + } + } + + if (batches.size() == 0) { + return batches; + } + + auto maxBatch = *std::max_element(batches.begin(), batches.end()); + if (maxBatch == 0) + return batches; + + std::transform( + batches.begin(), batches.end(), batches.begin(), [&](auto numItems) { + if (numItems == 0) { + return 0UL; + } + + auto cappedBatchSize = maxEvictionBatch * numItems / maxBatch; + if (cappedBatchSize < minEvictionBatch) + return minEvictionBatch; + else + return cappedBatchSize; + }); + + return batches; +} + +} // namespace cachelib +} // namespace facebook diff --git a/cachelib/allocator/FreeThresholdStrategy.h b/cachelib/allocator/FreeThresholdStrategy.h new file mode 100644 index 0000000000..94316bfe82 --- /dev/null +++ b/cachelib/allocator/FreeThresholdStrategy.h @@ -0,0 +1,46 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "cachelib/allocator/BackgroundMoverStrategy.h" +#include "cachelib/allocator/Cache.h" + +namespace facebook { +namespace cachelib { + +// Base class for background mover strategy. +class FreeThresholdStrategy : public BackgroundMoverStrategy { + public: + FreeThresholdStrategy(double lowEvictionAcWatermark, + double highEvictionAcWatermark, + uint64_t maxEvictionBatch, + uint64_t minEvictionBatch); + ~FreeThresholdStrategy() {} + + std::vector calculateBatchSizes( + const CacheBase& cache, + std::vector acVecs); + + private: + double lowEvictionAcWatermark{2.0}; + double highEvictionAcWatermark{5.0}; + uint64_t maxEvictionBatch{40}; + uint64_t minEvictionBatch{5}; +}; + +} // namespace cachelib +} // namespace facebook diff --git a/cachelib/allocator/Handle.h b/cachelib/allocator/Handle.h index 11d2bed2be..06c21bffe4 100644 --- a/cachelib/allocator/Handle.h +++ b/cachelib/allocator/Handle.h @@ -400,6 +400,12 @@ struct ReadHandleImpl { } } + protected: + friend class ReadHandleImpl; + // Method used only by ReadHandleImpl ctor + void discard() { + it_.store(nullptr, std::memory_order_relaxed); + } private: // we are waiting on Item* to be set to a value. One of the valid values is // nullptr. So choose something that we dont expect to indicate a ptr @@ -479,7 +485,8 @@ struct ReadHandleImpl { // Handle which has the item already FOLLY_ALWAYS_INLINE ReadHandleImpl(Item* it, CacheT& alloc) noexcept - : alloc_(&alloc), it_(it) {} + : alloc_(&alloc), it_(it) { + } // handle that has a wait context allocated. Used for async handles // In this case, the it_ will be filled in asynchronously and mulitple diff --git a/cachelib/allocator/MM2Q-inl.h b/cachelib/allocator/MM2Q-inl.h index ba388d40a4..7f00b96131 100644 --- a/cachelib/allocator/MM2Q-inl.h +++ b/cachelib/allocator/MM2Q-inl.h @@ -247,6 +247,12 @@ MM2Q::Container::getEvictionIterator() const noexcept { return LockedIterator{std::move(l), lru_.rbegin()}; } +template T::*HookPtr> +template +void MM2Q::Container::withContainerLock(F&& fun) { + lruMutex_->lock_combine([this, &fun]() { fun(); }); +} + template T::*HookPtr> template void MM2Q::Container::withEvictionIterator(F&& fun) { @@ -258,6 +264,16 @@ void MM2Q::Container::withEvictionIterator(F&& fun) { } } +// returns the head of the hot queue for promotion +template T::*HookPtr> +template +void +MM2Q::Container::withPromotionIterator(F&& fun) { + lruMutex_->lock_combine([this, &fun]() { + fun(LockedIterator{LockHolder{}, lru_.begin(LruType::Hot)}); + }); +} + template T::*HookPtr> void MM2Q::Container::removeLocked(T& node, bool doRebalance) noexcept { diff --git a/cachelib/allocator/MM2Q.h b/cachelib/allocator/MM2Q.h index 982eca21f9..292a612cde 100644 --- a/cachelib/allocator/MM2Q.h +++ b/cachelib/allocator/MM2Q.h @@ -68,6 +68,7 @@ class MM2Q { enum LruType { Warm, WarmTail, Hot, Cold, ColdTail, NumTypes }; // Config class for MM2Q + // TODO: implement support for useCombinedLockForIterators struct Config { // Create from serialized config explicit Config(SerializationConfigType configState) @@ -501,6 +502,15 @@ class MM2Q { // Iterator passed as parameter. template void withEvictionIterator(F&& f); + + // Execute provided function under container lock. + template + void withContainerLock(F&& f); + + // Execute provided function under container lock. Function gets + // iterator passed as parameter. + template + void withPromotionIterator(F&& f); // get the current config as a copy Config getConfig() const; diff --git a/cachelib/allocator/MMLru-inl.h b/cachelib/allocator/MMLru-inl.h index d35759f212..427753f853 100644 --- a/cachelib/allocator/MMLru-inl.h +++ b/cachelib/allocator/MMLru-inl.h @@ -218,6 +218,12 @@ MMLru::Container::getEvictionIterator() const noexcept { return LockedIterator{std::move(l), lru_.rbegin()}; } +template T::*HookPtr> +template +void MMLru::Container::withContainerLock(F&& fun) { + lruMutex_->lock_combine([this, &fun]() { fun(); }); +} + template T::*HookPtr> template void MMLru::Container::withEvictionIterator(F&& fun) { @@ -229,6 +235,18 @@ void MMLru::Container::withEvictionIterator(F&& fun) { } } +template T::*HookPtr> +template +void +MMLru::Container::withPromotionIterator(F&& fun) { + if (config_.useCombinedLockForIterators) { + lruMutex_->lock_combine([this, &fun]() { fun(Iterator{lru_.begin()}); }); + } else { + LockHolder lck{*lruMutex_}; + fun(Iterator{lru_.begin()}); + } +} + template T::*HookPtr> void MMLru::Container::ensureNotInsertionPoint(T& node) noexcept { // If we are removing the insertion point node, grow tail before we remove diff --git a/cachelib/allocator/MMLru.h b/cachelib/allocator/MMLru.h index 29c6d02689..ad6c4b784b 100644 --- a/cachelib/allocator/MMLru.h +++ b/cachelib/allocator/MMLru.h @@ -230,12 +230,13 @@ class MMLru { // lruInsertionPointSpec = 2, we insert at a point 1/4th from tail uint8_t lruInsertionPointSpec{0}; + // Whether to use combined locking for withEvictionIterator. + bool useCombinedLockForIterators{true}; + // Minimum interval between reconfigurations. If 0, reconfigure is never // called. std::chrono::seconds mmReconfigureIntervalSecs{}; - // Whether to use combined locking for withEvictionIterator. - bool useCombinedLockForIterators{false}; }; // The container object which can be used to keep track of objects of type @@ -376,6 +377,13 @@ class MMLru { template void withEvictionIterator(F&& f); + // Execute provided function under container lock. + template + void withContainerLock(F&& f); + + template + void withPromotionIterator(F&& f); + // get copy of current config Config getConfig() const; diff --git a/cachelib/allocator/MMTinyLFU-inl.h b/cachelib/allocator/MMTinyLFU-inl.h index 46640b24ca..46a760909d 100644 --- a/cachelib/allocator/MMTinyLFU-inl.h +++ b/cachelib/allocator/MMTinyLFU-inl.h @@ -220,6 +220,13 @@ MMTinyLFU::Container::getEvictionIterator() const noexcept { return LockedIterator{std::move(l), *this}; } +template T::*HookPtr> +template +void MMTinyLFU::Container::withContainerLock(F&& fun) { + LockHolder l(lruMutex_); + fun(); +} + template T::*HookPtr> template void MMTinyLFU::Container::withEvictionIterator(F&& fun) { @@ -227,6 +234,13 @@ void MMTinyLFU::Container::withEvictionIterator(F&& fun) { fun(getEvictionIterator()); } +template T::*HookPtr> +template +void +MMTinyLFU::Container::withPromotionIterator(F&& fun) { + throw std::runtime_error("Not supported"); +} + template T::*HookPtr> void MMTinyLFU::Container::removeLocked(T& node) noexcept { if (isTiny(node)) { diff --git a/cachelib/allocator/MMTinyLFU.h b/cachelib/allocator/MMTinyLFU.h index c8f2699264..a0d4386b83 100644 --- a/cachelib/allocator/MMTinyLFU.h +++ b/cachelib/allocator/MMTinyLFU.h @@ -496,6 +496,13 @@ class MMTinyLFU { // iterator passed as parameter. template void withEvictionIterator(F&& f); + + // Execute provided function under container lock. + template + void withContainerLock(F&& f); + + template + void withPromotionIterator(F&& f); // for saving the state of the lru // diff --git a/cachelib/allocator/MemoryTierCacheConfig.h b/cachelib/allocator/MemoryTierCacheConfig.h index 1b9477c048..ee579a5386 100644 --- a/cachelib/allocator/MemoryTierCacheConfig.h +++ b/cachelib/allocator/MemoryTierCacheConfig.h @@ -16,11 +16,14 @@ #pragma once +#include "cachelib/common/Utils.h" #include "cachelib/shm/ShmCommon.h" namespace facebook { namespace cachelib { class MemoryTierCacheConfig { + using bitmask_type = util::NumaBitMask; + public: // Creates instance of MemoryTierCacheConfig for Posix/SysV Shared memory. static MemoryTierCacheConfig fromShm() { return MemoryTierCacheConfig(); } @@ -39,12 +42,12 @@ class MemoryTierCacheConfig { size_t getRatio() const noexcept { return ratio; } // Allocate memory only from specified NUMA nodes - MemoryTierCacheConfig& setMemBind(const NumaBitMask& _numaNodes) { + MemoryTierCacheConfig& setMemBind(const bitmask_type& _numaNodes) { numaNodes = _numaNodes; return *this; } - const NumaBitMask& getMemBind() const noexcept { return numaNodes; } + const bitmask_type& getMemBind() const noexcept { return numaNodes; } size_t calculateTierSize(size_t totalCacheSize, size_t partitionNum) const { // TODO: Call this method when tiers are enabled in allocator @@ -71,7 +74,7 @@ class MemoryTierCacheConfig { size_t ratio{1}; // Numa node(s) to bind the tier - NumaBitMask numaNodes; + bitmask_type numaNodes; // TODO: introduce a container for tier settings when adding support for // file-mapped memory diff --git a/cachelib/allocator/PoolOptimizer.cpp b/cachelib/allocator/PoolOptimizer.cpp index 8d67762be8..d101231a04 100644 --- a/cachelib/allocator/PoolOptimizer.cpp +++ b/cachelib/allocator/PoolOptimizer.cpp @@ -51,6 +51,8 @@ void PoolOptimizer::optimizeRegularPoolSizes() { void PoolOptimizer::optimizeCompactCacheSizes() { try { + // TODO: should optimizer look at each tier individually? + // If yes, then resizePools should be per-tier auto strategy = cache_.getPoolOptimizeStrategy(); if (!strategy) { strategy = strategy_; diff --git a/cachelib/allocator/PrivateMemoryManager.cpp b/cachelib/allocator/PrivateMemoryManager.cpp new file mode 100644 index 0000000000..afcf1b2202 --- /dev/null +++ b/cachelib/allocator/PrivateMemoryManager.cpp @@ -0,0 +1,50 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "cachelib/allocator/PrivateMemoryManager.h" + +#include + +namespace facebook { +namespace cachelib { + +PrivateMemoryManager::~PrivateMemoryManager() { + for (auto& entry : mappings) { + util::munmapMemory(entry.first, entry.second); + } +} + +void* PrivateMemoryManager::createMapping(size_t size, + PrivateSegmentOpts opts) { + void* addr = util::mmapAlignedZeroedMemory(opts.alignment, size); + auto guard = folly::makeGuard([&]() { + util::munmapMemory(addr, size); + mappings.erase(addr); + }); + + XDCHECK_EQ(reinterpret_cast(addr) & (opts.alignment - 1), 0ULL); + + if (!opts.memBindNumaNodes.empty()) { + util::mbindMemory(addr, size, MPOL_BIND, opts.memBindNumaNodes, 0); + } + + mappings.emplace(addr, size); + + guard.dismiss(); + return addr; +} +} // namespace cachelib +} // namespace facebook \ No newline at end of file diff --git a/cachelib/allocator/PrivateMemoryManager.h b/cachelib/allocator/PrivateMemoryManager.h new file mode 100644 index 0000000000..7880ca928a --- /dev/null +++ b/cachelib/allocator/PrivateMemoryManager.h @@ -0,0 +1,44 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +#include "cachelib/common/Utils.h" + +namespace facebook { +namespace cachelib { + +struct PrivateSegmentOpts { + size_t alignment{1}; // alignment for mapping. + util::NumaBitMask memBindNumaNodes; +}; + +class PrivateMemoryManager { + public: + PrivateMemoryManager() {} + ~PrivateMemoryManager(); + + void* createMapping(size_t size, PrivateSegmentOpts opts); + + private: + std::unordered_map mappings; +}; + +} // namespace cachelib +} // namespace facebook \ No newline at end of file diff --git a/cachelib/allocator/PromotionStrategy.h b/cachelib/allocator/PromotionStrategy.h new file mode 100644 index 0000000000..1022aca0f8 --- /dev/null +++ b/cachelib/allocator/PromotionStrategy.h @@ -0,0 +1,84 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "cachelib/allocator/BackgroundMoverStrategy.h" +#include "cachelib/allocator/Cache.h" + +namespace facebook { +namespace cachelib { + +// Base class for background eviction strategy. +class PromotionStrategy : public BackgroundMoverStrategy { + public: + PromotionStrategy(uint64_t promotionAcWatermark, + uint64_t maxPromotionBatch, + uint64_t minPromotionBatch) + : promotionAcWatermark(promotionAcWatermark), + maxPromotionBatch(maxPromotionBatch), + minPromotionBatch(minPromotionBatch) {} + ~PromotionStrategy() {} + + std::vector calculateBatchSizes( + const CacheBase& cache, + std::vector acVec) { + std::vector batches{}; + for (auto [tid, pid, cid] : acVec) { + XDCHECK(tid > 0); + auto stats = cache.getACStats(tid - 1, pid, cid); + if ((1-stats.usageFraction())*100 < promotionAcWatermark) + batches.push_back(0); + else { + auto maxPossibleItemsToPromote = static_cast( + (promotionAcWatermark - (1-stats.usageFraction())*100) * + (stats.totalSlabs() * Slab::kSize) / stats.allocSize); + batches.push_back(maxPossibleItemsToPromote); + } + } + + if (batches.size() == 0) { + return batches; + } + + auto maxBatch = *std::max_element(batches.begin(), batches.end()); + if (maxBatch == 0) + return batches; + + std::transform( + batches.begin(), batches.end(), batches.begin(), [&](auto numItems) { + if (numItems == 0) { + return 0UL; + } + + auto cappedBatchSize = maxPromotionBatch * numItems / maxBatch; + if (cappedBatchSize < minPromotionBatch) + return minPromotionBatch; + else + return cappedBatchSize; + }); + + return batches; + } + + private: + double promotionAcWatermark{4.0}; + uint64_t maxPromotionBatch{40}; + uint64_t minPromotionBatch{5}; +}; + +} // namespace cachelib +} // namespace facebook diff --git a/cachelib/allocator/Refcount.h b/cachelib/allocator/Refcount.h index 107e10735e..b254093d07 100644 --- a/cachelib/allocator/Refcount.h +++ b/cachelib/allocator/Refcount.h @@ -130,30 +130,41 @@ class FOLLY_PACK_ATTR RefcountWithFlags { RefcountWithFlags& operator=(const RefcountWithFlags&) = delete; RefcountWithFlags(RefcountWithFlags&&) = delete; RefcountWithFlags& operator=(RefcountWithFlags&&) = delete; - + enum incResult { + incOk, + incFailedMoving, + incFailedEviction + }; // Bumps up the reference count only if the new count will be strictly less // than or equal to the maxCount and the item is not exclusive // @return true if refcount is bumped. false otherwise (if item is exclusive) // @throw exception::RefcountOverflow if new count would be greater than // maxCount - FOLLY_ALWAYS_INLINE bool incRef() { - auto predicate = [](const Value curValue) { - Value bitMask = getAdminRef(); - - const bool exlusiveBitIsSet = curValue & bitMask; - if (UNLIKELY((curValue & kAccessRefMask) == (kAccessRefMask))) { - throw exception::RefcountOverflow("Refcount maxed out."); - } - - // Check if the item is not marked for eviction - return !exlusiveBitIsSet || ((curValue & kAccessRefMask) != 0); - }; - - auto newValue = [](const Value curValue) { - return (curValue + static_cast(1)); - }; - - return atomicUpdateValue(predicate, newValue); + FOLLY_ALWAYS_INLINE incResult incRef() { + incResult res = incOk; + auto predicate = [&res](const Value curValue) { + Value bitMask = getAdminRef(); + + const bool exlusiveBitIsSet = curValue & bitMask; + if (UNLIKELY((curValue & kAccessRefMask) == (kAccessRefMask))) { + throw exception::RefcountOverflow("Refcount maxed out."); + } else if (exlusiveBitIsSet && (curValue & kAccessRefMask) == 0) { + res = incFailedEviction; + return false; + } else if (exlusiveBitIsSet) { + res = incFailedMoving; + return false; + } + res = incOk; + return true; + }; + + auto newValue = [](const Value curValue) { + return (curValue + static_cast(1)); + }; + + atomicUpdateValue(predicate, newValue); + return res; } // Bumps down the reference count @@ -322,11 +333,17 @@ class FOLLY_PACK_ATTR RefcountWithFlags { bool markMoving() { Value linkedBitMask = getAdminRef(); Value exclusiveBitMask = getAdminRef(); + Value isChainedItemFlag = getFlag(); - auto predicate = [linkedBitMask, exclusiveBitMask](const Value curValue) { + auto predicate = [linkedBitMask, exclusiveBitMask, isChainedItemFlag](const Value curValue) { const bool unlinked = !(curValue & linkedBitMask); const bool alreadyExclusive = curValue & exclusiveBitMask; + const bool isChained = curValue & isChainedItemFlag; + // chained item can have ref count == 1, this just means it's linked in the chain + if ((curValue & kAccessRefMask) > isChained ? 1 : 0) { + return false; + } if (unlinked || alreadyExclusive) { return false; } diff --git a/cachelib/allocator/datastruct/DList.h b/cachelib/allocator/datastruct/DList.h index 2e872c8ee0..4d862b1908 100644 --- a/cachelib/allocator/datastruct/DList.h +++ b/cachelib/allocator/datastruct/DList.h @@ -221,6 +221,10 @@ class DList { curr_ = dir_ == Direction::FROM_HEAD ? dlist_->head_ : dlist_->tail_; } + Direction getDirection() noexcept { + return dir_; + } + protected: void goForward() noexcept; void goBackward() noexcept; diff --git a/cachelib/allocator/datastruct/MultiDList-inl.h b/cachelib/allocator/datastruct/MultiDList-inl.h index e20510d4fc..cd79b600c5 100644 --- a/cachelib/allocator/datastruct/MultiDList-inl.h +++ b/cachelib/allocator/datastruct/MultiDList-inl.h @@ -25,12 +25,26 @@ void MultiDList::Iterator::goForward() noexcept { } // Move iterator forward ++currIter_; - // If we land at the rend of this list, move to the previous list. - while (index_ != kInvalidIndex && - currIter_ == mlist_.lists_[index_]->rend()) { - --index_; - if (index_ != kInvalidIndex) { - currIter_ = mlist_.lists_[index_]->rbegin(); + + if (currIter_.getDirection() == DListIterator::Direction::FROM_HEAD) { + // If we land at the rend of this list, move to the previous list. + while (index_ != kInvalidIndex && index_ != mlist_.lists_.size() && + currIter_ == mlist_.lists_[index_]->end()) { + ++index_; + if (index_ != kInvalidIndex && index_ != mlist_.lists_.size()) { + currIter_ = mlist_.lists_[index_]->begin(); + } else { + return; + } + } + } else { + // If we land at the rend of this list, move to the previous list. + while (index_ != kInvalidIndex && + currIter_ == mlist_.lists_[index_]->rend()) { + --index_; + if (index_ != kInvalidIndex) { + currIter_ = mlist_.lists_[index_]->rbegin(); + } } } } @@ -71,6 +85,25 @@ void MultiDList::Iterator::initToValidRBeginFrom( : mlist_.lists_[index_]->rbegin(); } +template T::*HookPtr> +void MultiDList::Iterator::initToValidBeginFrom( + size_t listIdx) noexcept { + // Find the first non-empty list. + index_ = listIdx; + while (index_ != mlist_.lists_.size() && + mlist_.lists_[index_]->size() == 0) { + ++index_; + } + if (index_ == mlist_.lists_.size()) { + //we reached the end - we should get set to + //invalid index + index_ = std::numeric_limits::max(); + } + currIter_ = index_ == std::numeric_limits::max() + ? mlist_.lists_[0]->begin() + : mlist_.lists_[index_]->begin(); +} + template T::*HookPtr> typename MultiDList::Iterator& MultiDList::Iterator::operator++() noexcept { @@ -97,7 +130,16 @@ typename MultiDList::Iterator MultiDList::rbegin( if (listIdx >= lists_.size()) { throw std::invalid_argument("Invalid list index for MultiDList iterator."); } - return MultiDList::Iterator(*this, listIdx); + return MultiDList::Iterator(*this, listIdx, false); +} + +template T::*HookPtr> +typename MultiDList::Iterator MultiDList::begin( + size_t listIdx) const { + if (listIdx >= lists_.size()) { + throw std::invalid_argument("Invalid list index for MultiDList iterator."); + } + return MultiDList::Iterator(*this, listIdx, true); } template T::*HookPtr> diff --git a/cachelib/allocator/datastruct/MultiDList.h b/cachelib/allocator/datastruct/MultiDList.h index 1a59baa715..bd7be00bd4 100644 --- a/cachelib/allocator/datastruct/MultiDList.h +++ b/cachelib/allocator/datastruct/MultiDList.h @@ -110,14 +110,18 @@ class MultiDList { } explicit Iterator(const MultiDList& mlist, - size_t listIdx) noexcept + size_t listIdx, bool head) noexcept : currIter_(mlist.lists_[mlist.lists_.size() - 1]->rbegin()), mlist_(mlist) { XDCHECK_LT(listIdx, mlist.lists_.size()); - initToValidRBeginFrom(listIdx); + if (head) { + initToValidBeginFrom(listIdx); + } else { + initToValidRBeginFrom(listIdx); + } // We should either point to an element or the end() iterator // which has an invalid index_. - XDCHECK(index_ == kInvalidIndex || currIter_.get() != nullptr); + XDCHECK(index_ == kInvalidIndex || index_ == mlist.lists_.size() || currIter_.get() != nullptr); } virtual ~Iterator() = default; @@ -169,6 +173,9 @@ class MultiDList { // reset iterator to the beginning of a speicific queue void initToValidRBeginFrom(size_t listIdx) noexcept; + + // reset iterator to the head of a specific queue + void initToValidBeginFrom(size_t listIdx) noexcept; // Index of current list size_t index_{0}; @@ -184,6 +191,9 @@ class MultiDList { // provides an iterator starting from the tail of a specific list. Iterator rbegin(size_t idx) const; + + // provides an iterator starting from the head of a specific list. + Iterator begin(size_t idx) const; // Iterator to compare against for the end. Iterator rend() const noexcept; diff --git a/cachelib/allocator/memory/AllocationClass.cpp b/cachelib/allocator/memory/AllocationClass.cpp index 71089153e9..512df86bbe 100644 --- a/cachelib/allocator/memory/AllocationClass.cpp +++ b/cachelib/allocator/memory/AllocationClass.cpp @@ -50,7 +50,7 @@ AllocationClass::AllocationClass(ClassId classId, poolId_(poolId), allocationSize_(allocSize), slabAlloc_(s), - freedAllocations_{slabAlloc_.createPtrCompressor()} { + freedAllocations_{slabAlloc_.createSingleTierPtrCompressor()} { checkState(); } @@ -102,7 +102,7 @@ AllocationClass::AllocationClass( currSlab_(s.getSlabForIdx(*object.currSlabIdx())), slabAlloc_(s), freedAllocations_(*object.freedAllocationsObject(), - slabAlloc_.createPtrCompressor()), + slabAlloc_.createSingleTierPtrCompressor()), canAllocate_(*object.canAllocate()) { if (!slabAlloc_.isRestorable()) { throw std::logic_error("The allocation class cannot be restored."); @@ -356,9 +356,10 @@ std::pair> AllocationClass::pruneFreeAllocs( // allocated slab, release any freed allocations belonging to this slab. // Set the bit to true if the corresponding allocation is freed, false // otherwise. - FreeList freeAllocs{slabAlloc_.createPtrCompressor()}; - FreeList notInSlab{slabAlloc_.createPtrCompressor()}; - FreeList inSlab{slabAlloc_.createPtrCompressor()}; + FreeList freeAllocs{slabAlloc_.createSingleTierPtrCompressor()}; + FreeList notInSlab{slabAlloc_.createSingleTierPtrCompressor()}; + FreeList inSlab{slabAlloc_.createSingleTierPtrCompressor()}; + lock_->lock_combine([&]() { // Take the allocation class free list offline diff --git a/cachelib/allocator/memory/AllocationClass.h b/cachelib/allocator/memory/AllocationClass.h index d45a45c6cd..269887f207 100644 --- a/cachelib/allocator/memory/AllocationClass.h +++ b/cachelib/allocator/memory/AllocationClass.h @@ -445,7 +445,7 @@ class AllocationClass { struct CACHELIB_PACKED_ATTR FreeAlloc { using CompressedPtr = facebook::cachelib::CompressedPtr; using PtrCompressor = - facebook::cachelib::PtrCompressor; + facebook::cachelib::SingleTierPtrCompressor; SListHook hook_{}; }; diff --git a/cachelib/allocator/memory/CompressedPtr.h b/cachelib/allocator/memory/CompressedPtr.h index 029abd91b9..d664063ea3 100644 --- a/cachelib/allocator/memory/CompressedPtr.h +++ b/cachelib/allocator/memory/CompressedPtr.h @@ -27,9 +27,12 @@ namespace cachelib { class SlabAllocator; +template +class PtrCompressor; + // This CompressedPtr makes decompression fast by staying away from division and -// modulo arithmetic and doing those during the compression time. We most often -// decompress a CompressedPtr than compress a pointer while creating one. This +// modulo arithmetic and doing those during the compression time. We most often +// decompress a CompressedPtr than compress a pointer while creating one. This // is used for pointer compression by the memory allocator. // We compress pointers by storing the tier index, slab index and alloc index of @@ -173,12 +176,14 @@ class CACHELIB_PACKED_ATTR CompressedPtr { } friend SlabAllocator; + template + friend class PtrCompressor; }; template -class PtrCompressor { +class SingleTierPtrCompressor { public: - explicit PtrCompressor(const AllocatorT& allocator) noexcept + explicit SingleTierPtrCompressor(const AllocatorT& allocator) noexcept : allocator_(allocator) {} const CompressedPtr compress(const PtrType* uncompressed) const { @@ -190,11 +195,11 @@ class PtrCompressor { allocator_.unCompress(compressed, false /* isMultiTiered */)); } - bool operator==(const PtrCompressor& rhs) const noexcept { + bool operator==(const SingleTierPtrCompressor& rhs) const noexcept { return &allocator_ == &rhs.allocator_; } - bool operator!=(const PtrCompressor& rhs) const noexcept { + bool operator!=(const SingleTierPtrCompressor& rhs) const noexcept { return !(*this == rhs); } @@ -202,5 +207,53 @@ class PtrCompressor { // memory allocator that does the pointer compression. const AllocatorT& allocator_; }; + +template +class PtrCompressor { + public: + explicit PtrCompressor(const AllocatorContainer& allocators) noexcept + : allocators_(allocators) {} + + const CompressedPtr compress(const PtrType* uncompressed) const { + if (uncompressed == nullptr) + return CompressedPtr{}; + + TierId tid; + for (tid = 0; tid < allocators_.size(); tid++) { + if (allocators_[tid]->isMemoryInAllocator( + static_cast(uncompressed))) + break; + } + + bool isMultiTiered = allocators_.size() > 1; + auto cptr = allocators_[tid]->compress(uncompressed, isMultiTiered); + if (isMultiTiered) { // config has multiple tiers + cptr.setTierId(tid); + } + return cptr; + } + + PtrType* unCompress(const CompressedPtr compressed) const { + if (compressed.isNull()) { + return nullptr; + } + bool isMultiTiered = allocators_.size() > 1; + auto& allocator = *allocators_[compressed.getTierId(isMultiTiered)]; + return static_cast( + allocator.unCompress(compressed, isMultiTiered)); + } + + bool operator==(const PtrCompressor& rhs) const noexcept { + return &allocators_ == &rhs.allocators_; + } + + bool operator!=(const PtrCompressor& rhs) const noexcept { + return !(*this == rhs); + } + + private: + // memory allocator that does the pointer compression. + const AllocatorContainer& allocators_; +}; } // namespace cachelib } // namespace facebook diff --git a/cachelib/allocator/memory/MemoryAllocator.h b/cachelib/allocator/memory/MemoryAllocator.h index 1ce58857de..a77d23494c 100644 --- a/cachelib/allocator/memory/MemoryAllocator.h +++ b/cachelib/allocator/memory/MemoryAllocator.h @@ -516,12 +516,13 @@ class MemoryAllocator { using CompressedPtr = facebook::cachelib::CompressedPtr; template using PtrCompressor = - facebook::cachelib::PtrCompressor; - + facebook::cachelib::PtrCompressor>>; + template - PtrCompressor createPtrCompressor() { - return slabAllocator_.createPtrCompressor(); - } + using SingleTierPtrCompressor = + facebook::cachelib::PtrCompressor; // compress a given pointer to a valid allocation made out of this allocator // through an allocate() or nullptr. Calling this otherwise with invalid @@ -646,6 +647,13 @@ class MemoryAllocator { memoryPoolManager_.updateNumSlabsToAdvise(numSlabs); } + // returns ture if ptr points to memory which is managed by this + // allocator + bool isMemoryInAllocator(const void *ptr) { + return ptr && ptr >= slabAllocator_.getSlabMemoryBegin() + && ptr < slabAllocator_.getSlabMemoryEnd(); + } + private: // @param memory pointer to the memory. // @return the MemoryPool corresponding to the memory. diff --git a/cachelib/allocator/memory/MemoryAllocatorStats.h b/cachelib/allocator/memory/MemoryAllocatorStats.h index 74ebbe64dd..acda9ee530 100644 --- a/cachelib/allocator/memory/MemoryAllocatorStats.h +++ b/cachelib/allocator/memory/MemoryAllocatorStats.h @@ -20,6 +20,7 @@ #include #include "cachelib/allocator/memory/Slab.h" +#include "cachelib/common/RollingStats.h" namespace facebook { namespace cachelib { @@ -47,6 +48,9 @@ struct ACStats { // true if the allocation class is full. bool full; + // Rolling allocation latency (in ns) + util::RollingStats allocLatencyNs; + constexpr unsigned long long totalSlabs() const noexcept { return freeSlabs + usedSlabs; } @@ -54,6 +58,17 @@ struct ACStats { constexpr size_t getTotalFreeMemory() const noexcept { return Slab::kSize * freeSlabs + freeAllocs * allocSize; } + + constexpr double usageFraction() const noexcept { + if (usedSlabs == 0) + return 0.0; + + return activeAllocs / (usedSlabs * allocsPerSlab); + } + + constexpr size_t totalAllocatedSize() const noexcept { + return activeAllocs * allocSize; + } }; // structure to query stats corresponding to a MemoryPool diff --git a/cachelib/allocator/memory/SlabAllocator.h b/cachelib/allocator/memory/SlabAllocator.h index d82cf5b947..a80a54672c 100644 --- a/cachelib/allocator/memory/SlabAllocator.h +++ b/cachelib/allocator/memory/SlabAllocator.h @@ -318,8 +318,19 @@ class SlabAllocator { } template - PtrCompressor createPtrCompressor() const { - return PtrCompressor(*this); + SingleTierPtrCompressor createSingleTierPtrCompressor() const { + return SingleTierPtrCompressor(*this); + } + + // returns starting address of memory we own. + const Slab* getSlabMemoryBegin() const noexcept { + return reinterpret_cast(memoryStart_); + } + + // returns first byte after the end of memory region we own. + const Slab* getSlabMemoryEnd() const noexcept { + return reinterpret_cast(reinterpret_cast(memoryStart_) + + memorySize_); } private: @@ -339,12 +350,6 @@ class SlabAllocator { // @throw std::invalid_argument if the state is invalid. void checkState() const; - // returns first byte after the end of memory region we own. - const Slab* getSlabMemoryEnd() const noexcept { - return reinterpret_cast(reinterpret_cast(memoryStart_) + - memorySize_); - } - // returns true if we have slabbed all the memory that is available to us. // false otherwise. bool allMemorySlabbed() const noexcept { diff --git a/cachelib/allocator/tests/AllocatorMemoryTiersTest.cpp b/cachelib/allocator/tests/AllocatorMemoryTiersTest.cpp index 3e4847251f..a08ee04e6d 100644 --- a/cachelib/allocator/tests/AllocatorMemoryTiersTest.cpp +++ b/cachelib/allocator/tests/AllocatorMemoryTiersTest.cpp @@ -21,11 +21,16 @@ namespace cachelib { namespace tests { using LruAllocatorMemoryTiersTest = AllocatorMemoryTiersTest; - +//using LruTestAllocatorMemoryTiersTest = AllocatorMemoryTiersTest; // TODO(MEMORY_TIER): add more tests with different eviction policies -TEST_F(LruAllocatorMemoryTiersTest, MultiTiersValid1) { - this->testMultiTiersValid1(); -} +TEST_F(LruAllocatorMemoryTiersTest, MultiTiersInvalid) { this->testMultiTiersInvalid(); } +TEST_F(LruAllocatorMemoryTiersTest, MultiTiersValid) { this->testMultiTiersValid(); } +TEST_F(LruAllocatorMemoryTiersTest, MultiTiersValidStats) { this->testMultiTiersValidStats(); } +TEST_F(LruAllocatorMemoryTiersTest, MultiTiersValidMixed) { this->testMultiTiersValidMixed(); } +TEST_F(LruAllocatorMemoryTiersTest, MultiTiersBackgroundMovers ) { this->testMultiTiersBackgroundMovers(); } +TEST_F(LruAllocatorMemoryTiersTest, MultiTiersRemoveDuringEviction) { this->testMultiTiersRemoveDuringEviction(); } +TEST_F(LruAllocatorMemoryTiersTest, MultiTiersReplaceDuringEviction) { this->testMultiTiersReplaceDuringEviction(); } +TEST_F(LruAllocatorMemoryTiersTest, MultiTiersReplaceDuringEvictionWithReader) { this->testMultiTiersReplaceDuringEvictionWithReader(); } } // end of namespace tests } // end of namespace cachelib diff --git a/cachelib/allocator/tests/AllocatorMemoryTiersTest.h b/cachelib/allocator/tests/AllocatorMemoryTiersTest.h index a0d1513990..5da2d01b5d 100644 --- a/cachelib/allocator/tests/AllocatorMemoryTiersTest.h +++ b/cachelib/allocator/tests/AllocatorMemoryTiersTest.h @@ -19,6 +19,14 @@ #include "cachelib/allocator/CacheAllocatorConfig.h" #include "cachelib/allocator/MemoryTierCacheConfig.h" #include "cachelib/allocator/tests/TestBase.h" +#include "cachelib/allocator/FreeThresholdStrategy.h" +#include "cachelib/allocator/PromotionStrategy.h" + +#include +#include +#include +#include +#include namespace facebook { namespace cachelib { @@ -26,15 +34,388 @@ namespace tests { template class AllocatorMemoryTiersTest : public AllocatorTest { + private: + template + void testMultiTiersAsyncOpDuringMove(std::unique_ptr& alloc, + PoolId& pool, bool& quit, MvCallback&& moveCb) { + typename AllocatorT::Config config; + config.setCacheSize(4 * Slab::kSize); + config.enableCachePersistence("/tmp"); + config.configureMemoryTiers({ + MemoryTierCacheConfig::fromShm() + .setRatio(1).setMemBind(std::string("0")), + MemoryTierCacheConfig::fromShm() + .setRatio(1).setMemBind(std::string("0")) + }); + + config.enableMovingOnSlabRelease(moveCb, {} /* ChainedItemsMoveSync */, + -1 /* movingAttemptsLimit */); + + alloc = std::make_unique(AllocatorT::SharedMemNew, config); + ASSERT(alloc != nullptr); + pool = alloc->addPool("default", alloc->getCacheMemoryStats().ramCacheSize); + + int i = 0; + while(!quit) { + auto handle = alloc->allocate(pool, std::to_string(++i), std::string("value").size()); + ASSERT(handle != nullptr); + ASSERT_NO_THROW(alloc->insertOrReplace(handle)); + } + } + public: - void testMultiTiersValid1() { + void testMultiTiersInvalid() { + typename AllocatorT::Config config; + config.setCacheSize(100 * Slab::kSize); + ASSERT_NO_THROW(config.configureMemoryTiers( + {MemoryTierCacheConfig::fromShm().setRatio(1).setMemBind( + std::string("0")), + MemoryTierCacheConfig::fromShm().setRatio(1).setMemBind( + std::string("0"))})); + } + + void testMultiTiersValid() { typename AllocatorT::Config config; config.setCacheSize(100 * Slab::kSize); + config.enableCachePersistence("/tmp"); ASSERT_NO_THROW(config.configureMemoryTiers( {MemoryTierCacheConfig::fromShm().setRatio(1).setMemBind( std::string("0")), MemoryTierCacheConfig::fromShm().setRatio(1).setMemBind( std::string("0"))})); + + auto alloc = std::make_unique(AllocatorT::SharedMemNew, config); + ASSERT(alloc != nullptr); + + auto pool = alloc->addPool("default", alloc->getCacheMemoryStats().ramCacheSize); + auto handle = alloc->allocate(pool, "key", std::string("value").size()); + ASSERT(handle != nullptr); + ASSERT_NO_THROW(alloc->insertOrReplace(handle)); + } + + void testMultiTiersValidStats() { + typename AllocatorT::Config config; + size_t nSlabs = 20; + config.setCacheSize(nSlabs * Slab::kSize); + config.enableCachePersistence("/tmp"); + ASSERT_NO_THROW(config.configureMemoryTiers( + {MemoryTierCacheConfig::fromShm().setRatio(1).setMemBind( + std::string("0")), + MemoryTierCacheConfig::fromShm().setRatio(2).setMemBind( + std::string("0"))})); + + auto alloc = std::make_unique(AllocatorT::SharedMemNew, config); + ASSERT(alloc != nullptr); + size_t keyLen = 8; + auto pool = alloc->addPool("default", alloc->getCacheMemoryStats().ramCacheSize); + std::vector valsize = {1000}; + std::vector itemCount; + std::vector evictCount; + for (uint32_t tid = 0; tid < 2; tid++) { + this->fillUpPoolUntilEvictions(*alloc, tid, pool, valsize, keyLen); + auto stats = alloc->getPoolStats(tid, pool); + const auto& classIds = stats.mpStats.classIds; + uint32_t prev = 0; + ClassId cid = 0; + for (const ClassId c : classIds) { + uint32_t currSize = stats.cacheStats[c].allocSize; + if (prev <= valsize[0] && valsize[0] <= currSize) { + cid = c; + break; + } + prev = currSize; + } + + std::cout << "Tid: " << tid << " cid: " << static_cast(cid) + << " items: " << stats.cacheStats[cid].numItems() + << " evicts: " << stats.cacheStats[cid].numEvictions() + << std::endl; + ASSERT_GE(stats.cacheStats[cid].numItems(), 1); + ASSERT_EQ(stats.cacheStats[cid].numEvictions(), 1); + itemCount.push_back(stats.cacheStats[cid].numItems()); + evictCount.push_back(stats.cacheStats[cid].numEvictions()); + //first tier should have some writebacks to second tier + //second tier should not have any writebacks since it + //is last memory tier + if (tid == 0) { + ASSERT_EQ(stats.cacheStats[cid].numWritebacks, 1); + } else { + ASSERT_EQ(0, stats.cacheStats[cid].numWritebacks); + } + } + for (uint32_t tid = 1; tid < 2; tid++) { + ASSERT_NE(itemCount[tid],itemCount[tid-1]); + ASSERT_EQ(evictCount[tid],evictCount[tid-1]); + } + } + + void testMultiTiersBackgroundMovers() { + typename AllocatorT::Config config; + config.setCacheSize(10 * Slab::kSize); + config.enableCachePersistence("/tmp"); + config.usePosixForShm(); + config.configureMemoryTiers({ + MemoryTierCacheConfig::fromShm() + .setRatio(1).setMemBind(std::string("0")), + MemoryTierCacheConfig::fromShm() + .setRatio(1).setMemBind(std::string("0")) + }); + config.enableBackgroundEvictor(std::make_shared(2, 10, 100, 40), + std::chrono::milliseconds(10),1); + config.enableBackgroundPromoter(std::make_shared(5, 4, 2), + std::chrono::milliseconds(10),1); + + auto allocator = std::make_unique(AllocatorT::SharedMemNew, config); + ASSERT(allocator != nullptr); + const size_t numBytes = allocator->getCacheMemoryStats().ramCacheSize; + + auto poolId = allocator->addPool("default", numBytes); + + const unsigned int keyLen = 100; + const unsigned int size = 100; + unsigned int allocs = 0; + + //we should work on pool stats because filluppooluntil evictions + //will finish once we evict an item from tier 0 to tier 1 and + //there will be unallocated memory left. + while (allocs < 174760) { + const auto key = this->getRandomNewKey(*allocator, keyLen); + ASSERT_EQ(allocator->find(key), nullptr); + auto handle = util::allocateAccessible(*allocator, poolId, key, size); + allocs++; + } + + const auto key = this->getRandomNewKey(*allocator, keyLen); + auto handle = util::allocateAccessible(*allocator, poolId, key, size); + ASSERT_NE(nullptr, handle); + const uint8_t cid = allocator->getAllocInfo(handle->getMemory()).classId; + ASSERT_EQ(cid,5); + auto stats = allocator->getGlobalCacheStats(); + auto slabStats = allocator->getACStats(0,0,cid); + const auto& mpStats = allocator->getPoolByTid(poolId, 0).getStats(); + //cache is 10MB should move about 1MB to reach 10% free + uint32_t approxEvict = (1024*1024)/mpStats.acStats.at(cid).allocSize; + while (stats.evictionStats.numMovedItems < approxEvict*0.95 && (1-slabStats.usageFraction()) >= 0.095) { + std::this_thread::sleep_for(std::chrono::seconds(1)); + stats = allocator->getGlobalCacheStats(); + slabStats = allocator->getACStats(0,0,cid); + } + ASSERT_GE(1-slabStats.usageFraction(),0.095); + + auto perclassEstats = allocator->getBackgroundMoverClassStats(MoverDir::Evict); + auto perclassPstats = allocator->getBackgroundMoverClassStats(MoverDir::Promote); + + ASSERT_GE(stats.evictionStats.numMovedItems,1); + ASSERT_GE(stats.evictionStats.runCount,1); + ASSERT_GE(stats.promotionStats.numMovedItems,1); + + ASSERT_GE(perclassEstats[0][0][cid], 1); + ASSERT_GE(perclassPstats[1][0][cid], 1); + + } + + void testMultiTiersValidMixed() { + typename AllocatorT::Config config; + config.setCacheSize(100 * Slab::kSize); + config.enableCachePersistence("/tmp"); + ASSERT_NO_THROW(config.configureMemoryTiers( + {MemoryTierCacheConfig::fromShm().setRatio(1).setMemBind( + std::string("0")), + MemoryTierCacheConfig::fromShm().setRatio(1).setMemBind( + std::string("0"))})); + + auto alloc = std::make_unique(AllocatorT::SharedMemNew, config); + ASSERT(alloc != nullptr); + + auto pool = alloc->addPool("default", alloc->getCacheMemoryStats().ramCacheSize); + auto handle = alloc->allocate(pool, "key", std::string("value").size()); + ASSERT(handle != nullptr); + ASSERT_NO_THROW(alloc->insertOrReplace(handle)); + } + + void testMultiTiersRemoveDuringEviction() { + std::unique_ptr alloc; + PoolId pool; + std::unique_ptr t; + folly::Latch latch(1); + bool quit = false; + + auto moveCb = [&] (typename AllocatorT::Item& oldItem, + typename AllocatorT::Item& newItem, + typename AllocatorT::Item* /* parentPtr */) { + + auto key = oldItem.getKey(); + t = std::make_unique([&](){ + // remove() function is blocked by wait context + // till item is moved to next tier. So that, we should + // notify latch before calling remove() + latch.count_down(); + alloc->remove(key); + }); + // wait till async thread is running + latch.wait(); + memcpy(newItem.getMemory(), oldItem.getMemory(), oldItem.getSize()); + quit = true; + }; + + testMultiTiersAsyncOpDuringMove(alloc, pool, quit, moveCb); + + t->join(); + } + + void testMultiTiersReplaceDuringEviction() { + std::unique_ptr alloc; + PoolId pool; + std::unique_ptr t; + folly::Latch latch(1); + bool quit = false; + + auto moveCb = [&] (typename AllocatorT::Item& oldItem, + typename AllocatorT::Item& newItem, + typename AllocatorT::Item* /* parentPtr */) { + auto key = oldItem.getKey(); + if(!quit) { + // we need to replace only once because subsequent allocate calls + // will cause evictions recursevly + quit = true; + t = std::make_unique([&](){ + auto handle = alloc->allocate(pool, key, std::string("new value").size()); + // insertOrReplace() function is blocked by wait context + // till item is moved to next tier. So that, we should + // notify latch before calling insertOrReplace() + latch.count_down(); + ASSERT_NO_THROW(alloc->insertOrReplace(handle)); + }); + // wait till async thread is running + latch.wait(); + } + memcpy(newItem.getMemory(), oldItem.getMemory(), oldItem.getSize()); + }; + + testMultiTiersAsyncOpDuringMove(alloc, pool, quit, moveCb); + + t->join(); + + } + + + void gdb_sync1() {} + void gdb_sync2() {} + void gdb_sync3() {} + using ReadHandle = typename AllocatorT::ReadHandle; + void testMultiTiersReplaceDuringEvictionWithReader() { + sem_unlink ("/gdb1_sem"); + sem_t *sem = sem_open ("/gdb1_sem", O_CREAT | O_EXCL, S_IRUSR | S_IWUSR, 0); + int gdbfd = open("/tmp/gdb1.gdb",O_CREAT | O_TRUNC | O_RDWR, S_IRUSR | S_IWUSR); + char gdbcmds[] = + "set attached=1\n" + "break gdb_sync1\n" + "break gdb_sync2\n" + "break moveRegularItemWithSync\n" + "c\n" + "set scheduler-locking on\n" + "thread 1\n" + "c\n" + "thread 4\n" + "c\n" + "thread 5\n" + "break nativeFutexWaitImpl thread 5\n" + "c\n" + "thread 4\n" + "break nativeFutexWaitImpl thread 4\n" + "c\n" + "thread 1\n" + "break releaseBackToAllocator\n" + "c\n" + "c\n" + "thread 5\n" + "c\n" + "thread 4\n" + "c\n" + "thread 1\n" + "break gdb_sync3\n" + "c\n" + "quit\n"; + int ret = write(gdbfd,gdbcmds,strlen(gdbcmds)); + int ppid = getpid(); //parent pid + //int pid = 0; + int pid = fork(); + if (pid == 0) { + sem_wait(sem); + sem_close(sem); + sem_unlink("/gdb1_sem"); + char cmdpid[256]; + sprintf(cmdpid,"%d",ppid); + int f = execlp("gdb","gdb","--pid",cmdpid,"--batch-silent","--command=/tmp/gdb1.gdb",(char*) 0); + ASSERT(f != -1); + } + sem_post(sem); + //wait for gdb to run + int attached = 0; + while (attached == 0); + + std::unique_ptr alloc; + PoolId pool; + bool quit = false; + + typename AllocatorT::Config config; + config.setCacheSize(4 * Slab::kSize); + config.enableCachePersistence("/tmp"); + config.configureMemoryTiers({ + MemoryTierCacheConfig::fromShm() + .setRatio(1).setMemBind(std::string("0")), + MemoryTierCacheConfig::fromShm() + .setRatio(1).setMemBind(std::string("0")) + }); + + alloc = std::make_unique(AllocatorT::SharedMemNew, config); + ASSERT(alloc != nullptr); + pool = alloc->addPool("default", alloc->getCacheMemoryStats().ramCacheSize); + + int i = 0; + typename AllocatorT::Item* evicted; + std::unique_ptr t; + std::unique_ptr r; + while(!quit) { + auto handle = alloc->allocate(pool, std::to_string(++i), std::string("value").size()); + ASSERT(handle != nullptr); + if (i == 1) { + evicted = static_cast(handle.get()); + folly::Latch latch_t(1); + t = std::make_unique([&](){ + auto handleNew = alloc->allocate(pool, std::to_string(1), std::string("new value").size()); + ASSERT(handleNew != nullptr); + latch_t.count_down(); + //first breakpoint will be this one because + //thread 1 still has more items to fill up the + //cache before an evict is evicted + gdb_sync1(); + ASSERT(evicted->isMoving()); + //need to suspend thread 1 - who is doing the eviction + //gdb will do this for us + folly::Latch latch(1); + r = std::make_unique([&](){ + ASSERT(evicted->isMoving()); + latch.count_down(); + auto handleEvict = alloc->find(std::to_string(1)); + //does find block until done moving?? yes + while (evicted->isMarkedForEviction()); //move will fail + XDCHECK(handleEvict == nullptr) << handleEvict->toString(); + ASSERT(handleEvict == nullptr); + }); + latch.wait(); + gdb_sync2(); + alloc->insertOrReplace(handleNew); + ASSERT(!evicted->isAccessible()); //move failed + quit = true; + }); + latch_t.wait(); + } + ASSERT_NO_THROW(alloc->insertOrReplace(handle)); + } + t->join(); + r->join(); + gdb_sync3(); } }; } // namespace tests diff --git a/cachelib/allocator/tests/AllocatorResizeTest.h b/cachelib/allocator/tests/AllocatorResizeTest.h index d65205ac74..883dd9c056 100644 --- a/cachelib/allocator/tests/AllocatorResizeTest.h +++ b/cachelib/allocator/tests/AllocatorResizeTest.h @@ -966,23 +966,23 @@ class AllocatorResizeTest : public AllocatorTest { for (i = 1; i <= numItersToMaxAdviseAway + 1; i++) { alloc.memMonitor_->adviseAwaySlabs(); std::this_thread::sleep_for(std::chrono::seconds{2}); - ASSERT_EQ(alloc.allocator_->getAdvisedMemorySize(), i * perIterAdvSize); + ASSERT_EQ(alloc.allocator_[0 /* TODO - extend test */]->getAdvisedMemorySize(), i * perIterAdvSize); } i--; // This should fail alloc.memMonitor_->adviseAwaySlabs(); std::this_thread::sleep_for(std::chrono::seconds{2}); - auto totalAdvisedAwayMemory = alloc.allocator_->getAdvisedMemorySize(); + auto totalAdvisedAwayMemory = alloc.allocator_[0 /* TODO - extend test */]->getAdvisedMemorySize(); ASSERT_EQ(totalAdvisedAwayMemory, i * perIterAdvSize); // Try to reclaim back for (i = 1; i <= numItersToMaxAdviseAway + 1; i++) { alloc.memMonitor_->reclaimSlabs(); std::this_thread::sleep_for(std::chrono::seconds{2}); - ASSERT_EQ(alloc.allocator_->getAdvisedMemorySize(), + ASSERT_EQ(alloc.allocator_[0 /* TODO - extend test */]->getAdvisedMemorySize(), totalAdvisedAwayMemory - i * perIterAdvSize); } - totalAdvisedAwayMemory = alloc.allocator_->getAdvisedMemorySize(); + totalAdvisedAwayMemory = alloc.allocator_[0 /* TODO - extend test */]->getAdvisedMemorySize(); ASSERT_EQ(totalAdvisedAwayMemory, 0); } } diff --git a/cachelib/allocator/tests/AllocatorTypeTest.cpp b/cachelib/allocator/tests/AllocatorTypeTest.cpp index 1e98af29f2..ad38588bcb 100644 --- a/cachelib/allocator/tests/AllocatorTypeTest.cpp +++ b/cachelib/allocator/tests/AllocatorTypeTest.cpp @@ -288,8 +288,8 @@ TYPED_TEST(BaseAllocatorTest, AddChainedItemMultiThreadWithMovingAndSync) { this->testAddChainedItemMultithreadWithMovingAndSync(); } -TYPED_TEST(BaseAllocatorTest, TransferChainWhileMoving) { - this->testTransferChainWhileMoving(); +TYPED_TEST(BaseAllocatorTest, TransferChainAfterMoving) { + this->testTransferChainAfterMoving(); } TYPED_TEST(BaseAllocatorTest, AddAndPopChainedItemMultithread) { @@ -409,6 +409,7 @@ TYPED_TEST(BaseAllocatorTest, RateMap) { this->testRateMap(); } TYPED_TEST(BaseAllocatorTest, StatSnapshotTest) { this->testStatSnapshotTest(); } +TYPED_TEST(BaseAllocatorTest, BasicMultiTier) {this->testBasicMultiTier(); } namespace { // the tests that cannot be done by TYPED_TEST. diff --git a/cachelib/allocator/tests/BaseAllocatorTest.h b/cachelib/allocator/tests/BaseAllocatorTest.h index f503d59f61..e7bf0db8b6 100644 --- a/cachelib/allocator/tests/BaseAllocatorTest.h +++ b/cachelib/allocator/tests/BaseAllocatorTest.h @@ -3655,6 +3655,16 @@ class BaseAllocatorTest : public AllocatorTest { sourceAlloc); otherThread.join(); + // in our new version with marking item as moving, move attempts + // will only fail if there is a concurrent set to that item, in + // this case if the handle to an item is held, the slab release + // will keep trying to mark the item as moving - we currently + // don't have a counter for that (but this test assumes that + // if handle is held then moveForSlabRelease will retry, + // that is where the move attempts counter is incremented) + // + // as a fix, we increment the move attempts counter during + // markMovingForSlabRelase too XLOG(INFO, "Number of move retry attempts: ", allocator.getSlabReleaseStats().numMoveAttempts); ASSERT_GT(allocator.getSlabReleaseStats().numMoveAttempts, 1); @@ -4098,15 +4108,16 @@ class BaseAllocatorTest : public AllocatorTest { // Check that item is in the expected container. bool findItem(AllocatorT& allocator, typename AllocatorT::Item* item) { auto& container = allocator.getMMContainer(*item); - auto itr = container.getEvictionIterator(); bool found = false; - while (itr) { - if (itr.get() == item) { - found = true; - break; + container.withEvictionIterator([&found, &item](auto&& itr) { + while (itr) { + if (itr.get() == item) { + found = true; + break; + } + ++itr; } - ++itr; - } + }); return found; } @@ -4256,13 +4267,13 @@ class BaseAllocatorTest : public AllocatorTest { // Had a bug: D4799860 where we allocated the wrong size for chained item { const auto parentAllocInfo = - alloc.allocator_->getAllocInfo(itemHandle->getMemory()); + alloc.allocator_[0 /* TODO - extend test */]->getAllocInfo(itemHandle->getMemory()); const auto child1AllocInfo = - alloc.allocator_->getAllocInfo(chainedItemHandle->getMemory()); + alloc.allocator_[0 /* TODO - extend test */]->getAllocInfo(chainedItemHandle->getMemory()); const auto child2AllocInfo = - alloc.allocator_->getAllocInfo(chainedItemHandle2->getMemory()); + alloc.allocator_[0 /* TODO - extend test */]->getAllocInfo(chainedItemHandle2->getMemory()); const auto child3AllocInfo = - alloc.allocator_->getAllocInfo(chainedItemHandle3->getMemory()); + alloc.allocator_[0 /* TODO - extend test */]->getAllocInfo(chainedItemHandle3->getMemory()); const auto parentCid = parentAllocInfo.classId; const auto child1Cid = child1AllocInfo.classId; @@ -4831,7 +4842,7 @@ class BaseAllocatorTest : public AllocatorTest { std::memcpy(newItem.getMemory(), oldItem.getMemory(), oldItem.getSize()); ++numMoves; - }); + }, {}, 1000000 /* lots of moving tries */); AllocatorT alloc(config); const size_t numBytes = alloc.getCacheMemoryStats().ramCacheSize; @@ -4872,7 +4883,7 @@ class BaseAllocatorTest : public AllocatorTest { } /* sleep override */ - std::this_thread::sleep_for(std::chrono::milliseconds(100)); + std::this_thread::sleep_for(std::chrono::milliseconds(1000)); } }; @@ -4880,7 +4891,7 @@ class BaseAllocatorTest : public AllocatorTest { auto releaseFn = [&] { for (unsigned int i = 0; i < 5;) { /* sleep override */ - std::this_thread::sleep_for(std::chrono::milliseconds(100)); + std::this_thread::sleep_for(std::chrono::milliseconds(1000)); ClassId cid = static_cast(i); alloc.releaseSlab(pid, cid, SlabReleaseMode::kRebalance); @@ -5097,7 +5108,7 @@ class BaseAllocatorTest : public AllocatorTest { auto releaseFn = [&] { for (unsigned int i = 0; i < 5;) { /* sleep override */ - std::this_thread::sleep_for(std::chrono::milliseconds(100)); + std::this_thread::sleep_for(std::chrono::milliseconds(1000)); ClassId cid = static_cast(i); alloc.releaseSlab(pid, cid, SlabReleaseMode::kRebalance); @@ -5156,9 +5167,10 @@ class BaseAllocatorTest : public AllocatorTest { lookupFn("yolo"); } - // while a chained item could be moved, try to transfer its parent and - // validate that move succeeds correctly. - void testTransferChainWhileMoving() { + // while a chained item could be moved - it is sync on parent moving bit. + // try to transfer its parent after we moved and + // validate that transfer succeeds correctly. + void testTransferChainAfterMoving() { // create an allocator worth 10 slabs. typename AllocatorT::Config config; config.configureChainedItems(); @@ -5179,15 +5191,13 @@ class BaseAllocatorTest : public AllocatorTest { struct TestSyncObj : public AllocatorT::SyncObj { TestSyncObj(std::mutex& m, std::atomic& firstTime, - folly::Baton<>& startedMoving, - folly::Baton<>& changedParent) + folly::Baton<>& startedMoving) : l(m) { if (!firstTime) { return; } firstTime = false; startedMoving.post(); - changedParent.wait(); } std::lock_guard l; @@ -5200,9 +5210,6 @@ class BaseAllocatorTest : public AllocatorTest { // baton to indicate that the move process has started so that we can // switch the parent folly::Baton<> startedMoving; - // baton to indicate that the parent has been switched so that the move - // process can proceed - folly::Baton<> changedParent; const size_t numMovingAttempts = 100; std::atomic numMoves{0}; @@ -5214,11 +5221,10 @@ class BaseAllocatorTest : public AllocatorTest { oldItem.getSize()); ++numMoves; }, - [&m, &startedMoving, &changedParent, - &firstTimeMovingSync](typename Item::Key key) { + [&m, &startedMoving, &firstTimeMovingSync](typename Item::Key key) { XLOG(ERR) << "Moving" << key; return std::make_unique(m, firstTimeMovingSync, - startedMoving, changedParent); + startedMoving); }, numMovingAttempts); @@ -5248,24 +5254,19 @@ class BaseAllocatorTest : public AllocatorTest { auto slabRelease = std::async(releaseFn); startedMoving.wait(); + // wait for slab release to complete. + slabRelease.wait(); // we know moving sync is held now. { auto newParent = alloc.allocate(pid, movingKey, 600); - auto parent = alloc.findToWrite(movingKey); + auto parent = alloc.findToWrite(movingKey); //parent is marked moving during moved, once finished we will get handle alloc.transferChainAndReplace(parent, newParent); } - // indicate that we changed the parent. This should abort the current - // moving attempt, re-allocate the item and eventually succeed in moving. - changedParent.post(); - - // wait for slab release to complete. - slabRelease.wait(); - EXPECT_EQ(numMoves, 1); auto slabReleaseStats = alloc.getSlabReleaseStats(); - EXPECT_EQ(slabReleaseStats.numMoveAttempts, 2); + EXPECT_EQ(slabReleaseStats.numMoveAttempts, 1); EXPECT_EQ(slabReleaseStats.numMoveSuccesses, 1); auto handle = alloc.find(movingKey); @@ -5494,8 +5495,12 @@ class BaseAllocatorTest : public AllocatorTest { ASSERT_TRUE(big->isInMMContainer()); auto& mmContainer = alloc.getMMContainer(*big); - auto itr = mmContainer.getEvictionIterator(); - ASSERT_EQ(big.get(), &(*itr)); + + typename AllocatorT::Item* evictionCandidate = nullptr; + mmContainer.withEvictionIterator( + [&evictionCandidate](auto&& itr) { evictionCandidate = itr.get(); }); + + ASSERT_EQ(big.get(), evictionCandidate); alloc.remove("hello"); } @@ -5509,8 +5514,11 @@ class BaseAllocatorTest : public AllocatorTest { ASSERT_TRUE(small2->isInMMContainer()); auto& mmContainer = alloc.getMMContainer(*small2); - auto itr = mmContainer.getEvictionIterator(); - ASSERT_EQ(small2.get(), &(*itr)); + + typename AllocatorT::Item* evictionCandidate = nullptr; + mmContainer.withEvictionIterator( + [&evictionCandidate](auto&& itr) { evictionCandidate = itr.get(); }); + ASSERT_EQ(small2.get(), evictionCandidate); alloc.remove("hello"); } @@ -5972,7 +5980,6 @@ class BaseAllocatorTest : public AllocatorTest { EXPECT_EQ(nullptr, util::allocateAccessible(alloc, poolId, "large", largeSize)); - std::this_thread::sleep_for(std::chrono::seconds{1}); // trigger the slab rebalance EXPECT_EQ(nullptr, util::allocateAccessible(alloc, poolId, "large", largeSize)); @@ -6308,6 +6315,86 @@ class BaseAllocatorTest : public AllocatorTest { }); EXPECT_EQ(intervalNameExists, 4); } + + void testSingleTierMemoryAllocatorSize() { + typename AllocatorT::Config config; + static constexpr size_t cacheSize = 100 * 1024 * 1024; /* 100 MB */ + config.setCacheSize(cacheSize); + config.enableCachePersistence(folly::sformat("/tmp/single-tier-test/{}", ::getpid())); + + AllocatorT alloc(AllocatorT::SharedMemNew, config); + + EXPECT_LE(alloc.allocator_[0]->getMemorySize(), cacheSize); + } + + void testSingleTierMemoryAllocatorSizeAnonymous() { + typename AllocatorT::Config config; + static constexpr size_t cacheSize = 100 * 1024 * 1024; /* 100 MB */ + config.setCacheSize(cacheSize); + + AllocatorT alloc(config); + + EXPECT_LE(alloc.allocator_[0]->getMemorySize(), cacheSize); + } + + void testBasicMultiTier() { + using Item = typename AllocatorT::Item; + const static std::string data = "data"; + + std::set movedKeys; + auto moveCb = [&](const Item& oldItem, Item& newItem, Item* /* parentPtr */) { + std::memcpy(newItem.getMemory(), oldItem.getMemory(), oldItem.getSize()); + movedKeys.insert(oldItem.getKey().str()); + }; + + typename AllocatorT::Config config; + static constexpr size_t cacheSize = 100 * 1024 * 1024; /* 100 MB */ + config.setCacheSize(100 * 1024 * 1024); /* 100 MB */ + config.enableCachePersistence(folly::sformat("/tmp/multi-tier-test/{}", ::getpid())); + config.configureMemoryTiers({ + MemoryTierCacheConfig::fromShm().setRatio(1) + .setMemBind(std::string("0")), + MemoryTierCacheConfig::fromShm().setRatio(1) + .setMemBind(std::string("0")), + }); + config.enableMovingOnSlabRelease(moveCb); + + AllocatorT alloc(AllocatorT::SharedMemNew, config); + + EXPECT_EQ(alloc.allocator_.size(), 2); + EXPECT_LE(alloc.allocator_[0]->getMemorySize(), cacheSize / 2); + EXPECT_LE(alloc.allocator_[1]->getMemorySize(), cacheSize / 2); + + const size_t numBytes = alloc.getCacheMemoryStats().ramCacheSize; + auto pid = alloc.addPool("default", numBytes); + + static constexpr size_t numOps = cacheSize / 1024; + for (int i = 0; i < numOps; i++) { + std::string key = std::to_string(i); + auto h = alloc.allocate(pid, key, 1024); + EXPECT_TRUE(h); + + std::memcpy(h->getMemory(), data.data(), data.size()); + + alloc.insertOrReplace(h); + } + + EXPECT_TRUE(movedKeys.size() > 0); + + size_t movedButStillInMemory = 0; + for (const auto &k : movedKeys) { + auto h = alloc.find(k); + + if (h) { + movedButStillInMemory++; + /* All moved elements should be in the second tier. */ + EXPECT_TRUE(alloc.allocator_[1]->isMemoryInAllocator(h->getMemory())); + EXPECT_EQ(data, std::string((char*)h->getMemory(), data.size())); + } + } + + EXPECT_TRUE(movedButStillInMemory > 0); + } }; } // namespace tests } // namespace cachelib diff --git a/cachelib/allocator/tests/CacheBaseTest.cpp b/cachelib/allocator/tests/CacheBaseTest.cpp index 928fcc0c67..dae14c5335 100644 --- a/cachelib/allocator/tests/CacheBaseTest.cpp +++ b/cachelib/allocator/tests/CacheBaseTest.cpp @@ -33,7 +33,10 @@ class CacheBaseTest : public CacheBase, public SlabAllocatorTestBase { const std::string getCacheName() const override { return cacheName; } bool isObjectCache() const override { return false; } const MemoryPool& getPool(PoolId) const override { return memoryPool_; } + //TODO: support tiers + const MemoryPool& getPoolByTid(PoolId, TierId tid) const override { return memoryPool_; } PoolStats getPoolStats(PoolId) const override { return PoolStats(); } + ACStats getACStats(TierId, PoolId, ClassId) const { return ACStats(); }; AllSlabReleaseEvents getAllSlabReleaseEvents(PoolId) const override { return AllSlabReleaseEvents{}; } diff --git a/cachelib/allocator/tests/ItemHandleTest.cpp b/cachelib/allocator/tests/ItemHandleTest.cpp index d992a84011..5213166816 100644 --- a/cachelib/allocator/tests/ItemHandleTest.cpp +++ b/cachelib/allocator/tests/ItemHandleTest.cpp @@ -39,6 +39,8 @@ struct TestItem { using ChainedItem = int; void reset() {} + + folly::StringPiece getKey() const { return folly::StringPiece(); } }; struct TestNvmCache; @@ -80,6 +82,12 @@ struct TestAllocator { void adjustHandleCountForThread_private(int i) { tlRef_.tlStats() += i; } + bool addWaitContextForMovingItem( + folly::StringPiece key, + std::shared_ptr> waiter) { + return false; + } + util::FastStats tlRef_; }; } // namespace diff --git a/cachelib/allocator/tests/ItemTest.cpp b/cachelib/allocator/tests/ItemTest.cpp index 70dd1277fe..54bac1945a 100644 --- a/cachelib/allocator/tests/ItemTest.cpp +++ b/cachelib/allocator/tests/ItemTest.cpp @@ -82,6 +82,8 @@ TEST(ItemTest, ExpiryTime) { EXPECT_TRUE(result); EXPECT_EQ(tenMins, item->getConfiguredTTL()); + // So that exclusive bit will be set + item->markAccessible(); // Test that writes fail while the item is moving result = item->markMoving(); EXPECT_TRUE(result); diff --git a/cachelib/allocator/tests/MM2QTest.cpp b/cachelib/allocator/tests/MM2QTest.cpp index e11dd95f5a..0e01ffa56f 100644 --- a/cachelib/allocator/tests/MM2QTest.cpp +++ b/cachelib/allocator/tests/MM2QTest.cpp @@ -223,6 +223,19 @@ void MMTypeTest::testIterate(std::vector>& nodes, } } +template +void MMTypeTest::testIterateHot(std::vector>& nodes, + Container& c) { + auto it = nodes.rbegin(); + c.withPromotionIterator([&it,&c](auto &&it2q) { + while (it2q && c.isHot(*it2q)) { + ASSERT_EQ(it2q->getId(), (*it)->getId()); + ++it2q; + ++it; + } + }); +} + template void MMTypeTest::testMatch(std::string expected, MMTypeTest::Container& c) { @@ -238,6 +251,23 @@ void MMTypeTest::testMatch(std::string expected, ASSERT_EQ(expected, actual); } +template +void MMTypeTest::testMatchHot(std::string expected, + MMTypeTest::Container& c) { + int index = -1; + std::string actual; + c.withPromotionIterator([&c,&actual,&index](auto &&it2q) { + while (it2q) { + ++index; + actual += folly::stringPrintf( + "%d:%s, ", it2q->getId(), + (c.isHot(*it2q) ? "H" : (c.isCold(*it2q) ? "C" : "W"))); + ++it2q; + } + }); + ASSERT_EQ(expected, actual); +} + TEST_F(MM2QTest, DetailedTest) { MM2Q::Config config; config.lruRefreshTime = 0; @@ -259,8 +289,11 @@ TEST_F(MM2QTest, DetailedTest) { } testIterate(nodes, c); + testIterateHot(nodes, c); testMatch("0:C, 1:C, 2:C, 3:C, 4:H, 5:H, ", c); + testMatchHot("5:H, 4:H, 3:C, 2:C, 1:C, 0:C, ", c); + // Move 3 to top of the hot cache c.recordAccess(*(nodes[4]), AccessMode::kRead); testMatch("0:C, 1:C, 2:C, 3:C, 5:H, 4:H, ", c); diff --git a/cachelib/allocator/tests/MMTypeTest.h b/cachelib/allocator/tests/MMTypeTest.h index d38f6ce2c1..dbc55677ea 100644 --- a/cachelib/allocator/tests/MMTypeTest.h +++ b/cachelib/allocator/tests/MMTypeTest.h @@ -147,7 +147,9 @@ class MMTypeTest : public testing::Test { void testRecordAccessBasic(Config c); void testSerializationBasic(Config c); void testIterate(std::vector>& nodes, Container& c); + void testIterateHot(std::vector>& nodes, Container& c); void testMatch(std::string expected, Container& c); + void testMatchHot(std::string expected, Container& c); size_t getListSize(const Container& c, typename MMType::LruType list); void verifyIterationVariants(Container& c); }; diff --git a/cachelib/allocator/tests/MemoryTiersTest.cpp b/cachelib/allocator/tests/MemoryTiersTest.cpp index ed35115c0c..f618dcbb06 100644 --- a/cachelib/allocator/tests/MemoryTiersTest.cpp +++ b/cachelib/allocator/tests/MemoryTiersTest.cpp @@ -109,7 +109,7 @@ class MemoryTiersTest : public AllocatorTest { void validatePoolSize(PoolId poolId, std::unique_ptr& allocator, size_t expectedSize) { - size_t actualSize = allocator->getPool(poolId).getPoolSize(); + size_t actualSize = allocator->getPoolSize(poolId); EXPECT_EQ(actualSize, expectedSize); } @@ -119,9 +119,9 @@ class MemoryTiersTest : public AllocatorTest { size_t numTiers = 2) { if (isSizeValid) { auto pool = alloc->addPool("validPoolSize", poolSize); - EXPECT_LE(alloc->getPool(pool).getPoolSize(), poolSize); + EXPECT_LE(alloc->getPoolSize(pool), poolSize); if (poolSize >= numTiers * Slab::kSize) - EXPECT_GE(alloc->getPool(pool).getPoolSize(), + EXPECT_GE(alloc->getPoolSize(pool), poolSize - numTiers * Slab::kSize); } else { EXPECT_THROW(alloc->addPool("invalidPoolSize", poolSize), @@ -172,6 +172,84 @@ TEST_F(LruMemoryTiersTest, TestInvalid2TierConfigRatioNotSet) { TEST_F(LruMemoryTiersTest, TestInvalid2TierConfigSizesNeCacheSize) { EXPECT_THROW(createTestCacheConfig({0, 0}), std::invalid_argument); } + +TEST_F(LruMemoryTiersTest, TestPoolAllocations) { + std::vector totalCacheSizes = {8 * GB, 2 * GB}; + + static const size_t numExtraSizes = 4; + static const size_t numExtraSlabs = 20; + + for (size_t i = 0; i < numExtraSizes; i++) { + totalCacheSizes.push_back(totalCacheSizes.back() + + (folly::Random::rand64() % numExtraSlabs) * + Slab::kSize); + } + + size_t min_ratio = 1; + size_t max_ratio = 111; + + static const size_t numCombinations = 10; + + for (auto totalCacheSize : totalCacheSizes) { + for (size_t k = 0; k < numCombinations; k++) { + const size_t i = folly::Random::rand32() % max_ratio + min_ratio; + const size_t j = folly::Random::rand32() % max_ratio + min_ratio; + LruAllocatorConfig cfg = + createTestCacheConfig({i, j}, + /* usePoisx */ true, totalCacheSize); + basicCheck(cfg, totalCacheSize); + + std::unique_ptr alloc = std::unique_ptr( + new LruAllocator(LruAllocator::SharedMemNew, cfg)); + + size_t size = (folly::Random::rand64() % + (alloc->getCacheMemoryStats().ramCacheSize - Slab::kSize)) + + Slab::kSize; + testAddPool(alloc, size, true); + } + } +} + +TEST_F(LruMemoryTiersTest, TestPoolInvalidAllocations) { + std::vector totalCacheSizes = {48 * MB, 51 * MB, 256 * MB, + 1 * GB, 5 * GB, 8 * GB}; + size_t min_ratio = 1; + size_t max_ratio = 111; + + static const size_t numCombinations = 10; + + for (auto totalCacheSize : totalCacheSizes) { + for (size_t k = 0; k < numCombinations; k++) { + const size_t i = folly::Random::rand32() % max_ratio + min_ratio; + const size_t j = folly::Random::rand32() % max_ratio + min_ratio; + LruAllocatorConfig cfg = + createTestCacheConfig({i, j}, + /* usePoisx */ true, totalCacheSize); + + std::unique_ptr alloc = nullptr; + try { + alloc = std::unique_ptr( + new LruAllocator(LruAllocator::SharedMemNew, cfg)); + } catch(...) { + // expection only if cache too small + size_t sum_ratios = std::accumulate( + cfg.getMemoryTierConfigs().begin(), cfg.getMemoryTierConfigs().end(), 0UL, + [](const size_t i, const MemoryTierCacheConfig& config) { + return i + config.getRatio(); + }); + auto tier1slabs = cfg.getMemoryTierConfigs()[0].calculateTierSize(cfg.getCacheSize(), sum_ratios) / Slab::kSize; + auto tier2slabs = cfg.getMemoryTierConfigs()[1].calculateTierSize(cfg.getCacheSize(), sum_ratios) / Slab::kSize; + EXPECT_TRUE(tier1slabs <= 2 || tier2slabs <= 2); + + continue; + } + + size_t size = (folly::Random::rand64() % (100 * GB)) + + alloc->getCacheMemoryStats().ramCacheSize; + testAddPool(alloc, size, false); + } + } +} } // namespace tests } // namespace cachelib } // namespace facebook diff --git a/cachelib/allocator/tests/RebalanceStrategyTest.cpp b/cachelib/allocator/tests/RebalanceStrategyTest.cpp index cb5c4cfd51..849483892d 100644 --- a/cachelib/allocator/tests/RebalanceStrategyTest.cpp +++ b/cachelib/allocator/tests/RebalanceStrategyTest.cpp @@ -214,6 +214,9 @@ class RebalanceStrategyTest : public testing::Test { config.poolRebalancerFreeAllocThreshold = 20; initAllocatorConfigForStrategy(config, LruTailAge); + //TODO: why does this fail with orig. value of 8? + //on upstream this fails too, it always reports 4 instead + //of the original test value, which is 8 expected slabs doWork(config, true, 8); } diff --git a/cachelib/allocator/tests/RefCountTest.cpp b/cachelib/allocator/tests/RefCountTest.cpp index 1f31894ddc..f26f76aa4d 100644 --- a/cachelib/allocator/tests/RefCountTest.cpp +++ b/cachelib/allocator/tests/RefCountTest.cpp @@ -101,7 +101,7 @@ void RefCountTest::testBasic() { ASSERT_FALSE(ref.template isFlagSet()); for (uint32_t i = 0; i < RefcountWithFlags::kAccessRefMask; i++) { - ASSERT_TRUE(ref.incRef()); + ASSERT_EQ(ref.incRef(),RefcountWithFlags::incOk); } // Incrementing past the max will fail @@ -152,11 +152,11 @@ void RefCountTest::testBasic() { ASSERT_FALSE(ref.template isFlagSet()); // conditionally set flags - ASSERT_FALSE((ref.markMoving())); + ASSERT_FALSE(ref.markMoving()); ref.markInMMContainer(); // only first one succeeds - ASSERT_TRUE((ref.markMoving())); - ASSERT_FALSE((ref.markMoving())); + ASSERT_TRUE(ref.markMoving()); + ASSERT_FALSE(ref.markMoving()); ref.unmarkInMMContainer(); ref.template setFlag(); @@ -214,20 +214,6 @@ void RefCountTest::testMarkForEvictionAndMoving() { ASSERT_EQ(ret, 0); } - { - // can mark moving when ref count > 0 - RefcountWithFlags ref; - ref.markInMMContainer(); - - ref.incRef(); - - ASSERT_TRUE(ref.markMoving()); - - ref.unmarkInMMContainer(); - auto ret = ref.unmarkMoving(); - ASSERT_EQ(ret, 1); - } - { // cannot mark for eviction when ref count > 0 RefcountWithFlags ref; diff --git a/cachelib/allocator/tests/SimpleRebalancingTest.h b/cachelib/allocator/tests/SimpleRebalancingTest.h index 634882c730..3f1869ede3 100644 --- a/cachelib/allocator/tests/SimpleRebalancingTest.h +++ b/cachelib/allocator/tests/SimpleRebalancingTest.h @@ -104,7 +104,7 @@ class SimpleRebalanceTest : public testing::Test { // Sleep for 2 seconds to let the rebalancing work /* sleep override */ - std::this_thread::sleep_for(std::chrono::seconds(3)); + std::this_thread::sleep_for(std::chrono::seconds(10)); // Evicted keys shouldn't be in the allocator anymore ASSERT_FALSE(evictedKeys.empty()); diff --git a/cachelib/allocator/tests/TestBase-inl.h b/cachelib/allocator/tests/TestBase-inl.h index bf7355c87d..79dc6f44be 100644 --- a/cachelib/allocator/tests/TestBase-inl.h +++ b/cachelib/allocator/tests/TestBase-inl.h @@ -98,6 +98,30 @@ void AllocatorTest::fillUpPoolUntilEvictions( } while (allocs != 0); } +template +void AllocatorTest::fillUpPoolUntilEvictions( + AllocatorT& alloc, + TierId tid, + PoolId poolId, + const std::vector& sizes, + unsigned int keyLen) { + unsigned int allocs = 0; + do { + allocs = 0; + for (const auto size : sizes) { + const auto key = getRandomNewKey(alloc, keyLen); + ASSERT_EQ(alloc.find(key), nullptr); + const size_t prev = alloc.getPoolByTid(poolId, tid).getCurrentAllocSize(); + auto handle = util::allocateAccessible(alloc, poolId, key, size); + if (handle && prev != alloc.getPoolByTid(poolId, tid).getCurrentAllocSize()) { + // this means we did not cause an eviction. + ASSERT_GE(handle->getSize(), size); + allocs++; + } + } + } while (allocs != 0); +} + template void AllocatorTest::testAllocWithoutEviction( AllocatorT& alloc, @@ -312,7 +336,7 @@ void AllocatorTest::testShmIsRemoved( ASSERT_FALSE(AllocatorT::ShmManager::segmentExists( config.getCacheDir(), detail::kShmHashTableName, config.usePosixShm)); ASSERT_FALSE(AllocatorT::ShmManager::segmentExists( - config.getCacheDir(), detail::kShmCacheName, config.usePosixShm)); + config.getCacheDir(), detail::kShmCacheName + std::to_string(0), config.usePosixShm)); ASSERT_FALSE(AllocatorT::ShmManager::segmentExists( config.getCacheDir(), detail::kShmChainedItemHashTableName, config.usePosixShm)); @@ -326,7 +350,7 @@ void AllocatorTest::testShmIsNotRemoved( ASSERT_TRUE(AllocatorT::ShmManager::segmentExists( config.getCacheDir(), detail::kShmHashTableName, config.usePosixShm)); ASSERT_TRUE(AllocatorT::ShmManager::segmentExists( - config.getCacheDir(), detail::kShmCacheName, config.usePosixShm)); + config.getCacheDir(), detail::kShmCacheName + std::to_string(0), config.usePosixShm)); ASSERT_TRUE(AllocatorT::ShmManager::segmentExists( config.getCacheDir(), detail::kShmChainedItemHashTableName, config.usePosixShm)); diff --git a/cachelib/allocator/tests/TestBase.h b/cachelib/allocator/tests/TestBase.h index 54032e3257..858da2bb95 100644 --- a/cachelib/allocator/tests/TestBase.h +++ b/cachelib/allocator/tests/TestBase.h @@ -69,6 +69,11 @@ class AllocatorTest : public SlabAllocatorTestBase { PoolId pid, const std::vector& sizes, unsigned int keyLen); + void fillUpPoolUntilEvictions(AllocatorT& alloc, + TierId tid, + PoolId pid, + const std::vector& sizes, + unsigned int keyLen); void fillUpOneSlab(AllocatorT& alloc, PoolId poolId, const uint32_t size, diff --git a/cachelib/cachebench/cache/Cache-inl.h b/cachelib/cachebench/cache/Cache-inl.h index ab2908558f..f1c5248718 100644 --- a/cachelib/cachebench/cache/Cache-inl.h +++ b/cachelib/cachebench/cache/Cache-inl.h @@ -46,6 +46,16 @@ Cache::Cache(const CacheConfig& config, config_.getRebalanceStrategy(), std::chrono::seconds(config_.poolRebalanceIntervalSec)); + allocatorConfig_.enableBackgroundEvictor( + config_.getBackgroundEvictorStrategy(), + std::chrono::milliseconds(config_.backgroundEvictorIntervalMilSec), + config_.evictorThreads); + + allocatorConfig_.enableBackgroundPromoter( + config_.getBackgroundPromoterStrategy(), + std::chrono::milliseconds(config_.backgroundPromoterIntervalMilSec), + config_.promoterThreads); + if (config_.moveOnSlabRelease && movingSync != nullptr) { allocatorConfig_.enableMovingOnSlabRelease( [](Item& oldItem, Item& newItem, Item* parentPtr) { @@ -94,12 +104,20 @@ Cache::Cache(const CacheConfig& config, allocatorConfig_.configureMemoryTiers(config_.memoryTierConfigs); } + allocatorConfig_.insertToFirstFreeTier = config_.insertToFirstFreeTier; + auto cleanupGuard = folly::makeGuard([&] { if (!nvmCacheFilePath_.empty()) { util::removePath(nvmCacheFilePath_); } }); + allocatorConfig_.maxEvictionBatch = config_.maxEvictionBatch; + allocatorConfig_.maxPromotionBatch = config_.maxPromotionBatch; + allocatorConfig_.minEvictionBatch = config_.minEvictionBatch; + allocatorConfig_.minPromotionBatch = config_.minPromotionBatch; + allocatorConfig_.maxEvictionPromotionHotness = config_.maxEvictionPromotionHotness; + if (config_.enableItemDestructorCheck) { auto removeCB = [&](const typename Allocator::DestructorData& data) { if (!itemRecords_.validate(data)) { @@ -629,28 +647,36 @@ double Cache::getNvmBytesWritten() const { template Stats Cache::getStats() const { - PoolStats aggregate = cache_->getPoolStats(pools_[0]); - auto usageFraction = - 1.0 - (static_cast(aggregate.freeMemoryBytes())) / - aggregate.poolUsableSize; + Stats ret; - ret.poolUsageFraction.push_back(usageFraction); - for (size_t pid = 1; pid < pools_.size(); pid++) { - auto poolStats = cache_->getPoolStats(static_cast(pid)); - usageFraction = 1.0 - (static_cast(poolStats.freeMemoryBytes())) / - poolStats.poolUsableSize; - ret.poolUsageFraction.push_back(usageFraction); - aggregate += poolStats; + for (TierId tid = 0; tid < cache_->getNumTiers(); tid++) { + PoolStats aggregate = cache_->getPoolStats(tid,pools_[0]); + auto usageFraction = + 1.0 - (static_cast(aggregate.freeMemoryBytes())) / + aggregate.poolUsableSize; + ret.poolUsageFraction[tid].push_back(usageFraction); + for (size_t pid = 1; pid < pools_.size(); pid++) { + auto poolStats = cache_->getPoolStats(tid, static_cast(pid)); + usageFraction = 1.0 - (static_cast(poolStats.freeMemoryBytes())) / + poolStats.poolUsableSize; + ret.poolUsageFraction[tid].push_back(usageFraction); + aggregate += poolStats; + } + ret.numEvictions.push_back(aggregate.numEvictions()); + ret.numWritebacks.push_back(aggregate.numWritebacks()); + ret.numCacheHits.push_back(aggregate.numHits()); + ret.numItems.push_back(aggregate.numItems()); } - std::map> allocationClassStats{}; + std::map>> allocationClassStats{}; for (size_t pid = 0; pid < pools_.size(); pid++) { PoolId poolId = static_cast(pid); auto poolStats = cache_->getPoolStats(poolId); auto cids = poolStats.getClassIds(); - for (auto [cid, stats] : poolStats.mpStats.acStats) { - allocationClassStats[poolId][cid] = stats; + for (TierId tid = 0; tid < cache_->getNumTiers(); tid++) { + for (auto cid : cids) + allocationClassStats[tid][pid][cid] = cache_->getACStats(tid, pid, cid); } } @@ -659,8 +685,21 @@ Stats Cache::getStats() const { const auto navyStats = cache_->getNvmCacheStatsMap().toMap(); ret.allocationClassStats = allocationClassStats; - ret.numEvictions = aggregate.numEvictions(); - ret.numItems = aggregate.numItems(); + + ret.backgndEvicStats.nEvictedItems = + cacheStats.evictionStats.numMovedItems; + ret.backgndEvicStats.nTraversals = + cacheStats.evictionStats.runCount; + ret.backgndEvicStats.nClasses = + cacheStats.evictionStats.totalClasses; + ret.backgndEvicStats.evictionSize = + cacheStats.evictionStats.totalBytesMoved; + + ret.backgndPromoStats.nPromotedItems = + cacheStats.promotionStats.numMovedItems; + ret.backgndPromoStats.nTraversals = + cacheStats.promotionStats.runCount; + ret.evictAttempts = cacheStats.evictionAttempts; ret.allocAttempts = cacheStats.allocAttempts; ret.allocFailures = cacheStats.allocFailures; @@ -704,6 +743,8 @@ Stats Cache::getStats() const { static_cast(itemRecords_.count()) - totalDestructor_; ret.cacheAllocateLatencyNs = cacheStats.allocateLatencyNs; + ret.cacheBgEvictLatencyNs = cacheStats.bgEvictLatencyNs; + ret.cacheBgPromoteLatencyNs = cacheStats.bgPromoteLatencyNs; ret.cacheFindLatencyNs = cacheFindLatency_.estimate(); // Populate counters. @@ -712,6 +753,9 @@ Stats Cache::getStats() const { ret.nvmCounters = cache_->getNvmCacheStatsMap().toMap(); } + ret.backgroundEvictionClasses = cache_->getBackgroundMoverClassStats(MoverDir::Evict); + ret.backgroundPromotionClasses = cache_->getBackgroundMoverClassStats(MoverDir::Promote); + // nvm stats from navy if (!isRamOnly() && !navyStats.empty()) { auto lookup = [&navyStats](const std::string& key) { diff --git a/cachelib/cachebench/cache/Cache.h b/cachelib/cachebench/cache/Cache.h index fa0a2ea556..a85c1efb66 100644 --- a/cachelib/cachebench/cache/Cache.h +++ b/cachelib/cachebench/cache/Cache.h @@ -325,6 +325,10 @@ class Cache { // return the stats for the pool. PoolStats getPoolStats(PoolId pid) const { return cache_->getPoolStats(pid); } + ACStats getACStats(TierId tid, PoolId pid, ClassId cid) const { + return cache_->getACStats(tid, pid, cid); + } + // return the total number of inconsistent operations detected since start. unsigned int getInconsistencyCount() const { return inconsistencyCount_.load(std::memory_order_relaxed); diff --git a/cachelib/cachebench/cache/CacheStats.h b/cachelib/cachebench/cache/CacheStats.h index c673c4b406..4285462abf 100644 --- a/cachelib/cachebench/cache/CacheStats.h +++ b/cachelib/cachebench/cache/CacheStats.h @@ -26,15 +26,44 @@ DECLARE_string(report_ac_memory_usage_stats); namespace facebook { namespace cachelib { namespace cachebench { + +struct BackgroundEvictionStats { + // the number of items this worker evicted by looking at pools/classes stats + uint64_t nEvictedItems{0}; + + // number of times we went executed the thread //TODO: is this def correct? + uint64_t nTraversals{0}; + + // number of classes + uint64_t nClasses{0}; + + // size of evicted items + uint64_t evictionSize{0}; +}; + +struct BackgroundPromotionStats { + // the number of items this worker evicted by looking at pools/classes stats + uint64_t nPromotedItems{0}; + + // number of times we went executed the thread //TODO: is this def correct? + uint64_t nTraversals{0}; +}; + struct Stats { - uint64_t numEvictions{0}; - uint64_t numItems{0}; + BackgroundEvictionStats backgndEvicStats; + BackgroundPromotionStats backgndPromoStats; + ReaperStats reaperStats; + + std::vector numEvictions; + std::vector numWritebacks; + std::vector numCacheHits; + std::vector numItems; - uint64_t evictAttempts{0}; - uint64_t allocAttempts{0}; - uint64_t allocFailures{0}; + std::vector evictAttempts{0}; + std::vector allocAttempts{0}; + std::vector allocFailures{0}; - std::vector poolUsageFraction; + std::map> poolUsageFraction; uint64_t numCacheGets{0}; uint64_t numCacheGetMiss{0}; @@ -62,6 +91,8 @@ struct Stats { uint64_t numNvmItemRemovedSetSize{0}; util::PercentileStats::Estimates cacheAllocateLatencyNs; + util::PercentileStats::Estimates cacheBgEvictLatencyNs; + util::PercentileStats::Estimates cacheBgPromoteLatencyNs; util::PercentileStats::Estimates cacheFindLatencyNs; double nvmReadLatencyMicrosP50{0}; @@ -101,36 +132,71 @@ struct Stats { uint64_t invalidDestructorCount{0}; int64_t unDestructedItemCount{0}; - std::map> allocationClassStats; + std::map>> allocationClassStats; // populate the counters related to nvm usage. Cache implementation can decide // what to populate since not all of those are interesting when running // cachebench. std::unordered_map nvmCounters; + std::map>> backgroundEvictionClasses; + std::map>> backgroundPromotionClasses; + // errors from the nvm engine. std::unordered_map nvmErrors; void render(std::ostream& out) const { auto totalMisses = getTotalMisses(); const double overallHitRatio = invertPctFn(totalMisses, numCacheGets); - out << folly::sformat("Items in RAM : {:,}", numItems) << std::endl; - out << folly::sformat("Items in NVM : {:,}", numNvmItems) << std::endl; - - out << folly::sformat("Alloc Attempts: {:,} Success: {:.2f}%", - allocAttempts, - invertPctFn(allocFailures, allocAttempts)) - << std::endl; - out << folly::sformat("Evict Attempts: {:,} Success: {:.2f}%", - evictAttempts, - pctFn(numEvictions, evictAttempts)) - << std::endl; - out << folly::sformat("RAM Evictions : {:,}", numEvictions) << std::endl; + const auto nTiers = numItems.size(); + for (TierId tid = 0; tid < nTiers; tid++) { + out << folly::sformat("Items in Tier {} : {:,}", tid, numItems[tid]) << std::endl; + } + out << folly::sformat("Items in NVM : {:,}", numNvmItems) << std::endl; + for (TierId tid = 0; tid < nTiers; tid++) { + out << folly::sformat("Tier {} Alloc Attempts: {:,}\n" + "Tier {} Alloc Success: {:.2f}%", + tid, allocAttempts[tid], + tid, invertPctFn(allocFailures[tid], allocAttempts[tid])) + << std::endl; + } + for (TierId tid = 0; tid < nTiers; tid++) { + out << folly::sformat( + "Tier {} Evict Attempts: {:,}\n" + "Tier {} Success: {:.2f}%", + tid, evictAttempts[tid], + tid, pctFn(numEvictions[tid], evictAttempts[tid])) + << std::endl; + } + for (TierId tid = 0; tid < nTiers; tid++) { + out << folly::sformat("Tier {} Evictions: {:,}\n" + "Tier {} Writebacks: {:,}\n" + "Tier {} Success: {:.2f}%", + tid, numEvictions[tid], + tid, numWritebacks[tid], + tid, invertPctFn(numEvictions[tid] - numWritebacks[tid], numEvictions[tid])) + << std::endl; + } + auto foreachAC = [&](auto &map, auto cb) { + for (auto &tidStats : map) { + for (auto &pidStat : tidStats.second) { + for (auto &cidStat : pidStat.second) { + cb(tidStats.first, pidStat.first, cidStat.first, cidStat.second); + } + } + } + }; - for (auto pid = 0U; pid < poolUsageFraction.size(); pid++) { - out << folly::sformat("Fraction of pool {:,} used : {:.2f}", pid, - poolUsageFraction[pid]) - << std::endl; + for (auto entry : poolUsageFraction) { + auto tid = entry.first; + auto usageFraction = entry.second; + for (auto pid = 0U; pid < usageFraction.size(); pid++) { + out << folly::sformat("Tier {} fraction of pool {:,} used : {:.2f}", + tid, + pid, + usageFraction[pid]) + << std::endl; + } } if (FLAGS_report_ac_memory_usage_stats != "") { @@ -155,58 +221,57 @@ struct Stats { }; auto foreachAC = [&](auto cb) { - for (auto& pidStat : allocationClassStats) { - for (auto& cidStat : pidStat.second) { - cb(pidStat.first, cidStat.first, cidStat.second); + for (auto& tidStat : allocationClassStats) { + for (auto& pidStat : tidStat.second) { + for (auto& cidStat : pidStat.second) { + cb(tidStat.first, pidStat.first, cidStat.first, cidStat.second); + } } } }; - foreachAC([&](auto pid, auto cid, auto stats) { + foreachAC([&](auto tid, auto pid, auto cid, auto stats) { auto [allocSizeSuffix, allocSize] = formatMemory(stats.allocSize); auto [memorySizeSuffix, memorySize] = - formatMemory(stats.activeAllocs * stats.allocSize); - out << folly::sformat("pid{:2} cid{:4} {:8.2f}{} memorySize: {:8.2f}{}", - pid, cid, allocSize, allocSizeSuffix, memorySize, - memorySizeSuffix) - << std::endl; - }); - - foreachAC([&](auto pid, auto cid, auto stats) { - auto [allocSizeSuffix, allocSize] = formatMemory(stats.allocSize); + formatMemory(stats.totalAllocatedSize()); // If the pool is not full, extrapolate usageFraction for AC assuming it // will grow at the same rate. This value will be the same for all ACs. - double acUsageFraction; - if (poolUsageFraction[pid] < 1.0) { - acUsageFraction = poolUsageFraction[pid]; - } else if (stats.usedSlabs == 0) { - acUsageFraction = 0.0; - } else { - acUsageFraction = - stats.activeAllocs / (stats.usedSlabs * stats.allocsPerSlab); - } + const auto acUsageFraction = (poolUsageFraction.at(tid)[pid] < 1.0) + ? poolUsageFraction.at(tid)[pid] + : stats.usageFraction(); out << folly::sformat( - "pid{:2} cid{:4} {:8.2f}{} usageFraction: {:4.2f}", pid, cid, - allocSize, allocSizeSuffix, acUsageFraction) + "tid{:2} pid{:2} cid{:4} {:8.2f}{} usage fraction: {:4.2f}\n" + "tid{:2} pid{:2} cid{:4} {:8.2f}{} memory size in {}: {:8.2f}\n" + "tid{:2} pid{:2} cid{:4} {:8.2f}{} rolling avg alloc latency in ns: {:8.2f}", + tid, pid, cid, allocSize, allocSizeSuffix, acUsageFraction, + tid, pid, cid, allocSize, allocSizeSuffix, memorySizeSuffix, memorySize, + tid, pid, cid, allocSize, allocSizeSuffix, stats.allocLatencyNs.estimate()) << std::endl; }); } + out << folly::sformat("Tier 0 Background Evicted Items : {:,}", + backgndEvicStats.nEvictedItems) << std::endl; + out << folly::sformat("Tier 0 Background Traversals : {:,}", + backgndEvicStats.nTraversals) << std::endl; if (numCacheGets > 0) { out << folly::sformat("Cache Gets : {:,}", numCacheGets) << std::endl; out << folly::sformat("Hit Ratio : {:6.2f}%", overallHitRatio) << std::endl; - + for (TierId tid = 0; tid < numCacheHits.size(); tid++) { + double tierHitRatio = pctFn(numCacheHits[tid],numCacheGets); + out << folly::sformat("Tier {} Hit Ratio : {:6.2f}%", tid, tierHitRatio) + << std::endl; + } if (FLAGS_report_api_latency) { auto printLatencies = [&out](folly::StringPiece cat, const util::PercentileStats::Estimates& latency) { auto fmtLatency = [&out, &cat](folly::StringPiece pct, double val) { - out << folly::sformat("{:20} {:8} : {:>10.2f} ns\n", cat, pct, - val); + out << folly::sformat("{:20} {:8} in ns: {:>10.2f}\n", cat, pct, val); }; fmtLatency("p50", latency.p50); @@ -221,9 +286,35 @@ struct Stats { printLatencies("Cache Find API latency", cacheFindLatencyNs); printLatencies("Cache Allocate API latency", cacheAllocateLatencyNs); + printLatencies("Cache Background Eviction API latency", cacheBgEvictLatencyNs); + printLatencies("Cache Background Promotion API latency", cacheBgPromoteLatencyNs); } } + if (!backgroundEvictionClasses.empty() && backgndEvicStats.nEvictedItems > 0 ) { + out << "== Class Background Eviction Counters Map ==" << std::endl; + foreachAC(backgroundEvictionClasses, [&](auto tid, auto pid, auto cid, auto evicted){ + out << folly::sformat("tid{:2} pid{:2} cid{:4} evicted: {:4}", + tid, pid, cid, evicted) << std::endl; + }); + } + if (!backgroundPromotionClasses.empty() && backgndPromoStats.nPromotedItems > 0) { + out << "== Class Background Promotion Counters Map ==" << std::endl; + foreachAC(backgroundPromotionClasses, [&](auto tid, auto pid, auto cid, auto promoted){ + out << folly::sformat("tid{:2} pid{:2} cid{:4} promoted: {:4}", + tid, pid, cid, promoted) << std::endl; + } + }); + } + + if (reaperStats.numReapedItems > 0) { + + out << folly::sformat("Reaper reaped: {:,} visited: {:,} traversals: {:,} avg traversal time: {:,}", + reaperStats.numReapedItems,reaperStats.numVisitedItems, + reaperStats.numTraversals,reaperStats.avgTraversalTimeMs) + << std::endl; + } + if (numNvmGets > 0 || numNvmDeletes > 0 || numNvmPuts > 0) { const double ramHitRatio = invertPctFn(numCacheGetMiss, numCacheGets); const double nvmHitRatio = invertPctFn(numNvmGetMiss, numNvmGets); @@ -269,15 +360,15 @@ struct Stats { double devWriteAmp = pctFn(numNvmNandBytesWritten, numNvmBytesWritten) / 100.0; - out << folly::sformat("NVM bytes written (physical) : {:6.2f} GB\n", + out << folly::sformat("NVM bytes written (physical) in GB : {:6.2f}\n", numNvmBytesWritten / GB); - out << folly::sformat("NVM bytes written (logical) : {:6.2f} GB\n", + out << folly::sformat("NVM bytes written (logical) in GB : {:6.2f}\n", numNvmLogicalBytesWritten / GB); - out << folly::sformat("NVM bytes written (nand) : {:6.2f} GB\n", + out << folly::sformat("NVM bytes written (nand) in GB : {:6.2f}\n", numNvmNandBytesWritten / GB); - out << folly::sformat("NVM app write amplification : {:6.2f}\n", + out << folly::sformat("NVM app write amplification : {:6.2f}\n", appWriteAmp); - out << folly::sformat("NVM dev write amplification : {:6.2f}\n", + out << folly::sformat("NVM dev write amplification : {:6.2f}\n", devWriteAmp); } const double putSuccessPct = @@ -286,62 +377,57 @@ struct Stats { numNvmPuts); const double cleanEvictPct = pctFn(numNvmCleanEvict, numNvmEvictions); const double getCoalescedPct = pctFn(numNvmGetCoalesced, numNvmGets); - out << folly::sformat("{:14}: {:15,}, {:10}: {:6.2f}%", - "NVM Gets", - numNvmGets, - "Coalesced", - getCoalescedPct) + out << folly::sformat("{:30}: {:10,}\n" + "{:30}: {:10.2f}", + "NVM Gets", numNvmGets, + "NVM Coalesced in pct", getCoalescedPct) << std::endl; out << folly::sformat( - "{:14}: {:15,}, {:10}: {:6.2f}%, {:8}: {:6.2f}%, {:16}: " - "{:8,}, {:16}: {:8,}", - "NVM Puts", - numNvmPuts, - "Success", - putSuccessPct, - "Clean", - pctFn(numNvmPutFromClean, numNvmPuts), - "AbortsFromDel", - numNvmAbortedPutOnTombstone, - "AbortsFromGet", - numNvmAbortedPutOnInflightGet) + "{:30}: {:10,}\n" + "{:30}: {:10.2f}\n" + "{:30}: {:10.2f}\n" + "{:30}: {:10,}\n" + "{:30}: {:10,}", + "NVM Puts", numNvmPuts, + "NVM Puts Success in pct", putSuccessPct, + "NVM Puts from Clean in pct", pctFn(numNvmPutFromClean, numNvmPuts), + "NVM AbortsFromDel", numNvmAbortedPutOnTombstone, + "NVM AbortsFromGet", numNvmAbortedPutOnInflightGet) << std::endl; out << folly::sformat( - "{:14}: {:15,}, {:10}: {:6.2f}%, {:8}: {:7,}," - " {:16}: {:8,}", - "NVM Evicts", - numNvmEvictions, - "Clean", - cleanEvictPct, - "Unclean", - numNvmUncleanEvict, - "Double", - numNvmCleanDoubleEvict) + "{:30}: {:10,}\n" + "{:30}: {:10.2f}\n" + "{:30}: {:10,}\n" + "{:30}: {:10,}", + "NVM Evicts", numNvmEvictions, + "NVM Clean Evicts in pct", cleanEvictPct, + "NVM Unclean Evicts", numNvmUncleanEvict, + "NVM Clean Double Evicts", numNvmCleanDoubleEvict) << std::endl; const double skippedDeletesPct = pctFn(numNvmSkippedDeletes, numNvmDeletes); - out << folly::sformat("{:14}: {:15,} {:14}: {:6.2f}%", - "NVM Deletes", - numNvmDeletes, - "Skipped Deletes", - skippedDeletesPct) + out << folly::sformat("{:30}: {:10,}\n" + "{:30}: {:10.2f}", + "NVM Deletes", numNvmDeletes, + "NVM Skipped Deletes in pct", skippedDeletesPct) << std::endl; if (numNvmExceededMaxRetry > 0) { - out << folly::sformat("{}: {}", "NVM max read retry reached", + out << folly::sformat("{:30}: {:10,}", "NVM max read retry reached", numNvmExceededMaxRetry) << std::endl; } if (slabsReleased > 0) { out << folly::sformat( - "Released {:,} slabs\n" - " Moves : attempts: {:10,}, success: {:6.2f}%\n" - " Evictions : attempts: {:10,}, success: {:6.2f}%", + "Released slabs: {:,}\n" + "Slab Move attempts: {:10,}\n" + "Slab Move success in pct: {:6.2f}\n" + "Slab Eviction attempts: {:10,}\n" + "Slab Eviction success in pct: {:6.2f}", slabsReleased, moveAttemptsForSlabRelease, pctFn(moveSuccessesForSlabRelease, moveAttemptsForSlabRelease), evictionAttemptsForSlabRelease, - pctFn(evictionSuccessesForSlabRelease, - evictionAttemptsForSlabRelease)) + pctFn(evictionSuccessesForSlabRelease, evictionAttemptsForSlabRelease)) << std::endl; } @@ -359,8 +445,13 @@ struct Stats { } if (numCacheEvictions > 0) { - out << folly::sformat("Total eviction executed {}", numCacheEvictions) - << std::endl; + out << folly::sformat("Total evictions executed : {:10,}", numCacheEvictions) + << std::endl; + out << folly::sformat("Total background evictions: {:10,}", totalbgevicted) + << std::endl; + } + if (totalpromoted > 0) { + out << folly::sformat("Total promotions : {:10,}", totalpromoted) << std::endl; } } @@ -368,46 +459,43 @@ struct Stats { return numNvmGets > 0 ? numNvmGetMiss : numCacheGetMiss; } - std::tuple getHitRatios( - const Stats& prevStats) const { - double overallHitRatio = 0.0; - double ramHitRatio = 0.0; - double nvmHitRatio = 0.0; - - if (numCacheGets > prevStats.numCacheGets) { - auto totalMisses = getTotalMisses(); - auto prevTotalMisses = prevStats.getTotalMisses(); - - overallHitRatio = invertPctFn(totalMisses - prevTotalMisses, - numCacheGets - prevStats.numCacheGets); - - ramHitRatio = invertPctFn(numCacheGetMiss - prevStats.numCacheGetMiss, - numCacheGets - prevStats.numCacheGets); - } - - if (numNvmGets > prevStats.numNvmGets) { - nvmHitRatio = invertPctFn(numNvmGetMiss - prevStats.numNvmGetMiss, - numNvmGets - prevStats.numNvmGets); + double getOverallHitRatio(const Stats& prevStats) const { + auto totalMisses = getTotalMisses(); + auto prevTotalMisses = prevStats.getTotalMisses(); + if (numCacheGets <= prevStats.numCacheGets || + totalMisses <= prevTotalMisses) { + return 0.0; } - return std::make_tuple(overallHitRatio, ramHitRatio, nvmHitRatio); + return invertPctFn(totalMisses - prevTotalMisses, + numCacheGets - prevStats.numCacheGets); } // Render the stats based on the delta between overall stats and previous // stats. It can be used to render the stats in the last time period. void render(const Stats& prevStats, std::ostream& out) const { - if (numCacheGets > prevStats.numCacheGets) { - auto [overallHitRatio, ramHitRatio, nvmHitRatio] = - getHitRatios(prevStats); + auto totalMisses = getTotalMisses(); + auto prevTotalMisses = prevStats.getTotalMisses(); + if (numCacheGets > prevStats.numCacheGets && + totalMisses >= prevTotalMisses) { + const double overallHitRatio = invertPctFn( + totalMisses - prevTotalMisses, numCacheGets - prevStats.numCacheGets); out << folly::sformat("Cache Gets : {:,}", numCacheGets - prevStats.numCacheGets) << std::endl; - out << folly::sformat("Hit Ratio : {:6.2f}%", overallHitRatio) + out << folly::sformat("Overall Hit Ratio in pct: {:6.2f}", overallHitRatio) << std::endl; + const double ramHitRatio = + invertPctFn(numCacheGetMiss - prevStats.numCacheGetMiss, + numCacheGets - prevStats.numCacheGets); + const double nvmHitRatio = + invertPctFn(numNvmGetMiss - prevStats.numNvmGetMiss, + numNvmGets - prevStats.numNvmGets); + out << folly::sformat( - "RAM Hit Ratio : {:6.2f}%\n" - "NVM Hit Ratio : {:6.2f}%\n", + "RAM Hit Ratio in pct: {:6.2f}\n" + "NVM Hit Ratio in pct: {:6.2f}\n", ramHitRatio, nvmHitRatio); } } @@ -418,12 +506,15 @@ struct Stats { }; auto totalMisses = getTotalMisses(); - counters["num_items"] = numItems; + //TODO: per tier + counters["num_items"] = std::accumulate(numItems.begin(),numItems.end(),0); counters["num_nvm_items"] = numNvmItems; counters["hit_rate"] = calcInvertPctFn(totalMisses, numCacheGets); counters["find_latency_p99"] = cacheFindLatencyNs.p99; counters["alloc_latency_p99"] = cacheAllocateLatencyNs.p99; + counters["bg_evict_latency_p99"] = cacheBgEvictLatencyNs.p99; + counters["bg_promote_latency_p99"] = cacheBgPromoteLatencyNs.p99; counters["ram_hit_rate"] = calcInvertPctFn(numCacheGetMiss, numCacheGets); counters["nvm_hit_rate"] = calcInvertPctFn(numCacheGetMiss, numCacheGets); @@ -492,7 +583,7 @@ struct Stats { private: static double pctFn(uint64_t ops, uint64_t total) { return total == 0 - ? 0 + ? 100.0 : 100.0 * static_cast(ops) / static_cast(total); } diff --git a/cachelib/cachebench/runner/CacheStressor.h b/cachelib/cachebench/runner/CacheStressor.h index b222fa421f..41090c93a3 100644 --- a/cachelib/cachebench/runner/CacheStressor.h +++ b/cachelib/cachebench/runner/CacheStressor.h @@ -77,7 +77,7 @@ class CacheStressor : public Stressor { std::unique_lock lock; CacheStressSyncObj(CacheStressor& s, std::string itemKey) - : lock{s.chainedItemAcquireUniqueLock(itemKey)} {} + : lock{s.chainedItemTryAcquireUniqueLock(itemKey)} {} }; movingSync = [this](typename CacheT::Item::Key key) { return std::make_unique(*this, key.str()); @@ -247,6 +247,10 @@ class CacheStressor : public Stressor { using Lock = std::unique_lock; return lockEnabled_ ? Lock{getLock(key)} : Lock{}; } + auto chainedItemTryAcquireUniqueLock(Key key) { + using Lock = std::unique_lock; + return lockEnabled_ ? Lock{getLock(key), std::try_to_lock} : Lock{}; + } // populate the input item handle according to the stress setup. void populateItem(WriteHandle& handle, const std::string& itemValue = "") { diff --git a/cachelib/cachebench/test_configs/consistency/navy-multi-tier.json b/cachelib/cachebench/test_configs/consistency/navy-multi-tier.json new file mode 100644 index 0000000000..076550bc5c --- /dev/null +++ b/cachelib/cachebench/test_configs/consistency/navy-multi-tier.json @@ -0,0 +1,54 @@ +{ + "cache_config" : { + "cacheSizeMB" : 300, + "poolRebalanceIntervalSec" : 1, + "moveOnSlabRelease" : true, + + "cacheDir": "/tmp/mem-tier2", + "memoryTiers" : [ + { + "ratio": 1, + "memBindNodes": 0 + }, + { + "ratio": 1, + "memBindNodes": 0 + } + ], + + "numPools" : 2, + "poolSizes" : [0.5, 0.5], + "allocFactor" : 2.0, + "nvmCacheSizeMB" : 1024 + }, + "test_config" : + { + + "checkConsistency" : true, + + "numOps" : 60000, + "numThreads" : 20, + "numKeys" : 200000, + + + "keySizeRange" : [1, 8, 64], + "keySizeRangeProbability" : [0.5, 0.5], + + "valSizeRange" : [256, 1024, 4096, 8192], + "valSizeRangeProbability" : [0.2, 0.7, 0.1], + + "chainedItemLengthRange" : [1, 2, 4, 32], + "chainedItemLengthRangeProbability" : [0.8, 0.18, 0.02], + + "chainedItemValSizeRange" : [1, 128, 256, 1024, 4096, 20480], + "chainedItemValSizeRangeProbability" : [0.1, 0.1, 0.2, 0.3, 0.3], + + "getRatio" : 0.8, + "setRatio" : 0.1, + "delRatio" : 0.0, + "addChainedRatio" : 0.05, + "keyPoolDistribution": [0.5, 0.5], + "opPoolDistribution" : [0.5, 0.5] + } + +} diff --git a/cachelib/cachebench/test_configs/consistency/navy.json b/cachelib/cachebench/test_configs/consistency/navy.json index 73b016a50f..b95b056d31 100644 --- a/cachelib/cachebench/test_configs/consistency/navy.json +++ b/cachelib/cachebench/test_configs/consistency/navy.json @@ -14,8 +14,8 @@ "checkConsistency" : true, - "numOps" : 30000000, - "numThreads" : 40, + "numOps" : 600000, + "numThreads" : 20, "numKeys" : 200000, diff --git a/cachelib/cachebench/test_configs/hit_ratio/graph_cache_leader_fbobj/config-4GB-DRAM-4GB-PMEM.json b/cachelib/cachebench/test_configs/hit_ratio/graph_cache_leader_fbobj/config-4GB-DRAM-4GB-PMEM.json new file mode 100644 index 0000000000..d9acdf7c6c --- /dev/null +++ b/cachelib/cachebench/test_configs/hit_ratio/graph_cache_leader_fbobj/config-4GB-DRAM-4GB-PMEM.json @@ -0,0 +1,42 @@ +{ + "cache_config": { + "cacheSizeMB": 8192, + "poolRebalanceIntervalSec": 0, + "cacheDir": "/tmp/mem-tiers", + "memoryTiers" : [ + { + "ratio": 1, + "memBindNodes": 0 + }, + { + "ratio": 1, + "memBindNodes": 0 + } + ] + }, + "test_config": + { + "addChainedRatio": 0.0, + "delRatio": 0.0, + "enableLookaside": true, + "getRatio": 0.7684563460126871, + "keySizeRange": [ + 1, + 8, + 64 + ], + "keySizeRangeProbability": [ + 0.3, + 0.7 + ], + "loneGetRatio": 0.2315436539873129, + "numKeys": 71605574, + "numOps": 5000000, + "numThreads": 24, + "popDistFile": "pop.json", + + "setRatio": 0.0, + "valSizeDistFile": "sizes.json" + } + +} diff --git a/cachelib/cachebench/test_configs/hit_ratio/graph_cache_leader_fbobj/config-8GB-DRAM.json b/cachelib/cachebench/test_configs/hit_ratio/graph_cache_leader_fbobj/config-8GB-DRAM.json new file mode 100644 index 0000000000..6d47e08b74 --- /dev/null +++ b/cachelib/cachebench/test_configs/hit_ratio/graph_cache_leader_fbobj/config-8GB-DRAM.json @@ -0,0 +1,32 @@ +{ + "cache_config": { + "cacheSizeMB": 8192, + "poolRebalanceIntervalSec": 0, + "cacheDir": "/tmp/mem-tier" + }, + "test_config": + { + "addChainedRatio": 0.0, + "delRatio": 0.0, + "enableLookaside": true, + "getRatio": 0.7684563460126871, + "keySizeRange": [ + 1, + 8, + 64 + ], + "keySizeRangeProbability": [ + 0.3, + 0.7 + ], + "loneGetRatio": 0.2315436539873129, + "numKeys": 71605574, + "numOps": 5000000, + "numThreads": 24, + "popDistFile": "pop.json", + + "setRatio": 0.0, + "valSizeDistFile": "sizes.json" + } + +} diff --git a/cachelib/cachebench/test_configs/hit_ratio/graph_cache_leader_fbobj/config-8GB-PMEM.json b/cachelib/cachebench/test_configs/hit_ratio/graph_cache_leader_fbobj/config-8GB-PMEM.json new file mode 100644 index 0000000000..4feab55154 --- /dev/null +++ b/cachelib/cachebench/test_configs/hit_ratio/graph_cache_leader_fbobj/config-8GB-PMEM.json @@ -0,0 +1,38 @@ +{ + "cache_config": { + "cacheSizeMB": 8192, + "poolRebalanceIntervalSec": 0, + "cacheDir": "/tmp/mem-tier", + "memoryTiers" : [ + { + "ratio": 1, + "memBindNodes": 0 + } + ] + }, + "test_config": + { + "addChainedRatio": 0.0, + "delRatio": 0.0, + "enableLookaside": true, + "getRatio": 0.7684563460126871, + "keySizeRange": [ + 1, + 8, + 64 + ], + "keySizeRangeProbability": [ + 0.3, + 0.7 + ], + "loneGetRatio": 0.2315436539873129, + "numKeys": 71605574, + "numOps": 5000000, + "numThreads": 24, + "popDistFile": "pop.json", + + "setRatio": 0.0, + "valSizeDistFile": "sizes.json" + } + +} diff --git a/cachelib/cachebench/test_configs/simple_tiers_test.json b/cachelib/cachebench/test_configs/simple_tiers_test.json index 182bb514cb..58302b9f20 100644 --- a/cachelib/cachebench/test_configs/simple_tiers_test.json +++ b/cachelib/cachebench/test_configs/simple_tiers_test.json @@ -1,14 +1,18 @@ // @nolint instantiates a small cache and runs a quick run of basic operations. { "cache_config" : { - "cacheSizeMB" : 512, - "usePosixShm" : false, + "cacheSizeMB" : 1024, "cacheDir" : "/tmp/mem-tiers", "memoryTiers" : [ + { + "ratio": 1, + "memBindNodes": "0" + }, { "ratio": 1, "memBindNodes": "0" } + ], "poolRebalanceIntervalSec" : 1, "moveOnSlabRelease" : false, @@ -19,7 +23,7 @@ "test_config" : { "numOps" : 100000, "numThreads" : 32, - "numKeys" : 1000000, + "numKeys" : 2000000, "keySizeRange" : [1, 8, 64], "keySizeRangeProbability" : [0.3, 0.7], @@ -33,4 +37,4 @@ "keyPoolDistribution": [0.4, 0.6], "opPoolDistribution" : [0.5, 0.5] } - } \ No newline at end of file + } diff --git a/cachelib/cachebench/test_configs/small_moving_bg.json b/cachelib/cachebench/test_configs/small_moving_bg.json new file mode 100644 index 0000000000..c4838f42b5 --- /dev/null +++ b/cachelib/cachebench/test_configs/small_moving_bg.json @@ -0,0 +1,35 @@ +// @nolint like default.json, but moves items during slab release instead of evicting them. +{ + "cache_config" : { + "cacheSizeMB" : 2248, + "cacheDir": "/tmp/mem-tier5", + "memoryTiers" : [ + { + "ratio": 1, + "memBindNodes": 0 + }, { + "ratio": 1, + "memBindNodes": 0 + } + ], + "poolRebalanceIntervalSec" : 1, + "moveOnSlabRelease" : true, + "rebalanceMinSlabs" : 2, + "evictorThreads": 2, + "promoterThreads": 2 + }, + "test_config" : + { + "preallocateCache" : true, + "numOps" : 20000000, + "numThreads" : 32, + "numKeys" : 250000, + "generator": "online", + "keySizeRange" : [1, 8, 32, 64, 128, 256, 512], + "keySizeRangeProbability" : [0.1, 0.1, 0.2, 0.2, 0.3, 0.1], + "valSizeRange" : [1, 128, 512, 1024, 4096, 10240, 20480, 40960, 60000], + "valSizeRangeProbability" : [0.1, 0.1, 0.1, 0.2, 0.2, 0.1, 0.1, 0.1], + "getRatio" : 0.70, + "setRatio" : 0.30 + } + } diff --git a/cachelib/cachebench/util/CacheConfig.cpp b/cachelib/cachebench/util/CacheConfig.cpp index 0676a4ab67..af5d7b4f64 100644 --- a/cachelib/cachebench/util/CacheConfig.cpp +++ b/cachelib/cachebench/util/CacheConfig.cpp @@ -19,6 +19,8 @@ #include "cachelib/allocator/HitsPerSlabStrategy.h" #include "cachelib/allocator/LruTailAgeStrategy.h" #include "cachelib/allocator/RandomStrategy.h" +#include "cachelib/allocator/FreeThresholdStrategy.h" +#include "cachelib/allocator/PromotionStrategy.h" namespace facebook { namespace cachelib { @@ -28,6 +30,9 @@ CacheConfig::CacheConfig(const folly::dynamic& configJson) { JSONSetVal(configJson, cacheDir); JSONSetVal(configJson, cacheSizeMB); JSONSetVal(configJson, poolRebalanceIntervalSec); + JSONSetVal(configJson, backgroundEvictorIntervalMilSec); + JSONSetVal(configJson, backgroundPromoterIntervalMilSec); + JSONSetVal(configJson, backgroundEvictorStrategy); JSONSetVal(configJson, moveOnSlabRelease); JSONSetVal(configJson, rebalanceStrategy); JSONSetVal(configJson, rebalanceMinSlabs); @@ -44,6 +49,8 @@ CacheConfig::CacheConfig(const folly::dynamic& configJson) { JSONSetVal(configJson, tryLockUpdate); JSONSetVal(configJson, lruIpSpec); JSONSetVal(configJson, useCombinedLockForIterators); + + JSONSetVal(configJson, insertToFirstFreeTier); JSONSetVal(configJson, lru2qHotPct); JSONSetVal(configJson, lru2qColdPct); @@ -102,10 +109,27 @@ CacheConfig::CacheConfig(const folly::dynamic& configJson) { JSONSetVal(configJson, nvmAdmissionRetentionTimeThreshold); JSONSetVal(configJson, customConfigJson); + + //Background related configs + JSONSetVal(configJson, lowEvictionAcWatermark); + JSONSetVal(configJson, highEvictionAcWatermark); + JSONSetVal(configJson, minAcAllocationWatermark); + JSONSetVal(configJson, maxAcAllocationWatermark); + JSONSetVal(configJson, numDuplicateElements); + JSONSetVal(configJson, syncPromotion); + JSONSetVal(configJson, evictorThreads); + JSONSetVal(configJson, promoterThreads); + JSONSetVal(configJson, promotionAcWatermark); + JSONSetVal(configJson, maxEvictionBatch); + JSONSetVal(configJson, maxPromotionBatch); + JSONSetVal(configJson, minEvictionBatch); + JSONSetVal(configJson, minPromotionBatch); + JSONSetVal(configJson, maxEvictionPromotionHotness); + // if you added new fields to the configuration, update the JSONSetVal // to make them available for the json configs and increment the size // below - checkCorrectSize(); + checkCorrectSize(); if (numPools != poolSizes.size()) { throw std::invalid_argument(folly::sformat( @@ -141,6 +165,20 @@ MemoryTierConfig::MemoryTierConfig(const folly::dynamic& configJson) { checkCorrectSize(); } + +std::shared_ptr CacheConfig::getBackgroundEvictorStrategy() const { + if (backgroundEvictorIntervalMilSec == 0) { + return nullptr; + } + return std::make_shared(lowEvictionAcWatermark, highEvictionAcWatermark, maxEvictionBatch, minEvictionBatch); +} + +std::shared_ptr CacheConfig::getBackgroundPromoterStrategy() const { + if (backgroundPromoterIntervalMilSec == 0) { + return nullptr; + } + return std::make_shared(promotionAcWatermark, maxPromotionBatch, minPromotionBatch); +} } // namespace cachebench } // namespace cachelib } // namespace facebook diff --git a/cachelib/cachebench/util/CacheConfig.h b/cachelib/cachebench/util/CacheConfig.h index 13c0e1e7dc..ec120c900a 100644 --- a/cachelib/cachebench/util/CacheConfig.h +++ b/cachelib/cachebench/util/CacheConfig.h @@ -20,6 +20,7 @@ #include "cachelib/allocator/CacheAllocator.h" #include "cachelib/allocator/RebalanceStrategy.h" +#include "cachelib/allocator/BackgroundMoverStrategy.h" #include "cachelib/cachebench/util/JSONConfig.h" #include "cachelib/common/Ticker.h" #include "cachelib/navy/common/Device.h" @@ -51,7 +52,7 @@ struct MemoryTierConfig : public JSONConfig { MemoryTierCacheConfig getMemoryTierCacheConfig() { MemoryTierCacheConfig config = MemoryTierCacheConfig::fromShm(); config.setRatio(ratio); - config.setMemBind(NumaBitMask(memBindNodes)); + config.setMemBind(util::NumaBitMask(memBindNodes)); return config; } @@ -71,7 +72,10 @@ struct CacheConfig : public JSONConfig { uint64_t cacheSizeMB{0}; uint64_t poolRebalanceIntervalSec{0}; + uint64_t backgroundEvictorIntervalMilSec{0}; + uint64_t backgroundPromoterIntervalMilSec{0}; std::string rebalanceStrategy; + std::string backgroundEvictorStrategy; uint64_t rebalanceMinSlabs{1}; double rebalanceDiffRatio{0.25}; bool moveOnSlabRelease{false}; @@ -92,7 +96,9 @@ struct CacheConfig : public JSONConfig { bool lruUpdateOnWrite{false}; bool lruUpdateOnRead{true}; bool tryLockUpdate{false}; - bool useCombinedLockForIterators{false}; + bool useCombinedLockForIterators{true}; + + bool insertToFirstFreeTier{false}; // LRU param uint64_t lruIpSpec{0}; @@ -252,6 +258,27 @@ struct CacheConfig : public JSONConfig { // eviction-age is more than this threshold. 0 means no threshold uint32_t nvmAdmissionRetentionTimeThreshold{0}; + // See BackgroundMovers.md for complete description + double promotionAcWatermark{4.0}; + double lowEvictionAcWatermark{2.0}; + double highEvictionAcWatermark{5.0}; + double minAcAllocationWatermark{0.0}; + double maxAcAllocationWatermark{0.0}; + + double numDuplicateElements{0.0}; // inclusivness of the cache + double syncPromotion{0.0}; // can promotion be done synchronously in user thread + + uint64_t evictorThreads{1}; + uint64_t promoterThreads{1}; + + uint64_t maxEvictionBatch{40}; + uint64_t maxPromotionBatch{10}; + + uint64_t minEvictionBatch{5}; + uint64_t minPromotionBatch{5}; + + uint64_t maxEvictionPromotionHotness{60}; + // // Options below are not to be populated with JSON // @@ -287,6 +314,8 @@ struct CacheConfig : public JSONConfig { CacheConfig() {} std::shared_ptr getRebalanceStrategy() const; + std::shared_ptr getBackgroundEvictorStrategy() const; + std::shared_ptr getBackgroundPromoterStrategy() const; }; } // namespace cachebench } // namespace cachelib diff --git a/cachelib/common/CMakeLists.txt b/cachelib/common/CMakeLists.txt index 1e6d1a887c..212f421324 100644 --- a/cachelib/common/CMakeLists.txt +++ b/cachelib/common/CMakeLists.txt @@ -39,6 +39,7 @@ target_link_libraries(cachelib_common PUBLIC Folly::folly_exception_tracer Folly::folly_exception_tracer_base Folly::folly_exception_counter + numa ) install(TARGETS cachelib_common diff --git a/cachelib/common/Mutex.h b/cachelib/common/Mutex.h index 1d6e5898f1..15b440d406 100644 --- a/cachelib/common/Mutex.h +++ b/cachelib/common/Mutex.h @@ -341,6 +341,7 @@ class RWBucketLocks : public BaseBucketLocks { using Lock = LockType; using ReadLockHolder = ReadLockHolderType; using WriteLockHolder = WriteLockHolderType; + using LockHolder = std::unique_lock; RWBucketLocks(uint32_t locksPower, std::shared_ptr hasher) : Base::BaseBucketLocks(locksPower, std::move(hasher)) {} @@ -357,6 +358,11 @@ class RWBucketLocks : public BaseBucketLocks { return WriteLockHolder{Base::getLock(args...)}; } + template + LockHolder tryLockExclusive(Args... args) noexcept { + return LockHolder(Base::getLock(args...), std::try_to_lock); + } + // try to grab the reader lock for a limit _timeout_ duration template ReadLockHolder lockShared(const std::chrono::microseconds& timeout, diff --git a/cachelib/common/PercentileStats.h b/cachelib/common/PercentileStats.h index bdd3255eba..c308671ee9 100644 --- a/cachelib/common/PercentileStats.h +++ b/cachelib/common/PercentileStats.h @@ -107,16 +107,16 @@ class PercentileStats { class LatencyTracker { public: - explicit LatencyTracker(PercentileStats& stats) - : stats_(&stats), begin_(std::chrono::steady_clock::now()) {} + explicit LatencyTracker(PercentileStats& stats, size_t nSamples = 1) + : stats_(&stats), nSamples_(nSamples), begin_(std::chrono::steady_clock::now()) {} LatencyTracker() {} ~LatencyTracker() { - if (stats_) { + if (nSamples_ > 0 && stats_) { auto tp = std::chrono::steady_clock::now(); auto diffNanos = std::chrono::duration_cast(tp - begin_) .count(); - stats_->trackValue(static_cast(diffNanos), tp); + stats_->trackValue(static_cast(diffNanos/nSamples_), tp); } } @@ -124,7 +124,7 @@ class LatencyTracker { LatencyTracker& operator=(const LatencyTracker&) = delete; LatencyTracker(LatencyTracker&& rhs) noexcept - : stats_(rhs.stats_), begin_(rhs.begin_) { + : stats_(rhs.stats_), nSamples_(rhs.nSamples_), begin_(rhs.begin_) { rhs.stats_ = nullptr; } @@ -138,6 +138,7 @@ class LatencyTracker { private: PercentileStats* stats_{nullptr}; + size_t nSamples_{1}; std::chrono::time_point begin_; }; } // namespace util diff --git a/cachelib/common/RollingStats.h b/cachelib/common/RollingStats.h new file mode 100644 index 0000000000..4d179681ad --- /dev/null +++ b/cachelib/common/RollingStats.h @@ -0,0 +1,90 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +#include "cachelib/common/Utils.h" + +namespace facebook { +namespace cachelib { +namespace util { + +class RollingStats { + public: + // track latency by taking the value of duration directly. + void trackValue(double value) { + // This is a highly unlikely scenario where + // cnt_ reaches numerical limits. Skip update + // of the rolling average anymore. + if (cnt_ == std::numeric_limits::max()) { + cnt_ = 0; + return; + } + auto ratio = static_cast(cnt_) / (cnt_ + 1); + avg_ *= ratio; + ++cnt_; + avg_ += value / cnt_; + } + + // Return the rolling average. + double estimate() { return avg_; } + + private: + double avg_{0}; + uint64_t cnt_{0}; +}; + +class RollingLatencyTracker { + public: + explicit RollingLatencyTracker(RollingStats& stats) + : stats_(&stats), begin_(std::chrono::steady_clock::now()) {} + RollingLatencyTracker() {} + ~RollingLatencyTracker() { + if (stats_) { + auto tp = std::chrono::steady_clock::now(); + auto diffNanos = + std::chrono::duration_cast(tp - begin_) + .count(); + stats_->trackValue(static_cast(diffNanos)); + } + } + + RollingLatencyTracker(const RollingLatencyTracker&) = delete; + RollingLatencyTracker& operator=(const RollingLatencyTracker&) = delete; + + RollingLatencyTracker(RollingLatencyTracker&& rhs) noexcept + : stats_(rhs.stats_), begin_(rhs.begin_) { + rhs.stats_ = nullptr; + } + + RollingLatencyTracker& operator=(RollingLatencyTracker&& rhs) noexcept { + if (this != &rhs) { + this->~RollingLatencyTracker(); + new (this) RollingLatencyTracker(std::move(rhs)); + } + return *this; + } + + private: + RollingStats* stats_{nullptr}; + std::chrono::time_point begin_; +}; +} // namespace util +} // namespace cachelib +} // namespace facebook diff --git a/cachelib/common/Utils.cpp b/cachelib/common/Utils.cpp index 82ec0bf72e..9b051519dc 100644 --- a/cachelib/common/Utils.cpp +++ b/cachelib/common/Utils.cpp @@ -16,6 +16,7 @@ #include #include +#include #include #include #include @@ -181,6 +182,22 @@ void* mmapAlignedZeroedMemory(size_t alignment, throw std::system_error(errno, std::system_category(), "Cannot mmap"); } +void munmapMemory(void* addr, size_t size) { munmap(addr, size); } + +void mbindMemory(void* addr, + unsigned long len, + int mode, + const NumaBitMask& mask, + unsigned int flags) { + auto nodesMask = mask.getNativeBitmask(); + + long ret = mbind(addr, len, mode, nodesMask->maskp, nodesMask->size, flags); + if (ret != 0) { + util::throwSystemError( + errno, folly::sformat("mbind() failed: {}", std::strerror(errno))); + } +} + void setMaxLockMemory(uint64_t bytes) { struct rlimit rlim { bytes, bytes diff --git a/cachelib/common/Utils.h b/cachelib/common/Utils.h index 4e4c839ef9..c94a445b4d 100644 --- a/cachelib/common/Utils.h +++ b/cachelib/common/Utils.h @@ -18,6 +18,8 @@ #include #include +#include +#include #include @@ -35,6 +37,57 @@ namespace facebook { namespace cachelib { namespace util { +class NumaBitMask { + public: + using native_bitmask_type = struct bitmask*; + + NumaBitMask() { nodesMask = numa_allocate_nodemask(); } + + NumaBitMask(const NumaBitMask& other) { + nodesMask = numa_allocate_nodemask(); + copy_bitmask_to_bitmask(other.nodesMask, nodesMask); + } + + NumaBitMask(NumaBitMask&& other) { + nodesMask = other.nodesMask; + other.nodesMask = nullptr; + } + + NumaBitMask(const std::string& str) { + nodesMask = numa_parse_nodestring_all(str.c_str()); + } + + ~NumaBitMask() { + if (nodesMask) { + numa_bitmask_free(nodesMask); + } + } + + constexpr NumaBitMask& operator=(const NumaBitMask& other) { + if (this != &other) { + if (!nodesMask) { + nodesMask = numa_allocate_nodemask(); + } + copy_bitmask_to_bitmask(other.nodesMask, nodesMask); + } + return *this; + } + + native_bitmask_type getNativeBitmask() const noexcept { return nodesMask; } + + NumaBitMask& setBit(unsigned int n) { + numa_bitmask_setbit(nodesMask, n); + return *this; + } + + bool empty() const noexcept { + return numa_bitmask_equal(numa_no_nodes_ptr, nodesMask) == 1; + } + + protected: + native_bitmask_type nodesMask = nullptr; +}; + // A wrapper class for functions to collect counters. // It can be initialized by either // 1. folly::StringPiece, double -> void, or @@ -295,6 +348,25 @@ void* mmapAlignedZeroedMemory(size_t alignment, size_t numBytes, bool noAccess = false); +// destroy the mapping created by mmapAlignedZeroedMemory +// +// @param addr the pointer to the memory to unmap +// @param size size of the memory region +void munmapMemory(void* addr, size_t size); + +// binds memory to the NUMA nodes specified by nmask. +// +// @param addr the pointer to the memory to bind. +// @param len length of the memory. +// @param mode mode supported by mmap call +// @param mask mask specifies node ids +// @param flags flags supported by mmap call +void mbindMemory(void* addr, + unsigned long len, + int mode, + const NumaBitMask& mask, + unsigned int flags); + // get the number of pages in the range which are resident in the process. // // @param mem memory start which is page aligned diff --git a/cachelib/external/fbthrift b/cachelib/external/fbthrift index fb3c6ce37a..cbc3de581f 160000 --- a/cachelib/external/fbthrift +++ b/cachelib/external/fbthrift @@ -1 +1 @@ -Subproject commit fb3c6ce37aab5aecbb39c827e0ae84256c64a44b +Subproject commit cbc3de581fdf36ba474b0c135b9e785e504f1c1e diff --git a/cachelib/external/fizz b/cachelib/external/fizz index 5551610370..80ba4b64d1 160000 --- a/cachelib/external/fizz +++ b/cachelib/external/fizz @@ -1 +1 @@ -Subproject commit 555161037025db59658ae5d0277c4c3e1e49817e +Subproject commit 80ba4b64d1138025a3f61e4cd3c826405cd9e8cb diff --git a/cachelib/external/folly b/cachelib/external/folly index 017e426621..ce2b95715d 160000 --- a/cachelib/external/folly +++ b/cachelib/external/folly @@ -1 +1 @@ -Subproject commit 017e42662179411f83eb24c7100b3af7f8a61518 +Subproject commit ce2b95715de229fcb51bd97410469a3ad4d2bfb2 diff --git a/cachelib/external/wangle b/cachelib/external/wangle index 68b1ec08f2..44690e7894 160000 --- a/cachelib/external/wangle +++ b/cachelib/external/wangle @@ -1 +1 @@ -Subproject commit 68b1ec08f23196e0ad1dd2dfbb2308c095caf440 +Subproject commit 44690e7894842a7127245837b69627d4b964aabd diff --git a/cachelib/shm/PosixShmSegment.cpp b/cachelib/shm/PosixShmSegment.cpp index 7d47d061d1..4c19e229fd 100644 --- a/cachelib/shm/PosixShmSegment.cpp +++ b/cachelib/shm/PosixShmSegment.cpp @@ -31,6 +31,8 @@ namespace facebook { namespace cachelib { +using NumaBitMask = util::NumaBitMask; + constexpr static mode_t kRWMode = 0666; typedef struct stat stat_t; diff --git a/cachelib/shm/ShmCommon.h b/cachelib/shm/ShmCommon.h index 8db8707515..bc451c46d1 100644 --- a/cachelib/shm/ShmCommon.h +++ b/cachelib/shm/ShmCommon.h @@ -15,8 +15,6 @@ */ #pragma once -#include -#include #include #include #include @@ -30,6 +28,8 @@ #include #pragma GCC diagnostic pop +#include "cachelib/common/Utils.h" + /* On Mac OS / FreeBSD, mmap(2) syscall does not support these flags */ #ifndef MAP_LOCKED #define MAP_LOCKED 0 @@ -72,62 +72,11 @@ enum PageSizeT { ONE_GB, }; -class NumaBitMask { - public: - using native_bitmask_type = struct bitmask*; - - NumaBitMask() { nodesMask = numa_allocate_nodemask(); } - - NumaBitMask(const NumaBitMask& other) { - nodesMask = numa_allocate_nodemask(); - copy_bitmask_to_bitmask(other.nodesMask, nodesMask); - } - - NumaBitMask(NumaBitMask&& other) { - nodesMask = other.nodesMask; - other.nodesMask = nullptr; - } - - NumaBitMask(const std::string& str) { - nodesMask = numa_parse_nodestring_all(str.c_str()); - } - - ~NumaBitMask() { - if (nodesMask) { - numa_bitmask_free(nodesMask); - } - } - - constexpr NumaBitMask& operator=(const NumaBitMask& other) { - if (this != &other) { - if (!nodesMask) { - nodesMask = numa_allocate_nodemask(); - } - copy_bitmask_to_bitmask(other.nodesMask, nodesMask); - } - return *this; - } - - native_bitmask_type getNativeBitmask() const noexcept { return nodesMask; } - - NumaBitMask& setBit(unsigned int n) { - numa_bitmask_setbit(nodesMask, n); - return *this; - } - - bool empty() const noexcept { - return numa_bitmask_equal(numa_no_nodes_ptr, nodesMask) == 1; - } - - protected: - native_bitmask_type nodesMask = nullptr; -}; - struct ShmSegmentOpts { PageSizeT pageSize{PageSizeT::NORMAL}; bool readOnly{false}; size_t alignment{1}; // alignment for mapping. - NumaBitMask memBindNumaNodes; + util::NumaBitMask memBindNumaNodes; explicit ShmSegmentOpts(PageSizeT p) : pageSize(p) {} explicit ShmSegmentOpts(PageSizeT p, bool ro) : pageSize(p), readOnly(ro) {} diff --git a/cachelib/shm/SysVShmSegment.cpp b/cachelib/shm/SysVShmSegment.cpp index 29485fa0c4..1cb28da70b 100644 --- a/cachelib/shm/SysVShmSegment.cpp +++ b/cachelib/shm/SysVShmSegment.cpp @@ -189,21 +189,6 @@ void shmCtlImpl(int shmid, int cmd, shmid_ds* buf) { } } -void mbindImpl(void* addr, - unsigned long len, - int mode, - - const NumaBitMask& memBindNumaNodes, - unsigned int flags) { - auto nodesMask = memBindNumaNodes.getNativeBitmask(); - - long ret = mbind(addr, len, mode, nodesMask->maskp, nodesMask->size, flags); - if (ret != 0) { - util::throwSystemError( - errno, folly::sformat("mbind() failed: {}", std::strerror(errno))); - } -} - } // namespace detail void ensureSizeforHugePage(size_t size) { @@ -300,7 +285,7 @@ void SysVShmSegment::memBind(void* addr) const { if (opts_.memBindNumaNodes.empty()) { return; } - detail::mbindImpl(addr, getSize(), MPOL_BIND, opts_.memBindNumaNodes, 0); + util::mbindMemory(addr, getSize(), MPOL_BIND, opts_.memBindNumaNodes, 0); } void SysVShmSegment::markForRemoval() { diff --git a/contrib/build-package.sh b/contrib/build-package.sh index 755933bd44..f0f3283df0 100755 --- a/contrib/build-package.sh +++ b/contrib/build-package.sh @@ -78,9 +78,8 @@ build_tests= show_help= many_jobs= verbose= -PREFIX="$PWD/opt/cachelib/" - -while getopts :BSdhijtvp: param +install_path= +while getopts :BSdhijtvI: param do case $param in i) install=yes ;; @@ -91,7 +90,7 @@ do v) verbose=yes ;; j) many_jobs=yes ;; t) build_tests=yes ;; - p) PREFIX=$OPTARG ;; + I) install_path=${OPTARG} ; install=yes ;; ?) die "unknown option. See -h for help." esac done @@ -281,6 +280,7 @@ test -d cachelib || die "expected 'cachelib' directory not found in $PWD" # After ensuring we are in the correct directory, set the installation prefix" +PREFIX=${install_path:-"$PWD/opt/cachelib/"} CMAKE_PARAMS="$CMAKE_PARAMS -DCMAKE_INSTALL_PREFIX=$PREFIX" CMAKE_PREFIX_PATH="$PREFIX/lib/cmake:$PREFIX/lib64/cmake:$PREFIX/lib:$PREFIX/lib64:$PREFIX:${CMAKE_PREFIX_PATH:-}" export CMAKE_PREFIX_PATH diff --git a/docker/build.sh b/docker/build.sh new file mode 100755 index 0000000000..bb82f0142d --- /dev/null +++ b/docker/build.sh @@ -0,0 +1,97 @@ +#!/usr/bin/env bash +# SPDX-License-Identifier: BSD-3-Clause +# Copyright 2022, Intel Corporation + +# +# build.sh - runs a Docker container from a Docker image with environment +# prepared for running CacheLib builds and tests. It uses Docker image +# tagged as described in ./images/build-image.sh. +# +# Notes: +# - set env var 'HOST_WORKDIR' to where the root of this project is on the host machine, +# - set env var 'OS' and 'OS_VER' properly to a system/Docker you want to build this +# repo on (for proper values take a look at the list of Dockerfiles at the +# utils/docker/images directory in this repo), e.g. OS=ubuntu, OS_VER=20.04, +# - set env var 'CONTAINER_REG' to container registry address +# [and possibly user/org name, and package name], e.g. "/pmem/CacheLib", +# - set env var 'DNS_SERVER' if you use one, +# - set env var 'COMMAND' to execute specific command within Docker container or +# env var 'TYPE' to pick command based on one of the predefined types of build (see below). +# + +set -e + +source $(dirname ${0})/set-ci-vars.sh +IMG_VER=${IMG_VER:-devel} +TAG="${OS}-${OS_VER}-${IMG_VER}" +IMAGE_NAME=${CONTAINER_REG}:${TAG} +CONTAINER_NAME=CacheLib-${OS}-${OS_VER} +WORKDIR=/CacheLib # working dir within Docker container +SCRIPTSDIR=${WORKDIR}/docker + +if [[ -z "${OS}" || -z "${OS_VER}" ]]; then + echo "ERROR: The variables OS and OS_VER have to be set " \ + "(e.g. OS=fedora, OS_VER=32)." + exit 1 +fi + +if [[ -z "${HOST_WORKDIR}" ]]; then + echo "ERROR: The variable HOST_WORKDIR has to contain a path to " \ + "the root of this project on the host machine." + exit 1 +fi + +if [[ -z "${CONTAINER_REG}" ]]; then + echo "ERROR: CONTAINER_REG environment variable is not set " \ + "(e.g. \"//\")." + exit 1 +fi + +# Set command to execute in the Docker container +COMMAND="./run-build.sh"; +echo "COMMAND to execute within Docker container: ${COMMAND}" + +if [ -n "${DNS_SERVER}" ]; then DOCKER_OPTS="${DOCKER_OPTS} --dns=${DNS_SERVER}"; fi + +# Check if we are running on a CI (Travis or GitHub Actions) +[ -n "${GITHUB_ACTIONS}" -o -n "${TRAVIS}" ] && CI_RUN="YES" || CI_RUN="NO" + +# Do not allocate a pseudo-TTY if we are running on GitHub Actions +[ ! "${GITHUB_ACTIONS}" ] && DOCKER_OPTS="${DOCKER_OPTS} --tty=true" + + +echo "Running build using Docker image: ${IMAGE_NAME}" + +# Run a container with +# - environment variables set (--env) +# - host directory containing source mounted (-v) +# - working directory set (-w) +docker run --privileged=true --name=${CONTAINER_NAME} -i \ + ${DOCKER_OPTS} \ + --env http_proxy=${http_proxy} \ + --env https_proxy=${https_proxy} \ + --env TERM=xterm-256color \ + --env WORKDIR=${WORKDIR} \ + --env SCRIPTSDIR=${SCRIPTSDIR} \ + --env GITHUB_REPO=${GITHUB_REPO} \ + --env CI_RUN=${CI_RUN} \ + --env TRAVIS=${TRAVIS} \ + --env GITHUB_ACTIONS=${GITHUB_ACTIONS} \ + --env CI_COMMIT=${CI_COMMIT} \ + --env CI_COMMIT_RANGE=${CI_COMMIT_RANGE} \ + --env CI_BRANCH=${CI_BRANCH} \ + --env CI_EVENT_TYPE=${CI_EVENT_TYPE} \ + --env CI_REPO_SLUG=${CI_REPO_SLUG} \ + --env DOC_UPDATE_GITHUB_TOKEN=${DOC_UPDATE_GITHUB_TOKEN} \ + --env DOC_UPDATE_BOT_NAME=${DOC_UPDATE_BOT_NAME} \ + --env DOC_REPO_OWNER=${DOC_REPO_OWNER} \ + --env COVERITY_SCAN_TOKEN=${COVERITY_SCAN_TOKEN} \ + --env COVERITY_SCAN_NOTIFICATION_EMAIL=${COVERITY_SCAN_NOTIFICATION_EMAIL} \ + --env TEST_TIMEOUT=${TEST_TIMEOUT} \ + --env TZ='Europe/Warsaw' \ + --shm-size=4G \ + -v ${HOST_WORKDIR}:${WORKDIR} \ + -v /etc/localtime:/etc/localtime \ + -w ${SCRIPTSDIR} \ + ${IMAGE_NAME} ${COMMAND} + diff --git a/docker/images/build-image.sh b/docker/images/build-image.sh new file mode 100755 index 0000000000..985a6e0ff1 --- /dev/null +++ b/docker/images/build-image.sh @@ -0,0 +1,38 @@ +#!/usr/bin/env bash +# SPDX-License-Identifier: BSD-3-Clause +# Copyright 2016-2021, Intel Corporation +# +# build-image.sh - prepares a Docker image with -based environment for +# testing (or dev) purpose, tagged with ${CONTAINER_REG}:${OS}-${OS_VER}-${IMG_VER}, +# according to the ${OS}-${OS_VER}.Dockerfile file located in the same directory. +# IMG_VER is a version of Docker image (it usually relates to project's release tag) +# and it defaults to "devel". +# + +set -e +IMG_VER=${IMG_VER:-devel} +TAG="${OS}-${OS_VER}-${IMG_VER}" + +if [[ -z "${OS}" || -z "${OS_VER}" ]]; then + echo "ERROR: The variables OS and OS_VER have to be set " \ + "(e.g. OS=fedora, OS_VER=34)." + exit 1 +fi + +if [[ -z "${CONTAINER_REG}" ]]; then + echo "ERROR: CONTAINER_REG environment variable is not set " \ + "(e.g. \"//\")." + exit 1 +fi + +echo "Check if the file ${OS}-${OS_VER}.Dockerfile exists" +if [[ ! -f "${OS}-${OS_VER}.Dockerfile" ]]; then + echo "Error: ${OS}-${OS_VER}.Dockerfile does not exist." + exit 1 +fi + +echo "Build a Docker image tagged with: ${CONTAINER_REG}:${TAG}" +docker build -t ${CONTAINER_REG}:${TAG} \ + --build-arg http_proxy=$http_proxy \ + --build-arg https_proxy=$https_proxy \ + -f ${OS}-${OS_VER}.Dockerfile . diff --git a/docker/images/centos-8streams.Dockerfile b/docker/images/centos-8streams.Dockerfile new file mode 100644 index 0000000000..e0c31226a1 --- /dev/null +++ b/docker/images/centos-8streams.Dockerfile @@ -0,0 +1,30 @@ +FROM quay.io/centos/centos:stream8 + +RUN dnf install -y \ +cmake \ +sudo \ +git \ +tzdata \ +vim \ +gdb \ +clang \ +python36 \ +glibc-devel.i686 \ +xmlto \ +uuid \ +libuuid-devel \ +json-c-devel \ +perf \ +numactl + +# updated to fix compile errors and better symbol +# resolving in VTune +RUN dnf -y install gcc-toolset-12 +RUN echo "source /opt/rh/gcc-toolset-12/enable" >> /etc/bashrc +SHELL ["/bin/bash", "--login", "-c"] + +COPY ./install-cachelib-deps.sh ./install-cachelib-deps.sh +RUN ./install-cachelib-deps.sh + +COPY ./install-dsa-deps.sh ./install-dsa-deps.sh +RUN ./install-dsa-deps.sh diff --git a/docker/images/install-cachelib-deps.sh b/docker/images/install-cachelib-deps.sh new file mode 100755 index 0000000000..6d8fbdef7b --- /dev/null +++ b/docker/images/install-cachelib-deps.sh @@ -0,0 +1,14 @@ +#!/usr/bin/env bash +# SPDX-License-Identifier: BSD-3-Clause +# Copyright 2022, Intel Corporation + +git clone -b develop https://github.com/intel/CacheLib CacheLib + +./CacheLib/contrib/prerequisites-centos8.sh + +for pkg in zstd googleflags googlelog googletest sparsemap fmt folly fizz wangle fbthrift ; +do + sudo ./CacheLib/contrib/build-package.sh -j -I /opt/ "$pkg" +done + +rm -rf CacheLib diff --git a/docker/images/install-dsa-deps.sh b/docker/images/install-dsa-deps.sh new file mode 100755 index 0000000000..265011dd70 --- /dev/null +++ b/docker/images/install-dsa-deps.sh @@ -0,0 +1,24 @@ +#!/usr/bin/env bash +# Copyright 2023, Intel Corporation + +# Install idxd-config +git clone https://github.com/intel/idxd-config.git +cd idxd-config +./autogen.sh +./configure CFLAGS='-g -O2' --prefix=/usr --sysconfdir=/etc --libdir=/usr/lib64 +make +make check +sudo make install +cd ../ +rm -rf idxd-config + +# Install DML Library +git clone --recursive https://github.com/intel/DML.git +cd DML +git checkout e44443c24d53552b248b9869b1b16f89cd970f52 +mkdir build +cd build +cmake -DCMAKE_INSTALL_PREFIX=/usr -DCMAKE_BUILD_TYPE=RelWithDebInfo .. +cmake --build . --target install +cd ../../ +rm -rf DML diff --git a/docker/images/push-image.sh b/docker/images/push-image.sh new file mode 100755 index 0000000000..8f516b4205 --- /dev/null +++ b/docker/images/push-image.sh @@ -0,0 +1,49 @@ +#!/usr/bin/env bash +# SPDX-License-Identifier: BSD-3-Clause +# Copyright 2016-2021, Intel Corporation + +# +# push-image.sh - pushes the Docker image tagged as described in +# ./build-image.sh, to the ${CONTAINER_REG}. +# +# The script utilizes ${CONTAINER_REG_USER} and ${CONTAINER_REG_PASS} variables to +# log in to the ${CONTAINER_REG}. The variables can be set in the CI's configuration +# for automated builds. +# + +set -e +IMG_VER=${IMG_VER:-devel} +TAG="${OS}-${OS_VER}-${IMG_VER}" + +if [[ -z "${OS}" || -z "${OS_VER}" ]]; then + echo "ERROR: The variables OS and OS_VER have to be set " \ + "(e.g. OS=fedora, OS_VER=34)." + exit 1 +fi + +if [[ -z "${CONTAINER_REG}" ]]; then + echo "ERROR: CONTAINER_REG environment variable is not set " \ + "(e.g. \"//\")." + exit 1 +fi + +if [[ -z "${CONTAINER_REG_USER}" || -z "${CONTAINER_REG_PASS}" ]]; then + echo "ERROR: variables CONTAINER_REG_USER=\"${CONTAINER_REG_USER}\" and " \ + "CONTAINER_REG_PASS=\"${CONTAINER_REG_PASS}\"" \ + "have to be set properly to allow login to the Container Registry." + exit 1 +fi + +# Check if the image tagged with ${CONTAINER_REG}:${TAG} exists locally +if [[ ! $(docker images -a | awk -v pattern="^${CONTAINER_REG}:${TAG}\$" \ + '$1":"$2 ~ pattern') ]] +then + echo "ERROR: Docker image tagged ${CONTAINER_REG}:${TAG} does not exist locally." + exit 1 +fi + +echo "Log in to the Container Registry: ${CONTAINER_REG}" +echo "${CONTAINER_REG_PASS}" | docker login ghcr.io -u="${CONTAINER_REG_USER}" --password-stdin + +echo "Push the image to the Container Registry" +docker push ${CONTAINER_REG}:${TAG} diff --git a/docker/pull-or-rebuild-image.sh b/docker/pull-or-rebuild-image.sh new file mode 100755 index 0000000000..dcdcb40e8c --- /dev/null +++ b/docker/pull-or-rebuild-image.sh @@ -0,0 +1,124 @@ +#!/usr/bin/env bash +# SPDX-License-Identifier: BSD-3-Clause +# Copyright 2016-2021, Intel Corporation + +# +# pull-or-rebuild-image.sh - rebuilds the Docker image used in the +# current build (if necessary) or pulls it from the Container Registry. +# Docker image is tagged as described in docker/build-image.sh, +# but IMG_VER defaults in this script to "latest" (just in case it's +# used locally without building any images). +# +# If Docker was rebuilt and all requirements are fulfilled (more details in +# push_image function below) image will be pushed to the ${CONTAINER_REG}. +# +# The script rebuilds the Docker image if: +# 1. the Dockerfile for the current OS version (${OS}-${OS_VER}.Dockerfile) +# or any .sh script in the Dockerfiles directory were modified and committed, or +# 2. "rebuild" param was passed as a first argument to this script. +# +# The script pulls the Docker image if: +# 1. it does not have to be rebuilt (based on committed changes), or +# 2. "pull" param was passed as a first argument to this script. +# + +set -e + +source $(dirname ${0})/set-ci-vars.sh +IMG_VER=${IMG_VER:-latest} +TAG="${OS}-${OS_VER}-${IMG_VER}" +IMAGES_DIR_NAME=images +BASE_DIR=docker/${IMAGES_DIR_NAME} + +if [[ -z "${OS}" || -z "${OS_VER}" ]]; then + echo "ERROR: The variables OS and OS_VER have to be set properly " \ + "(eg. OS=fedora, OS_VER=34)." + exit 1 +fi + +if [[ -z "${CONTAINER_REG}" ]]; then + echo "ERROR: CONTAINER_REG environment variable is not set " \ + "(e.g. \"//\")." + exit 1 +fi + +function build_image() { + echo "Building the Docker image for the ${OS}-${OS_VER}.Dockerfile" + pushd ${IMAGES_DIR_NAME} + ./build-image.sh + popd +} + +function pull_image() { + echo "Pull the image '${CONTAINER_REG}:${TAG}' from the Container Registry." + docker pull ${CONTAINER_REG}:${TAG} +} + +function push_image { + # Check if the image has to be pushed to the Container Registry: + # - only upstream (not forked) repository, + # - stable-* or master branch, + # - not a pull_request event, + # - and PUSH_IMAGE flag was set for current build. + if [[ "${CI_REPO_SLUG}" == "${GITHUB_REPO}" \ + && (${CI_BRANCH} == develop || ${CI_BRANCH} == main) \ + && ${CI_EVENT_TYPE} != "pull_request" \ + && ${PUSH_IMAGE} == "1" ]] + then + echo "The image will be pushed to the Container Registry: ${CONTAINER_REG}" + pushd ${IMAGES_DIR_NAME} + ./push-image.sh + popd + else + echo "Skip pushing the image to the Container Registry." + fi +} + +# If "rebuild" or "pull" are passed to the script as param, force rebuild/pull. +if [[ "${1}" == "rebuild" ]]; then + build_image + push_image + exit 0 +elif [[ "${1}" == "pull" ]]; then + pull_image + exit 0 +fi + +# Determine if we need to rebuild the image or just pull it from +# the Container Registry, based on committed changes. +if [ -n "${CI_COMMIT_RANGE}" ]; then + commits=$(git rev-list ${CI_COMMIT_RANGE}) +else + commits=${CI_COMMIT} +fi + +if [[ -z "${commits}" ]]; then + echo "'commits' variable is empty. Docker image will be pulled." +fi + +echo "Commits in the commit range:" +for commit in ${commits}; do echo ${commit}; done + +echo "Files modified within the commit range:" +files=$(for commit in ${commits}; do git diff-tree --no-commit-id --name-only \ + -r ${commit}; done | sort -u) +for file in ${files}; do echo ${file}; done + +# Check if committed file modifications require the Docker image to be rebuilt +for file in ${files}; do + # Check if modified files are relevant to the current build + if [[ ${file} =~ ^(${BASE_DIR})\/(${OS})-(${OS_VER})\.Dockerfile$ ]] \ + || [[ ${file} =~ ^(${BASE_DIR})\/.*\.sh$ ]] + then + build_image + push_image + exit 0 + fi +done + +# Getting here means rebuilding the Docker image isn't required (based on changed files). +# Pull the image from the Container Registry or rebuild anyway, if pull fails. +if ! pull_image; then + build_image + push_image +fi diff --git a/docker/run-build.sh b/docker/run-build.sh new file mode 100755 index 0000000000..bc04819f18 --- /dev/null +++ b/docker/run-build.sh @@ -0,0 +1,20 @@ +#!/usr/bin/env bash +# SPDX-License-Identifier: BSD-3-Clause +# Copyright 2022, Intel Corporation + +set -e + +function sudo_password() { + echo ${USERPASS} | sudo -Sk $* +} + +cd .. +mkdir build +cd build + +source /opt/rh/gcc-toolset-12/enable + +cmake ../cachelib -DBUILD_TESTS=ON -DCMAKE_INSTALL_PREFIX=/opt -DCMAKE_BUILD_TYPE=Debug +sudo_password make install -j$(nproc) + +cd /opt/tests && $WORKDIR/run_tests.sh diff --git a/docker/set-ci-vars.sh b/docker/set-ci-vars.sh new file mode 100755 index 0000000000..f6f52132c8 --- /dev/null +++ b/docker/set-ci-vars.sh @@ -0,0 +1,111 @@ +#!/usr/bin/env bash +# SPDX-License-Identifier: BSD-3-Clause +# Copyright 2020-2021, Intel Corporation + +# +# set-ci-vars.sh -- set CI variables common for both: +# Travis and GitHub Actions CIs +# + +set -e + +function get_commit_range_from_last_merge { + # get commit id of the last merge + LAST_MERGE=$(git log --merges --pretty=%H -1) + LAST_COMMIT=$(git log --pretty=%H -1) + RANGE_END="HEAD" + if [ -n "${GITHUB_ACTIONS}" ] && [ "${GITHUB_EVENT_NAME}" == "pull_request" ] && [ "${LAST_MERGE}" == "${LAST_COMMIT}" ]; then + # GitHub Actions commits its own merge in case of pull requests + # so the first merge commit has to be skipped. + + LAST_COMMIT=$(git log --pretty=%H -2 | tail -n1) + LAST_MERGE=$(git log --merges --pretty=%H -2 | tail -n1) + # If still the last commit is a merge commit it means we're manually + # merging changes (probably back from stable branch). We have to use + # left parent of the merge and the current commit for COMMIT_RANGE. + if [ "${LAST_MERGE}" == "${LAST_COMMIT}" ]; then + LAST_MERGE=$(git log --merges --pretty=%P -2 | tail -n1 | cut -d" " -f1) + RANGE_END=${LAST_COMMIT} + fi + elif [ "${LAST_MERGE}" == "${LAST_COMMIT}" ] && + ([ "${TRAVIS_EVENT_TYPE}" == "push" ] || [ "${GITHUB_EVENT_NAME}" == "push" ]); then + # Other case in which last commit equals last merge, is when committing + # a manual merge. Push events don't set proper COMMIT_RANGE. + # It has to be then set: from merge's left parent to the current commit. + LAST_MERGE=$(git log --merges --pretty=%P -1 | cut -d" " -f1) + fi + if [ "${LAST_MERGE}" == "" ]; then + # possible in case of shallow clones + # or new repos with no merge commits yet + # - pick up the first commit + LAST_MERGE=$(git log --pretty=%H | tail -n1) + fi + COMMIT_RANGE="${LAST_MERGE}..${RANGE_END}" + # make sure it works now + if ! git rev-list ${COMMIT_RANGE} >/dev/null; then + COMMIT_RANGE="" + fi + echo ${COMMIT_RANGE} +} + +COMMIT_RANGE_FROM_LAST_MERGE=$(get_commit_range_from_last_merge) + +if [ -n "${TRAVIS}" ]; then + CI_COMMIT=${TRAVIS_COMMIT} + CI_COMMIT_RANGE="${TRAVIS_COMMIT_RANGE/.../..}" + CI_BRANCH=${TRAVIS_BRANCH} + CI_EVENT_TYPE=${TRAVIS_EVENT_TYPE} + CI_REPO_SLUG=${TRAVIS_REPO_SLUG} + + # CI_COMMIT_RANGE is usually invalid for force pushes - fix it when used + # with non-upstream repository + if [ -n "${CI_COMMIT_RANGE}" -a "${CI_REPO_SLUG}" != "${GITHUB_REPO}" ]; then + if ! git rev-list ${CI_COMMIT_RANGE}; then + CI_COMMIT_RANGE=${COMMIT_RANGE_FROM_LAST_MERGE} + fi + fi + + case "${TRAVIS_CPU_ARCH}" in + "amd64") + CI_CPU_ARCH="x86_64" + ;; + *) + CI_CPU_ARCH=${TRAVIS_CPU_ARCH} + ;; + esac + +elif [ -n "${GITHUB_ACTIONS}" ]; then + CI_COMMIT=${GITHUB_SHA} + CI_COMMIT_RANGE=${COMMIT_RANGE_FROM_LAST_MERGE} + CI_BRANCH=$(echo ${GITHUB_REF} | cut -d'/' -f3) + CI_REPO_SLUG=${GITHUB_REPOSITORY} + CI_CPU_ARCH="x86_64" # GitHub Actions supports only x86_64 + + case "${GITHUB_EVENT_NAME}" in + "schedule") + CI_EVENT_TYPE="cron" + ;; + *) + CI_EVENT_TYPE=${GITHUB_EVENT_NAME} + ;; + esac + +else + CI_COMMIT=$(git log --pretty=%H -1) + CI_COMMIT_RANGE=${COMMIT_RANGE_FROM_LAST_MERGE} + CI_CPU_ARCH="x86_64" +fi + +export CI_COMMIT=${CI_COMMIT} +export CI_COMMIT_RANGE=${CI_COMMIT_RANGE} +export CI_BRANCH=${CI_BRANCH} +export CI_EVENT_TYPE=${CI_EVENT_TYPE} +export CI_REPO_SLUG=${CI_REPO_SLUG} +export CI_CPU_ARCH=${CI_CPU_ARCH} + +echo CI_COMMIT=${CI_COMMIT} +echo CI_COMMIT_RANGE=${CI_COMMIT_RANGE} +echo CI_BRANCH=${CI_BRANCH} +echo CI_EVENT_TYPE=${CI_EVENT_TYPE} +echo CI_REPO_SLUG=${CI_REPO_SLUG} +echo CI_CPU_ARCH=${CI_CPU_ARCH} diff --git a/examples/single_tier_cache/main.cpp b/examples/single_tier_cache/main.cpp index de6373622c..9c19dfeea9 100644 --- a/examples/single_tier_cache/main.cpp +++ b/examples/single_tier_cache/main.cpp @@ -25,7 +25,7 @@ using CacheConfig = typename Cache::Config; using CacheKey = typename Cache::Key; using CacheReadHandle = typename Cache::ReadHandle; using MemoryTierCacheConfig = typename cachelib::MemoryTierCacheConfig; -using NumaBitMask = typename cachelib::NumaBitMask; +using NumaBitMask = typename cachelib::util::NumaBitMask; // Global cache object and a default cache pool std::unique_ptr gCache_; diff --git a/run_code_coverage.sh b/run_code_coverage.sh new file mode 100755 index 0000000000..7722e262bf --- /dev/null +++ b/run_code_coverage.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +#Build CacheLib with flag -DCOVERAGE_ENABLED=ON + +# Track coverage +lcov -c -i -b . -d . -o Coverage.baseline +./run_tests.sh +lcov -c -d . -b . -o Coverage.out +lcov -a Coverage.baseline -a Coverage.out -o Coverage.combined + +# Generate report +COVERAGE_DIR='coverage_report' +genhtml Coverage.combined -o ${COVERAGE_DIR} +COVERAGE_REPORT="${COVERAGE_DIR}.tgz" +tar -zcvf ${COVERAGE_REPORT} ${COVERAGE_DIR} +echo "Created coverage report ${COVERAGE_REPORT}" + +# Cleanup +rm Coverage.baseline Coverage.out Coverage.combined +rm -rf ${COVERAGE_DIR} diff --git a/run_tests.sh b/run_tests.sh new file mode 100755 index 0000000000..6ff2ac65ed --- /dev/null +++ b/run_tests.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +# Newline separated list of tests to ignore +BLACKLIST="allocator-test-NavySetupTest +allocator-test-NvmCacheTests +shm-test-test_page_size" + +if [ "$1" == "long" ]; then + find -type f -executable | grep -vF "$BLACKLIST" | xargs -n1 bash -c +else + find -type f \( -not -name "*bench*" -and -not -name "navy*" \) -executable | grep -vF "$BLACKLIST" | xargs -n1 bash -c +fi + +../bin/cachebench --json_test_config ../test_configs/consistency/navy.json +../bin/cachebench --json_test_config ../test_configs/consistency/navy-multi-tier.json +../bin/cachebench --json_test_config ../test_configs/small_moving_bg.json