diff --git a/.github/workflows/build-cachelib-centos.yml b/.github/workflows/build-cachelib-centos-long.yml similarity index 86% rename from .github/workflows/build-cachelib-centos.yml rename to .github/workflows/build-cachelib-centos-long.yml index 3b071a186a..92165f603b 100644 --- a/.github/workflows/build-cachelib-centos.yml +++ b/.github/workflows/build-cachelib-centos-long.yml @@ -1,7 +1,8 @@ name: build-cachelib-centos-latest on: schedule: - - cron: '30 5 * * 1,4' + - cron: '0 7 * * *' + jobs: build-cachelib-centos8-latest: name: "CentOS/latest - Build CacheLib with all dependencies" @@ -33,3 +34,6 @@ jobs: uses: actions/checkout@v2 - name: "build CacheLib using build script" run: ./contrib/build.sh -j -v -T + - name: "run tests" + timeout-minutes: 60 + run: cd opt/cachelib/tests && ../../../run_tests.sh long diff --git a/.github/workflows/build-cachelib-debian.yml b/.github/workflows/build-cachelib-debian.yml index a2ae44a569..5bc3ad3c70 100644 --- a/.github/workflows/build-cachelib-debian.yml +++ b/.github/workflows/build-cachelib-debian.yml @@ -1,7 +1,8 @@ name: build-cachelib-debian-10 on: schedule: - - cron: '30 5 * * 2,6' + - cron: '30 5 * * 0,3' + jobs: build-cachelib-debian-10: name: "Debian/Buster - Build CacheLib with all dependencies" @@ -37,3 +38,6 @@ jobs: uses: actions/checkout@v2 - name: "build CacheLib using build script" run: ./contrib/build.sh -j -v -T + - name: "run tests" + timeout-minutes: 60 + run: cd opt/cachelib/tests && ../../../run_tests.sh diff --git a/.github/workflows/build-cachelib-docker.yml b/.github/workflows/build-cachelib-docker.yml new file mode 100644 index 0000000000..be28bc233c --- /dev/null +++ b/.github/workflows/build-cachelib-docker.yml @@ -0,0 +1,49 @@ +name: build-cachelib-docker +on: + push: + pull_request: + +jobs: + build-cachelib-docker: + name: "CentOS/latest - Build CacheLib with all dependencies" + runs-on: ubuntu-latest + env: + REPO: cachelib + GITHUB_REPO: intel/CacheLib + CONTAINER_REG: ghcr.io/pmem/cachelib + CONTAINER_REG_USER: ${{ secrets.GH_CR_USER }} + CONTAINER_REG_PASS: ${{ secrets.GH_CR_PAT }} + FORCE_IMAGE_ACTION: ${{ secrets.FORCE_IMAGE_ACTION }} + HOST_WORKDIR: ${{ github.workspace }} + WORKDIR: docker + IMG_VER: devel + strategy: + matrix: + CONFIG: ["OS=centos OS_VER=8streams PUSH_IMAGE=1"] + steps: + - name: "System Information" + run: | + echo === uname === + uname -a + echo === /etc/os-release === + cat /etc/os-release + echo === df -hl === + df -hl + echo === free -h === + free -h + echo === top === + top -b -n1 -1 -Eg || timeout 1 top -b -n1 + echo === env === + env + echo === gcc -v === + gcc -v + - name: "checkout sources" + uses: actions/checkout@v2 + with: + fetch-depth: 0 + + - name: Pull the image or rebuild and push it + run: cd $WORKDIR && ${{ matrix.CONFIG }} ./pull-or-rebuild-image.sh $FORCE_IMAGE_ACTION + + - name: Run the build + run: cd $WORKDIR && ${{ matrix.CONFIG }} ./build.sh diff --git a/.github/workflows/build-cachelib.yml b/.github/workflows/build-cachelib.yml deleted file mode 100644 index 15161c40e0..0000000000 --- a/.github/workflows/build-cachelib.yml +++ /dev/null @@ -1,147 +0,0 @@ -# NOTES: -# 1. While Github-Actions enables cache of dependencies, -# Facebook's projects (folly,fizz,wangle,fbthrift) -# are fast-moving targets - so we always checkout the latest version -# (as opposed to using gitactions cache, which is recommended in the -# documentation). -# -# 2. Using docker containers to build on CentOS and Debian, -# Specifically CentOS v8.1.1911 as that -# version is closest to Facebook's internal dev machines. -# -# 3. When using docker containers we install 'sudo', -# as the docker images are typically very minimal and without -# 'sudo', while the ./contrib/ scripts use sudo. -# -# 4. When using the docker containers we install 'git' -# BEFORE getting the CacheLib source code (with the 'checkout' action). -# Otherwise, the 'checkout@v2' action script falls back to downloading -# the git repository files only, without the ".git" directory. -# We need the ".git" directory to updating the git-submodules -# (folly/wangle/fizz/fbthrift). See: -# https://github.com/actions/checkout/issues/126#issuecomment-570288731 -# -# 5. To reduce less-critical (and yet frequent) rebuilds, the jobs -# check the author of the commit, and SKIP the build if -# the author is "svcscm". These commits are automatic updates -# for the folly/fbthrift git-submodules, and can happen several times a day. -# While there is a possiblity that updating the git-submodules breaks -# CacheLib, it is less likely, and will be detected once an actual -# code change commit triggers a full build. -# e.g. https://github.com/facebookincubator/CacheLib/commit/9372a82190dd71a6e2bcb668828cfed9d1bd25c1 -# -# 6. The 'if' condition checking the author name of the commit (see #5 above) -# uses github actions metadata variable: -# 'github.event.head_commit.author.name' -# GitHub have changed in the past the metadata structure and broke -# such conditions. If you need to debug the metadata values, -# see the "dummy-show-github-event" job below. -# E.g. https://github.blog/changelog/2019-10-16-changes-in-github-actions-push-event-payload/ -# As of Jan-2021, the output is: -# { -# "author": { -# "email": "mimi@moo.moo", -# "name": "mimi" -# }, -# "committer": { -# "email": "assafgordon@gmail.com", -# "name": "Assaf Gordon", -# "username": "agordon" -# }, -# "distinct": true, -# "id": "6c3aab0970f4a07cc2af7658756a6ef9d82f3276", -# "message": "gitactions: test", -# "timestamp": "2021-01-26T11:11:57-07:00", -# "tree_id": "741cd1cb802df84362a51e5d01f28788845d08b7", -# "url": "https://github.com/agordon/CacheLib/commit/6c3aab0970f4a07cc2af7658756a6ef9d82f3276" -# } -# -# 7. When checking the commit's author name, we use '...author.name', -# NOT '...author.username' - because the 'svcscm' author does not -# have a github username (see the 'mimi' example above). -# - -name: build-cachelib -on: [push] -jobs: - dummy-show-github-event: - name: "Show GitHub Action event.head_commit variable" - runs-on: ubuntu-latest - steps: - - name: "GitHub Variable Content" - env: - CONTENT: ${{ toJSON(github.event.head_commit) }} - run: echo "$CONTENT" - - - build-cachelib-centos8-1-1911: - if: "!contains(github.event.head_commit.author.name, 'svcscm')" - name: "CentOS/8.1.1911 - Build CacheLib with all dependencies" - runs-on: ubuntu-latest - # Docker container image name - container: "centos:8.1.1911" - steps: - - name: "update packages" - # stock centos has a problem with CMAKE, fails with: - # "cmake: symbol lookup error: cmake: undefined symbol: archive_write_add_filter_zstd" - # updating solves it - run: dnf update -y - - name: "install sudo,git" - run: dnf install -y sudo git cmake gcc - - name: "System Information" - run: | - echo === uname === - uname -a - echo === /etc/os-release === - cat /etc/os-release - echo === df -hl === - df -hl - echo === free -h === - free -h - echo === top === - top -b -n1 -1 -Eg || timeout 1 top -b -n1 - echo === env === - env - echo === gcc -v === - gcc -v - - name: "checkout sources" - uses: actions/checkout@v2 - - name: "Install Prerequisites" - run: ./contrib/build.sh -S -B - - name: "Test: update-submodules" - run: ./contrib/update-submodules.sh - - name: "Install dependency: zstd" - run: ./contrib/build-package.sh -j -v -i zstd - - name: "Install dependency: googleflags" - run: ./contrib/build-package.sh -j -v -i googleflags - - name: "Install dependency: googlelog" - run: ./contrib/build-package.sh -j -v -i googlelog - - name: "Install dependency: googletest" - run: ./contrib/build-package.sh -j -v -i googletest - - name: "Install dependency: sparsemap" - run: ./contrib/build-package.sh -j -v -i sparsemap - - name: "Install dependency: fmt" - run: ./contrib/build-package.sh -j -v -i fmt - - name: "Install dependency: folly" - run: ./contrib/build-package.sh -j -v -i folly - - name: "Install dependency: fizz" - run: ./contrib/build-package.sh -j -v -i fizz - - name: "Install dependency: wangle" - run: ./contrib/build-package.sh -j -v -i wangle - - name: "Install dependency: fbthrift" - run: ./contrib/build-package.sh -j -v -i fbthrift - - name: "build CacheLib" - # Build cachelib in debug mode (-d) and with all tests (-t) - run: ./contrib/build-package.sh -j -v -i -d -t cachelib - - uses: actions/upload-artifact@v2 - if: failure() - with: - name: cachelib-cmake-logs - path: | - build-cachelib/CMakeFiles/*.log - build-cachelib/CMakeCache.txt - build-cachelib/Makefile - build-cachelib/**/Makefile - if-no-files-found: warn - retention-days: 1 - diff --git a/.github/workflows/clang-format-check.yml b/.github/workflows/clang-format-check.yml index 4b4897b610..90c8d739c6 100644 --- a/.github/workflows/clang-format-check.yml +++ b/.github/workflows/clang-format-check.yml @@ -1,6 +1,6 @@ # From: https://github.com/marketplace/actions/clang-format-check#multiple-paths name: clang-format Check -on: [pull_request] +on: [] jobs: formatting-check: name: Formatting Check diff --git a/AdmissionPolicies.md b/AdmissionPolicies.md new file mode 100644 index 0000000000..ada91457b3 --- /dev/null +++ b/AdmissionPolicies.md @@ -0,0 +1,24 @@ +# Admission Policies + +## Allocation policies + +- `maxAcAllocationWatermark`: Item is always allocated in topmost tier if at least this +percentage of the AllocationClass is free. +- `minAcAllocationWatermark`: Item is always allocated in bottom tier if only this percent +of the AllocationClass is free. If percentage of free AllocationClasses is between `maxAcAllocationWatermark` +and `minAcAllocationWatermark`: then extra checks (described below) are performed to decide where to put the element. + +By default, allocation will always be performed from the upper tier. + +- `acTopTierEvictionWatermark`: If there is less that this percent of free memory in topmost tier, cachelib will attempt to evict from top tier. This option takes precedence before allocationWatermarks. + +### Extra policies (used only when percentage of free AllocationClasses is between `maxAcAllocationWatermark` +and `minAcAllocationWatermark`) +- `sizeThresholdPolicy`: If item is smaller than this value, always allocate it in upper tier. +- `defaultTierChancePercentage`: Change (0-100%) of allocating item in top tier + +## MMContainer options + +- `lruInsertionPointSpec`: Can be set per tier when LRU2Q is used. Determines where new items are +inserted. 0 = insert to hot queue, 1 = insert to warm queue, 2 = insert to cold queue +- `markUsefulChance`: Per-tier, determines chance of moving item to the head of LRU on access diff --git a/cachelib/allocator/CMakeLists.txt b/cachelib/allocator/CMakeLists.txt index b659770d82..d64fadc932 100644 --- a/cachelib/allocator/CMakeLists.txt +++ b/cachelib/allocator/CMakeLists.txt @@ -117,6 +117,8 @@ if (BUILD_TESTS) add_test (tests/ChainedHashTest.cpp) add_test (tests/AllocatorResizeTypeTest.cpp) add_test (tests/AllocatorHitStatsTypeTest.cpp) + add_test (tests/AllocatorMemoryTiersTest.cpp) + add_test (tests/MemoryTiersTest.cpp) add_test (tests/MultiAllocatorTest.cpp) add_test (tests/NvmAdmissionPolicyTest.cpp) add_test (tests/CacheAllocatorConfigTest.cpp) diff --git a/cachelib/allocator/Cache.h b/cachelib/allocator/Cache.h index 2511a18291..302a70c4d5 100644 --- a/cachelib/allocator/Cache.h +++ b/cachelib/allocator/Cache.h @@ -83,6 +83,9 @@ class CacheBase { CacheBase(CacheBase&&) = default; CacheBase& operator=(CacheBase&&) = default; + // TODO: come up with some reasonable number + static constexpr unsigned kMaxTiers = 2; + // Get a string referring to the cache name for this cache virtual const std::string getCacheName() const = 0; @@ -90,16 +93,20 @@ class CacheBase { virtual bool isObjectCache() const = 0; // Get the reference to a memory pool, for stats purposes + // uses the top most memory tier by default // // @param poolId The pool id to query virtual const MemoryPool& getPool(PoolId poolId) const = 0; - + // Get Pool specific stats (regular pools). This includes stats from the // Memory Pool and also the cache. // // @param poolId the pool id virtual PoolStats getPoolStats(PoolId poolId) const = 0; + virtual AllocationClassBaseStat getAllocationClassStats( + TierId, PoolId pid, ClassId cid) const = 0; + // @param poolId the pool id virtual AllSlabReleaseEvents getAllSlabReleaseEvents(PoolId poolId) const = 0; diff --git a/cachelib/allocator/CacheAllocator-inl.h b/cachelib/allocator/CacheAllocator-inl.h index 8c7fec31c4..5f86c2a9ad 100644 --- a/cachelib/allocator/CacheAllocator-inl.h +++ b/cachelib/allocator/CacheAllocator-inl.h @@ -16,12 +16,21 @@ #pragma once +#include + namespace facebook { namespace cachelib { template CacheAllocator::CacheAllocator(Config config) : CacheAllocator(InitMemType::kNone, config) { + // TODO(MEMORY_TIER) + if (getNumTiers() > 1 || std::holds_alternative( + memoryTierConfigs[0].getShmTypeOpts())) { + throw std::runtime_error( + "Using custom memory tier or using more than one tier is only " + "supported for Shared Memory."); + } initCommon(false); } @@ -29,12 +38,14 @@ template CacheAllocator::CacheAllocator(SharedMemNewT, Config config) : CacheAllocator(InitMemType::kMemNew, config) { initCommon(false); - shmManager_->removeShm(detail::kShmInfoName); + shmManager_->removeShm(detail::kShmInfoName, + PosixSysVSegmentOpts(config_.isUsingPosixShm())); } template CacheAllocator::CacheAllocator(SharedMemAttachT, Config config) : CacheAllocator(InitMemType::kMemAttach, config) { + /* TODO - per tier? */ for (auto pid : *metadata_.compactCachePools()) { isCompactCachePool_[pid] = true; } @@ -44,7 +55,8 @@ CacheAllocator::CacheAllocator(SharedMemAttachT, Config config) // We will create a new info shm segment on shutDown(). If we don't remove // this info shm segment here and the new info shm segment's size is larger // than this one, creating new one will fail. - shmManager_->removeShm(detail::kShmInfoName); + shmManager_->removeShm(detail::kShmInfoName, + PosixSysVSegmentOpts(config_.isUsingPosixShm())); } template @@ -53,12 +65,13 @@ CacheAllocator::CacheAllocator( : isOnShm_{type != InitMemType::kNone ? true : config.memMonitoringEnabled()}, config_(config.validate()), + memoryTierConfigs(config.getMemoryTierConfigs()), tempShm_(type == InitMemType::kNone && isOnShm_ - ? std::make_unique(config_.size) + ? std::make_unique(config_.getCacheSize()) : nullptr), shmManager_(type != InitMemType::kNone ? std::make_unique(config_.cacheDir, - config_.usePosixShm) + config_.isUsingPosixShm()) : nullptr), deserializer_(type == InitMemType::kMemAttach ? createDeserializer() : nullptr), @@ -67,20 +80,23 @@ CacheAllocator::CacheAllocator( : serialization::CacheAllocatorMetadata{}}, allocator_(initAllocator(type)), compactCacheManager_(type != InitMemType::kMemAttach - ? std::make_unique(*allocator_) - : restoreCCacheManager()), + ? std::make_unique(*allocator_[0] /* TODO: per tier */) + : restoreCCacheManager(0/* TODO: per tier */)), compressor_(createPtrCompressor()), mmContainers_(type == InitMemType::kMemAttach ? deserializeMMContainers(*deserializer_, compressor_) - : MMContainers{}), + : MMContainers{getNumTiers()}), accessContainer_(initAccessContainer( - type, detail::kShmHashTableName, config.accessConfig)), + type, detail::kShmHashTableName, config.accessConfig, config_.isUsingPosixShm())), chainedItemAccessContainer_( initAccessContainer(type, detail::kShmChainedItemHashTableName, - config.chainedItemAccessConfig)), + config.chainedItemAccessConfig, + config_.isUsingPosixShm())), chainedItemLocks_(config_.chainedItemsLockPower, std::make_shared()), + movesMap_(kShards), + moveLock_(kShards), cacheCreationTime_{ type != InitMemType::kMemAttach ? util::getCurrentTimeSec() @@ -105,39 +121,99 @@ CacheAllocator::~CacheAllocator() { } template -std::unique_ptr -CacheAllocator::createNewMemoryAllocator() { +ShmSegmentOpts CacheAllocator::createShmCacheOpts(TierId tid) { ShmSegmentOpts opts; opts.alignment = sizeof(Slab); + opts.typeOpts = memoryTierConfigs[tid].getShmTypeOpts(); + opts.memBindNumaNodes = memoryTierConfigs[tid].getMemBind(); + if (auto *v = std::get_if(&opts.typeOpts)) { + v->usePosix = config_.usePosixShm; + } + + return opts; +} + +template +size_t CacheAllocator::memoryTierSize(TierId tid) const +{ + auto partitions = std::accumulate(memoryTierConfigs.begin(), memoryTierConfigs.end(), 0UL, + [](const size_t i, const MemoryTierCacheConfig& config){ + return i + config.getRatio(); + }); + + return memoryTierConfigs[tid].calculateTierSize(config_.getCacheSize(), partitions); +} + +template +std::vector> +CacheAllocator::createPrivateAllocator() { + std::vector> allocators; + + if (isOnShm_) + allocators.emplace_back(std::make_unique( + getAllocatorConfig(config_), + tempShm_->getAddr(), + config_.getCacheSize())); + else + allocators.emplace_back(std::make_unique( + getAllocatorConfig(config_), + config_.getCacheSize())); + + return allocators; +} + +template +std::unique_ptr +CacheAllocator::createNewMemoryAllocator(TierId tid) { + size_t tierSize = memoryTierSize(tid); return std::make_unique( getAllocatorConfig(config_), shmManager_ - ->createShm(detail::kShmCacheName, config_.size, - config_.slabMemoryBaseAddr, opts) + ->createShm(detail::kShmCacheName + std::to_string(tid), + tierSize, config_.slabMemoryBaseAddr, + createShmCacheOpts(tid)) .addr, - config_.size); + tierSize); } template std::unique_ptr -CacheAllocator::restoreMemoryAllocator() { - ShmSegmentOpts opts; - opts.alignment = sizeof(Slab); +CacheAllocator::restoreMemoryAllocator(TierId tid) { return std::make_unique( deserializer_->deserialize(), shmManager_ - ->attachShm(detail::kShmCacheName, config_.slabMemoryBaseAddr, opts) - .addr, - config_.size, + ->attachShm(detail::kShmCacheName + std::to_string(tid), + config_.slabMemoryBaseAddr, createShmCacheOpts(tid)).addr, + memoryTierSize(tid), config_.disableFullCoredump); } +template +std::vector> +CacheAllocator::createAllocators() { + std::vector> allocators; + for (int tid = 0; tid < getNumTiers(); tid++) { + allocators.emplace_back(createNewMemoryAllocator(tid)); + } + return allocators; +} + +template +std::vector> +CacheAllocator::restoreAllocators() { + std::vector> allocators; + for (int tid = 0; tid < getNumTiers(); tid++) { + allocators.emplace_back(restoreMemoryAllocator(tid)); + } + return allocators; +} + template std::unique_ptr -CacheAllocator::restoreCCacheManager() { +CacheAllocator::restoreCCacheManager(TierId tid) { return std::make_unique( deserializer_->deserialize(), - *allocator_); + *allocator_[tid]); } template @@ -229,20 +305,15 @@ void CacheAllocator::initWorkers() { } template -std::unique_ptr CacheAllocator::initAllocator( +std::vector> +CacheAllocator::initAllocator( InitMemType type) { if (type == InitMemType::kNone) { - if (isOnShm_ == true) { - return std::make_unique( - getAllocatorConfig(config_), tempShm_->getAddr(), config_.size); - } else { - return std::make_unique(getAllocatorConfig(config_), - config_.size); - } + return createPrivateAllocator(); } else if (type == InitMemType::kMemNew) { - return createNewMemoryAllocator(); + return createAllocators(); } else if (type == InitMemType::kMemAttach) { - return restoreMemoryAllocator(); + return restoreAllocators(); } // Invalid type @@ -255,7 +326,8 @@ template std::unique_ptr::AccessContainer> CacheAllocator::initAccessContainer(InitMemType type, const std::string name, - AccessConfig config) { + AccessConfig config, + bool usePosixShm) { if (type == InitMemType::kNone) { return std::make_unique( config, compressor_, @@ -268,7 +340,7 @@ CacheAllocator::initAccessContainer(InitMemType type, name, AccessContainer::getRequiredSize(config.getNumBuckets()), nullptr, - ShmSegmentOpts(config.getPageSize())) + ShmSegmentOpts(config.getPageSize(), false, usePosixShm)) .addr, compressor_, [this](Item* it) -> WriteHandle { return acquire(it); }); @@ -276,7 +348,8 @@ CacheAllocator::initAccessContainer(InitMemType type, return std::make_unique( deserializer_->deserialize(), config, - shmManager_->attachShm(name), + shmManager_->attachShm(name, nullptr, + ShmSegmentOpts(config.getPageSize(), false, usePosixShm)), compressor_, [this](Item* it) -> WriteHandle { return acquire(it); }); } @@ -289,7 +362,8 @@ CacheAllocator::initAccessContainer(InitMemType type, template std::unique_ptr CacheAllocator::createDeserializer() { - auto infoAddr = shmManager_->attachShm(detail::kShmInfoName); + auto infoAddr = shmManager_->attachShm(detail::kShmInfoName, nullptr, + ShmSegmentOpts(PageSizeT::NORMAL, false, config_.isUsingPosixShm())); return std::make_unique( reinterpret_cast(infoAddr.addr), reinterpret_cast(infoAddr.addr) + infoAddr.size); @@ -311,11 +385,12 @@ CacheAllocator::allocate(PoolId poolId, template typename CacheAllocator::WriteHandle -CacheAllocator::allocateInternal(PoolId pid, - typename Item::Key key, - uint32_t size, - uint32_t creationTime, - uint32_t expiryTime) { +CacheAllocator::allocateInternalTier(TierId tid, + PoolId pid, + typename Item::Key key, + uint32_t size, + uint32_t creationTime, + uint32_t expiryTime) { util::LatencyTracker tracker{stats().allocateLatency_}; SCOPE_FAIL { stats_.invalidAllocs.inc(); }; @@ -324,13 +399,30 @@ CacheAllocator::allocateInternal(PoolId pid, const auto requiredSize = Item::getRequiredSize(key, size); // the allocation class in our memory allocator. - const auto cid = allocator_->getAllocationClassId(pid, requiredSize); + const auto cid = allocator_[tid]->getAllocationClassId(pid, requiredSize); + util::RollingLatencyTracker rollTracker{(*stats_.classAllocLatency)[tid][pid][cid]}; + // TODO: per-tier (*stats_.allocAttempts)[pid][cid].inc(); - void* memory = allocator_->allocate(pid, requiredSize); + void *memory = nullptr; + + if (tid == 0 && config_.acTopTierEvictionWatermark > 0.0 + && getAllocationClassStats(tid, pid, cid) + .approxFreePercent < config_.acTopTierEvictionWatermark) { + memory = findEviction(tid, pid, cid); + } + + if (memory == nullptr) { + // TODO: should we try allocate item even if this will result in violating + // acTopTierEvictionWatermark? + memory = allocator_[tid]->allocate(pid, requiredSize); + } + + // TODO: Today disableEviction means do not evict from memory (DRAM). + // Should we support eviction between memory tiers (e.g. from DRAM to next tier)? if (memory == nullptr && !config_.isEvictionDisabled()) { - memory = findEviction(pid, cid); + memory = findEviction(tid, pid, cid); } WriteHandle handle; @@ -341,7 +433,7 @@ CacheAllocator::allocateInternal(PoolId pid, // for example. SCOPE_FAIL { // free back the memory to the allocator since we failed. - allocator_->free(memory); + allocator_[tid]->free(memory); }; handle = acquire(new (memory) Item(key, size, creationTime, expiryTime)); @@ -352,7 +444,7 @@ CacheAllocator::allocateInternal(PoolId pid, } } else { // failed to allocate memory. - (*stats_.allocFailures)[pid][cid].inc(); + (*stats_.allocFailures)[pid][cid].inc(); // TODO: per-tier // wake up rebalancer if (poolRebalancer_) { poolRebalancer_->wakeUp(); @@ -369,6 +461,73 @@ CacheAllocator::allocateInternal(PoolId pid, return handle; } +template +TierId +CacheAllocator::getTargetTierForItem(PoolId pid, + typename Item::Key key, + uint32_t size, + uint32_t creationTime, + uint32_t expiryTime) { + if (getNumTiers() == 1) + return 0; + + if (config_.forceAllocationTier != UINT64_MAX) { + return config_.forceAllocationTier; + } + + const TierId defaultTargetTier = 0; + + const auto requiredSize = Item::getRequiredSize(key, size); + const auto cid = allocator_[defaultTargetTier]->getAllocationClassId(pid, requiredSize); + + auto freePercentage = getAllocationClassStats(defaultTargetTier, pid, cid).approxFreePercent; + + // TODO: COULD we implement BG worker which would move slabs around + // so that there is similar amount of free space in each pool/ac. + // Should this be responsibility of BG evictor? + + if (freePercentage >= config_.maxAcAllocationWatermark) + return defaultTargetTier; + + if (freePercentage <= config_.minAcAllocationWatermark) + return defaultTargetTier + 1; + + // TODO: we can even think about creating different allocation classes for different tiers + // and we could look at possible fragmentation when deciding where to put the item + if (config_.sizeThresholdPolicy) + return requiredSize < config_.sizeThresholdPolicy ? defaultTargetTier : defaultTargetTier + 1; + + // TODO: (e.g. always put chained items to PMEM) + // if (chainedItemsPolicy) + // return item.isChainedItem() ? defaultTargetTier + 1 : defaultTargetTier; + + // TODO: + // if (expiryTimePolicy) + // return (expiryTime - creationTime) < expiryTimePolicy ? defaultTargetTier : defaultTargetTier + 1; + + // TODO: + // if (keyPolicy) // this can be based on key length or some other properties + // return getTargetTierForKey(key); + + // TODO: + // if (compressabilityPolicy) // if compresses well store in PMEM? latency will be higher anyway + // return TODO; + + // TODO: only works for 2 tiers + return (folly::Random::rand32() % 100) < config_.defaultTierChancePercentage ? defaultTargetTier : defaultTargetTier + 1; +} + +template +typename CacheAllocator::WriteHandle +CacheAllocator::allocateInternal(PoolId pid, + typename Item::Key key, + uint32_t size, + uint32_t creationTime, + uint32_t expiryTime) { + auto tid = getTargetTierForItem(pid, key, size, creationTime, expiryTime); + return allocateInternalTier(tid, pid, key, size, creationTime, expiryTime); +} + template typename CacheAllocator::WriteHandle CacheAllocator::allocateChainedItem(const ReadHandle& parent, @@ -399,21 +558,28 @@ CacheAllocator::allocateChainedItemInternal( // number of bytes required for this item const auto requiredSize = ChainedItem::getRequiredSize(size); - const auto pid = allocator_->getAllocInfo(parent->getMemory()).poolId; - const auto cid = allocator_->getAllocationClassId(pid, requiredSize); + // TODO: is this correct? + auto tid = getTierId(*parent); + const auto pid = allocator_[tid]->getAllocInfo(parent->getMemory()).poolId; + const auto cid = allocator_[tid]->getAllocationClassId(pid, requiredSize); + + util::RollingLatencyTracker rollTracker{(*stats_.classAllocLatency)[tid][pid][cid]}; + + // TODO: per-tier? Right now stats_ are not used in any public periodic + // worker (*stats_.allocAttempts)[pid][cid].inc(); - void* memory = allocator_->allocate(pid, requiredSize); + void* memory = allocator_[tid]->allocate(pid, requiredSize); if (memory == nullptr) { - memory = findEviction(pid, cid); + memory = findEviction(tid, pid, cid); } if (memory == nullptr) { (*stats_.allocFailures)[pid][cid].inc(); return WriteHandle{}; } - SCOPE_FAIL { allocator_->free(memory); }; + SCOPE_FAIL { allocator_[tid]->free(memory); }; auto child = acquire( new (memory) ChainedItem(compressor_.compress(parent.getInternal()), size, @@ -722,8 +888,8 @@ CacheAllocator::releaseBackToAllocator(Item& it, throw std::runtime_error( folly::sformat("cannot release this item: {}", it.toString())); } - - const auto allocInfo = allocator_->getAllocInfo(it.getMemory()); + const auto tid = getTierId(it); + const auto allocInfo = allocator_[tid]->getAllocInfo(it.getMemory()); if (ctx == RemoveContext::kEviction) { const auto timeNow = util::getCurrentTimeSec(); @@ -747,8 +913,7 @@ CacheAllocator::releaseBackToAllocator(Item& it, folly::sformat("Can not recycle a chained item {}, toRecyle", it.toString(), toRecycle->toString())); } - - allocator_->free(&it); + allocator_[tid]->free(&it); return ReleaseRes::kReleased; } @@ -807,7 +972,7 @@ CacheAllocator::releaseBackToAllocator(Item& it, auto next = head->getNext(compressor_); const auto childInfo = - allocator_->getAllocInfo(static_cast(head)); + allocator_[tid]->getAllocInfo(static_cast(head)); (*stats_.fragmentationSize)[childInfo.poolId][childInfo.classId].sub( util::getFragmentation(*this, *head)); @@ -840,7 +1005,7 @@ CacheAllocator::releaseBackToAllocator(Item& it, XDCHECK(ReleaseRes::kReleased != res); res = ReleaseRes::kRecycled; } else { - allocator_->free(head); + allocator_[tid]->free(head); } } @@ -855,7 +1020,7 @@ CacheAllocator::releaseBackToAllocator(Item& it, res = ReleaseRes::kRecycled; } else { XDCHECK(it.isDrained()); - allocator_->free(&it); + allocator_[tid]->free(&it); } return res; @@ -927,6 +1092,25 @@ bool CacheAllocator::replaceInMMContainer(Item& oldItem, } } +template +bool CacheAllocator::replaceInMMContainer(Item* oldItem, + Item& newItem) { + return replaceInMMContainer(*oldItem, newItem); +} + +template +bool CacheAllocator::replaceInMMContainer(EvictionIterator& oldItemIt, + Item& newItem) { + auto& oldContainer = getMMContainer(*oldItemIt); + auto& newContainer = getMMContainer(newItem); + + // This function is used for eviction across tiers + XDCHECK(&oldContainer != &newContainer); + oldContainer.remove(oldItemIt); + + return newContainer.add(newItem); +} + template bool CacheAllocator::replaceChainedItemInMMContainer( Item& oldItem, Item& newItem) { @@ -1072,6 +1256,157 @@ CacheAllocator::insertOrReplace(const WriteHandle& handle) { return replaced; } +/* Next two methods are used to asynchronously move Item between memory tiers. + * + * The thread, which moves Item, allocates new Item in the tier we are moving to + * and calls moveRegularItemWithSync() method. This method does the following: + * 1. Create MoveCtx and put it to the movesMap. + * 2. Update the access container with the new item from the tier we are + * moving to. This Item has kIncomplete flag set. + * 3. Copy data from the old Item to the new one. + * 4. Unset the kIncomplete flag and Notify MoveCtx + * + * Concurrent threads which are getting handle to the same key: + * 1. When a handle is created it checks if the kIncomplete flag is set + * 2. If so, Handle implementation creates waitContext and adds it to the + * MoveCtx by calling addWaitContextForMovingItem() method. + * 3. Wait until the moving thread will complete its job. + */ +template +bool CacheAllocator::addWaitContextForMovingItem( + folly::StringPiece key, std::shared_ptr> waiter) { + auto shard = getShardForKey(key); + auto& movesMap = getMoveMapForShard(shard); + auto lock = getMoveLockForShard(shard); + auto it = movesMap.find(key); + if (it == movesMap.end()) { + return false; + } + auto ctx = it->second.get(); + ctx->addWaiter(std::move(waiter)); + return true; +} + +template +template +typename CacheAllocator::WriteHandle +CacheAllocator::moveRegularItemWithSync( + Item& oldItem, WriteHandle& newItemHdl, P&& predicate) { + XDCHECK(oldItem.isMoving()); + // TODO: should we introduce new latency tracker. E.g. evictRegularLatency_ + // ??? util::LatencyTracker tracker{stats_.evictRegularLatency_}; + + if (!oldItem.isAccessible() || oldItem.isExpired()) { + return {}; + } + + XDCHECK_EQ(newItemHdl->getSize(), oldItem.getSize()); + XDCHECK_NE(getTierId(oldItem), getTierId(*newItemHdl)); + + // take care of the flags before we expose the item to be accessed. this + // will ensure that when another thread removes the item from RAM, we issue + // a delete accordingly. See D7859775 for an example + if (oldItem.isNvmClean()) { + newItemHdl->markNvmClean(); + } + + folly::StringPiece key(oldItem.getKey()); + auto shard = getShardForKey(key); + auto& movesMap = getMoveMapForShard(shard); + MoveCtx* ctx(nullptr); + { + auto lock = getMoveLockForShard(shard); + auto res = movesMap.try_emplace(key, std::make_unique()); + if (!res.second) { + return {}; + } + ctx = res.first->second.get(); + } + + auto resHdl = WriteHandle{}; + auto guard = folly::makeGuard([key, this, ctx, shard, &resHdl]() { + auto& movesMap = getMoveMapForShard(shard); + if (resHdl) + resHdl->unmarkIncomplete(); + auto lock = getMoveLockForShard(shard); + ctx->setItemHandle(std::move(resHdl)); + movesMap.erase(key); + }); + + // TODO: Possibly we can use markMoving() instead. But today + // moveOnSlabRelease logic assume that we mark as moving old Item + // and than do copy and replace old Item with the new one in access + // container. Furthermore, Item can be marked as Moving only + // if it is linked to MM container. In our case we mark the new Item + // and update access container before the new Item is ready (content is + // copied). + newItemHdl->markIncomplete(); + + // Inside the access container's lock, this checks if the old item is + // accessible and its refcount is zero. If the item is not accessible, + // there is no point to replace it since it had already been removed + // or in the process of being removed. If the item is in cache but the + // refcount is non-zero, it means user could be attempting to remove + // this item through an API such as remove(ItemHandle). In this case, + // it is unsafe to replace the old item with a new one, so we should + // also abort. + if (!accessContainer_->replaceIf(oldItem, *newItemHdl, + predicate)) { + return {}; + } + + if (config_.moveCb) { + // Execute the move callback. We cannot make any guarantees about the + // consistency of the old item beyond this point, because the callback can + // do more than a simple memcpy() e.g. update external references. If there + // are any remaining handles to the old item, it is the caller's + // responsibility to invalidate them. The move can only fail after this + // statement if the old item has been removed or replaced, in which case it + // should be fine for it to be left in an inconsistent state. + config_.moveCb(oldItem, *newItemHdl, nullptr); + } else { + std::memcpy(newItemHdl->getMemory(), oldItem.getMemory(), + oldItem.getSize()); + } + + // Inside the MM container's lock, this checks if the old item exists to + // make sure that no other thread removed it, and only then replaces it. + if (!replaceInMMContainer(oldItem, *newItemHdl)) { + accessContainer_->remove(*newItemHdl); + return {}; + } + + // Replacing into the MM container was successful, but someone could have + // called insertOrReplace() or remove() before or after the + // replaceInMMContainer() operation, which would invalidate newItemHdl. + if (!newItemHdl->isAccessible()) { + removeFromMMContainer(*newItemHdl); + return {}; + } + + // no one can add or remove chained items at this point + if (oldItem.hasChainedItem()) { + // safe to acquire handle for a moving Item + auto oldHandle = acquire(&oldItem); + XDCHECK_EQ(1u, oldHandle->getRefCount()) << oldHandle->toString(); + XDCHECK(!newItemHdl->hasChainedItem()) << newItemHdl->toString(); + try { + auto l = chainedItemLocks_.lockExclusive(oldItem.getKey()); + transferChainLocked(oldHandle, newItemHdl); + } catch (const std::exception& e) { + // this should never happen because we drained all the handles. + XLOGF(DFATAL, "{}", e.what()); + throw; + } + + XDCHECK(!oldItem.hasChainedItem()); + XDCHECK(newItemHdl->hasChainedItem()); + } + newItemHdl.unmarkNascent(); + resHdl = std::move(newItemHdl); // guard will assign it to ctx under lock + return acquire(&oldItem); +} + template bool CacheAllocator::moveRegularItem(Item& oldItem, WriteHandle& newItemHdl) { @@ -1214,48 +1549,78 @@ bool CacheAllocator::moveChainedItem(ChainedItem& oldItem, template typename CacheAllocator::Item* -CacheAllocator::findEviction(PoolId pid, ClassId cid) { - auto& mmContainer = getMMContainer(pid, cid); +CacheAllocator::findEviction(TierId tid, PoolId pid, ClassId cid) { + auto& mmContainer = getMMContainer(tid, pid, cid); // Keep searching for a candidate until we were able to evict it // or until the search limit has been exhausted unsigned int searchTries = 0; - auto itr = mmContainer.getEvictionIterator(); while ((config_.evictionSearchTries == 0 || - config_.evictionSearchTries > searchTries) && - itr) { + config_.evictionSearchTries > searchTries)) { ++searchTries; (*stats_.evictionAttempts)[pid][cid].inc(); - Item* candidate = itr.get(); + Item* toRecycle = nullptr; + Item* candidate = nullptr; + + mmContainer.withEvictionIterator([this, &candidate, &toRecycle, &searchTries](auto &&itr){ + while ((config_.evictionSearchTries == 0 || + config_.evictionSearchTries > searchTries) && itr) { + ++searchTries; + + auto *toRecycle_ = itr.get(); + auto *candidate_ = toRecycle_->isChainedItem() + ? &toRecycle_->asChainedItem().getParentItem(compressor_) + : toRecycle_; + + // make sure no other thead is evicting the item + if (candidate_->getRefCount() == 0 && candidate_->markMoving()) { + toRecycle = toRecycle_; + candidate = candidate_; + return; + } + + ++itr; + } + }); + + if (!toRecycle) + continue; + + XDCHECK(toRecycle); + XDCHECK(candidate); + // for chained items, the ownership of the parent can change. We try to // evict what we think as parent and see if the eviction of parent // recycles the child we intend to. auto toReleaseHandle = - itr->isChainedItem() - ? advanceIteratorAndTryEvictChainedItem(itr) - : advanceIteratorAndTryEvictRegularItem(mmContainer, itr); + evictNormalItem(*candidate, true /* skipIfTokenInvalid */); + auto ref = candidate->unmarkMoving(); - if (toReleaseHandle) { - if (toReleaseHandle->hasChainedItem()) { + if (toReleaseHandle || ref == 0u) { + if (candidate->hasChainedItem()) { (*stats_.chainedItemEvictions)[pid][cid].inc(); } else { (*stats_.regularItemEvictions)[pid][cid].inc(); } + } else { + if (candidate->hasChainedItem()) { + stats_.evictFailParentAC.inc(); + } else { + stats_.evictFailAC.inc(); + } + } + if (toReleaseHandle) { if (auto eventTracker = getEventTracker()) { eventTracker->record( AllocatorApiEvent::DRAM_EVICT, toReleaseHandle->getKey(), AllocatorApiResult::EVICTED, toReleaseHandle->getSize(), toReleaseHandle->getConfiguredTTL().count()); } - // Invalidate iterator since later on we may use this mmContainer - // again, which cannot be done unless we drop this iterator - itr.destroy(); - // we must be the last handle and for chained items, this will be - // the parent. - XDCHECK(toReleaseHandle.get() == candidate || candidate->isChainedItem()); + XDCHECK(toReleaseHandle.get() == candidate); + XDCHECK(toRecycle == candidate || toRecycle->isChainedItem()); XDCHECK_EQ(1u, toReleaseHandle->getRefCount()); // We manually release the item here because we don't want to @@ -1271,15 +1636,18 @@ CacheAllocator::findEviction(PoolId pid, ClassId cid) { // recycle the candidate. if (ReleaseRes::kRecycled == releaseBackToAllocator(itemToRelease, RemoveContext::kEviction, - /* isNascent */ false, candidate)) { - return candidate; + /* isNascent */ false, toRecycle)) { + return toRecycle; + } + } else if (ref == 0u) { + // it's safe to recycle the item here as there are no more + // references and the item could not been marked as moving + // by other thread since it's detached from MMContainer. + if (ReleaseRes::kRecycled == + releaseBackToAllocator(*candidate, RemoveContext::kEviction, + /* isNascent */ false, toRecycle)) { + return toRecycle; } - } - - // If we destroyed the itr to possibly evict and failed, we restart - // from the beginning again - if (!itr) { - itr.resetToBegin(); } } return nullptr; @@ -1334,140 +1702,51 @@ bool CacheAllocator::shouldWriteToNvmCacheExclusive( } template -typename CacheAllocator::WriteHandle -CacheAllocator::advanceIteratorAndTryEvictRegularItem( - MMContainer& mmContainer, EvictionIterator& itr) { - // we should flush this to nvmcache if it is not already present in nvmcache - // and the item is not expired. - Item& item = *itr; - const bool evictToNvmCache = shouldWriteToNvmCache(item); - - auto token = evictToNvmCache ? nvmCache_->createPutToken(item.getKey()) - : typename NvmCacheT::PutToken{}; - // record the in-flight eviciton. If not, we move on to next item to avoid - // stalling eviction. - if (evictToNvmCache && !token.isValid()) { - ++itr; - stats_.evictFailConcurrentFill.inc(); - return WriteHandle{}; - } - - // If there are other accessors, we should abort. Acquire a handle here since - // if we remove the item from both access containers and mm containers - // below, we will need a handle to ensure proper cleanup in case we end up - // not evicting this item - auto evictHandle = accessContainer_->removeIf(item, &itemEvictionPredicate); - - if (!evictHandle) { - ++itr; - stats_.evictFailAC.inc(); - return evictHandle; - } - - mmContainer.remove(itr); - XDCHECK_EQ(reinterpret_cast(evictHandle.get()), - reinterpret_cast(&item)); - XDCHECK(!evictHandle->isInMMContainer()); - XDCHECK(!evictHandle->isAccessible()); - - // If the item is now marked as moving, that means its corresponding slab is - // being released right now. So, we look for the next item that is eligible - // for eviction. It is safe to destroy the handle here since the moving bit - // is set. Iterator was already advance by the remove call above. - if (evictHandle->isMoving()) { - stats_.evictFailMove.inc(); - return WriteHandle{}; - } - - // Invalidate iterator since later on if we are not evicting this - // item, we may need to rely on the handle we created above to ensure - // proper cleanup if the item's raw refcount has dropped to 0. - // And since this item may be a parent item that has some child items - // in this very same mmContainer, we need to make sure we drop this - // exclusive iterator so we can gain access to it when we're cleaning - // up the child items - itr.destroy(); - - // Ensure that there are no accessors after removing from the access - // container - XDCHECK(evictHandle->getRefCount() == 1); - - if (evictToNvmCache && shouldWriteToNvmCacheExclusive(item)) { - XDCHECK(token.isValid()); - nvmCache_->put(evictHandle, std::move(token)); - } - return evictHandle; +bool CacheAllocator::shouldEvictToNextMemoryTier( + TierId sourceTierId, TierId targetTierId, PoolId pid, Item& item) +{ + return !(config_.disableEvictionToMemory); } template typename CacheAllocator::WriteHandle -CacheAllocator::advanceIteratorAndTryEvictChainedItem( - EvictionIterator& itr) { - XDCHECK(itr->isChainedItem()); - - ChainedItem* candidate = &itr->asChainedItem(); - ++itr; +CacheAllocator::tryEvictToNextMemoryTier( + TierId tid, PoolId pid, Item& item) { + if(item.isChainedItem()) return {}; // TODO: We do not support ChainedItem yet + if(item.isExpired()) { + auto handle = removeIf(item, [](const Item& it) { + return it.getRefCount() == 0; + }); - // The parent could change at any point through transferChain. However, if - // that happens, we would realize that the releaseBackToAllocator return - // kNotRecycled and we would try another chained item, leading to transient - // failure. - auto& parent = candidate->getParentItem(compressor_); - - const bool evictToNvmCache = shouldWriteToNvmCache(parent); - - auto token = evictToNvmCache ? nvmCache_->createPutToken(parent.getKey()) - : typename NvmCacheT::PutToken{}; - - // if token is invalid, return. iterator is already advanced. - if (evictToNvmCache && !token.isValid()) { - stats_.evictFailConcurrentFill.inc(); - return WriteHandle{}; - } - - // check if the parent exists in the hashtable and refcount is drained. - auto parentHandle = - accessContainer_->removeIf(parent, &itemEvictionPredicate); - if (!parentHandle) { - stats_.evictFailParentAC.inc(); - return parentHandle; + if (handle) { return handle; } } - // Invalidate iterator since later on we may use the mmContainer - // associated with this iterator which cannot be done unless we - // drop this iterator - // - // This must be done once we know the parent is not nullptr. - // Since we can very well be the last holder of this parent item, - // which may have a chained item that is linked in this MM container. - itr.destroy(); - - // Ensure we have the correct parent and we're the only user of the - // parent, then free it from access container. Otherwise, we abort - XDCHECK_EQ(reinterpret_cast(&parent), - reinterpret_cast(parentHandle.get())); - XDCHECK_EQ(1u, parent.getRefCount()); - - removeFromMMContainer(*parentHandle); - - XDCHECK(!parent.isInMMContainer()); - XDCHECK(!parent.isAccessible()); + TierId nextTier = tid; + while (++nextTier < getNumTiers()) { // try to evict down to the next memory tiers + if (!shouldEvictToNextMemoryTier(tid, nextTier, pid, item)) + continue; + // allocateInternal might trigger another eviction + auto newItemHdl = allocateInternalTier(nextTier, pid, + item.getKey(), + item.getSize(), + item.getCreationTime(), + item.getExpiryTime()); - // We need to make sure the parent is not marked as moving - // and we're the only holder of the parent item. Safe to destroy the handle - // here since moving bit is set. - if (parentHandle->isMoving()) { - stats_.evictFailParentMove.inc(); - return WriteHandle{}; + if (newItemHdl) { + XDCHECK_EQ(newItemHdl->getSize(), item.getSize()); + return moveRegularItemWithSync(item, newItemHdl, itemMovingPredicate); + } } - if (evictToNvmCache && shouldWriteToNvmCacheExclusive(*parentHandle)) { - XDCHECK(token.isValid()); - XDCHECK(parentHandle->hasChainedItem()); - nvmCache_->put(parentHandle, std::move(token)); - } + return {}; +} - return parentHandle; +template +typename CacheAllocator::WriteHandle +CacheAllocator::tryEvictToNextMemoryTier(Item& item) { + auto tid = getTierId(item); + auto pid = allocator_[tid]->getAllocInfo(item.getMemory()).poolId; + return tryEvictToNextMemoryTier(tid, pid, item); } template @@ -1670,33 +1949,56 @@ void CacheAllocator::invalidateNvm(Item& item) { } } +template +TierId +CacheAllocator::getTierId(const Item& item) const { + return getTierId(item.getMemory()); +} + +template +TierId +CacheAllocator::getTierId(const void* ptr) const { + for (TierId tid = 0; tid < getNumTiers(); tid++) { + if (allocator_[tid]->isMemoryInAllocator(ptr)) + return tid; + } + + throw std::invalid_argument("Item does not belong to any tier!"); +} + template typename CacheAllocator::MMContainer& CacheAllocator::getMMContainer(const Item& item) const noexcept { + const auto tid = getTierId(item); const auto allocInfo = - allocator_->getAllocInfo(static_cast(&item)); - return getMMContainer(allocInfo.poolId, allocInfo.classId); + allocator_[tid]->getAllocInfo(static_cast(&item)); + return getMMContainer(tid, allocInfo.poolId, allocInfo.classId); } template typename CacheAllocator::MMContainer& -CacheAllocator::getMMContainer(PoolId pid, +CacheAllocator::getMMContainer(TierId tid, + PoolId pid, ClassId cid) const noexcept { - XDCHECK_LT(static_cast(pid), mmContainers_.size()); - XDCHECK_LT(static_cast(cid), mmContainers_[pid].size()); - return *mmContainers_[pid][cid]; + XDCHECK_LT(static_cast(tid), mmContainers_.size()); + XDCHECK_LT(static_cast(pid), mmContainers_[tid].size()); + XDCHECK_LT(static_cast(cid), mmContainers_[tid][pid].size()); + return *mmContainers_[tid][pid][cid]; } template MMContainerStat CacheAllocator::getMMContainerStat( - PoolId pid, ClassId cid) const noexcept { - if (static_cast(pid) >= mmContainers_.size()) { + TierId tid, PoolId pid, ClassId cid) const noexcept { + if(static_cast(tid) >= mmContainers_.size()) { return MMContainerStat{}; } - if (static_cast(cid) >= mmContainers_[pid].size()) { + if (static_cast(pid) >= mmContainers_[tid].size()) { return MMContainerStat{}; } - return mmContainers_[pid][cid] ? mmContainers_[pid][cid]->getStats() + if (static_cast(cid) >= mmContainers_[tid][pid].size()) { + return MMContainerStat{}; + } + return mmContainers_[tid][pid][cid] ? mmContainers_[tid][pid][cid]->getStats() : MMContainerStat{}; } @@ -1863,8 +2165,9 @@ void CacheAllocator::markUseful(const ReadHandle& handle, template bool CacheAllocator::recordAccessInMMContainer(Item& item, AccessMode mode) { + const auto tid = getTierId(item); const auto allocInfo = - allocator_->getAllocInfo(static_cast(&item)); + allocator_[tid]->getAllocInfo(static_cast(&item)); (*stats_.cacheHits)[allocInfo.poolId][allocInfo.classId].inc(); // track recently accessed items if needed @@ -1872,14 +2175,15 @@ bool CacheAllocator::recordAccessInMMContainer(Item& item, ring_->trackItem(reinterpret_cast(&item), item.getSize()); } - auto& mmContainer = getMMContainer(allocInfo.poolId, allocInfo.classId); + auto& mmContainer = getMMContainer(tid, allocInfo.poolId, allocInfo.classId); return mmContainer.recordAccess(item, mode); } template uint32_t CacheAllocator::getUsableSize(const Item& item) const { + const auto tid = getTierId(item); const auto allocSize = - allocator_->getAllocInfo(static_cast(&item)).allocSize; + allocator_[tid]->getAllocInfo(static_cast(&item)).allocSize; return item.isChainedItem() ? allocSize - ChainedItem::getRequiredSize(0) : allocSize - Item::getRequiredSize(item.getKey(), 0); @@ -1888,8 +2192,11 @@ uint32_t CacheAllocator::getUsableSize(const Item& item) const { template typename CacheAllocator::ReadHandle CacheAllocator::getSampleItem() { + // TODO: is using random tier a good idea? + auto tid = folly::Random::rand32() % getNumTiers(); + const auto* item = - reinterpret_cast(allocator_->getRandomAlloc()); + reinterpret_cast(allocator_[tid]->getRandomAlloc()); if (!item) { return ReadHandle{}; } @@ -1904,26 +2211,34 @@ CacheAllocator::getSampleItem() { template std::vector CacheAllocator::dumpEvictionIterator( - PoolId pid, ClassId cid, size_t numItems) { + PoolId pid, ClassId cid, size_t numItems) { if (numItems == 0) { return {}; } - if (static_cast(pid) >= mmContainers_.size() || - static_cast(cid) >= mmContainers_[pid].size()) { + // Always evict from the lowest layer. + int tid = getNumTiers() - 1; + + if (static_cast(tid) >= mmContainers_.size() || + static_cast(pid) >= mmContainers_[tid].size() || + static_cast(cid) >= mmContainers_[tid][pid].size()) { throw std::invalid_argument( - folly::sformat("Invalid PoolId: {} and ClassId: {}.", pid, cid)); + folly::sformat("Invalid TierId: {} and PoolId: {} and ClassId: {}.", tid, pid, cid)); } std::vector content; - auto& mm = *mmContainers_[pid][cid]; - auto evictItr = mm.getEvictionIterator(); size_t i = 0; - while (evictItr && i < numItems) { - content.push_back(evictItr->toString()); - ++evictItr; - ++i; + while (i < numItems && tid >= 0) { + auto& mm = *mmContainers_[tid][pid][cid]; + auto evictItr = mm.getEvictionIterator(); + while (evictItr && i < numItems) { + content.push_back(evictItr->toString()); + ++evictItr; + ++i; + } + + --tid; } return content; @@ -2101,19 +2416,40 @@ PoolId CacheAllocator::addPool( std::shared_ptr resizeStrategy, bool ensureProvisionable) { folly::SharedMutex::WriteHolder w(poolsResizeAndRebalanceLock_); - auto pid = allocator_->addPool(name, size, allocSizes, ensureProvisionable); + + PoolId pid = 0; + size_t totalCacheSize = 0; + + for (TierId tid = 0; tid < getNumTiers(); tid++) { + totalCacheSize += allocator_[tid]->getMemorySize(); + } + + for (TierId tid = 0; tid < getNumTiers(); tid++) { + auto tierSizeRatio = + static_cast(allocator_[tid]->getMemorySize()) / totalCacheSize; + size_t tierPoolSize = static_cast(tierSizeRatio * size); + + // TODO: what if we manage to add pool only in one tier? + // we should probably remove that on failure + auto res = allocator_[tid]->addPool( + name, tierPoolSize, allocSizes, ensureProvisionable); + XDCHECK(tid == 0 || res == pid); + pid = res; + } + createMMContainers(pid, std::move(config)); setRebalanceStrategy(pid, std::move(rebalanceStrategy)); setResizeStrategy(pid, std::move(resizeStrategy)); + return pid; } template void CacheAllocator::overridePoolRebalanceStrategy( PoolId pid, std::shared_ptr rebalanceStrategy) { - if (static_cast(pid) >= mmContainers_.size()) { + if (static_cast(pid) >= mmContainers_[0].size()) { throw std::invalid_argument(folly::sformat( - "Invalid PoolId: {}, size of pools: {}", pid, mmContainers_.size())); + "Invalid PoolId: {}, size of pools: {}", pid, mmContainers_[0].size())); } setRebalanceStrategy(pid, std::move(rebalanceStrategy)); } @@ -2121,9 +2457,9 @@ void CacheAllocator::overridePoolRebalanceStrategy( template void CacheAllocator::overridePoolResizeStrategy( PoolId pid, std::shared_ptr resizeStrategy) { - if (static_cast(pid) >= mmContainers_.size()) { + if (static_cast(pid) >= mmContainers_[0].size()) { throw std::invalid_argument(folly::sformat( - "Invalid PoolId: {}, size of pools: {}", pid, mmContainers_.size())); + "Invalid PoolId: {}, size of pools: {}", pid, mmContainers_[0].size())); } setResizeStrategy(pid, std::move(resizeStrategy)); } @@ -2135,14 +2471,14 @@ void CacheAllocator::overridePoolOptimizeStrategy( } template -void CacheAllocator::overridePoolConfig(PoolId pid, +void CacheAllocator::overridePoolConfig(TierId tid, PoolId pid, const MMConfig& config) { - if (static_cast(pid) >= mmContainers_.size()) { + // TODO: add generic tier id checking + if (static_cast(pid) >= mmContainers_[tid].size()) { throw std::invalid_argument(folly::sformat( - "Invalid PoolId: {}, size of pools: {}", pid, mmContainers_.size())); + "Invalid PoolId: {}, size of pools: {}", pid, mmContainers_[tid].size())); } - - auto& pool = allocator_->getPool(pid); + auto& pool = allocator_[tid]->getPool(pid); for (unsigned int cid = 0; cid < pool.getNumClassId(); ++cid) { MMConfig mmConfig = config; mmConfig.addExtraConfig( @@ -2150,29 +2486,39 @@ void CacheAllocator::overridePoolConfig(PoolId pid, ? pool.getAllocationClass(static_cast(cid)) .getAllocsPerSlab() : 0); - DCHECK_NOTNULL(mmContainers_[pid][cid].get()); - mmContainers_[pid][cid]->setConfig(mmConfig); + DCHECK_NOTNULL(mmContainers_[tid][pid][cid].get()); + mmContainers_[tid][pid][cid]->setConfig(mmConfig); } } template void CacheAllocator::createMMContainers(const PoolId pid, MMConfig config) { - auto& pool = allocator_->getPool(pid); + // pools on each layer should have the same number of class id, etc. + // TODO: think about deduplication + auto& pool = allocator_[0]->getPool(pid); + for (unsigned int cid = 0; cid < pool.getNumClassId(); ++cid) { config.addExtraConfig( config_.trackTailHits ? pool.getAllocationClass(static_cast(cid)) .getAllocsPerSlab() : 0); - mmContainers_[pid][cid].reset(new MMContainer(config, compressor_)); + for (TierId tid = 0; tid < getNumTiers(); tid++) { + if constexpr (std::is_same_v || std::is_same_v) { + config.lruInsertionPointSpec = config_.memoryTierConfigs[tid].lruInsertionPointSpec ; + config.markUsefulChance = config_.memoryTierConfigs[tid].markUsefulChance; + } + mmContainers_[tid][pid][cid].reset(new MMContainer(config, compressor_)); + } } } template PoolId CacheAllocator::getPoolId( folly::StringPiece name) const noexcept { - return allocator_->getPoolId(name.str()); + // each tier has the same pools + return allocator_[0]->getPoolId(name.str()); } // The Function returns a consolidated vector of Release Slab @@ -2215,7 +2561,9 @@ std::set CacheAllocator::filterCompactCachePools( template std::set CacheAllocator::getRegularPoolIds() const { folly::SharedMutex::ReadHolder r(poolsResizeAndRebalanceLock_); - return filterCompactCachePools(allocator_->getPoolIds()); + // TODO - get rid of the duplication - right now, each tier + // holds pool objects with mostly the same info + return filterCompactCachePools(allocator_[currentTier()]->getPoolIds()); } template @@ -2240,10 +2588,9 @@ std::set CacheAllocator::getRegularPoolIdsForResize() // getAdvisedMemorySize - then pools may be overLimit even when // all slabs are not allocated. Otherwise, pools may be overLimit // only after all slabs are allocated. - // - return (allocator_->allSlabsAllocated()) || - (allocator_->getAdvisedMemorySize() != 0) - ? filterCompactCachePools(allocator_->getPoolsOverLimit()) + return (allocator_[currentTier()]->allSlabsAllocated()) || + (allocator_[currentTier()]->getAdvisedMemorySize() != 0) + ? filterCompactCachePools(allocator_[currentTier()]->getPoolsOverLimit()) : std::set{}; } @@ -2252,9 +2599,19 @@ const std::string CacheAllocator::getCacheName() const { return config_.cacheName; } +template +size_t CacheAllocator::getPoolSize(PoolId poolId) const { + size_t poolSize = 0; + for (auto& allocator: allocator_) { + const auto& pool = allocator->getPool(poolId); + poolSize += pool.getPoolSize(); + } + return poolSize; +} + template PoolStats CacheAllocator::getPoolStats(PoolId poolId) const { - const auto& pool = allocator_->getPool(poolId); + const auto& pool = allocator_[currentTier()]->getPool(poolId); const auto& allocSizes = pool.getAllocSizes(); auto mpStats = pool.getStats(); const auto& classIds = mpStats.classIds; @@ -2281,14 +2638,14 @@ PoolStats CacheAllocator::getPoolStats(PoolId poolId) const { (*stats_.fragmentationSize)[poolId][cid].get(), classHits, (*stats_.chainedItemEvictions)[poolId][cid].get(), (*stats_.regularItemEvictions)[poolId][cid].get(), - getMMContainerStat(poolId, cid)}}); + getMMContainerStat(currentTier(), poolId, cid)}}); totalHits += classHits; } } PoolStats ret; ret.isCompactCache = isCompactCache; - ret.poolName = allocator_->getPoolName(poolId); + ret.poolName = allocator_[currentTier()]->getPoolName(poolId); ret.poolSize = pool.getPoolSize(); ret.poolUsableSize = pool.getPoolUsableSize(); ret.poolAdvisedSize = pool.getPoolAdvisedSize(); @@ -2300,29 +2657,66 @@ PoolStats CacheAllocator::getPoolStats(PoolId poolId) const { return ret; } +template +double CacheAllocator::slabsApproxFreePercentage(TierId tid) const +{ + return allocator_[tid]->approxFreeSlabsPercentage(); +} + +template +AllocationClassBaseStat CacheAllocator::getAllocationClassStats( + TierId tid, PoolId pid, ClassId cid) const { + const auto &ac = allocator_[tid]->getPool(pid).getAllocationClass(cid); + + AllocationClassBaseStat stats{}; + stats.allocSize = ac.getAllocSize(); + stats.memorySize = ac.getNumSlabs() * Slab::kSize; + + if (slabsApproxFreePercentage(tid) > 0.0) { + auto totalMemory = MemoryAllocator::getMemorySize(memoryTierSize(tid)); + auto freeMemory = static_cast(totalMemory) * slabsApproxFreePercentage(tid) / 100.0; + + // amount of free memory which has the same ratio to entire free memory as + // this allocation class memory size has to used memory + auto scaledFreeMemory = static_cast(freeMemory * stats.memorySize / totalMemory); + + auto acAllocatedMemory = (100.0 - ac.approxFreePercentage()) / 100.0 * ac.getNumSlabs() * Slab::kSize; + auto acMaxAvailableMemory = ac.getNumSlabs() * Slab::kSize + scaledFreeMemory; + + if (acMaxAvailableMemory == 0) { + stats.approxFreePercent = 100.0; + } else { + stats.approxFreePercent = 100.0 - 100.0 * acAllocatedMemory / acMaxAvailableMemory; + } + } else { + stats.approxFreePercent = ac.approxFreePercentage(); + } + stats.allocLatencyNs = (*stats_.classAllocLatency)[tid][pid][cid]; + + return stats; +} + template PoolEvictionAgeStats CacheAllocator::getPoolEvictionAgeStats( PoolId pid, unsigned int slabProjectionLength) const { PoolEvictionAgeStats stats; - - const auto& pool = allocator_->getPool(pid); + const auto& pool = allocator_[currentTier()]->getPool(pid); const auto& allocSizes = pool.getAllocSizes(); for (ClassId cid = 0; cid < static_cast(allocSizes.size()); ++cid) { - auto& mmContainer = getMMContainer(pid, cid); + auto& mmContainer = getMMContainer(currentTier(), pid, cid); const auto numItemsPerSlab = - allocator_->getPool(pid).getAllocationClass(cid).getAllocsPerSlab(); + allocator_[currentTier()]->getPool(pid).getAllocationClass(cid).getAllocsPerSlab(); const auto projectionLength = numItemsPerSlab * slabProjectionLength; stats.classEvictionAgeStats[cid] = mmContainer.getEvictionAgeStat(projectionLength); } - return stats; } template CacheMetadata CacheAllocator::getCacheMetadata() const noexcept { return CacheMetadata{kCachelibVersion, kCacheRamFormatVersion, - kCacheNvmFormatVersion, config_.size}; + kCacheNvmFormatVersion, config_.getCacheSize()}; } template @@ -2354,7 +2748,7 @@ void CacheAllocator::releaseSlab(PoolId pid, } try { - auto releaseContext = allocator_->startSlabRelease( + auto releaseContext = allocator_[currentTier()]->startSlabRelease( pid, victim, receiver, mode, hint, [this]() -> bool { return shutDownInProgress_; }); @@ -2363,15 +2757,15 @@ void CacheAllocator::releaseSlab(PoolId pid, return; } - releaseSlabImpl(releaseContext); - if (!allocator_->allAllocsFreed(releaseContext)) { + releaseSlabImpl(currentTier(), releaseContext); + if (!allocator_[currentTier()]->allAllocsFreed(releaseContext)) { throw std::runtime_error( folly::sformat("Was not able to free all allocs. PoolId: {}, AC: {}", releaseContext.getPoolId(), releaseContext.getClassId())); } - allocator_->completeSlabRelease(releaseContext); + allocator_[currentTier()]->completeSlabRelease(releaseContext); } catch (const exception::SlabReleaseAborted& e) { stats_.numAbortedSlabReleases.inc(); throw exception::SlabReleaseAborted(folly::sformat( @@ -2382,8 +2776,7 @@ void CacheAllocator::releaseSlab(PoolId pid, } template -SlabReleaseStats CacheAllocator::getSlabReleaseStats() - const noexcept { +SlabReleaseStats CacheAllocator::getSlabReleaseStats() const noexcept { std::lock_guard l(workersMutex_); return SlabReleaseStats{stats_.numActiveSlabReleases.get(), stats_.numReleasedForRebalance.get(), @@ -2401,7 +2794,7 @@ SlabReleaseStats CacheAllocator::getSlabReleaseStats() } template -void CacheAllocator::releaseSlabImpl( +void CacheAllocator::releaseSlabImpl(TierId tid, const SlabReleaseContext& releaseContext) { auto startTime = std::chrono::milliseconds(util::getCurrentTimeMs()); bool releaseStuck = false; @@ -2446,7 +2839,7 @@ void CacheAllocator::releaseSlabImpl( if (!isMoved) { evictForSlabRelease(releaseContext, item, throttler); } - XDCHECK(allocator_->isAllocFreed(releaseContext, alloc)); + XDCHECK(allocator_[tid]->isAllocFreed(releaseContext, alloc)); } } @@ -2526,8 +2919,11 @@ bool CacheAllocator::moveForSlabRelease( ctx.getPoolId(), ctx.getClassId()); }); } - const auto allocInfo = allocator_->getAllocInfo(oldItem.getMemory()); - allocator_->free(&oldItem); + + auto tid = getTierId(oldItem); + + const auto allocInfo = allocator_[tid]->getAllocInfo(oldItem.getMemory()); + allocator_[tid]->free(&oldItem); (*stats_.fragmentationSize)[allocInfo.poolId][allocInfo.classId].sub( util::getFragmentation(*this, oldItem)); @@ -2589,15 +2985,16 @@ CacheAllocator::allocateNewItemForOldItem(const Item& oldItem) { } const auto allocInfo = - allocator_->getAllocInfo(static_cast(&oldItem)); + allocator_[getTierId(oldItem)]->getAllocInfo(static_cast(&oldItem)); // Set up the destination for the move. Since oldItem would have the moving // bit set, it won't be picked for eviction. - auto newItemHdl = allocateInternal(allocInfo.poolId, - oldItem.getKey(), - oldItem.getSize(), - oldItem.getCreationTime(), - oldItem.getExpiryTime()); + auto newItemHdl = allocateInternalTier(getTierId(oldItem), + allocInfo.poolId, + oldItem.getKey(), + oldItem.getSize(), + oldItem.getCreationTime(), + oldItem.getExpiryTime()); if (!newItemHdl) { return {}; } @@ -2672,13 +3069,13 @@ void CacheAllocator::evictForSlabRelease( auto owningHandle = item.isChainedItem() ? evictChainedItemForSlabRelease(item.asChainedItem()) - : evictNormalItemForSlabRelease(item); + : evictNormalItem(item); // we managed to evict the corresponding owner of the item and have the // last handle for the owner. if (owningHandle) { const auto allocInfo = - allocator_->getAllocInfo(static_cast(&item)); + allocator_[getTierId(item)]->getAllocInfo(static_cast(&item)); if (owningHandle->hasChainedItem()) { (*stats_.chainedItemEvictions)[allocInfo.poolId][allocInfo.classId] .inc(); @@ -2705,7 +3102,7 @@ void CacheAllocator::evictForSlabRelease( if (shutDownInProgress_) { item.unmarkMoving(); - allocator_->abortSlabRelease(ctx); + allocator_[getTierId(item)]->abortSlabRelease(ctx); throw exception::SlabReleaseAborted( folly::sformat("Slab Release aborted while trying to evict" " Item: {} Pool: {}, Class: {}.", @@ -2729,32 +3126,37 @@ void CacheAllocator::evictForSlabRelease( template typename CacheAllocator::WriteHandle -CacheAllocator::evictNormalItemForSlabRelease(Item& item) { +CacheAllocator::evictNormalItem(Item& item, + bool skipIfTokenInvalid) { XDCHECK(item.isMoving()); if (item.isOnlyMoving()) { return WriteHandle{}; } + auto evictHandle = tryEvictToNextMemoryTier(item); + if(evictHandle) return evictHandle; + auto predicate = [](const Item& it) { return it.getRefCount() == 0; }; const bool evictToNvmCache = shouldWriteToNvmCache(item); auto token = evictToNvmCache ? nvmCache_->createPutToken(item.getKey()) : typename NvmCacheT::PutToken{}; + if (skipIfTokenInvalid && evictToNvmCache && !token.isValid()) { + stats_.evictFailConcurrentFill.inc(); + return WriteHandle{}; + } + // We remove the item from both access and mm containers. It doesn't matter // if someone else calls remove on the item at this moment, the item cannot // be freed as long as we have the moving bit set. - auto handle = accessContainer_->removeIf(item, std::move(predicate)); - + auto handle = removeIf(item, std::move(predicate)); if (!handle) { return handle; } - XDCHECK_EQ(reinterpret_cast(handle.get()), - reinterpret_cast(&item)); XDCHECK_EQ(1u, handle->getRefCount()); - removeFromMMContainer(item); // now that we are the only handle and we actually removed something from // the RAM cache, we enqueue it to nvmcache. @@ -2866,6 +3268,21 @@ CacheAllocator::evictChainedItemForSlabRelease(ChainedItem& child) { return parentHandle; } +template +template +typename CacheAllocator::WriteHandle +CacheAllocator::removeIf(Item& item, Fn&& predicate) { + auto handle = accessContainer_->removeIf(item, std::forward(predicate)); + + if (handle) { + XDCHECK_EQ(reinterpret_cast(handle.get()), + reinterpret_cast(&item)); + removeFromMMContainer(item); + } + + return handle; +} + template bool CacheAllocator::removeIfExpired(const ReadHandle& handle) { if (!handle) { @@ -2874,14 +3291,7 @@ bool CacheAllocator::removeIfExpired(const ReadHandle& handle) { // We remove the item from both access and mm containers. // We want to make sure the caller is the only one holding the handle. - auto removedHandle = - accessContainer_->removeIf(*(handle.getInternal()), itemExpiryPredicate); - if (removedHandle) { - removeFromMMContainer(*(handle.getInternal())); - return true; - } - - return false; + return (bool)removeIf(*(handle.getInternal()), itemExpiryPredicate); } template @@ -2900,6 +3310,7 @@ bool CacheAllocator::markMovingForSlabRelease( // At first, we assume this item was already freed bool itemFreed = true; bool markedMoving = false; + TierId tid = getTierId(alloc); const auto fn = [&markedMoving, &itemFreed](void* memory) { // Since this callback is executed, the item is not yet freed itemFreed = false; @@ -2911,7 +3322,7 @@ bool CacheAllocator::markMovingForSlabRelease( auto startTime = util::getCurrentTimeSec(); while (true) { - allocator_->processAllocForRelease(ctx, alloc, fn); + allocator_[tid]->processAllocForRelease(ctx, alloc, fn); // If item is already freed we give up trying to mark the item moving // and return false, otherwise if marked as moving, we return true. @@ -2927,7 +3338,7 @@ bool CacheAllocator::markMovingForSlabRelease( if (shutDownInProgress_) { XDCHECK(!static_cast(alloc)->isMoving()); - allocator_->abortSlabRelease(ctx); + allocator_[tid]->abortSlabRelease(ctx); throw exception::SlabReleaseAborted( folly::sformat("Slab Release aborted while still trying to mark" " as moving for Item: {}. Pool: {}, Class: {}.", @@ -2950,12 +3361,15 @@ template CCacheT* CacheAllocator::addCompactCache(folly::StringPiece name, size_t size, Args&&... args) { + if (getNumTiers() != 1) + throw std::runtime_error("TODO: compact cache for multi-tier Cache not supported."); + if (!config_.isCompactCacheEnabled()) { throw std::logic_error("Compact cache is not enabled"); } folly::SharedMutex::WriteHolder lock(compactCachePoolsLock_); - auto poolId = allocator_->addPool(name, size, {Slab::kSize}); + auto poolId = allocator_[0]->addPool(name, size, {Slab::kSize}); isCompactCachePool_[poolId] = true; auto ptr = std::make_unique( @@ -3064,12 +3478,15 @@ folly::IOBufQueue CacheAllocator::saveStateToIOBuf() { *metadata_.numChainedChildItems() = stats_.numChainedChildItems.get(); *metadata_.numAbortedSlabReleases() = stats_.numAbortedSlabReleases.get(); + // TODO: implement serialization for multiple tiers auto serializeMMContainers = [](MMContainers& mmContainers) { MMSerializationTypeContainer state; - for (unsigned int i = 0; i < mmContainers.size(); ++i) { + for (unsigned int i = 0; i < 1 /* TODO: */ ; ++i) { for (unsigned int j = 0; j < mmContainers[i].size(); ++j) { - if (mmContainers[i][j]) { - state.pools_ref()[i][j] = mmContainers[i][j]->saveState(); + for (unsigned int k = 0; k < mmContainers[i][j].size(); ++k) { + if (mmContainers[i][j][k]) { + state.pools_ref()[j][k] = mmContainers[i][j][k]->saveState(); + } } } } @@ -3079,7 +3496,8 @@ folly::IOBufQueue CacheAllocator::saveStateToIOBuf() { serializeMMContainers(mmContainers_); AccessSerializationType accessContainerState = accessContainer_->saveState(); - MemoryAllocator::SerializationType allocatorState = allocator_->saveState(); + // TODO: foreach allocator + MemoryAllocator::SerializationType allocatorState = allocator_[0]->saveState(); CCacheManager::SerializationType ccState = compactCacheManager_->saveState(); AccessSerializationType chainedItemAccessContainerState = @@ -3141,6 +3559,8 @@ CacheAllocator::shutDown() { (shmShutDownStatus == ShmShutDownRes::kSuccess); shmManager_.reset(); + // TODO: save per-tier state + if (shmShutDownSucceeded) { if (!nvmShutDownStatusOpt || *nvmShutDownStatusOpt) return ShutDownStatus::kSuccess; @@ -3189,8 +3609,11 @@ void CacheAllocator::saveRamCache() { std::unique_ptr ioBuf = serializedBuf.move(); ioBuf->coalesce(); - void* infoAddr = - shmManager_->createShm(detail::kShmInfoName, ioBuf->length()).addr; + ShmSegmentOpts opts; + opts.typeOpts = PosixSysVSegmentOpts(config_.isUsingPosixShm()); + + void* infoAddr = shmManager_->createShm(detail::kShmInfoName, ioBuf->length(), + nullptr, opts).addr; Serializer serializer(reinterpret_cast(infoAddr), reinterpret_cast(infoAddr) + ioBuf->length()); serializer.writeToBuffer(std::move(ioBuf)); @@ -3204,22 +3627,26 @@ CacheAllocator::deserializeMMContainers( const auto container = deserializer.deserialize(); - MMContainers mmContainers; + /* TODO: right now, we create empty containers because deserialization + * only works for a single (topmost) tier. */ + MMContainers mmContainers{getNumTiers()}; for (auto& kvPool : *container.pools_ref()) { auto i = static_cast(kvPool.first); auto& pool = getPool(i); for (auto& kv : kvPool.second) { auto j = static_cast(kv.first); - MMContainerPtr ptr = - std::make_unique(kv.second, - compressor); - auto config = ptr->getConfig(); - config.addExtraConfig(config_.trackTailHits - ? pool.getAllocationClass(j).getAllocsPerSlab() - : 0); - ptr->setConfig(config); - mmContainers[i][j] = std::move(ptr); + for (TierId tid = 0; tid < getNumTiers(); tid++) { + MMContainerPtr ptr = + std::make_unique(kv.second, + compressor); + auto config = ptr->getConfig(); + config.addExtraConfig(config_.trackTailHits + ? pool.getAllocationClass(j).getAllocsPerSlab() + : 0); + ptr->setConfig(config); + mmContainers[tid][i][j] = std::move(ptr); + } } } // We need to drop the unevictableMMContainer in the desierializer. @@ -3370,10 +3797,13 @@ GlobalCacheStats CacheAllocator::getGlobalCacheStats() const { template CacheMemoryStats CacheAllocator::getCacheMemoryStats() const { - const auto totalCacheSize = allocator_->getMemorySize(); + size_t totalCacheSize = 0; + for(auto& allocator: allocator_) { + totalCacheSize += allocator->getMemorySize(); + } auto addSize = [this](size_t a, PoolId pid) { - return a + allocator_->getPool(pid).getPoolSize(); + return a + allocator_[currentTier()]->getPool(pid).getPoolSize(); }; const auto regularPoolIds = getRegularPoolIds(); const auto ccCachePoolIds = getCCachePoolIds(); @@ -3382,15 +3812,20 @@ CacheMemoryStats CacheAllocator::getCacheMemoryStats() const { size_t compactCacheSize = std::accumulate( ccCachePoolIds.begin(), ccCachePoolIds.end(), 0ULL, addSize); + std::vector slabsApproxFreePercentages; + for (TierId tid = 0; tid < getNumTiers(); ++tid) + slabsApproxFreePercentages.push_back(slabsApproxFreePercentage(tid)); + return CacheMemoryStats{totalCacheSize, regularCacheSize, compactCacheSize, - allocator_->getAdvisedMemorySize(), + allocator_[currentTier()]->getAdvisedMemorySize(), memMonitor_ ? memMonitor_->getMaxAdvisePct() : 0, - allocator_->getUnreservedMemorySize(), + allocator_[currentTier()]->getUnreservedMemorySize(), nvmCache_ ? nvmCache_->getSize() : 0, util::getMemAvailable(), - util::getRSSBytes()}; + util::getRSSBytes(), + slabsApproxFreePercentages}; } template @@ -3529,12 +3964,14 @@ bool CacheAllocator::stopReaper(std::chrono::seconds timeout) { template bool CacheAllocator::cleanupStrayShmSegments( - const std::string& cacheDir, bool posix) { + const std::string& cacheDir, bool posix /*TODO(SHM_FILE): const std::vector& config */) { if (util::getStatIfExists(cacheDir, nullptr) && util::isDir(cacheDir)) { try { // cache dir exists. clean up only if there are no other processes // attached. if another process was attached, the following would fail. ShmManager::cleanup(cacheDir, posix); + + // TODO: cleanup per-tier state } catch (const std::exception& e) { XLOGF(ERR, "Error cleaning up {}. Exception: ", cacheDir, e.what()); return false; @@ -3544,10 +3981,17 @@ bool CacheAllocator::cleanupStrayShmSegments( // Any other concurrent process can not be attached to the segments or // even if it does, we want to mark it for destruction. ShmManager::removeByName(cacheDir, detail::kShmInfoName, posix); - ShmManager::removeByName(cacheDir, detail::kShmCacheName, posix); + ShmManager::removeByName(cacheDir, detail::kShmCacheName + + std::to_string(0 /* TODO: per tier */), posix); ShmManager::removeByName(cacheDir, detail::kShmHashTableName, posix); ShmManager::removeByName(cacheDir, detail::kShmChainedItemHashTableName, posix); + + // TODO(SHM_FILE): try to nuke segments of differente types (which require + // extra info) + // for (auto &tier : config) { + // ShmManager::removeByName(cacheDir, tierShmName, config_.memoryTiers[i].opts); + // } } return true; } @@ -3558,14 +4002,16 @@ uint64_t CacheAllocator::getItemPtrAsOffset(const void* ptr) { // the two differ (e.g. Mac OS 12) - causing templating instantiation // errors downstream. + auto tid = getTierId(ptr); + // if this succeeeds, the address is valid within the cache. - allocator_->getAllocInfo(ptr); + allocator_[tid]->getAllocInfo(ptr); if (!isOnShm_ || !shmManager_) { throw std::invalid_argument("Shared memory not used"); } - const auto& shm = shmManager_->getShmByName(detail::kShmCacheName); + const auto& shm = shmManager_->getShmByName(detail::kShmCacheName + std::to_string(tid)); return reinterpret_cast(ptr) - reinterpret_cast(shm.getCurrentMapping().addr); diff --git a/cachelib/allocator/CacheAllocator.h b/cachelib/allocator/CacheAllocator.h index 0d8eb8fb1b..fab13f5feb 100644 --- a/cachelib/allocator/CacheAllocator.h +++ b/cachelib/allocator/CacheAllocator.h @@ -21,6 +21,8 @@ #include #include #include +#include +#include #include #include @@ -753,7 +755,7 @@ class CacheAllocator : public CacheBase { // @param config new config for the pool // // @throw std::invalid_argument if the poolId is invalid - void overridePoolConfig(PoolId pid, const MMConfig& config); + void overridePoolConfig(TierId tid, PoolId pid, const MMConfig& config); // update an existing pool's rebalance strategy // @@ -794,8 +796,9 @@ class CacheAllocator : public CacheBase { // @return true if the operation succeeded. false if the size of the pool is // smaller than _bytes_ // @throw std::invalid_argument if the poolId is invalid. + // TODO: should call shrinkPool for specific tier? bool shrinkPool(PoolId pid, size_t bytes) { - return allocator_->shrinkPool(pid, bytes); + return allocator_[currentTier()]->shrinkPool(pid, bytes); } // grow an existing pool by _bytes_. This will fail if there is no @@ -804,8 +807,9 @@ class CacheAllocator : public CacheBase { // @return true if the pool was grown. false if the necessary number of // bytes were not available. // @throw std::invalid_argument if the poolId is invalid. + // TODO: should call growPool for specific tier? bool growPool(PoolId pid, size_t bytes) { - return allocator_->growPool(pid, bytes); + return allocator_[currentTier()]->growPool(pid, bytes); } // move bytes from one pool to another. The source pool should be at least @@ -818,7 +822,7 @@ class CacheAllocator : public CacheBase { // correct size to do the transfer. // @throw std::invalid_argument if src or dest is invalid pool bool resizePools(PoolId src, PoolId dest, size_t bytes) override { - return allocator_->resizePools(src, dest, bytes); + return allocator_[currentTier()]->resizePools(src, dest, bytes); } // Add a new compact cache with given name and size @@ -1023,12 +1027,13 @@ class CacheAllocator : public CacheBase { // @throw std::invalid_argument if the memory does not belong to this // cache allocator AllocInfo getAllocInfo(const void* memory) const { - return allocator_->getAllocInfo(memory); + return allocator_[getTierId(memory)]->getAllocInfo(memory); } // return the ids for the set of existing pools in this cache. std::set getPoolIds() const override final { - return allocator_->getPoolIds(); + // all tiers have the same pool ids. TODO: deduplicate + return allocator_[0]->getPoolIds(); } // return a list of pool ids that are backing compact caches. This includes @@ -1040,18 +1045,18 @@ class CacheAllocator : public CacheBase { // return the pool with speicified id. const MemoryPool& getPool(PoolId pid) const override final { - return allocator_->getPool(pid); + return allocator_[currentTier()]->getPool(pid); } // calculate the number of slabs to be advised/reclaimed in each pool PoolAdviseReclaimData calcNumSlabsToAdviseReclaim() override final { auto regularPoolIds = getRegularPoolIds(); - return allocator_->calcNumSlabsToAdviseReclaim(regularPoolIds); + return allocator_[currentTier()]->calcNumSlabsToAdviseReclaim(regularPoolIds); } // update number of slabs to advise in the cache void updateNumSlabsToAdvise(int32_t numSlabsToAdvise) override final { - allocator_->updateNumSlabsToAdvise(numSlabsToAdvise); + allocator_[currentTier()]->updateNumSlabsToAdvise(numSlabsToAdvise); } // returns a valid PoolId corresponding to the name or kInvalidPoolId if the @@ -1060,7 +1065,8 @@ class CacheAllocator : public CacheBase { // returns the pool's name by its poolId. std::string getPoolName(PoolId poolId) const { - return allocator_->getPoolName(poolId); + // all tiers have the same pool names. + return allocator_[0]->getPoolName(poolId); } // get stats related to all kinds of slab release events. @@ -1106,6 +1112,9 @@ class CacheAllocator : public CacheBase { // whether it is object-cache bool isObjectCache() const override final { return false; } + // combined pool size for all memory tiers + size_t getPoolSize(PoolId pid) const; + // pool stats by pool id PoolStats getPoolStats(PoolId pid) const override final; @@ -1122,6 +1131,10 @@ class CacheAllocator : public CacheBase { // return cache's memory usage stats CacheMemoryStats getCacheMemoryStats() const override final; + // return basic stats for Allocation Class + AllocationClassBaseStat getAllocationClassStats(TierId tid, PoolId pid, ClassId cid) + const override final; + // return the nvm cache stats map std::unordered_map getNvmCacheStatsMap() const override final; @@ -1220,7 +1233,8 @@ class CacheAllocator : public CacheBase { // returns true if there was no error in trying to cleanup the segment // because another process was attached. False if the user tried to clean up // and the cache was actually attached. - static bool cleanupStrayShmSegments(const std::string& cacheDir, bool posix); + static bool cleanupStrayShmSegments(const std::string& cacheDir, bool posix + /*TODO: const std::vector& config = {} */); // gives a relative offset to a pointer within the cache. uint64_t getItemPtrAsOffset(const void* ptr); @@ -1232,7 +1246,8 @@ class CacheAllocator : public CacheBase { sizeof(typename RefcountWithFlags::Value) + sizeof(uint32_t) + sizeof(uint32_t) + sizeof(KAllocation)) == sizeof(Item), "vtable overhead"); - static_assert(32 == sizeof(Item), "item overhead is 32 bytes"); + // XXX: this will fail due to CompressedPtr change + // static_assert(32 == sizeof(Item), "item overhead is 32 bytes"); // make sure there is no overhead in ChainedItem on top of a regular Item static_assert(sizeof(Item) == sizeof(ChainedItem), @@ -1255,6 +1270,8 @@ class CacheAllocator : public CacheBase { #pragma GCC diagnostic pop private: + double slabsApproxFreePercentage(TierId tid) const; + // wrapper around Item's refcount and active handle tracking FOLLY_ALWAYS_INLINE void incRef(Item& it); FOLLY_ALWAYS_INLINE RefcountWithFlags::Value decRef(Item& it); @@ -1270,6 +1287,11 @@ class CacheAllocator : public CacheBase { // @param types the type of the memory used // @param config the configuration for the whole cache allocator CacheAllocator(InitMemType types, Config config); + + TierId getTargetTierForItem(PoolId pid, typename Item::Key key, + uint32_t size, + uint32_t creationTime, + uint32_t expiryTime); // This is the last step in item release. We also use this for the eviction // scenario where we have to do everything, but not release the allocation @@ -1326,11 +1348,14 @@ class CacheAllocator : public CacheBase { using MMContainerPtr = std::unique_ptr; using MMContainers = - std::array, - MemoryPoolManager::kMaxPools>; + std::vector, + MemoryPoolManager::kMaxPools>>; void createMMContainers(const PoolId pid, MMConfig config); + TierId getTierId(const Item& item) const; + TierId getTierId(const void* ptr) const; + // acquire the MMContainer corresponding to the the Item's class and pool. // // @return pointer to the MMContainer. @@ -1338,12 +1363,12 @@ class CacheAllocator : public CacheBase { // allocation from the memory allocator. MMContainer& getMMContainer(const Item& item) const noexcept; - MMContainer& getMMContainer(PoolId pid, ClassId cid) const noexcept; + MMContainer& getMMContainer(TierId tid, PoolId pid, ClassId cid) const noexcept; // Get stats of the specified pid and cid. // If such mmcontainer is not valid (pool id or cid out of bound) // or the mmcontainer is not initialized, return an empty stat. - MMContainerStat getMMContainerStat(PoolId pid, ClassId cid) const noexcept; + MMContainerStat getMMContainerStat(TierId tid, PoolId pid, ClassId cid) const noexcept; // create a new cache allocation. The allocation can be initialized // appropriately and made accessible through insert or insertOrReplace. @@ -1375,6 +1400,17 @@ class CacheAllocator : public CacheBase { uint32_t creationTime, uint32_t expiryTime); + // create a new cache allocation on specific memory tier. + // For description see allocateInternal. + // + // @param tid id a memory tier + WriteHandle allocateInternalTier(TierId tid, + PoolId id, + Key key, + uint32_t size, + uint32_t creationTime, + uint32_t expiryTime); + // Allocate a chained item // // The resulting chained item does not have a parent item and @@ -1460,6 +1496,16 @@ class CacheAllocator : public CacheBase { // not exist. FOLLY_ALWAYS_INLINE WriteHandle findFastImpl(Key key, AccessMode mode); + // Moves a regular item to a different memory tier. + // + // @param oldItem Reference to the item being moved + // @param newItemHdl Reference to the handle of the new item being moved into + // + // @return true If the move was completed, and the containers were updated + // successfully. + template + WriteHandle moveRegularItemWithSync(Item& oldItem, WriteHandle& newItemHdl, P&& predicate); + // Moves a regular item to a different slab. This should only be used during // slab release after the item's moving bit has been set. The user supplied // callback is responsible for copying the contents and fixing the semantics @@ -1545,6 +1591,10 @@ class CacheAllocator : public CacheBase { // false if the item is not in MMContainer bool removeFromMMContainer(Item& item); + using EvictionIterator = typename MMContainer::Iterator; + + WriteHandle acquire(EvictionIterator& it) { return acquire(it.get()); } + // Replaces an item in the MMContainer with another item, at the same // position. // @@ -1555,6 +1605,8 @@ class CacheAllocator : public CacheBase { // destination item did not exist in the container, or if the // source item already existed. bool replaceInMMContainer(Item& oldItem, Item& newItem); + bool replaceInMMContainer(Item* oldItem, Item& newItem); + bool replaceInMMContainer(EvictionIterator& oldItemIt, Item& newItem); // Replaces an item in the MMContainer with another item, at the same // position. Or, if the two chained items belong to two different MM @@ -1611,28 +1663,30 @@ class CacheAllocator : public CacheBase { // @param pid the id of the pool to look for evictions inside // @param cid the id of the class to look for evictions inside // @return An evicted item or nullptr if there is no suitable candidate. - Item* findEviction(PoolId pid, ClassId cid); - - using EvictionIterator = typename MMContainer::Iterator; + Item* findEviction(TierId tid, PoolId pid, ClassId cid); - // Advance the current iterator and try to evict a regular item + // Try to move the item down to the next memory tier // - // @param mmContainer the container to look for evictions. - // @param itr iterator holding the item + // @param tid current tier ID of the item + // @param pid the pool ID the item belong to. + // @param item the item to evict // - // @return valid handle to regular item on success. This will be the last - // handle to the item. On failure an empty handle. - WriteHandle advanceIteratorAndTryEvictRegularItem(MMContainer& mmContainer, - EvictionIterator& itr); + // @return valid handle to the item. This will be the last + // handle to the item. On failure an empty handle. + WriteHandle tryEvictToNextMemoryTier(TierId tid, PoolId pid, Item& item); - // Advance the current iterator and try to evict a chained item - // Iterator may also be reset during the course of this function + // Try to move the item down to the next memory tier // - // @param itr iterator holding the item + // @param item the item to evict // - // @return valid handle to the parent item on success. This will be the last - // handle to the item - WriteHandle advanceIteratorAndTryEvictChainedItem(EvictionIterator& itr); + // @return valid handle to the item. This will be the last + // handle to the item. On failure an empty handle. + WriteHandle tryEvictToNextMemoryTier(Item& item); + + bool shouldEvictToNextMemoryTier(TierId sourceTierId, + TierId targetTierId, PoolId pid, Item& item); + + size_t memoryTierSize(TierId tid) const; // Deserializer CacheAllocatorMetadata and verify the version // @@ -1646,7 +1700,7 @@ class CacheAllocator : public CacheBase { const typename Item::PtrCompressor& compressor); unsigned int reclaimSlabs(PoolId id, size_t numSlabs) final { - return allocator_->reclaimSlabsAndGrow(id, numSlabs); + return allocator_[currentTier()]->reclaimSlabsAndGrow(id, numSlabs); } FOLLY_ALWAYS_INLINE EventTracker* getEventTracker() const { @@ -1705,7 +1759,7 @@ class CacheAllocator : public CacheBase { const void* hint = nullptr) final; // @param releaseContext slab release context - void releaseSlabImpl(const SlabReleaseContext& releaseContext); + void releaseSlabImpl(TierId tid, const SlabReleaseContext& releaseContext); // @return true when successfully marked as moving, // fasle when this item has already been freed @@ -1751,7 +1805,7 @@ class CacheAllocator : public CacheBase { // // @return last handle for corresponding to item on success. empty handle on // failure. caller can retry if needed. - WriteHandle evictNormalItemForSlabRelease(Item& item); + WriteHandle evictNormalItem(Item& item, bool skipIfTokenInvalid = false); // Helper function to evict a child item for slab release // As a side effect, the parent item is also evicted @@ -1760,6 +1814,12 @@ class CacheAllocator : public CacheBase { // handle on failure. caller can retry. WriteHandle evictChainedItemForSlabRelease(ChainedItem& item); + // Helper function to remove a item if predicates is true. + // + // @return last handle to the item on success. empty handle on failure. + template + WriteHandle removeIf(Item& item, Fn&& predicate); + // Helper function to remove a item if expired. // // @return true if it item expire and removed successfully. @@ -1777,7 +1837,7 @@ class CacheAllocator : public CacheBase { // primitives. So we consciously exempt ourselves here from TSAN data race // detection. folly::annotate_ignore_thread_sanitizer_guard g(__FILE__, __LINE__); - auto slabsSkipped = allocator_->forEachAllocation(std::forward(f)); + auto slabsSkipped = allocator_[currentTier()]->forEachAllocation(std::forward(f)); stats().numSkippedSlabReleases.add(slabsSkipped); } @@ -1821,9 +1881,11 @@ class CacheAllocator : public CacheBase { std::unique_ptr& worker, std::chrono::seconds timeout = std::chrono::seconds{0}); - std::unique_ptr createNewMemoryAllocator(); - std::unique_ptr restoreMemoryAllocator(); - std::unique_ptr restoreCCacheManager(); + ShmSegmentOpts createShmCacheOpts(TierId tid); + + std::unique_ptr createNewMemoryAllocator(TierId tid); + std::unique_ptr restoreMemoryAllocator(TierId tid); + std::unique_ptr restoreCCacheManager(TierId tid); PoolIds filterCompactCachePools(const PoolIds& poolIds) const; @@ -1843,7 +1905,7 @@ class CacheAllocator : public CacheBase { } typename Item::PtrCompressor createPtrCompressor() const { - return allocator_->createPtrCompressor(); + return typename Item::PtrCompressor(allocator_); } // helper utility to throttle and optionally log. @@ -1866,16 +1928,22 @@ class CacheAllocator : public CacheBase { // @param type the type of initialization // @return nullptr if the type is invalid - // @return pointer to memory allocator + // @return vector of pointers to memory allocator // @throw std::runtime_error if type is invalid - std::unique_ptr initAllocator(InitMemType type); + std::vector> initAllocator(InitMemType type); + + std::vector> createPrivateAllocator(); + std::vector> createAllocators(); + std::vector> restoreAllocators(); + // @param type the type of initialization // @return nullptr if the type is invalid // @return pointer to access container // @throw std::runtime_error if type is invalid std::unique_ptr initAccessContainer(InitMemType type, const std::string name, - AccessConfig config); + AccessConfig config, + bool usePosixShm); std::optional saveNvmCache(); void saveRamCache(); @@ -1884,10 +1952,6 @@ class CacheAllocator : public CacheBase { return item.getRefCount() == 0; } - static bool itemEvictionPredicate(const Item& item) { - return item.getRefCount() == 0 && !item.isMoving(); - } - static bool itemExpiryPredicate(const Item& item) { return item.getRefCount() == 1 && item.isExpired(); } @@ -1934,6 +1998,95 @@ class CacheAllocator : public CacheBase { // BEGIN private members + TierId currentTier() const { + // TODO: every function which calls this method should be refactored. + // We should go case by case and either make such function work on + // all tiers or expose separate parameter to describe the tier ID. + return 0; + } + + unsigned getNumTiers() const { + return memoryTierConfigs.size(); + } + + bool addWaitContextForMovingItem( + folly::StringPiece key, std::shared_ptr> waiter); + + class MoveCtx { + public: + MoveCtx() {} + + ~MoveCtx() { + // prevent any further enqueue to waiters + // Note: we don't need to hold locks since no one can enqueue + // after this point. + wakeUpWaiters(); + } + + // record the item handle. Upon destruction we will wake up the waiters + // and pass a clone of the handle to the callBack. By default we pass + // a null handle + void setItemHandle(WriteHandle _it) { it = std::move(_it); } + + // enqueue a waiter into the waiter list + // @param waiter WaitContext + void addWaiter(std::shared_ptr> waiter) { + XDCHECK(waiter); + waiters.push_back(std::move(waiter)); + } + + private: + // notify all pending waiters that are waiting for the fetch. + void wakeUpWaiters() { + bool refcountOverflowed = false; + for (auto& w : waiters) { + // If refcount overflowed earlier, then we will return miss to + // all subsequent waitors. + if (refcountOverflowed) { + w->set(WriteHandle{}); + continue; + } + + try { + w->set(it.clone()); + } catch (const exception::RefcountOverflow&) { + // We'll return a miss to the user's pending read, + // so we should enqueue a delete via NvmCache. + // TODO: cache.remove(it); + refcountOverflowed = true; + } + } + } + + WriteHandle it; // will be set when Context is being filled + std::vector>> waiters; // list of + // waiters + }; + using MoveMap = + folly::F14ValueMap, + folly::HeterogeneousAccessHash>; + + static size_t getShardForKey(folly::StringPiece key) { + return folly::Hash()(key) % kShards; + } + + MoveMap& getMoveMapForShard(size_t shard) { + return movesMap_[shard].movesMap_; + } + + MoveMap& getMoveMap(folly::StringPiece key) { + return getMoveMapForShard(getShardForKey(key)); + } + + std::unique_lock getMoveLockForShard(size_t shard) { + return std::unique_lock(moveLock_[shard].moveLock_); + } + + std::unique_lock getMoveLock(folly::StringPiece key) { + return getMoveLockForShard(getShardForKey(key)); + } + // Whether the memory allocator for this cache allocator was created on shared // memory. The hash table, chained item hash table etc is also created on // shared memory except for temporary shared memory mode when they're created @@ -1942,6 +2095,8 @@ class CacheAllocator : public CacheBase { const Config config_{}; + const typename Config::MemoryTierConfigs memoryTierConfigs; + // Manages the temporary shared memory segment for memory allocator that // is not persisted when cache process exits. std::unique_ptr tempShm_; @@ -1959,9 +2114,10 @@ class CacheAllocator : public CacheBase { const MMConfig mmConfig_{}; // the memory allocator for allocating out of the available memory. - std::unique_ptr allocator_; + std::vector> allocator_; // compact cache allocator manager + // TODO: per tier? std::unique_ptr compactCacheManager_; // compact cache instances reside here when user "add" or "attach" compact @@ -2023,6 +2179,22 @@ class CacheAllocator : public CacheBase { // poolResizer_, poolOptimizer_, memMonitor_, reaper_ mutable std::mutex workersMutex_; + static constexpr size_t kShards = 8192; // TODO: need to define right value + + struct MovesMapShard { + alignas(folly::hardware_destructive_interference_size) MoveMap movesMap_; + }; + + struct MoveLock { + alignas(folly::hardware_destructive_interference_size) std::mutex moveLock_; + }; + + // a map of all pending moves + std::vector movesMap_; + + // a map of move locks for each shard + std::vector moveLock_; + // time when the ram cache was first created const uint32_t cacheCreationTime_{0}; diff --git a/cachelib/allocator/CacheAllocatorConfig.h b/cachelib/allocator/CacheAllocatorConfig.h index b6c0fbbc92..bef485de67 100644 --- a/cachelib/allocator/CacheAllocatorConfig.h +++ b/cachelib/allocator/CacheAllocatorConfig.h @@ -194,12 +194,14 @@ class CacheAllocatorConfig { // This allows cache to be persisted across restarts. One example use case is // to preserve the cache when releasing a new version of your service. Refer // to our user guide for how to set up cache persistence. + // TODO: get rid of baseAddr or if set make sure all mapping are adjacent? + // We can also make baseAddr a per-tier configuration CacheAllocatorConfig& enableCachePersistence(std::string directory, void* baseAddr = nullptr); - // uses posix shm segments instead of the default sys-v shm segments. - // @throw std::invalid_argument if called without enabling - // cachePersistence() + // Uses posix shm segments instead of the default sys-v shm + // segments. @throw std::invalid_argument if called without enabling + // cachePersistence(). CacheAllocatorConfig& usePosixForShm(); // Configures cache memory tiers. Each tier represents a cache region inside @@ -207,10 +209,13 @@ class CacheAllocatorConfig { // Accepts vector of MemoryTierCacheConfig. Each vector element describes // configuration for a single memory cache tier. Tier sizes are specified as // ratios, the number of parts of total cache size each tier would occupy. + // @throw std::invalid_argument if: + // - the size of configs is 0 + // - the size of configs is greater than kMaxCacheMemoryTiers CacheAllocatorConfig& configureMemoryTiers(const MemoryTierConfigs& configs); // Return reference to MemoryTierCacheConfigs. - const MemoryTierConfigs& getMemoryTierConfigs(); + const MemoryTierConfigs& getMemoryTierConfigs() const; // This turns on a background worker that periodically scans through the // access container and look for expired items and remove them. @@ -386,8 +391,7 @@ class CacheAllocatorConfig { std::map serialize() const; // The max number of memory cache tiers - // TODO: increase this number when multi-tier configs are enabled - inline static const size_t kMaxCacheMemoryTiers = 1; + inline static const size_t kMaxCacheMemoryTiers = 2; // Cache name for users to indentify their own cache. std::string cacheName{""}; @@ -599,6 +603,14 @@ class CacheAllocatorConfig { // If true, we will delay worker start until user explicitly calls // CacheAllocator::startCacheWorkers() bool delayCacheWorkersStart{false}; + bool disableEvictionToMemory{false}; + + double minAcAllocationWatermark{0.0}; //if % free in AC is <= then try to allocate in next tier + double maxAcAllocationWatermark{0.0}; //if % free in AC is >= then allocate in default target tier + double acTopTierEvictionWatermark{0.0}; // evict from 1st tier if % free is < watermark + uint64_t sizeThresholdPolicy{0}; // only allow items < threshold in top tier + double defaultTierChancePercentage{50.0}; //randomly allocate items among top and bottom tiers + uint64_t forceAllocationTier{UINT64_MAX}; //force allocations to happen in this tier friend CacheT; @@ -894,7 +906,7 @@ CacheAllocatorConfig& CacheAllocatorConfig::configureMemoryTiers( template const typename CacheAllocatorConfig::MemoryTierConfigs& -CacheAllocatorConfig::getMemoryTierConfigs() { +CacheAllocatorConfig::getMemoryTierConfigs() const { return memoryTierConfigs; } @@ -1114,7 +1126,7 @@ std::map CacheAllocatorConfig::serialize() const { configMap["size"] = std::to_string(size); configMap["cacheDir"] = cacheDir; - configMap["posixShm"] = usePosixShm ? "set" : "empty"; + configMap["posixShm"] = isUsingPosixShm() ? "set" : "empty"; configMap["defaultAllocSizes"] = ""; // Stringify std::set diff --git a/cachelib/allocator/CacheItem-inl.h b/cachelib/allocator/CacheItem-inl.h index a1c2456af5..db0bbe7ca8 100644 --- a/cachelib/allocator/CacheItem-inl.h +++ b/cachelib/allocator/CacheItem-inl.h @@ -219,8 +219,8 @@ bool CacheItem::markMoving() noexcept { } template -void CacheItem::unmarkMoving() noexcept { - ref_.unmarkMoving(); +RefcountWithFlags::Value CacheItem::unmarkMoving() noexcept { + return ref_.unmarkMoving(); } template @@ -263,6 +263,21 @@ bool CacheItem::isNvmEvicted() const noexcept { return ref_.isNvmEvicted(); } +template +void CacheItem::markIncomplete() noexcept { + ref_.markIncomplete(); +} + +template +void CacheItem::unmarkIncomplete() noexcept { + ref_.unmarkIncomplete(); +} + +template +bool CacheItem::isIncomplete() const noexcept { + return ref_.isIncomplete(); +} + template void CacheItem::markIsChainedItem() noexcept { XDCHECK(!hasChainedItem()); diff --git a/cachelib/allocator/CacheItem.h b/cachelib/allocator/CacheItem.h index 87c8b8a19e..61b374720e 100644 --- a/cachelib/allocator/CacheItem.h +++ b/cachelib/allocator/CacheItem.h @@ -141,6 +141,7 @@ class CACHELIB_PACKED_ATTR CacheItem { * to be mapped to different addresses on shared memory. */ using CompressedPtr = facebook::cachelib::CompressedPtr; + using SingleTierPtrCompressor = MemoryAllocator::SingleTierPtrCompressor; using PtrCompressor = MemoryAllocator::PtrCompressor; // Get the required size for a cache item given the size of memory @@ -238,6 +239,14 @@ class CACHELIB_PACKED_ATTR CacheItem { void unmarkNvmEvicted() noexcept; bool isNvmEvicted() const noexcept; + /** + * Marks that the item is migrating between memory tiers and + * not ready for access now. Accessing thread should wait. + */ + void markIncomplete() noexcept; + void unmarkIncomplete() noexcept; + bool isIncomplete() const noexcept; + /** * Function to set the timestamp for when to expire an item * @@ -357,7 +366,7 @@ class CACHELIB_PACKED_ATTR CacheItem { * Unmarking moving does not depend on `isInMMContainer` */ bool markMoving() noexcept; - void unmarkMoving() noexcept; + RefcountWithFlags::Value unmarkMoving() noexcept; bool isMoving() const noexcept; bool isOnlyMoving() const noexcept; diff --git a/cachelib/allocator/CacheStats.cpp b/cachelib/allocator/CacheStats.cpp index b6a2d1a030..5ce7ad9c92 100644 --- a/cachelib/allocator/CacheStats.cpp +++ b/cachelib/allocator/CacheStats.cpp @@ -44,6 +44,8 @@ void Stats::init() { initToZero(*fragmentationSize); initToZero(*chainedItemEvictions); initToZero(*regularItemEvictions); + + classAllocLatency = std::make_unique(); } template @@ -51,7 +53,7 @@ struct SizeVerify {}; void Stats::populateGlobalCacheStats(GlobalCacheStats& ret) const { #ifndef SKIP_SIZE_VERIFY - SizeVerify a = SizeVerify<16160>{}; + SizeVerify a = SizeVerify<16176>{}; std::ignore = a; #endif ret.numCacheGets = numCacheGets.get(); diff --git a/cachelib/allocator/CacheStats.h b/cachelib/allocator/CacheStats.h index 87ff44908e..edd1d8a4cb 100644 --- a/cachelib/allocator/CacheStats.h +++ b/cachelib/allocator/CacheStats.h @@ -25,6 +25,7 @@ #include "cachelib/allocator/memory/Slab.h" #include "cachelib/common/FastStats.h" #include "cachelib/common/PercentileStats.h" +#include "cachelib/common/RollingStats.h" #include "cachelib/common/Time.h" namespace facebook { @@ -95,6 +96,20 @@ struct MMContainerStat { uint64_t numTailAccesses; }; +struct AllocationClassBaseStat { + // size of allocation class + size_t allocSize{0}; + + // size of memory assigned to this allocation class + size_t memorySize{0}; + + // percent of free memory in this class + double approxFreePercent{0.0}; + + // Rolling allocation latency (in ns) + util::RollingStats allocLatencyNs; +}; + // cache related stats for a given allocation class. struct CacheStat { // allocation size for this container. @@ -545,6 +560,9 @@ struct CacheMemoryStats { // rss size of the process size_t memRssSize{0}; + + // percentage of free slabs + std::vector slabsApproxFreePercentages{0.0}; }; // Stats for compact cache diff --git a/cachelib/allocator/CacheStatsInternal.h b/cachelib/allocator/CacheStatsInternal.h index 014c8f6d42..50a70c2c22 100644 --- a/cachelib/allocator/CacheStatsInternal.h +++ b/cachelib/allocator/CacheStatsInternal.h @@ -21,6 +21,7 @@ #include "cachelib/allocator/Cache.h" #include "cachelib/allocator/memory/MemoryAllocator.h" #include "cachelib/common/AtomicCounter.h" +#include "cachelib/common/RollingStats.h" namespace facebook { namespace cachelib { @@ -226,6 +227,14 @@ struct Stats { std::unique_ptr chainedItemEvictions{}; std::unique_ptr regularItemEvictions{}; + using PerTierPoolClassRollingStats = std::array< + std::array, + MemoryPoolManager::kMaxPools>, + CacheBase::kMaxTiers>; + + // rolling latency tracking for every alloc class in every pool + std::unique_ptr classAllocLatency{}; + // Eviction failures due to parent cannot be removed from access container AtomicCounter evictFailParentAC{0}; diff --git a/cachelib/allocator/Handle.h b/cachelib/allocator/Handle.h index bd2eee39cd..ce455a0bca 100644 --- a/cachelib/allocator/Handle.h +++ b/cachelib/allocator/Handle.h @@ -402,6 +402,12 @@ struct ReadHandleImpl { } } + protected: + friend class ReadHandleImpl; + // Method used only by ReadHandleImpl ctor + void discard() { + it_.store(nullptr, std::memory_order_relaxed); + } private: // we are waiting on Item* to be set to a value. One of the valid values is // nullptr. So choose something that we dont expect to indicate a ptr @@ -481,7 +487,15 @@ struct ReadHandleImpl { // Handle which has the item already FOLLY_ALWAYS_INLINE ReadHandleImpl(Item* it, CacheT& alloc) noexcept - : alloc_(&alloc), it_(it) {} + : alloc_(&alloc), it_(it) { + if (it_ && it_->isIncomplete()) { + waitContext_ = std::make_shared(alloc); + if (!alloc_->addWaitContextForMovingItem(it->getKey(), waitContext_)) { + waitContext_->discard(); + waitContext_.reset(); + } + } + } // handle that has a wait context allocated. Used for async handles // In this case, the it_ will be filled in asynchronously and mulitple diff --git a/cachelib/allocator/MM2Q-inl.h b/cachelib/allocator/MM2Q-inl.h index 1d2482d45b..fb3be8551b 100644 --- a/cachelib/allocator/MM2Q-inl.h +++ b/cachelib/allocator/MM2Q-inl.h @@ -14,6 +14,8 @@ * limitations under the License. */ +#include + namespace facebook { namespace cachelib { @@ -104,6 +106,10 @@ bool MM2Q::Container::recordAccess(T& node, return false; } + // TODO: % 100 is not very accurate + if (config_.markUsefulChance < 100.0 && folly::Random::rand32() % 100 >= config_.markUsefulChance) + return false; + return lruMutex_->lock_combine(func); } return false; @@ -223,15 +229,32 @@ void MM2Q::Container::rebalance() noexcept { template T::*HookPtr> bool MM2Q::Container::add(T& node) noexcept { const auto currTime = static_cast