diff --git a/cachelib/allocator/Cache.h b/cachelib/allocator/Cache.h
index eab7d01184..5d2e53fe22 100644
--- a/cachelib/allocator/Cache.h
+++ b/cachelib/allocator/Cache.h
@@ -85,6 +85,9 @@ class CacheBase {
   CacheBase(CacheBase&&) = default;
   CacheBase& operator=(CacheBase&&) = default;
 
+  // TODO: come up with some reasonable number
+  static constexpr unsigned kMaxTiers = 2;
+
   // Get a string referring to the cache name for this cache
   virtual const std::string getCacheName() const = 0;
 
@@ -102,6 +105,9 @@ class CacheBase {
   // @param poolId   the pool id
   virtual PoolStats getPoolStats(PoolId poolId) const = 0;
 
+  virtual AllocationClassBaseStat getAllocationClassStats(
+      TierId, PoolId pid, ClassId cid) const = 0;
+
   // @param poolId   the pool id
   virtual AllSlabReleaseEvents getAllSlabReleaseEvents(PoolId poolId) const = 0;
 
diff --git a/cachelib/allocator/CacheAllocator-inl.h b/cachelib/allocator/CacheAllocator-inl.h
index d14bcfa789..5328e151aa 100644
--- a/cachelib/allocator/CacheAllocator-inl.h
+++ b/cachelib/allocator/CacheAllocator-inl.h
@@ -324,7 +324,9 @@ CacheAllocator<CacheTrait>::allocateInternal(PoolId pid,
   const auto requiredSize = Item::getRequiredSize(key, size);
 
   // the allocation class in our memory allocator.
-  const auto cid = allocator_->getAllocationClassId(pid, requiredSize);
+  const auto cid = allocator_[tid]->getAllocationClassId(pid, requiredSize);
+  util::RollingLatencyTracker rollTracker{
+      (*stats_.classAllocLatency)[tid][pid][cid]};
 
   (*stats_.allocAttempts)[pid][cid].inc();
 
@@ -402,6 +404,11 @@ CacheAllocator<CacheTrait>::allocateChainedItemInternal(
   const auto pid = allocator_->getAllocInfo(parent->getMemory()).poolId;
   const auto cid = allocator_->getAllocationClassId(pid, requiredSize);
 
+  util::RollingLatencyTracker rollTracker{
+      (*stats_.classAllocLatency)[tid][pid][cid]};
+
+  // TODO: per-tier? Right now stats_ are not used in any public periodic
+  // worker
   (*stats_.allocAttempts)[pid][cid].inc();
 
   void* memory = allocator_->allocate(pid, requiredSize);
@@ -2220,6 +2227,49 @@ PoolStats CacheAllocator<CacheTrait>::getPoolStats(PoolId poolId) const {
   return ret;
 }
 
+template <typename CacheTrait>
+double CacheAllocator<CacheTrait>::slabsApproxFreePercentage(TierId tid) const {
+  return allocator_[tid]->approxFreeSlabsPercentage();
+}
+
+template <typename CacheTrait>
+AllocationClassBaseStat CacheAllocator<CacheTrait>::getAllocationClassStats(
+    TierId tid, PoolId pid, ClassId cid) const {
+  const auto& ac = allocator_[tid]->getPool(pid).getAllocationClass(cid);
+
+  AllocationClassBaseStat stats{};
+  stats.allocSize = ac.getAllocSize();
+  stats.memorySize = ac.getNumSlabs() * Slab::kSize;
+
+  if (slabsApproxFreePercentage(tid) > 0.0) {
+    auto totalMemory = MemoryAllocator::getMemorySize(memoryTierSize(tid));
+    auto freeMemory = static_cast<double>(totalMemory) *
+                      slabsApproxFreePercentage(tid) / 100.0;
+
+    // amount of free memory which has the same ratio to entire free memory as
+    // this allocation class memory size has to used memory
+    auto scaledFreeMemory =
+        static_cast<size_t>(freeMemory * stats.memorySize / totalMemory);
+
+    auto acAllocatedMemory = (100.0 - ac.approxFreePercentage()) / 100.0 *
+                             ac.getNumSlabs() * Slab::kSize;
+    auto acMaxAvailableMemory =
+        ac.getNumSlabs() * Slab::kSize + scaledFreeMemory;
+
+    if (acMaxAvailableMemory == 0) {
+      stats.approxFreePercent = 100.0;
+    } else {
+      stats.approxFreePercent =
+          100.0 - 100.0 * acAllocatedMemory / acMaxAvailableMemory;
+    }
+  } else {
+    stats.approxFreePercent = ac.approxFreePercentage();
+  }
+  stats.allocLatencyNs = (*stats_.classAllocLatency)[tid][pid][cid];
+
+  return stats;
+}
+
 template <typename CacheTrait>
 PoolEvictionAgeStats CacheAllocator<CacheTrait>::getPoolEvictionAgeStats(
     PoolId pid, unsigned int slabProjectionLength) const {
diff --git a/cachelib/allocator/CacheStats.cpp b/cachelib/allocator/CacheStats.cpp
index 2e848e34a8..1dfa4283ce 100644
--- a/cachelib/allocator/CacheStats.cpp
+++ b/cachelib/allocator/CacheStats.cpp
@@ -44,6 +44,8 @@ void Stats::init() {
   initToZero(*fragmentationSize);
   initToZero(*chainedItemEvictions);
   initToZero(*regularItemEvictions);
+
+  classAllocLatency = std::make_unique<PerTierPoolClassRollingStats>();
 }
 
 template <int>
diff --git a/cachelib/allocator/CacheStats.h b/cachelib/allocator/CacheStats.h
index be0a9da5cb..a98b61dd50 100644
--- a/cachelib/allocator/CacheStats.h
+++ b/cachelib/allocator/CacheStats.h
@@ -25,6 +25,7 @@
 #include "cachelib/allocator/memory/Slab.h"
 #include "cachelib/common/FastStats.h"
 #include "cachelib/common/PercentileStats.h"
+#include "cachelib/common/RollingStats.h"
 #include "cachelib/common/Time.h"
 
 namespace facebook {
@@ -95,6 +96,20 @@ struct MMContainerStat {
   uint64_t numTailAccesses;
 };
 
+struct AllocationClassBaseStat {
+  // size of allocation class
+  size_t allocSize{0};
+
+  // size of memory assigned to this allocation class
+  size_t memorySize{0};
+
+  // percent of free memory in this class
+  double approxFreePercent{0.0};
+
+  // Rolling allocation latency (in ns)
+  util::RollingStats allocLatencyNs;
+};
+
 // cache related stats for a given allocation class.
 struct CacheStat {
   // allocation size for this container.
diff --git a/cachelib/allocator/CacheStatsInternal.h b/cachelib/allocator/CacheStatsInternal.h
index b2a5f8c469..19a15fbbd4 100644
--- a/cachelib/allocator/CacheStatsInternal.h
+++ b/cachelib/allocator/CacheStatsInternal.h
@@ -21,6 +21,7 @@
 #include "cachelib/allocator/Cache.h"
 #include "cachelib/allocator/memory/MemoryAllocator.h"
 #include "cachelib/common/AtomicCounter.h"
+#include "cachelib/common/RollingStats.h"
 
 namespace facebook {
 namespace cachelib {
@@ -229,6 +230,14 @@ struct Stats {
   std::unique_ptr<PerPoolClassAtomicCounters> chainedItemEvictions{};
   std::unique_ptr<PerPoolClassAtomicCounters> regularItemEvictions{};
 
+  using PerTierPoolClassRollingStats = std::array<
+      std::array<std::array<util::RollingStats, MemoryAllocator::kMaxClasses>,
+                 MemoryPoolManager::kMaxPools>,
+      CacheBase::kMaxTiers>;
+
+  // rolling latency tracking for every alloc class in every pool
+  std::unique_ptr<PerTierPoolClassRollingStats> classAllocLatency{};
+
   // Eviction failures due to parent cannot be removed from access container
   AtomicCounter evictFailParentAC{0};
 
diff --git a/cachelib/cachebench/cache/CacheStats.h b/cachelib/cachebench/cache/CacheStats.h
index d6c9e53584..6e24d52498 100644
--- a/cachelib/cachebench/cache/CacheStats.h
+++ b/cachelib/cachebench/cache/CacheStats.h
@@ -100,6 +100,11 @@ struct Stats {
   uint64_t invalidDestructorCount{0};
   int64_t unDestructedItemCount{0};
 
+  std::map<TierId, std::map<PoolId, std::map<ClassId, AllocationClassBaseStat>>>
+      allocationClassStats;
+
+  std::vector<double> slabsApproxFreePercentages;
+
   // populate the counters related to nvm usage. Cache implementation can decide
   // what to populate since not all of those are interesting when running
   // cachebench.
@@ -131,6 +136,56 @@ struct Stats {
           << std::endl;
     }
 
+    if (FLAGS_report_memory_usage_stats != "") {
+      for (TierId tid = 0; tid < slabsApproxFreePercentages.size(); tid++) {
+        out << folly::sformat("tid{:2} free slabs : {:.2f}%", tid,
+                              slabsApproxFreePercentages[tid])
+            << std::endl;
+      }
+
+      auto formatMemory = [&](size_t bytes) -> std::tuple<std::string, double> {
+        if (FLAGS_report_memory_usage_stats == "raw") {
+          return {"B", bytes};
+        }
+
+        constexpr double KB = 1024.0;
+        constexpr double MB = 1024.0 * 1024;
+        constexpr double GB = 1024.0 * 1024 * 1024;
+
+        if (bytes >= GB) {
+          return {"GB", static_cast<double>(bytes) / GB};
+        } else if (bytes >= MB) {
+          return {"MB", static_cast<double>(bytes) / MB};
+        } else if (bytes >= KB) {
+          return {"KB", static_cast<double>(bytes) / KB};
+        } else {
+          return {"B", bytes};
+        }
+      };
+
+      auto foreachAC = [&](auto cb) {
+        for (auto& tidStats : allocationClassStats) {
+          for (auto& pidStat : tidStats.second) {
+            for (auto& cidStat : pidStat.second) {
+              cb(tidStats.first, pidStat.first, cidStat.first, cidStat.second);
+            }
+          }
+        }
+      };
+
+      foreachAC([&](auto tid, auto pid, auto cid, auto stats) {
+        auto [allocSizeSuffix, allocSize] = formatMemory(stats.allocSize);
+        auto [memorySizeSuffix, memorySize] = formatMemory(stats.memorySize);
+        out << folly::sformat(
+                   "tid{:2} pid{:2} cid{:4} {:8.2f}{} memorySize:{:8.2f}{} "
+                   "free:{:4.2f}% rollingAvgAllocLatency:{:8.2f}ns",
+                   tid, pid, cid, allocSize, allocSizeSuffix, memorySize,
+                   memorySizeSuffix, stats.approxFreePercent,
+                   stats.allocLatencyNs.estimate())
+            << std::endl;
+      });
+    }
+
     if (numCacheGets > 0) {
       out << folly::sformat("Cache Gets    : {:,}", numCacheGets) << std::endl;
       out << folly::sformat("Hit Ratio     : {:6.2f}%", overallHitRatio)
diff --git a/cachelib/common/RollingStats.h b/cachelib/common/RollingStats.h
new file mode 100644
index 0000000000..4d179681ad
--- /dev/null
+++ b/cachelib/common/RollingStats.h
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <folly/Range.h>
+#include <folly/logging/xlog.h>
+
+#include "cachelib/common/Utils.h"
+
+namespace facebook {
+namespace cachelib {
+namespace util {
+
+class RollingStats {
+ public:
+  // track latency by taking the value of duration directly.
+  void trackValue(double value) {
+    // This is a highly unlikely scenario where
+    // cnt_ reaches numerical limits. Skip update
+    // of the rolling average anymore.
+    if (cnt_ == std::numeric_limits<uint64_t>::max()) {
+      cnt_ = 0;
+      return;
+    }
+    auto ratio = static_cast<double>(cnt_) / (cnt_ + 1);
+    avg_ *= ratio;
+    ++cnt_;
+    avg_ += value / cnt_;
+  }
+
+  // Return the rolling average.
+  double estimate() { return avg_; }
+
+ private:
+  double avg_{0};
+  uint64_t cnt_{0};
+};
+
+class RollingLatencyTracker {
+ public:
+  explicit RollingLatencyTracker(RollingStats& stats)
+      : stats_(&stats), begin_(std::chrono::steady_clock::now()) {}
+  RollingLatencyTracker() {}
+  ~RollingLatencyTracker() {
+    if (stats_) {
+      auto tp = std::chrono::steady_clock::now();
+      auto diffNanos =
+          std::chrono::duration_cast<std::chrono::nanoseconds>(tp - begin_)
+              .count();
+      stats_->trackValue(static_cast<double>(diffNanos));
+    }
+  }
+
+  RollingLatencyTracker(const RollingLatencyTracker&) = delete;
+  RollingLatencyTracker& operator=(const RollingLatencyTracker&) = delete;
+
+  RollingLatencyTracker(RollingLatencyTracker&& rhs) noexcept
+      : stats_(rhs.stats_), begin_(rhs.begin_) {
+    rhs.stats_ = nullptr;
+  }
+
+  RollingLatencyTracker& operator=(RollingLatencyTracker&& rhs) noexcept {
+    if (this != &rhs) {
+      this->~RollingLatencyTracker();
+      new (this) RollingLatencyTracker(std::move(rhs));
+    }
+    return *this;
+  }
+
+ private:
+  RollingStats* stats_{nullptr};
+  std::chrono::time_point<std::chrono::steady_clock> begin_;
+};
+} // namespace util
+} // namespace cachelib
+} // namespace facebook