diff --git a/.gitignore b/.gitignore
index f2424a220..e2d24c97b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -103,4 +103,3 @@ __pycache__/
 
 include/csv2/
 debug.*
-*.log
\ No newline at end of file
diff --git a/Makefile b/Makefile
index 30b431e50..cb5a05e2f 100644
--- a/Makefile
+++ b/Makefile
@@ -14,6 +14,10 @@ ifeq ($(origin PYTHON), undefined)
 endif
 export PYTHON
 
+# EXTRA_CXXFLAGS += -I$(HOME)/local/include -gdwarf-4 -fsanitize=address
+EXTRA_CXXFLAGS += -I$(HOME)/local/include -gdwarf-4 -DGFLAGS
+LDFLAGS  += -L$(HOME)/local/lib -lsocket++ -lgflags
+
 CLEAN_FILES = # deliberately empty, so we can append below.
 CFLAGS += ${EXTRA_CFLAGS}
 CXXFLAGS += ${EXTRA_CXXFLAGS}
@@ -112,6 +116,8 @@ ifneq ($(findstring rocksdbjava, $(MAKECMDGOALS)),)
 	endif
 endif
 
+# DEBUG_LEVEL=1
+DEBUG_LEVEL=0
 $(info $$DEBUG_LEVEL is ${DEBUG_LEVEL})
 
 # Lite build flag.
@@ -1310,6 +1316,11 @@ $(STATIC_LIBRARY): $(LIB_OBJECTS)
 	$(AM_V_AR)rm -f $@ $(SHARED1) $(SHARED2) $(SHARED3) $(SHARED4)
 	$(AM_V_at)$(AR) $(ARFLAGS) $@ $(LIB_OBJECTS)
 
+# Add a target to build bloom_test with ART_PLUS defined
+bloom_test_plus: util/bloom_test.cc $(LIBRARY) $(GTEST)
+	$(AM_V_CCLD)$(CXX) $(CXXFLAGS) -DART_PLUS -c util/bloom_test.cc -o $(OBJ_DIR)/util/bloom_test_plus.o
+	$(AM_V_CCLD)$(CXX) -o bloom_test_plus $(OBJ_DIR)/util/bloom_test_plus.o $(GTEST) $(LIBRARY) $(EXEC_LDFLAGS) $(LDFLAGS) $(COVERAGEFLAGS)
+
 $(STATIC_TEST_LIBRARY): $(TEST_OBJECTS)
 	$(AM_V_AR)rm -f $@ $(SHARED_TEST_LIBRARY)
 	$(AM_V_at)$(AR) $(ARFLAGS) $@ $^
diff --git a/YCSB/.gitignore b/YCSB/.gitignore
index 50f9d154b..056f2b01b 100644
--- a/YCSB/.gitignore
+++ b/YCSB/.gitignore
@@ -8,3 +8,7 @@ tags
 compile_commands.json
 .clangd/
 .cache/
+
+test_logs/
+compile.log
+*.log
\ No newline at end of file
diff --git a/YCSB/Makefile b/YCSB/Makefile
index 10ba2d280..f15bbed47 100644
--- a/YCSB/Makefile
+++ b/YCSB/Makefile
@@ -9,16 +9,19 @@
 
 #---------------------build config-------------------------
 
+CXXFLAGS += -I$(HOME)/local/include -gdwarf-4
+LDFLAGS += -lstdc++
+LDFLAGS += -L$(HOME)/local/lib -lsocket++
+
 DEBUG_BUILD ?= 0
-EXTRA_CXXFLAGS ?= -I../include -I../include/rocksdb 
-EXTRA_LDFLAGS ?= -L../ -lpmem -ldl
+# EXTRA_CXXFLAGS += -I../include -I../include/rocksdb -fsanitize=address
+EXTRA_CXXFLAGS += -I../include -I../include/rocksdb
+EXTRA_LDFLAGS += -L../ -lpmem -ldl
 
 BIND_ROCKSDB ?= 1
 BIND_LEVELDB ?= 0
 BIND_LMDB ?= 0
 
-EXTRA_LDFLAGS += -lstdc++
-EXTRA_LDFLAGS += -lsocket++
 # EXTRA_LDFLAGS += -lpython3.12
 # EXTRA_CXXFLAGS += -I$(PYTHON_INCLUDE_PATH)
 # EXTRA_CXXFLAGS += -L$(PYTHON_LIBRARY_PATH)
diff --git a/YCSB/batch_test.sh b/YCSB/batch_test.sh
new file mode 100755
index 000000000..8faf77130
--- /dev/null
+++ b/YCSB/batch_test.sh
@@ -0,0 +1,408 @@
+#!/bin/bash
+
+# YCSB批量测试脚本
+# 作者: 自动生成
+# 用途: 执行一系列YCSB测试，支持不同workload和properties配置
+
+# ===========================================
+# 全局配置变量
+# ===========================================
+
+# 默认配置
+DEFAULT_YCSB_PATH="./ycsb"
+DEFAULT_DB_TYPE="rocksdb"
+DEFAULT_PROPERTIES_FILE="rocksdb/rocksdb.properties"
+DEFAULT_THREAD_COUNT=8
+
+# 数据库和NVM路径配置
+DB_PATH="/mnt/nvme0n1/guoteng/walsmtest/tmp/db_nvm_l0"
+NVM_PATH="/mnt/pmem0.7/guoteng/nodememory"
+
+# 日志配置
+LOG_DIR="./test_logs"
+TIMESTAMP=$(date +"%Y%m%d_%H%M%S")
+
+# ===========================================
+# 工具函数
+# ===========================================
+
+# 打印带时间戳的日志信息
+log_info() {
+    echo "[$(date '+%Y-%m-%d %H:%M:%S')] INFO: $1"
+}
+
+# 打印错误信息
+log_error() {
+    echo "[$(date '+%Y-%m-%d %H:%M:%S')] ERROR: $1" >&2
+}
+
+# 打印警告信息
+log_warn() {
+    echo "[$(date '+%Y-%m-%d %H:%M:%S')] WARN: $1"
+}
+
+# ===========================================
+# 资源清理函数
+# ===========================================
+
+# 清理数据库和NVM存储资源
+cleanup_resources() {
+    log_info "开始清理资源..."
+    
+    local cleanup_success=true
+    
+    # 清理数据库目录
+    if [ -d "${DB_PATH}" ]; then
+        log_info "清理数据库目录: ${DB_PATH}"
+        if [ "$(ls -A "${DB_PATH}" 2>/dev/null)" ]; then
+            rm -rf "${DB_PATH}"/*
+            if [ $? -eq 0 ]; then
+                log_info "数据库目录清理完成"
+            else
+                log_error "数据库目录清理失败"
+                cleanup_success=false
+            fi
+        else
+            log_info "数据库目录为空，无需清理"
+        fi
+    else
+        log_info "数据库目录不存在: ${DB_PATH} (正常情况)"
+    fi
+    
+    # 清理NVM路径
+    if [ -f "${NVM_PATH}" ] || [ -d "${NVM_PATH}" ]; then
+        log_info "清理NVM路径: ${NVM_PATH}"
+        rm -rf "${NVM_PATH}"
+        if [ $? -eq 0 ]; then
+            log_info "NVM路径清理完成"
+        else
+            log_error "NVM路径清理失败"
+            cleanup_success=false
+        fi
+    else
+        log_info "NVM路径不存在: ${NVM_PATH} (正常情况)"
+    fi
+    
+    if [ "$cleanup_success" = true ]; then
+        log_info "资源清理完成"
+        return 0
+    else
+        log_error "资源清理过程中出现错误"
+        return 1
+    fi
+}
+
+# ===========================================
+# YCSB测试执行函数
+# ===========================================
+
+# 执行单个YCSB测试
+run_single_ycsb_test() {
+    local workload="$1"
+    local properties_file="$2"
+    local additional_params="$3"
+    local test_name="$4"
+    
+    log_info "开始执行YCSB测试: ${test_name}"
+    log_info "  - Workload: ${workload}"
+    log_info "  - Properties: ${properties_file}"
+    log_info "  - Thread Count: ${DEFAULT_THREAD_COUNT}"
+    
+    # 检查workload文件是否存在
+    if [ ! -f "${workload}" ]; then
+        log_error "Workload文件不存在: ${workload}"
+        return 1
+    fi
+    
+    # 检查properties文件是否存在
+    if [ ! -f "${properties_file}" ]; then
+        log_error "Properties文件不存在: ${properties_file}"
+        return 1
+    fi
+    
+    # 创建日志目录
+    mkdir -p "${LOG_DIR}"
+    
+    # 生成日志文件名
+    local log_file="${LOG_DIR}/${test_name}_${TIMESTAMP}.log"
+    
+    # 构建YCSB命令
+    local ycsb_cmd="${DEFAULT_YCSB_PATH} -load -run -db ${DEFAULT_DB_TYPE} -P ${workload} -P ${properties_file} -p threadcount=${DEFAULT_THREAD_COUNT}"
+    
+    # 添加额外参数
+    if [ -n "${additional_params}" ]; then
+        ycsb_cmd="${ycsb_cmd} ${additional_params}"
+    fi
+    
+    # 添加统计输出
+    ycsb_cmd="${ycsb_cmd} -s"
+    
+    log_info "执行命令: ${ycsb_cmd}"
+    
+    # 执行YCSB测试并记录日志
+    echo "开始时间: $(date)" > "${log_file}"
+    echo "命令: ${ycsb_cmd}" >> "${log_file}"
+    echo "======================================" >> "${log_file}"
+    
+    eval "${ycsb_cmd}" 2>&1 | tee -a "${log_file}"
+    local exit_code=${PIPESTATUS[0]}
+    
+    echo "======================================" >> "${log_file}"
+    echo "结束时间: $(date)" >> "${log_file}"
+    echo "退出码: ${exit_code}" >> "${log_file}"
+    
+    if [ ${exit_code} -eq 0 ]; then
+        log_info "测试完成: ${test_name} (日志: ${log_file})"
+        return 0
+    else
+        log_error "测试失败: ${test_name} (退出码: ${exit_code})"
+        return 1
+    fi
+}
+
+# ===========================================
+# 批量测试函数
+# ===========================================
+
+# 执行批量测试
+run_batch_tests() {
+    local config_file="$1"
+    
+    if [ ! -f "${config_file}" ]; then
+        log_error "配置文件不存在: ${config_file}"
+        return 1
+    fi
+    
+    log_info "开始批量测试，配置文件: ${config_file}"
+    
+    local test_count=0
+    local success_count=0
+    local failed_tests=()
+    
+    # 读取配置文件并执行测试
+    while IFS='|' read -r test_name workload properties_file additional_params; do
+        # 跳过注释行和空行
+        [[ ${test_name} =~ ^#.*$ ]] && continue
+        [[ -z ${test_name} ]] && continue
+        
+        test_count=$((test_count + 1))
+        
+        log_info "执行测试 ${test_count}: ${test_name}"
+        
+        # 清理资源
+        cleanup_resources
+        if [ $? -ne 0 ]; then
+            log_error "资源清理失败，跳过测试: ${test_name}"
+            failed_tests+=("${test_name}")
+            continue
+        fi
+        
+        # 确定properties文件路径
+        local actual_properties_file
+        if [ -n "${properties_file}" ] && [ "${properties_file}" != " " ]; then
+            actual_properties_file="${properties_file}"
+        else
+            actual_properties_file="${DEFAULT_PROPERTIES_FILE}"
+        fi
+        
+        # 执行测试
+        run_single_ycsb_test "${workload}" "${actual_properties_file}" "${additional_params}" "${test_name}"
+        if [ $? -eq 0 ]; then
+            success_count=$((success_count + 1))
+            log_info "测试成功: ${test_name}"
+        else
+            failed_tests+=("${test_name}")
+            log_error "测试失败: ${test_name}"
+        fi
+        
+        log_info "测试 ${test_count} 完成: ${test_name}"
+        echo "----------------------------------------"
+        
+    done < "${config_file}"
+    
+    # 输出测试结果汇总
+    log_info "批量测试完成"
+    log_info "总测试数: ${test_count}"
+    log_info "成功测试数: ${success_count}"
+    log_info "失败测试数: $((test_count - success_count))"
+    
+    if [ ${#failed_tests[@]} -gt 0 ]; then
+        log_warn "失败的测试:"
+        for failed_test in "${failed_tests[@]}"; do
+            log_warn "  - ${failed_test}"
+        done
+    fi
+    
+    return 0
+}
+
+# ===========================================
+# 主函数
+# ===========================================
+
+# 显示帮助信息
+show_help() {
+    cat << EOF
+YCSB批量测试脚本使用说明:
+
+用法:
+    $0 [选项] [参数]
+
+选项:
+    -h, --help              显示此帮助信息
+    -c, --cleanup           仅执行资源清理
+    -s, --single            执行单个测试
+    -b, --batch             执行批量测试
+    -l, --list              列出可用的workload文件
+
+单个测试参数:
+    -w, --workload          指定workload文件 (必需)
+    -P, --properties        指定properties文件 (可选，默认: ${DEFAULT_PROPERTIES_FILE})
+    -p, --params            额外参数 (可选)
+    -n, --name              测试名称 (可选)
+
+批量测试参数:
+    -f, --config-file       批量测试配置文件 (必需)
+
+配置文件格式 (使用|分隔):
+    测试名称|workload路径|properties文件路径(可选)|额外参数(可选)
+
+示例:
+    # 执行单个测试
+    $0 -s -w workloads/workloada -n test1
+
+    # 执行单个测试并指定properties文件
+    $0 -s -w workloads/workloada -P custom.properties -n test1
+
+    # 执行批量测试
+    $0 -b -f batch_config.txt
+
+    # 仅清理资源
+    $0 -c
+
+EOF
+}
+
+# 列出可用的workload文件
+list_workloads() {
+    log_info "可用的workload文件:"
+    find workloads/ -name "workload*" -type f | sort | while read -r workload; do
+        echo "  - ${workload}"
+    done
+}
+
+# 主函数
+main() {
+    local action=""
+    local workload=""
+    local properties_file=""
+    local additional_params=""
+    local test_name=""
+    local config_file=""
+    
+    # 解析命令行参数
+    while [[ $# -gt 0 ]]; do
+        case $1 in
+            -h|--help)
+                show_help
+                exit 0
+                ;;
+            -c|--cleanup)
+                action="cleanup"
+                shift
+                ;;
+            -s|--single)
+                action="single"
+                shift
+                ;;
+            -b|--batch)
+                action="batch"
+                shift
+                ;;
+            -l|--list)
+                list_workloads
+                exit 0
+                ;;
+            -w|--workload)
+                workload="$2"
+                shift 2
+                ;;
+            -P|--properties)
+                properties_file="$2"
+                shift 2
+                ;;
+            -p|--params)
+                additional_params="$2"
+                shift 2
+                ;;
+            -n|--name)
+                test_name="$2"
+                shift 2
+                ;;
+            -f|--config-file)
+                config_file="$2"
+                shift 2
+                ;;
+            *)
+                log_error "未知参数: $1"
+                show_help
+                exit 1
+                ;;
+        esac
+    done
+    
+    # 检查YCSB可执行文件
+    if [ ! -f "${DEFAULT_YCSB_PATH}" ]; then
+        log_error "YCSB可执行文件不存在: ${DEFAULT_YCSB_PATH}"
+        exit 1
+    fi
+    
+    # 根据action执行相应操作
+    case "${action}" in
+        cleanup)
+            cleanup_resources
+            ;;
+        single)
+            if [ -z "${workload}" ]; then
+                log_error "执行单个测试需要指定workload文件"
+                show_help
+                exit 1
+            fi
+            
+            if [ -z "${test_name}" ]; then
+                test_name="single_test_$(basename ${workload})"
+            fi
+            
+            # 确定properties文件路径
+            if [ -z "${properties_file}" ]; then
+                properties_file="${DEFAULT_PROPERTIES_FILE}"
+            fi
+            
+            # 清理资源
+            cleanup_resources
+            
+            # 执行测试
+            run_single_ycsb_test "${workload}" "${properties_file}" "${additional_params}" "${test_name}"
+            local result=$?
+            
+            exit ${result}
+            ;;
+        batch)
+            if [ -z "${config_file}" ]; then
+                log_error "执行批量测试需要指定配置文件"
+                show_help
+                exit 1
+            fi
+            
+            run_batch_tests "${config_file}"
+            ;;
+        *)
+            log_error "需要指定操作类型 (-c, -s, -b, -l)"
+            show_help
+            exit 1
+            ;;
+    esac
+}
+
+# 执行主函数
+main "$@"
diff --git a/YCSB/build.sh b/YCSB/build.sh
new file mode 100755
index 000000000..f7c979322
--- /dev/null
+++ b/YCSB/build.sh
@@ -0,0 +1,12 @@
+cd ..
+rm -rf ../log/*
+make clean
+make static_lib -j32
+
+# mv librocksdb_debug.a librocksdb.a
+
+cd YCSB
+make clean && make -j4
+
+rm -rf /mnt/nvme0n1/guoteng/walsmtest/tmp/db_nvm_l0
+rm -rf /mnt/pmem0.7/guoteng/nodememory
diff --git a/YCSB/buildall.sh b/YCSB/buildall.sh
new file mode 100755
index 000000000..1b684ca01
--- /dev/null
+++ b/YCSB/buildall.sh
@@ -0,0 +1,7 @@
+cd ..
+# make clean
+make static_lib -j32
+
+cd YCSB
+make clean && make DEBUG_BUILD=1
+
diff --git a/YCSB/core/core_workload.cc b/YCSB/core/core_workload.cc
index a559d5509..693a9eb6a 100644
--- a/YCSB/core/core_workload.cc
+++ b/YCSB/core/core_workload.cc
@@ -68,6 +68,7 @@ const string CoreWorkload::REQUEST_DISTRIBUTION_PROPERTY = "requestdistribution"
 const string CoreWorkload::REQUEST_DISTRIBUTION_DEFAULT = "uniform";
 
 const string CoreWorkload::ZERO_PADDING_PROPERTY = "zeropadding";
+// const string CoreWorkload::ZERO_PADDING_DEFAULT = "96";
 const string CoreWorkload::ZERO_PADDING_DEFAULT = "1";
 
 const string CoreWorkload::MIN_SCAN_LENGTH_PROPERTY = "minscanlength";
diff --git a/YCSB/rocksdb/rocksdb.properties b/YCSB/rocksdb/rocksdb.properties
index d4b7f9774..a33f6f67a 100644
--- a/YCSB/rocksdb/rocksdb.properties
+++ b/YCSB/rocksdb/rocksdb.properties
@@ -1,29 +1,33 @@
-rocksdb.dbname=/mnt/walsm/tmp/tmp_data/db_test_art
-rocksdb.nvm_path=/mnt/walsm/node_memory
+rocksdb.dbname=/mnt/nvme0n1/guoteng/walsmtest/tmp/gt_test
+rocksdb.nvm_path=/mnt/pmem0.8/guoteng/nodememory
 rocksdb.format=single
-rocksdb.destroy=false
+# rocksdb.destroy=false
 
 # Load options from file
 #rocksdb.optionsfile=rocksdb/options.ini
 
 # Below options are ignored if options file is used
-rocksdb.compression=no
-rocksdb.max_background_jobs=2
-rocksdb.target_file_size_base=67108864
-rocksdb.target_file_size_multiplier=1
-rocksdb.max_bytes_for_level_base=268435456
-rocksdb.write_buffer_size=67108864
+# rocksdb.compression=no
+rocksdb.max_background_jobs=4
+# rocksdb.target_file_size_base=67108864
+# rocksdb.target_file_size_multiplier=1
+# rocksdb.max_bytes_for_level_base=268435456
+# rocksdb.write_buffer_size=67108864
 rocksdb.max_open_files=-1
-rocksdb.max_write_buffer_number=2
+# rocksdb.max_write_buffer_number=2
 rocksdb.use_direct_io_for_flush_compaction=true
 rocksdb.use_direct_reads=true
-rocksdb.allow_mmap_writes=false
-rocksdb.allow_mmap_reads=false
-rocksdb.cache_size=8388608
+# rocksdb.allow_mmap_writes=false
+# rocksdb.allow_mmap_reads=false
+rocksdb.cache_size=100663296
 rocksdb.compressed_cache_size=0
-rocksdb.bloom_bits=0
+rocksdb.bloom_bits=2
 
 # set total_threads to 32, see rocksdb_db.cc
 rocksdb.increase_parallelism=true
 # rocksdb.optimize_level_style_compaction=true
 rocksdb.optimize_universal_style_compaction=true
+
+rocksdb.block_size=4096
+rocksdb.metadata_size=4096
+rocksdb.max_subcompactions=1
\ No newline at end of file
diff --git a/YCSB/rocksdb/rocksdb_cacheio.properties b/YCSB/rocksdb/rocksdb_cacheio.properties
new file mode 100644
index 000000000..fbca92902
--- /dev/null
+++ b/YCSB/rocksdb/rocksdb_cacheio.properties
@@ -0,0 +1,33 @@
+rocksdb.dbname=/mnt/nvme0n1/guoteng/walsmtest/tmp/db_nvm_l0
+rocksdb.nvm_path=/mnt/pmem0.7/guoteng/nodememory
+rocksdb.format=single
+# rocksdb.destroy=false
+
+# Load options from file
+#rocksdb.optionsfile=rocksdb/options.ini
+
+# Below options are ignored if options file is used
+# rocksdb.compression=no
+rocksdb.max_background_jobs=4
+# rocksdb.target_file_size_base=67108864
+# rocksdb.target_file_size_multiplier=1
+# rocksdb.max_bytes_for_level_base=268435456
+# rocksdb.write_buffer_size=67108864
+rocksdb.max_open_files=-1
+# rocksdb.max_write_buffer_number=2
+# rocksdb.use_direct_io_for_flush_compaction=true
+# rocksdb.use_direct_reads=true
+# rocksdb.allow_mmap_writes=false
+# rocksdb.allow_mmap_reads=false
+rocksdb.cache_size=134217728
+rocksdb.compressed_cache_size=0
+rocksdb.bloom_bits=2
+
+# set total_threads to 32, see rocksdb_db.cc
+rocksdb.increase_parallelism=true
+# rocksdb.optimize_level_style_compaction=true
+rocksdb.optimize_universal_style_compaction=true
+
+rocksdb.block_size=4096
+rocksdb.metadata_size=8192
+rocksdb.max_subcompactions=4
\ No newline at end of file
diff --git a/YCSB/rocksdb/rocksdb_db.cc b/YCSB/rocksdb/rocksdb_db.cc
index 4f8dda0cd..212ad3f79 100644
--- a/YCSB/rocksdb/rocksdb_db.cc
+++ b/YCSB/rocksdb/rocksdb_db.cc
@@ -18,7 +18,15 @@
 #include <rocksdb/status.h>
 #include <rocksdb/utilities/options_util.h>
 #include <rocksdb/write_batch.h>
+#include <atomic>
 #include <fstream>
+#include <iostream>
+#include <mutex>
+#include "rocksdb/env.h"
+#include "rocksdb/options.h"
+#include "rocksdb/table.h"
+
+#include <unistd.h>
 
 namespace {
   const std::string PROP_NAME = "rocksdb.dbname";
@@ -111,6 +119,15 @@ namespace {
   const std::string PROP_FS_URI = "rocksdb.fs_uri";
   const std::string PROP_FS_URI_DEFAULT = "";
 
+  const std::string PROP_BLOCK_SIZE = "rocksdb.block_size";
+  const std::string PROP_BLOCK_SIZE_DEFAULT = "0";
+
+  const std::string PROP_METADATA_SIZE = "rocksdb.metadata_size";
+  const std::string PROP_METADATA_SIZE_DEFAULT = "0";
+
+  const std::string PROP_MAX_SUBCOMPACTION = "rocksdb.max_subcompactions";
+  const std::string PROP_MAX_SUBCOMPACTION_DEFAULT = "0";
+
   static std::shared_ptr<rocksdb::Env> env_guard;
   static std::shared_ptr<rocksdb::Cache> block_cache;
   static std::shared_ptr<rocksdb::Cache> block_cache_compressed;
@@ -121,6 +138,7 @@ namespace ycsbc {
 rocksdb::DB *RocksdbDB::db_ = nullptr;
 int RocksdbDB::ref_cnt_ = 0;
 std::mutex RocksdbDB::mu_;
+rocksdb::Options opt;
 
 void RocksdbDB::Init() {
 // merge operator disabled by default due to link error
@@ -198,7 +216,6 @@ void RocksdbDB::Init() {
     throw utils::Exception("RocksDB db path is missing");
   }
 
-  rocksdb::Options opt;
   opt.create_if_missing = true;
   opt.nvm_path = nvm_path;
   std::vector<rocksdb::ColumnFamilyDescriptor> cf_descs;
@@ -230,6 +247,8 @@ void RocksdbDB::Cleanup() {
   if (--ref_cnt_) {
     return;
   }
+  std::cout << "Statistics: " << opt.statistics->ToString() << std::endl;
+  sleep(5); // sleep 5 seconds to wait for final reports
   delete db_;
 }
 
@@ -309,6 +328,10 @@ void RocksdbDB::GetOptions(const utils::Properties &props, rocksdb::Options *opt
     if (val != 0) {
       opt->max_open_files = val;
     }
+    val = std::stoi(props.GetProperty(PROP_MAX_SUBCOMPACTION, PROP_MAX_SUBCOMPACTION_DEFAULT));
+    if (val != 0) {
+      opt->max_subcompactions = val;
+    }
 
     val = std::stoi(props.GetProperty(PROP_L0_COMPACTION_TRIGGER, PROP_L0_COMPACTION_TRIGGER_DEFAULT));
     if (val != 0) {
@@ -337,6 +360,21 @@ void RocksdbDB::GetOptions(const utils::Properties &props, rocksdb::Options *opt
     }
 
     rocksdb::BlockBasedTableOptions table_options;
+    table_options.pin_top_level_index_and_filter = true;
+    table_options.pin_l0_filter_and_index_blocks_in_cache = false;
+    table_options.cache_index_and_filter_blocks_with_high_priority = true;
+    table_options.index_type = rocksdb::BlockBasedTableOptions::kTwoLevelIndexSearch;
+    table_options.partition_filters = true;
+    table_options.cache_index_and_filter_blocks = true;
+    table_options.index_shortening = rocksdb::BlockBasedTableOptions::IndexShorteningMode::kNoShortening;
+    size_t block_size = std::stoul(props.GetProperty(PROP_BLOCK_SIZE, PROP_BLOCK_SIZE_DEFAULT));
+    if (block_size > 0) {
+      table_options.block_size = block_size;
+    }
+    size_t metadata_block_size = std::stoul(props.GetProperty(PROP_METADATA_SIZE, PROP_METADATA_SIZE_DEFAULT));
+    if (metadata_block_size > 0) {
+      table_options.metadata_block_size = metadata_block_size;
+    }
     size_t cache_size = std::stoul(props.GetProperty(PROP_CACHE_SIZE, PROP_CACHE_SIZE_DEFAULT));
     if (cache_size > 0) {
       block_cache = rocksdb::NewLRUCache(cache_size);
@@ -348,13 +386,16 @@ void RocksdbDB::GetOptions(const utils::Properties &props, rocksdb::Options *opt
       block_cache_compressed = rocksdb::NewLRUCache(cache_size);
       table_options.block_cache_compressed = rocksdb::NewLRUCache(compressed_cache_size);
     }
-    int bloom_bits = std::stoul(props.GetProperty(PROP_BLOOM_BITS, PROP_BLOOM_BITS_DEFAULT));
+    int bloom_bits = std::stoul(props.GetProperty(PROP_BLOOM_BITS, PROP_BLOOM_BITS_DEFAULT)); 
     if (bloom_bits > 0) {
       table_options.filter_policy.reset(rocksdb::NewBloomFilterPolicy(bloom_bits));
     }
     opt->table_factory.reset(rocksdb::NewBlockBasedTableFactory(table_options));
 
     if (props.GetProperty(PROP_INCREASE_PARALLELISM, PROP_INCREASE_PARALLELISM_DEFAULT) == "true") {
+      // unlimit the thread count of compactions and flushes. let it depend on total thread: 32
+      opt->max_background_compactions = -1;
+      opt->max_background_flushes = -1;
       opt->IncreaseParallelism(32);
     }
     if (props.GetProperty(PROP_OPTIMIZE_LEVELCOMP, PROP_OPTIMIZE_LEVELCOMP_DEFAULT) == "true") {
@@ -363,6 +404,7 @@ void RocksdbDB::GetOptions(const utils::Properties &props, rocksdb::Options *opt
     if (props.GetProperty(PROP_OPTIMIZE_UNIVERSALCOMP, PROP_OPTIMIZE_UNIVERSALCOMP_DEFAULT) == "true") {
       opt->OptimizeUniversalStyleCompaction();
     }
+    opt->statistics = rocksdb::CreateDBStatistics();
   }
 }
 
@@ -431,12 +473,6 @@ DB::Status RocksdbDB::ReadSingle(const std::string &table, const std::string &ke
                                  std::vector<Field> &result) {
   std::string data;
   rocksdb::Status s = db_->Get(rocksdb::ReadOptions(), key, &data);
-  #ifdef GEN_WORKLOAD
-  std::fstream f;
-	f.open("../workload/workload", std::ios::out | std::ios::app);
-	f << key <<std::endl;
-	f.close();
-  #endif
   if (s.IsNotFound()) {
     return kNotFound;
   } else if (!s.ok()) {
@@ -474,7 +510,6 @@ DB::Status RocksdbDB::ScanSingle(const std::string &table, const std::string &ke
 
 DB::Status RocksdbDB::UpdateSingle(const std::string &table, const std::string &key,
                                    std::vector<Field> &values) {
-  /*
   std::string data;
   rocksdb::Status s = db_->Get(rocksdb::ReadOptions(), key, &data);
   if (s.IsNotFound()) {
@@ -505,9 +540,6 @@ DB::Status RocksdbDB::UpdateSingle(const std::string &table, const std::string &
     throw utils::Exception(std::string("RocksDB Put: ") + s.ToString());
   }
   return kOK;
-  */
-  // use insert, not read-modify-write
-  return InsertSingle(table, key, values);
 }
 
 DB::Status RocksdbDB::MergeSingle(const std::string &table, const std::string &key,
diff --git a/YCSB/rocksdb/rocksdb_directio.properties b/YCSB/rocksdb/rocksdb_directio.properties
new file mode 100644
index 000000000..a38b865c4
--- /dev/null
+++ b/YCSB/rocksdb/rocksdb_directio.properties
@@ -0,0 +1,33 @@
+rocksdb.dbname=/mnt/nvme0n1/guoteng/walsmtest/tmp/db_nvm_l0
+rocksdb.nvm_path=/mnt/pmem0.7/guoteng/nodememory
+rocksdb.format=single
+# rocksdb.destroy=false
+
+# Load options from file
+#rocksdb.optionsfile=rocksdb/options.ini
+
+# Below options are ignored if options file is used
+# rocksdb.compression=no
+rocksdb.max_background_jobs=4
+# rocksdb.target_file_size_base=67108864
+# rocksdb.target_file_size_multiplier=1
+# rocksdb.max_bytes_for_level_base=268435456
+# rocksdb.write_buffer_size=67108864
+rocksdb.max_open_files=-1
+# rocksdb.max_write_buffer_number=2
+rocksdb.use_direct_io_for_flush_compaction=true
+rocksdb.use_direct_reads=true
+# rocksdb.allow_mmap_writes=false
+# rocksdb.allow_mmap_reads=false
+rocksdb.cache_size=134217728
+rocksdb.compressed_cache_size=0
+rocksdb.bloom_bits=2
+
+# set total_threads to 32, see rocksdb_db.cc
+rocksdb.increase_parallelism=true
+# rocksdb.optimize_level_style_compaction=true
+rocksdb.optimize_universal_style_compaction=true
+
+rocksdb.block_size=4096
+rocksdb.metadata_size=8192
+rocksdb.max_subcompactions=4
\ No newline at end of file
diff --git a/YCSB/test.sh b/YCSB/test.sh
new file mode 100644
index 000000000..2116bcfed
--- /dev/null
+++ b/YCSB/test.sh
@@ -0,0 +1,15 @@
+cd ..
+rm -rf log/*
+make clean
+make static_lib -j32
+
+#mv librocksdb_debug.a librocksdb.a
+
+cd YCSB
+make clean && make -j4
+
+rm -rf /mnt/nvme0n1/guoteng/walsmtest/tmp/gt_test
+rm -rf /mnt/pmem0.8/guoteng/nodememory
+
+#gdb --args ./ycsb -load -run -db rocksdb -P workloads/workloadt -P rocksdb/rocksdb.properties -p threadcount=8 -s
+./ycsb -load -run -db rocksdb -P workloads/workloadt -P rocksdb/rocksdb.properties -p threadcount=8 -p sleepafterload=60 -s
diff --git a/YCSB/workloads/search_0p.spec b/YCSB/workloads/search_0p.spec
new file mode 100644
index 000000000..624bd1646
--- /dev/null
+++ b/YCSB/workloads/search_0p.spec
@@ -0,0 +1,17 @@
+fieldcount=10
+fieldlength=100
+
+recordcount=80000000
+operationcount=320000000
+workload=com.yahoo.ycsb.workloads.CoreWorkload
+
+readallfields=true
+
+readproportion=0
+updateproportion=1.0
+scanproportion=0
+insertproportion=0
+
+requestdistribution=zipfian
+zipfianvalue=0.98
+maxscanlength=100
\ No newline at end of file
diff --git a/YCSB/workloads/search_100p.spec b/YCSB/workloads/search_100p.spec
new file mode 100644
index 000000000..3403b9c5e
--- /dev/null
+++ b/YCSB/workloads/search_100p.spec
@@ -0,0 +1,17 @@
+fieldcount=10
+fieldlength=100
+
+recordcount=80000000
+operationcount=320000000
+workload=com.yahoo.ycsb.workloads.CoreWorkload
+
+readallfields=true
+
+readproportion=1.0
+updateproportion=0
+scanproportion=0
+insertproportion=0
+
+requestdistribution=zipfian
+zipfianvalue=0.98
+maxscanlength=100
\ No newline at end of file
diff --git a/YCSB/workloads/search_25p.spec b/YCSB/workloads/search_25p.spec
new file mode 100644
index 000000000..d2a8dd2a6
--- /dev/null
+++ b/YCSB/workloads/search_25p.spec
@@ -0,0 +1,17 @@
+fieldcount=10
+fieldlength=100
+
+recordcount=80000000
+operationcount=320000000
+workload=com.yahoo.ycsb.workloads.CoreWorkload
+
+readallfields=true
+
+readproportion=0.25
+updateproportion=0.75
+scanproportion=0
+insertproportion=0
+
+requestdistribution=zipfian
+zipfianvalue=0.98
+maxscanlength=100
\ No newline at end of file
diff --git a/YCSB/workloads/search_50p.spec b/YCSB/workloads/search_50p.spec
new file mode 100644
index 000000000..a53b084dd
--- /dev/null
+++ b/YCSB/workloads/search_50p.spec
@@ -0,0 +1,17 @@
+fieldcount=10
+fieldlength=100
+
+recordcount=80000000
+operationcount=320000000
+workload=com.yahoo.ycsb.workloads.CoreWorkload
+
+readallfields=true
+
+readproportion=0.50
+updateproportion=0.50
+scanproportion=0
+insertproportion=0
+
+requestdistribution=zipfian
+zipfianvalue=0.98
+maxscanlength=100
\ No newline at end of file
diff --git a/YCSB/workloads/search_75p.spec b/YCSB/workloads/search_75p.spec
new file mode 100644
index 000000000..f3ff43f48
--- /dev/null
+++ b/YCSB/workloads/search_75p.spec
@@ -0,0 +1,17 @@
+fieldcount=10
+fieldlength=100
+
+recordcount=80000000
+operationcount=320000000
+workload=com.yahoo.ycsb.workloads.CoreWorkload
+
+readallfields=true
+
+readproportion=0.75
+updateproportion=0.25
+scanproportion=0
+insertproportion=0
+
+requestdistribution=zipfian
+zipfianvalue=0.98
+maxscanlength=100
\ No newline at end of file
diff --git a/YCSB/workloads/value_16kb.spec b/YCSB/workloads/value_16kb.spec
new file mode 100644
index 000000000..158825ec5
--- /dev/null
+++ b/YCSB/workloads/value_16kb.spec
@@ -0,0 +1,17 @@
+fieldcount=10
+fieldlength=1600
+
+recordcount=5000000
+operationcount=20000000
+workload=com.yahoo.ycsb.workloads.CoreWorkload
+
+readallfields=true
+
+readproportion=0.5
+updateproportion=0.5
+scanproportion=0
+insertproportion=0
+
+requestdistribution=zipfian
+zipfianvalue=0.98
+maxscanlength=100
\ No newline at end of file
diff --git a/YCSB/workloads/value_1kb.spec b/YCSB/workloads/value_1kb.spec
new file mode 100644
index 000000000..8daafe878
--- /dev/null
+++ b/YCSB/workloads/value_1kb.spec
@@ -0,0 +1,17 @@
+fieldcount=10
+fieldlength=100
+
+recordcount=80000000
+operationcount=320000000
+workload=com.yahoo.ycsb.workloads.CoreWorkload
+
+readallfields=true
+
+readproportion=0.5
+updateproportion=0.5
+scanproportion=0
+insertproportion=0
+
+requestdistribution=zipfian
+zipfianvalue=0.98
+maxscanlength=100
\ No newline at end of file
diff --git a/YCSB/workloads/value_256b.spec b/YCSB/workloads/value_256b.spec
new file mode 100644
index 000000000..98decf924
--- /dev/null
+++ b/YCSB/workloads/value_256b.spec
@@ -0,0 +1,17 @@
+fieldcount=10
+fieldlength=25
+
+recordcount=320000000
+operationcount=1280000000
+workload=com.yahoo.ycsb.workloads.CoreWorkload
+
+readallfields=true
+
+readproportion=0.5
+updateproportion=0.5
+scanproportion=0
+insertproportion=0
+
+requestdistribution=zipfian
+zipfianvalue=0.98
+maxscanlength=100
\ No newline at end of file
diff --git a/YCSB/workloads/value_4kb.spec b/YCSB/workloads/value_4kb.spec
new file mode 100644
index 000000000..7f916ccdf
--- /dev/null
+++ b/YCSB/workloads/value_4kb.spec
@@ -0,0 +1,17 @@
+fieldcount=10
+fieldlength=400
+
+recordcount=20000000
+operationcount=80000000
+workload=com.yahoo.ycsb.workloads.CoreWorkload
+
+readallfields=true
+
+readproportion=0.5
+updateproportion=0.5
+scanproportion=0
+insertproportion=0
+
+requestdistribution=zipfian
+zipfianvalue=0.98
+maxscanlength=100
\ No newline at end of file
diff --git a/YCSB/workloads/workload_A.spec b/YCSB/workloads/workload_A.spec
new file mode 100644
index 000000000..8daafe878
--- /dev/null
+++ b/YCSB/workloads/workload_A.spec
@@ -0,0 +1,17 @@
+fieldcount=10
+fieldlength=100
+
+recordcount=80000000
+operationcount=320000000
+workload=com.yahoo.ycsb.workloads.CoreWorkload
+
+readallfields=true
+
+readproportion=0.5
+updateproportion=0.5
+scanproportion=0
+insertproportion=0
+
+requestdistribution=zipfian
+zipfianvalue=0.98
+maxscanlength=100
\ No newline at end of file
diff --git a/YCSB/workloads/workload_B.spec b/YCSB/workloads/workload_B.spec
new file mode 100644
index 000000000..b3df53ae4
--- /dev/null
+++ b/YCSB/workloads/workload_B.spec
@@ -0,0 +1,17 @@
+fieldcount=10
+fieldlength=100
+
+recordcount=80000000
+operationcount=320000000
+workload=com.yahoo.ycsb.workloads.CoreWorkload
+
+readallfields=true
+
+readproportion=0.95
+updateproportion=0.05
+scanproportion=0
+insertproportion=0
+
+requestdistribution=zipfian
+zipfianvalue=0.98
+maxscanlength=100
\ No newline at end of file
diff --git a/YCSB/workloads/workload_C.spec b/YCSB/workloads/workload_C.spec
new file mode 100644
index 000000000..dd7d41a08
--- /dev/null
+++ b/YCSB/workloads/workload_C.spec
@@ -0,0 +1,17 @@
+fieldcount=10
+fieldlength=100
+
+recordcount=80000000
+operationcount=320000000
+workload=com.yahoo.ycsb.workloads.CoreWorkload
+
+readallfields=true
+
+readproportion=1
+updateproportion=0
+scanproportion=0
+insertproportion=0
+
+requestdistribution=zipfian
+zipfianvalue=0.98
+maxscanlength=100
\ No newline at end of file
diff --git a/YCSB/workloads/workload_D.spec b/YCSB/workloads/workload_D.spec
new file mode 100644
index 000000000..e7c4e6986
--- /dev/null
+++ b/YCSB/workloads/workload_D.spec
@@ -0,0 +1,17 @@
+fieldcount=10
+fieldlength=100
+
+recordcount=80000000
+operationcount=320000000
+workload=com.yahoo.ycsb.workloads.CoreWorkload
+
+readallfields=true
+
+readproportion=0.95
+updateproportion=0
+scanproportion=0
+insertproportion=0.05
+
+requestdistribution=latest
+zipfianvalue=0.98
+maxscanlength=100
\ No newline at end of file
diff --git a/YCSB/workloads/workload_E.spec b/YCSB/workloads/workload_E.spec
new file mode 100644
index 000000000..26c9fd0e3
--- /dev/null
+++ b/YCSB/workloads/workload_E.spec
@@ -0,0 +1,17 @@
+fieldcount=10
+fieldlength=100
+
+recordcount=80000000
+operationcount=320000000
+workload=com.yahoo.ycsb.workloads.CoreWorkload
+
+readallfields=true
+
+readproportion=0
+updateproportion=0
+scanproportion=0.95
+insertproportion=0.05
+
+requestdistribution=zipfian
+zipfianvalue=0.98
+maxscanlength=100
\ No newline at end of file
diff --git a/YCSB/workloads/workload_F.spec b/YCSB/workloads/workload_F.spec
new file mode 100644
index 000000000..81a922969
--- /dev/null
+++ b/YCSB/workloads/workload_F.spec
@@ -0,0 +1,18 @@
+fieldcount=10
+fieldlength=100
+
+recordcount=80000000
+operationcount=320000000
+workload=com.yahoo.ycsb.workloads.CoreWorkload
+
+readallfields=true
+
+readproportion=0.5
+updateproportion=0
+scanproportion=0
+insertproportion=0
+readmodifywriteproportion=0.5
+
+requestdistribution=zipfian
+zipfianvalue=0.98
+maxscanlength=100
\ No newline at end of file
diff --git a/YCSB/workloads/workloadt b/YCSB/workloads/workloadt
index a69512474..efcc59b5d 100644
--- a/YCSB/workloads/workloadt
+++ b/YCSB/workloads/workloadt
@@ -2,8 +2,8 @@
 # Workload T: For Debug
 
 
-recordcount=5000000
-operationcount=2200000
+recordcount=80000000
+operationcount=80000000
 workload=com.yahoo.ycsb.workloads.CoreWorkload
 
 readallfields=true
diff --git a/YCSB/workloads/zipfian_0p.spec b/YCSB/workloads/zipfian_0p.spec
new file mode 100644
index 000000000..5780ccf0f
--- /dev/null
+++ b/YCSB/workloads/zipfian_0p.spec
@@ -0,0 +1,17 @@
+fieldcount=10
+fieldlength=100
+
+recordcount=80000000
+operationcount=320000000
+workload=com.yahoo.ycsb.workloads.CoreWorkload
+
+readallfields=true
+
+readproportion=0.50
+updateproportion=0.50
+scanproportion=0
+insertproportion=0
+
+requestdistribution=zipfian
+zipfianvalue=0
+maxscanlength=100
\ No newline at end of file
diff --git a/YCSB/workloads/zipfian_105p.spec b/YCSB/workloads/zipfian_105p.spec
new file mode 100644
index 000000000..1bca10d31
--- /dev/null
+++ b/YCSB/workloads/zipfian_105p.spec
@@ -0,0 +1,17 @@
+fieldcount=10
+fieldlength=100
+
+recordcount=80000000
+operationcount=320000000
+workload=com.yahoo.ycsb.workloads.CoreWorkload
+
+readallfields=true
+
+readproportion=0.50
+updateproportion=0.50
+scanproportion=0
+insertproportion=0
+
+requestdistribution=zipfian
+zipfianvalue=1.05
+maxscanlength=100
\ No newline at end of file
diff --git a/YCSB/workloads/zipfian_110p.spec b/YCSB/workloads/zipfian_110p.spec
new file mode 100644
index 000000000..d544be44c
--- /dev/null
+++ b/YCSB/workloads/zipfian_110p.spec
@@ -0,0 +1,17 @@
+fieldcount=10
+fieldlength=100
+
+recordcount=80000000
+operationcount=320000000
+workload=com.yahoo.ycsb.workloads.CoreWorkload
+
+readallfields=true
+
+readproportion=0.50
+updateproportion=0.50
+scanproportion=0
+insertproportion=0
+
+requestdistribution=zipfian
+zipfianvalue=1.1
+maxscanlength=100
\ No newline at end of file
diff --git a/YCSB/workloads/zipfian_70p.spec b/YCSB/workloads/zipfian_70p.spec
new file mode 100644
index 000000000..3211f81c7
--- /dev/null
+++ b/YCSB/workloads/zipfian_70p.spec
@@ -0,0 +1,17 @@
+fieldcount=10
+fieldlength=100
+
+recordcount=80000000
+operationcount=320000000
+workload=com.yahoo.ycsb.workloads.CoreWorkload
+
+readallfields=true
+
+readproportion=0.50
+updateproportion=0.50
+scanproportion=0
+insertproportion=0
+
+requestdistribution=zipfian
+zipfianvalue=0.70
+maxscanlength=100
\ No newline at end of file
diff --git a/YCSB/workloads/zipfian_90p.spec b/YCSB/workloads/zipfian_90p.spec
new file mode 100644
index 000000000..f40478267
--- /dev/null
+++ b/YCSB/workloads/zipfian_90p.spec
@@ -0,0 +1,17 @@
+fieldcount=10
+fieldlength=100
+
+recordcount=80000000
+operationcount=320000000
+workload=com.yahoo.ycsb.workloads.CoreWorkload
+
+readallfields=true
+
+readproportion=0.50
+updateproportion=0.50
+scanproportion=0
+insertproportion=0
+
+requestdistribution=zipfian
+zipfianvalue=0.90
+maxscanlength=100
\ No newline at end of file
diff --git a/YCSB/workloads/zipfian_95p.spec b/YCSB/workloads/zipfian_95p.spec
new file mode 100644
index 000000000..b4292293d
--- /dev/null
+++ b/YCSB/workloads/zipfian_95p.spec
@@ -0,0 +1,17 @@
+fieldcount=10
+fieldlength=100
+
+recordcount=80000000
+operationcount=320000000
+workload=com.yahoo.ycsb.workloads.CoreWorkload
+
+readallfields=true
+
+readproportion=0.50
+updateproportion=0.50
+scanproportion=0
+insertproportion=0
+
+requestdistribution=zipfian
+zipfianvalue=0.95
+maxscanlength=100
\ No newline at end of file
diff --git a/YCSB/workloads/zipfian_98p.spec b/YCSB/workloads/zipfian_98p.spec
new file mode 100644
index 000000000..a53b084dd
--- /dev/null
+++ b/YCSB/workloads/zipfian_98p.spec
@@ -0,0 +1,17 @@
+fieldcount=10
+fieldlength=100
+
+recordcount=80000000
+operationcount=320000000
+workload=com.yahoo.ycsb.workloads.CoreWorkload
+
+readallfields=true
+
+readproportion=0.50
+updateproportion=0.50
+scanproportion=0
+insertproportion=0
+
+requestdistribution=zipfian
+zipfianvalue=0.98
+maxscanlength=100
\ No newline at end of file
diff --git a/db/art/art_metric.h b/db/art/art_metric.h
new file mode 100644
index 000000000..a71d6a029
--- /dev/null
+++ b/db/art/art_metric.h
@@ -0,0 +1,133 @@
+//
+// Created by Guo Teng.
+//
+
+#pragma once
+#include <cstddef>
+#include <cstdint>
+#include <string>
+#include <iostream>
+
+namespace ROCKSDB_NAMESPACE {
+
+// record evaluation metric for WaLSM paper
+constexpr bool EVALUATE_METRIC = true; 
+
+// 将FlushMetric和SSDWriteMetric加起来才是总的Write Data Size
+
+// metric of WaLSM flush write evaluation
+// NVM MemTable-L0的写入
+// struct NVMWriteMetric {
+//     uint64_t WriteSsdDataBytes = 0;
+//     uint64_t LastPrintedBytes = 0;
+  
+//     std::string getMetric() {
+//         double WriteSSdDataGB = WriteSsdDataBytes * 1.0 / (1024 * 1024 * 1024);
+//         std::string s = "Write Metric analysis:";
+//         s = s + " <WriteSSD> From NVM - " + std::to_string(WriteSSdDataGB) + " GB";
+//         return s;
+//     }
+  
+//     void printMetric() {
+//         if (WriteSsdDataBytes - LastPrintedBytes > 1024 * 1024 * 1024) {
+//             LastPrintedBytes = WriteSsdDataBytes;
+//             std::cout << getMetric() << std::endl;
+//         }
+//     }
+
+//     void updateMetric(uint64_t add_bytes) {
+//         if (EVALUATE_METRIC) {
+//             WriteSsdDataBytes += add_bytes;
+//             printMetric();
+//         }
+//     }
+// };
+
+// metric of WaLSM compaction write evaluation
+// L0-L1, L1-L2, ... 的Compaction的写入
+struct SSDWriteMetric {
+    uint64_t WriteSsdDataBytes = 0;
+    uint64_t LastPrintedBytes = 0;
+  
+    std::string getMetric() {
+        double WriteSSdDataGB = WriteSsdDataBytes * 1.0 / (1024 * 1024 * 1024);
+        std::string s = "Write Metric analysis:";
+        s = s + " <WriteSSD> From SSD - " + std::to_string(WriteSSdDataGB) + " GB";
+        return s;
+    }
+  
+    void printMetric() {
+        if (WriteSsdDataBytes - LastPrintedBytes > 1024 * 1024 * 1024) {
+            LastPrintedBytes = WriteSsdDataBytes;
+            std::cout << getMetric() << std::endl;
+        }
+    }
+
+    void updateMetric(uint64_t add_bytes) {
+        if (EVALUATE_METRIC) {
+            WriteSsdDataBytes += add_bytes;
+            printMetric();
+        }
+    }
+};
+
+// metric of WaLSM flush evaluation
+// 从NVM flush到SSD的KV数据大小
+struct FlushMetric {
+    uint64_t FlushSsdDataBytes = 0;
+    uint64_t LastPrintedBytes = 0;
+  
+    std::string getMetric() {
+        double FlushSSdDataGB = FlushSsdDataBytes * 1.0 / (1024 * 1024 * 1024);
+        std::string s = "Flush Metric analysis:";
+        s = s + " <FlushSSD> - " + std::to_string(FlushSSdDataGB) + " GB";
+        return s;
+    }
+  
+    void printMetric() {
+        if (FlushSsdDataBytes - LastPrintedBytes > 512 * 1024 * 1024) {
+            LastPrintedBytes = FlushSsdDataBytes;
+            std::cout << getMetric() << std::endl;
+        }
+    }
+
+    void updateMetric(uint64_t add_bytes) {
+        if (EVALUATE_METRIC) {
+            FlushSsdDataBytes += add_bytes;
+            printMetric();
+        }
+    }
+};
+  
+// metric of WaLSM read evaluation
+// 读取的文件物理块个数，一个块4KB
+struct ReadMetric {
+    uint64_t ReadSsdBlocksCnt = 0;
+    uint64_t LastPrintedCount = 0;
+  
+    std::string getMetric() {
+        std::string s = "Read Metric analysis:";
+        s = s + " <ReadSSD> - " + std::to_string(ReadSsdBlocksCnt) + " Blocks";
+        return s;
+    }
+  
+    void printMetric() {
+        if (ReadSsdBlocksCnt - LastPrintedCount > 1000 * 1000) {
+            LastPrintedCount = ReadSsdBlocksCnt;
+            std::cout << getMetric() << std::endl;
+        }
+    }
+
+    void updateMetric(uint64_t offset_start, uint64_t offset_end) {
+        uint64_t block_start = offset_start / 4096;
+        uint64_t block_end = offset_end / 4096;
+        
+        if (EVALUATE_METRIC) {
+            ReadSsdBlocksCnt += std::max(block_end - block_end, uint64_t(1));
+            printMetric();
+        }
+    }
+};
+
+
+} // namespace rocksdb
\ No newline at end of file
diff --git a/db/art/clf_model.cc b/db/art/clf_model.cc
index 2c8224979..333e00f1c 100644
--- a/db/art/clf_model.cc
+++ b/db/art/clf_model.cc
@@ -7,15 +7,9 @@
 #include <map>
 #include <random>
 #include <chrono>
+#include "port/likely.h"
 
 namespace ROCKSDB_NAMESPACE {
-
-uint16_t ClfModel::feature_num_; 
-std::string ClfModel::dataset_name_;
-std::string ClfModel::dataset_path_;
-std::string ClfModel::host_, ClfModel::port_;
-size_t ClfModel::buffer_size_;
-
 void ClfModel::write_debug_dataset() {
     assert(feature_num_ > 0);
     // ready for writer
@@ -125,13 +119,17 @@ void ClfModel::write_real_dataset(std::vector<std::vector<uint32_t>>& datas, std
 
 void ClfModel::write_dataset(std::vector<std::vector<uint32_t>>& datas, std::vector<uint16_t>& tags, std::vector<uint32_t>& get_cnts) {
     assert(feature_num_ > 0);
-    if (datas.empty()) {
-        write_debug_dataset();
-        // dataset_cnt_ += 1;
-        return;
-    }
+    if (UNLIKELY(datas.empty())) return;
+    assert(datas.size() > 0);
+    // if (datas.empty()) {
+    //     assert(false); // we have to write dataset
+    //     write_debug_dataset();
+    //     // dataset_cnt_ += 1;
+    //     return;
+    // }
 
     assert(feature_num_ % 2 != 0); // features num: 2r + 1 
+    assert(feature_num_ >= 3);
 
     write_real_dataset(datas, tags, get_cnts);
     // dataset_cnt_ += 1;
@@ -139,9 +137,10 @@ void ClfModel::write_dataset(std::vector<std::vector<uint32_t>>& datas, std::vec
 }
 
 void ClfModel::make_train(std::vector<std::vector<uint32_t>>& datas, std::vector<uint16_t>& tags, std::vector<uint32_t>& get_cnts) {
-    assert(feature_num_ > 0);
+    assert(feature_num_ > 0); // model is ready
     write_dataset(datas, tags, get_cnts);
 
+    // // TODO: avoid python model training
     // already write dataset
     // send msg to LightGBM server, let server read dataset and train new model
     libsocket::inet_stream sock(host_, port_, LIBSOCKET_IPv4);
@@ -153,6 +152,7 @@ void ClfModel::make_train(std::vector<std::vector<uint32_t>>& datas, std::vector
     sock << message;
     sock >> recv_buffer; // wait for training end
     // will destroy sock when leaving this func scope
+    std::cout << "[MODEL] model training end, message: " << recv_buffer << std::endl;
 }
 
 void ClfModel::make_predict_samples(std::vector<std::vector<uint32_t>>& datas) {
@@ -194,7 +194,7 @@ void ClfModel::make_real_predict(std::vector<std::vector<uint32_t>>& datas, std:
     libsocket::inet_stream sock(host_, port_, LIBSOCKET_IPv4);
     std::string message, recv_buffer;
     for (std::vector<uint32_t>& data : datas) {
-        if (!data.empty()) {
+        if (LIKELY(!data.empty())) {
             prepare_data(data);
             message.clear();
             recv_buffer.clear();
@@ -220,10 +220,12 @@ void ClfModel::make_real_predict(std::vector<std::vector<uint32_t>>& datas, std:
 void ClfModel::make_predict(std::vector<std::vector<uint32_t>>& datas, std::vector<uint16_t>& preds) {
     preds.clear();
 
+    if (UNLIKELY(datas.empty())) return;
+    assert(datas.size() > 0);
     // datas empty means we are debuging class ClfModel
-    if (datas.empty()) {
-        make_predict_samples(datas);
-    } 
+    // if (datas.empty()) {
+    //     make_predict_samples(datas);
+    // } 
     // only write pred result to vector preds, and return nothing
     make_real_predict(datas, preds);
     return;
diff --git a/db/art/clf_model.h b/db/art/clf_model.h
index def2bf820..ab9087e26 100644
--- a/db/art/clf_model.h
+++ b/db/art/clf_model.h
@@ -7,6 +7,7 @@
 #include <cassert>
 #include <iostream>
 #include "macros.h"
+#include "port/likely.h"
 
 // dataset data point format: 
 // every data point accounts for one segment
@@ -29,17 +30,23 @@
 namespace ROCKSDB_NAMESPACE {
 
 struct RangeRatePair;
+struct RangeHeatPair;
 class ClfModel;
 
 bool RangeRatePairLessorComparor(const RangeRatePair& pair_1, const RangeRatePair& pair_2);
 bool RangeRatePairGreatorComparor(const RangeRatePair& pair_1, const RangeRatePair& pair_2);
 
+bool RangeHeatPairLessorComparor(const RangeHeatPair& pair_1, const RangeHeatPair& pair_2);
+bool RangeHeatPairGreatorComparor(const RangeHeatPair& pair_1, const RangeHeatPair& pair_2);
+
 struct RangeRatePair {
     uint32_t range_id;
     double rate_in_segment;
-    RangeRatePair(const uint32_t& id, const double& rate) {
-        range_id = id; rate_in_segment = rate;
-    }
+};
+
+struct RangeHeatPair {
+    double rate_in_segment;
+    double heat_value;
 };
 
 inline bool RangeRatePairLessorComparor(const RangeRatePair& pair_1, const RangeRatePair& pair_2) {
@@ -50,13 +57,21 @@ inline bool RangeRatePairGreatorComparor(const RangeRatePair& pair_1, const Rang
     return pair_1.rate_in_segment > pair_2.rate_in_segment;
 }
 
+inline bool RangeHeatPairLessorComparor(const RangeHeatPair& pair_1, const RangeHeatPair& pair_2) {
+    return pair_1.heat_value < pair_2.heat_value;
+}
+
+inline bool RangeHeatPairGreatorComparor(const RangeHeatPair& pair_1, const RangeHeatPair& pair_2) {
+    return pair_1.heat_value > pair_2.heat_value;
+}
+
 class ClfModel {
 private:
-    static uint16_t feature_num_; // model input features num
-    static std::string dataset_name_; // dataset csv file name
-    static std::string dataset_path_; // path to save dataset csv file
-    static std::string host_, port_; // lightgbm server connection
-    static size_t buffer_size_; // socket receive buffer max size
+    uint16_t feature_num_; // model input features num
+    std::string dataset_name_; // dataset csv file name
+    std::string dataset_path_; // path to save dataset csv file
+    std::string host_, port_; // lightgbm server connection
+    size_t buffer_size_; // socket receive buffer max size
 public:
     // init member vars
     ClfModel() {
@@ -78,7 +93,7 @@ class ClfModel {
     // feature num = level feature num (1) + 2 * num of key ranges 
     // we set features_num_ to largest feature num
     void make_ready(std::vector<uint16_t>& features_nums) { 
-        if (features_nums.empty()) {
+        if (UNLIKELY(features_nums.empty())) {
             feature_num_ = 41; // debug feature num, see ../lgb_server files
         } else {
             // we may limit feature_num_ because of the socket transmit size limit is 1024 bytes
@@ -88,7 +103,7 @@ class ClfModel {
                 feature_num_ = MAX_FEATURES_NUM;
             }
         }
-
+        assert(feature_num_ == MAX_FEATURES_NUM);
         // std::cout << "[DEBUG] ClfModel ready, feature_num_: " << feature_num_ << std::endl;
     }
 
diff --git a/db/art/compactor.h b/db/art/compactor.h
index 76d884096..b066f9a77 100644
--- a/db/art/compactor.h
+++ b/db/art/compactor.h
@@ -11,6 +11,7 @@
 #include <deque>
 #include <db/art/utils.h>
 #include <condition_variable>
+#include "table/block_based/filter_block.h"
 #include <db/dbformat.h>
 #include <rocksdb/rocksdb_namespace.h>
 #include <rocksdb/threadpool.h>
@@ -45,6 +46,8 @@ struct SingleCompactionJob {
   std::vector<std::string> keys_in_node;
   autovector<RecordIndex>* compacted_indexes;
 
+  SegmentBuilderResult segment_builder_result;
+
   void Reset() {
     candidates.clear();
     candidates_removed.clear();
diff --git a/db/art/filter_cache.cc b/db/art/filter_cache.cc
index 731f12ccb..871d24078 100644
--- a/db/art/filter_cache.cc
+++ b/db/art/filter_cache.cc
@@ -1,97 +1,140 @@
 #include "filter_cache.h"
 #include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <iostream>
+#include <thread>
+#include <chrono>
+#include "table/block_based/parsed_full_filter_block.h"
+#include "filter_cache_entry.h"
+#include "port/likely.h"
 
 namespace ROCKSDB_NAMESPACE {
 
-FilterCache FilterCacheManager::filter_cache_;
-HeatBuckets FilterCacheManager::heat_buckets_;
-ClfModel FilterCacheManager::clf_model_;
-GreedyAlgo FilterCacheManager::greedy_algo_;
-FilterCacheHeapManager FilterCacheManager::heap_manager_;
-uint32_t FilterCacheManager::get_cnt_;
-uint32_t FilterCacheManager::period_cnt_;
-uint32_t FilterCacheManager::last_long_period_;
-uint32_t FilterCacheManager::last_short_period_;
-std::mutex FilterCacheManager::update_mutex_;
-bool FilterCacheManager::train_signal_;
-std::map<uint32_t, uint32_t> FilterCacheManager::last_count_recorder_; 
-std::map<uint32_t, uint32_t> FilterCacheManager::current_count_recorder_; 
-std::mutex FilterCacheManager::count_mutex_;
-bool FilterCacheManager::is_ready_;
-
-bool FilterCache::check_key(const uint32_t& segment_id, const std::string& key) {
+std::vector<CachableEntry<ParsedFullFilterBlock>> FilterCache::get_filter_blocks(const uint32_t segment_id) {
     auto it = filter_cache_.find(segment_id);
-    if (it == filter_cache_.end()) {
+    if (UNLIKELY(it == filter_cache_.end())) {
         // not in cache, that means we havent insert segment FilterCacheItem info into cache
         // actually, we start inserting after every segment becomes available
-        return true;
-    } else {
-        return (it->second).check_key(key);
+        // we return a empty vector here
+        return {};
+    }
+
+    return it->second.get_filter_blocks();
+}
+
+void FilterCache::init_segment(uint32_t segment_id, const BlockBasedTable* table, const std::vector<BlockHandle>& block_handles) {
+    // filter_cache_[segment_id] = FilterCacheEntry(segment_id, table, this, block_handles);
+
+    if (LIKELY(table != nullptr && block_handles.size() == MAX_UNITS_NUM)) {
+        filter_cache_.emplace(std::piecewise_construct, std::make_tuple(segment_id), std::make_tuple(segment_id, table, this, block_handles));
     }
 }
 
 void FilterCache::enable_for_segments(std::unordered_map<uint32_t, uint16_t>& segment_units_num_recorder, const bool& is_forced,
-                                      std::set<uint32_t>& level_0_segment_ids, std::set<uint32_t>& failed_segment_ids) {
+                                      std::set<uint32_t>& new_level_0_segment_ids, std::set<uint32_t>& failed_segment_ids) {
     failed_segment_ids.clear();
     filter_cache_mutex_.lock();
+    // uint32_t enable_non_l0_count = 0, enable_l0_count = 0, fail_count = 0;
+    // std::cout << "level 0 filter usage before enable: " << level_0_used_space_size_ << std::endl;
+    // std::cout << "non level 0 filter usage before enable: " << used_space_size_ << std::endl; 
     for (auto it = segment_units_num_recorder.begin(); it != segment_units_num_recorder.end(); it ++) {
         const uint32_t segment_id = it->first;
         const uint16_t units_num = it->second;
         auto cache_it = filter_cache_.find(segment_id);
-        bool is_level_0 = level_0_segment_ids.count(segment_id);
+        bool is_level_0 = new_level_0_segment_ids.count(segment_id);
         if (cache_it != filter_cache_.end()) {
             // filter units cached
+            const uint32_t old_size = (cache_it->second).approximate_size();
+            assert(old_size >= 0); // should not cache it before
             if (is_forced || is_level_0 || !is_full()) {
-                const uint32_t old_size = (cache_it->second).approximate_size();
                 (cache_it->second).enable_units(units_num);
-                used_space_size_ = used_space_size_ - old_size + (cache_it->second).approximate_size();
                 if (is_level_0) {
                     level_0_used_space_size_ = level_0_used_space_size_ - old_size + (cache_it->second).approximate_size();
+                    // enable_l0_count++;
+                    // std::cout << "enable " << int((cache_it->second).approximate_size()) - int(old_size) 
+                    //           << " bits for l0 segment " << segment_id  << ", units num: " << units_num << std::endl;
+                }
+                else {
+                    used_space_size_ = used_space_size_ - old_size + (cache_it->second).approximate_size();
+                    // enable_non_l0_count++;
+                    // std::cout << "enable " << int((cache_it->second).approximate_size()) - int(old_size) 
+                    //           << " bits for non l0 segment " << segment_id << ", units num: " << units_num << std::endl;
                 }
             } else {
                 failed_segment_ids.insert(segment_id);
+                // fail_count++;
+                assert(new_level_0_segment_ids.count(segment_id) == 0);
+                // std::cout << "failed to enable filters for segment " << segment_id << std::endl;
             }
         } else {
+            // already call FIlterCache::init_segment, 
+            // so new segment already inserted into filter cache, but no filter units cached
+
             // filter units not cached
             // now cache it
-            if (is_forced || is_level_0 || !is_full()) {
-                FilterCacheItem cache_item(units_num);
-                filter_cache_.insert(std::make_pair(segment_id, cache_item));
-                used_space_size_ = used_space_size_ + cache_item.approximate_size();
-                if (is_level_0) {
-                    level_0_used_space_size_ = level_0_used_space_size_ + cache_item.approximate_size();
-                }
-            } else {
-                failed_segment_ids.insert(segment_id);
-            }
+            // if (is_forced || is_level_0 || !is_full()) {
+            //     FilterCacheEntry cache_item(units_num);
+            //     filter_cache_.insert(std::make_pair(segment_id, cache_item));
+            //     used_space_size_ = used_space_size_ + cache_item.approximate_size();
+            //     if (is_level_0) {
+            //         level_0_used_space_size_ = level_0_used_space_size_ + cache_item.approximate_size();
+            //     }
+            // } else {
+            //     failed_segment_ids.insert(segment_id);
+            // }
+
+            // all segments to be enabled must have been inited
+            // std::cout << "filter handle not exist, segment id: " << segment_id << std::endl;
+            // assert(false);
         }
     }
+    // std::cout << "enable l0 count: " << enable_l0_count << ", enable non l0 count: " << enable_non_l0_count << ", fail count: " << fail_count << std::endl;
+    // std::cout << "level 0 filter usage after enable: " << level_0_used_space_size_ << std::endl;
+    // std::cout << "non level 0 filter usage after enable: " << used_space_size_ << std::endl; 
+    // assert(enable_l0_count == new_level_0_segment_ids.size());
+    // assert(enable_l0_count + enable_non_l0_count + fail_count == segment_units_num_recorder.size());
     filter_cache_mutex_.unlock();
 }
 
-void FilterCache::update_for_segments(std::unordered_map<uint32_t, uint16_t>& segment_units_num_recorder, const bool& is_forced,
-                                      std::set<uint32_t>& level_0_segment_ids, std::set<uint32_t>& failed_segment_ids) {
+void FilterCache::update_for_segments(std::unordered_map<uint32_t, uint16_t>& segment_units_num_recorder,
+                                      std::set<uint32_t>& old_level_0_segment_ids, std::set<uint32_t>& failed_segment_ids) {
+    assert(false); // only used in move_segment, but it is disallowed, so this func never used.
+    exit(0);
+    // because no new segments is generated, no reason to increase the usage of filter cache
+    bool is_forced = true; 
+    failed_segment_ids.clear();
     filter_cache_mutex_.lock();
     for (auto it = segment_units_num_recorder.begin(); it != segment_units_num_recorder.end(); it ++) {
         const uint32_t segment_id = it->first;
         const uint16_t units_num = it->second;
         auto cache_it = filter_cache_.find(segment_id);
-        bool is_level_0 = level_0_segment_ids.count(segment_id);
+        bool is_level_0 = old_level_0_segment_ids.count(segment_id);
         if (cache_it != filter_cache_.end()) {
+            const uint32_t old_size = (cache_it->second).approximate_size();
             // filter units cached
             if (is_forced || is_level_0 || !is_full()) {
-                const uint32_t old_size = (cache_it->second).approximate_size();
                 (cache_it->second).enable_units(units_num);
-                used_space_size_ = used_space_size_ - old_size + (cache_it->second).approximate_size();
                 if (is_level_0) {
-                    level_0_used_space_size_ = level_0_used_space_size_ - old_size + (cache_it->second).approximate_size();
+                    assert(old_size > 0); // should already cache filter for level 0.
+                    level_0_used_space_size_ -= old_size;
+                    used_space_size_ += (cache_it->second).approximate_size();
+                } else {
+                    used_space_size_ = used_space_size_ - old_size + (cache_it->second).approximate_size();
                 }
             } else {
+                // never reach this statement, because is_forced is always true
+                assert(false);
                 failed_segment_ids.insert(segment_id);
             }
         } else {
             // filter units not cached
             // do nothing!!!
+            
+            // all segments to be enabled must have been inited
+            // std::cout << "error segment_id: " << segment_id << std::endl;
+            // assert(false);
         }
     }
     filter_cache_mutex_.unlock();
@@ -105,25 +148,38 @@ bool FilterCache::is_ready() {
     return double(used_space_size_) / double(cache_size_) >= READY_RATE;
 }
 
-void FilterCache::release_for_segments(std::vector<uint32_t>& segment_ids, std::set<uint32_t>& level_0_segment_ids) {
+void FilterCache::release_for_segments(std::vector<uint32_t>& segment_ids, std::set<uint32_t>& old_level_0_segment_ids) {
     std::sort(segment_ids.begin(), segment_ids.end());
     // delete key-value pair in filter_cache_
     filter_cache_mutex_.lock();
     auto it = filter_cache_.begin();
     size_t idx = 0;
+    // uint32_t release_non_l0_count = 0, release_l0_count = 0;
+    // std::cout << "level 0 filter usage before release: " << level_0_used_space_size_ << std::endl;
+    // std::cout << "non level 0 filter usage before release: " << used_space_size_ << std::endl; 
     while (it != filter_cache_.end() && idx < segment_ids.size()) {
         if (it->first < segment_ids[idx]) {
             it ++;
         } else if (it->first > segment_ids[idx]) {
             idx ++;
         } else {
-            used_space_size_ = used_space_size_ - (it->second).approximate_size();
-            if (level_0_segment_ids.count(it->first)) {
+            if (old_level_0_segment_ids.count(it->first)) {
                 level_0_used_space_size_ = level_0_used_space_size_ - (it->second).approximate_size();
+                // release_l0_count++; 
+                // std::cout << "free " << (it->second).approximate_size() << " bits of level 0 segment " << it->first << std::endl;
+            } else {
+                used_space_size_ = used_space_size_ - (it->second).approximate_size();
+                // release_non_l0_count++;
+                // std::cout << "free " << (it->second).approximate_size() << " bits of non level 0 segment " << it->first << std::endl;
             }
-            it = filter_cache_.erase(it);
+            it = filter_cache_.erase(it); idx++;
         }
     }
+    // assert(release_non_l0_count + release_l0_count == segment_ids.size());
+    // assert(release_l0_count == old_level_0_segment_ids.size());
+    // std::cout << "release l0 count: " << release_l0_count << ", release non l0 count: " << release_non_l0_count << std::endl;
+    // std::cout << "level 0 filter usage after release: " << level_0_used_space_size_ << std::endl;
+    // std::cout << "non level 0 filter usage after release: " << used_space_size_ << std::endl; 
     filter_cache_mutex_.unlock();
 }
 
@@ -143,39 +199,58 @@ bool FilterCacheManager::make_heat_buckets_ready(const std::string& key,
 }
 
 void FilterCacheManager::hit_heat_buckets(const std::string& key) {
-    if (heat_buckets_.is_ready()) {
+    bool signal = false;
+    if (LIKELY(heat_buckets_.is_ready())) {
         get_cnt_ += 1;
-        if (get_cnt_ >= PERIOD_COUNT) {
-            heat_buckets_.hit(key, true);
+        heat_buckets_.hit(key, signal); // if one period end, return true signal
+        if (signal) {
+            period_mutex_.WriteLock();
             get_cnt_ = 0;
             period_cnt_ += 1;
-        } else {
-            heat_buckets_.hit(key, false);
+            // std::cout << "get cnt updated, current period cnt: " << period_cnt_ << std::endl;
+            period_mutex_.WriteUnlock();
         }
     }
-    if (period_cnt_ - last_long_period_ >= TRAIN_PERIODS) {
-        update_mutex_.lock();
+}
 
-        if (period_cnt_ - last_long_period_ >= TRAIN_PERIODS) {
-            last_long_period_ = period_cnt_;
-            update_count_recorder();
-            train_signal_ = true;
-        }
+void FilterCacheManager::do_periods_work() {
+    bool need_retrain = false;
 
-        update_mutex_.unlock();
+    // called by a background thread, never need to lock
+    // update_mutex_.lock();
+    if (period_cnt_ - last_long_period_ >= TRAIN_PERIODS) {
+        // std::cout << "period_cnt_: " << period_cnt_ << std::endl;
+        // std::cout << "last_long_period_: " << last_long_period_ << std::endl;
+        last_long_period_ = period_cnt_;
+        update_count_recorder();
+        // debug_count_recorder();
+        std::map<uint32_t, uint32_t> recent_count_recorder;
+        std::vector<uint32_t> empty_needed_segment_ids;
+        estimate_recent_counts(recent_count_recorder, empty_needed_segment_ids);
+        assert(recent_count_recorder.size() > 0);
+        // std::cout << "long period end, sync visit cnt." << std::endl;
+        heap_manager_.sync_visit_cnt(recent_count_recorder);
+        train_signal_ = true;
+        need_retrain = true;
     }
     if (period_cnt_ - last_short_period_ >= 1) {
-        update_mutex_.lock();
-
-        if (period_cnt_ - last_short_period_ >= 1) {
-            last_short_period_ = period_cnt_;
-            std::map<uint32_t, uint32_t> estimate_count_recorder;
-            estimate_counts_for_all(estimate_count_recorder);
-            heap_manager_.sync_visit_cnt(estimate_count_recorder);
+        last_short_period_ = period_cnt_;
+        // if already updated, do not update again
+        if (!need_retrain) {
+            // std::cout << "period_cnt_: " << period_cnt_ << std::endl;
+            // std::cout << "last_short_period_: " << last_short_period_ << std::endl;
+            // debug_count_recorder();
+            std::map<uint32_t, uint32_t> recent_count_recorder;
+            std::vector<uint32_t> empty_needed_segment_ids;
+            estimate_recent_counts(recent_count_recorder, empty_needed_segment_ids);
+            assert(recent_count_recorder.size() > 0);
+            // std::cout << "short period end, sync visit cnt." << std::endl;
+            heap_manager_.sync_visit_cnt(recent_count_recorder);
         }
-
-        update_mutex_.unlock();
     }
+
+    std::this_thread::sleep_for(std::chrono::milliseconds(1000));
+    // update_mutex_.unlock();
 }
 
 bool FilterCacheManager::make_clf_model_ready(std::vector<uint16_t>& features_nums) {
@@ -183,126 +258,281 @@ bool FilterCacheManager::make_clf_model_ready(std::vector<uint16_t>& features_nu
     return clf_model_.is_ready();
 }
 
-bool FilterCacheManager::check_key(const uint32_t& segment_id, const std::string& key) {
+std::vector<CachableEntry<ParsedFullFilterBlock>> FilterCacheManager::get_filter_blocks(uint32_t segment_id) {
     // move hit_count_recorder to a background thread
     // hit_count_recorder(segment_id); // one get opt will cause query to many segments.
     // so one get opt only call one hit_heat_buckets, but call many hit_count_recorder
-    return filter_cache_.check_key(segment_id, key);
+    return filter_cache_.get_filter_blocks(segment_id);
+}
+
+void FilterCacheManager::init_segment(uint32_t segment_id, const BlockBasedTable* table, const std::vector<BlockHandle>& block_handles) {
+    filter_cache_.init_segment(segment_id, table, block_handles);
 }
 
-void FilterCacheManager::hit_count_recorder(const uint32_t& segment_id) {
-    count_mutex_.lock();
+void FilterCacheManager::hit_count_recorder(uint32_t segment_id) {
+    count_mutex_.ReadLock();
 
     auto it = current_count_recorder_.find(segment_id);
     if (it == current_count_recorder_.end()) {
         // segment havent been visited, need to insert count
-        current_count_recorder_.insert(std::make_pair(segment_id, 1));
+        // current_count_recorder_.insert(std::make_pair(segment_id, 1));
+        // do nothing, wait for insertion
     } else {
         // segment have been visited, only update count
         it->second = it->second + 1;
     }
 
-    count_mutex_.unlock();
+    count_mutex_.ReadUnlock();
 }
 
 void FilterCacheManager::update_count_recorder() {
-    count_mutex_.lock();
+    count_mutex_.WriteLock();
 
     last_count_recorder_.clear();
-    last_count_recorder_.insert(current_count_recorder_.begin(), current_count_recorder_.end());
+    // last_count_recorder_.insert(current_count_recorder_.begin(), current_count_recorder_.end());
+    std::copy(current_count_recorder_.begin(), current_count_recorder_.end(), 
+              std::inserter(last_count_recorder_, last_count_recorder_.begin()));
+    assert(last_count_recorder_.size() == current_count_recorder_.size());
     for (auto it = current_count_recorder_.begin(); it != current_count_recorder_.end(); it++) {
         it->second = 0;
     }
 
-    count_mutex_.unlock();
+    count_mutex_.WriteUnlock();
+}
+
+void FilterCacheManager::debug_count_recorder() {
+    uint32_t get_cnt = 0;
+
+    count_mutex_.ReadLock();
+
+    std::cout << "last_count_recorder: " << std::endl;
+    for (auto it = last_count_recorder_.begin(); it != last_count_recorder_.end(); it++) {
+        std::cout << it->first << ": " << it->second << std::endl;
+    }
+    std::cout << "current_count_recorder: " << std::endl;
+    for (auto it = current_count_recorder_.begin(); it != current_count_recorder_.end(); it++) {
+        get_cnt += it->second;
+        std::cout << it->first << ": " << it->second << std::endl;
+    }
+    std::cout << "get_cnt: " << get_cnt << std::endl;
+
+    count_mutex_.ReadUnlock();
 }
 
 void FilterCacheManager::inherit_count_recorder(std::vector<uint32_t>& merged_segment_ids, std::vector<uint32_t>& new_segment_ids,  const uint32_t& level_0_base_count,
                                                 std::map<uint32_t, std::unordered_map<uint32_t, double>>& inherit_infos_recorder) {
-    count_mutex_.lock();
+    count_mutex_.WriteLock();
 
+    // copy last count and current count of merged segments
     std::map<uint32_t, uint32_t> merged_last_count_recorder, merged_current_count_recorder; // cache merged segment count temporarily
+    // std::cout << std::endl << std::endl;
+    // std::cout << "merged segments id: ";
     for (uint32_t& merged_segment_id : merged_segment_ids) {
         merged_last_count_recorder.insert(std::make_pair(merged_segment_id, last_count_recorder_[merged_segment_id]));
         last_count_recorder_.erase(merged_segment_id);
         merged_current_count_recorder.insert(std::make_pair(merged_segment_id, current_count_recorder_[merged_segment_id]));
         current_count_recorder_.erase(merged_segment_id);
+
+        // std::cout << merged_segment_id << " ";
+        assert(last_count_recorder_.find(merged_segment_id) == last_count_recorder_.end());
+        assert(current_count_recorder_.find(merged_segment_id) == current_count_recorder_.end());
     }
+    // std::cout << std::endl;
 
+    // std::cout << "merged segments size: " << merged_segment_ids.size() << std::endl;
+    // std::cout << "new segments size: " << new_segment_ids.size() << std::endl;
+    // std::cout << "inherit_infos_recorder size: " << inherit_infos_recorder.size() << std::endl;
+
+    // init last count and current count of new segments based on inherit method (that not on Level 0)
     std::map<uint32_t, uint32_t> new_last_count_recorder, new_current_count_recorder;
     for (auto infos_it = inherit_infos_recorder.begin(); infos_it != inherit_infos_recorder.end(); infos_it ++) {
         double last_count = 0, current_count = 0;
+        double weight_sum = 0;
+        // std::cout << "child segment: " << infos_it->first << std::endl;
         std::unordered_map<uint32_t, double>& info = infos_it->second;
         for (auto info_it = info.begin(); info_it != info.end(); info_it ++) {
-            last_count = last_count + INHERIT_REMAIN_FACTOR * (merged_last_count_recorder[info_it->first] * info_it->second);
-            current_count = current_count + INHERIT_REMAIN_FACTOR * (merged_current_count_recorder[info_it->first] * info_it->second);
+            last_count += INHERIT_REMAIN_FACTOR * (merged_last_count_recorder[info_it->first] * info_it->second);
+            current_count += INHERIT_REMAIN_FACTOR * (merged_current_count_recorder[info_it->first] * info_it->second);
+            weight_sum += info_it->second;
+            // std::cout << "parent segment: " << info_it->first << " weight: " << info_it->second 
+            //           << " last count: " << merged_last_count_recorder[info_it->first]
+            //           << " current count: " << merged_current_count_recorder[info_it->first] << std::endl;
+            assert(merged_last_count_recorder.find(info_it->first) != merged_last_count_recorder.end());
+            assert(merged_current_count_recorder.find(info_it->first) != merged_current_count_recorder.end());
         }
+        
+        // std::cout << "temp last count: " << uint32_t(last_count) << " temp currrent count: " << uint32_t(current_count) << std::endl;
+        // assert(weight_sum > 0.90);
+        // weight sum should be 1.0, we multiple the inherited count by (1.0 / weight_sum)
+        // assert(weight_sum > 0.98 && weight_sum < 1.02); // weight_sum approximately equals to 1.0
+        last_count *= (1.0 / weight_sum); current_count *= (1.0 / weight_sum); // actually weight_sum always equals to 1.0
+        // std::cout << "weight sum: " << weight_sum << " final last count: " << uint32_t(last_count) << " final currrent count: " << uint32_t(current_count) << std::endl;
         new_last_count_recorder.insert(std::make_pair(infos_it->first, uint32_t(last_count)));
         new_current_count_recorder.insert(std::make_pair(infos_it->first, uint32_t(current_count)));
     }
 
+    assert(inherit_infos_recorder.size() == new_last_count_recorder.size());
+    assert(inherit_infos_recorder.size() == new_current_count_recorder.size());
+    assert(inherit_infos_recorder.size() <= new_segment_ids.size());
+
+    // uint32_t last_insert_num = 0, current_insert_num = 0;
+    // uint32_t last_update_num = 0, current_update_num = 0;
+    // uint32_t last_check_num = 0, current_check_num = 0;
+
+    // insert last count and current count of new segments
     for (uint32_t& new_segment_id : new_segment_ids) {
+        // insert last count
         auto last_it = last_count_recorder_.find(new_segment_id);
         uint32_t new_last_count = level_0_base_count; // level 0 segments init
+        // if true, this means new segment not on level 0, also means this segments are inherited from some segments
         if (new_last_count_recorder.count(new_segment_id) > 0) {
             new_last_count = new_last_count_recorder[new_segment_id];
+            // last_check_num ++;
         }
         if (last_it != last_count_recorder_.end()) {
             last_it->second = last_it->second + new_last_count;
+            // last_update_num ++;
         } else {
             last_count_recorder_.insert(std::make_pair(new_segment_id, new_last_count));
+            // last_insert_num ++;
         }
 
+        // insert current count
         auto current_it = current_count_recorder_.find(new_segment_id);
         uint32_t new_current_count = level_0_base_count; // level 0 segments init
+        // if true, this means new segment not on level 0, also means this segments are inherited from some segments
         if (new_current_count_recorder.count(new_segment_id) > 0) {
             new_current_count = new_current_count_recorder[new_segment_id];
+            // current_check_num ++;
         }
         if (current_it != current_count_recorder_.end()) {
             current_it->second = current_it->second + new_current_count;
+            // current_update_num ++;
         } else {
             current_count_recorder_.insert(std::make_pair(new_segment_id, new_current_count));
+            // current_insert_num ++;
         }
+
+        assert(last_count_recorder_[new_segment_id] >= new_last_count);
+        assert(current_count_recorder_[new_segment_id] >= new_current_count);
+        // std::cout << "new segment id: " << new_segment_id << " last count: " << new_last_count << " current count: " << new_current_count << std::endl;
     }
 
-    count_mutex_.unlock();
+    // assert(last_insert_num + last_update_num == new_segment_ids.size());
+    // assert(current_insert_num + current_update_num == new_segment_ids.size());
+    // assert(last_check_num == inherit_infos_recorder.size());
+    // assert(current_check_num == inherit_infos_recorder.size());
+    // std::cout << "last_insert_num: " << last_insert_num << " last_update_num: " << last_update_num << std::endl;
+    // std::cout << "current_insert_num: " << current_insert_num << " current_update_num: " << current_update_num << std::endl;
+    // std::cout << std::endl << std::endl;
+
+    count_mutex_.WriteUnlock();
 }
 
-void FilterCacheManager::estimate_counts_for_all(std::map<uint32_t, uint32_t>& approximate_counts_recorder) {
+void FilterCacheManager::estimate_recent_counts(std::map<uint32_t, uint32_t>& approximate_counts_recorder, const std::vector<uint32_t>& needed_segment_ids) {
     const uint32_t long_period_total_count = TRAIN_PERIODS * PERIOD_COUNT;
     uint32_t current_long_period_count = PERIOD_COUNT * (period_cnt_ % TRAIN_PERIODS) + get_cnt_;
     double current_long_period_rate = std::min(double(current_long_period_count) / double(long_period_total_count), 1.0);
 
-    approximate_counts_recorder.clear();
-    approximate_counts_recorder.insert(current_count_recorder_.begin(), current_count_recorder_.end());
-    auto approx_it = approximate_counts_recorder.begin();
-    auto last_it = last_count_recorder_.begin();
-    while (approx_it != approximate_counts_recorder.end() && last_it != last_count_recorder_.end()) {
-        if (approx_it->first > last_it->first) {
-            last_it ++;
-        } else if(approx_it->first < last_it->first) {
-            approx_it ++;
-        } else {
-            approx_it->second = approx_it->second + uint32_t((1 - current_long_period_rate) * last_it->second);
+    if (needed_segment_ids.empty()) {
+        count_mutex_.ReadLock();
+        approximate_counts_recorder.clear();
+        // approximate_counts_recorder.insert(current_count_recorder_.begin(), current_count_recorder_.end());
+        std::copy(current_count_recorder_.begin(), current_count_recorder_.end(), 
+                  std::inserter(approximate_counts_recorder, approximate_counts_recorder.begin()));
+        assert(approximate_counts_recorder.size() == current_count_recorder_.size());
+        auto approx_it = approximate_counts_recorder.begin();
+        auto last_it = last_count_recorder_.begin();
+        // std::cout << "estimate all segments' recent frequency." << std::endl;
+        while (approx_it != approximate_counts_recorder.end() && last_it != last_count_recorder_.end()) {
+            if (approx_it->first > last_it->first) {
+                last_it ++;
+            } else if(approx_it->first < last_it->first) {
+                approx_it ++;
+            } else {
+                uint32_t recent_result = approx_it->second + uint32_t((1 - current_long_period_rate) * last_it->second);
+                // if (current_long_period_rate > 0) {
+                //     std::cout << "current rate: " << current_long_period_rate
+                //             << ", current count: " << approx_it->second
+                //             << ", last count: " << last_it->second
+                //             << ", final recent count: " << recent_result << std::endl;
+                // }
+                approx_it->second = recent_result;
+                assert(approximate_counts_recorder[approx_it->first] == recent_result);
+                if (uint32_t((1 - current_long_period_rate) * last_it->second)  > 0) 
+                    assert(current_count_recorder_[approx_it->first] != recent_result);
+                approx_it ++;
+                last_it ++;
+            }
+        }
+        count_mutex_.ReadUnlock();
+    } else {
+        count_mutex_.ReadLock();
+        approximate_counts_recorder.clear();
+        for (uint32_t segment_id : needed_segment_ids) {
+            approximate_counts_recorder.insert(std::make_pair(segment_id, current_count_recorder_[segment_id]));
+        }
+        assert(approximate_counts_recorder.size() == needed_segment_ids.size());
+        auto approx_it = approximate_counts_recorder.begin();
+        // std::cout << "estimate some segments' recent frequency." << std::endl;
+        while (approx_it != approximate_counts_recorder.end()) {
+            uint32_t recent_result = approx_it->second + 
+                                     uint32_t((1 - current_long_period_rate) * last_count_recorder_[approx_it->first]);
+            // if (current_long_period_rate > 0) {
+            //     std::cout << "current rate: " << current_long_period_rate
+            //               << ", current count: " << approx_it->second
+            //               << ", last count: " << last_count_recorder_[approx_it->first]
+            //               << ", final recent count: " << recent_result << std::endl;
+            // }
+            approx_it->second = recent_result;
+            assert(approximate_counts_recorder[approx_it->first] == recent_result);
+            if (uint32_t((1 - current_long_period_rate) * last_count_recorder_[approx_it->first]) > 0) 
+                assert(current_count_recorder_[approx_it->first] != recent_result);
             approx_it ++;
         }
+        count_mutex_.ReadUnlock();
     }
-
     // return nothing, already write result to approximate_counts_recorder
 }
 
 
-void FilterCacheManager::try_retrain_model(std::map<uint32_t, uint16_t>& level_recorder,
+bool FilterCacheManager::try_retrain_model(std::map<uint32_t, uint16_t>& level_recorder,
                                            std::map<uint32_t, std::vector<RangeRatePair>>& segment_ranges_recorder,
                                            std::map<uint32_t, uint32_t>& unit_size_recorder) {
     // we should guarantee these 3 external recorder share the same keys set
     // we need to do this job outside FilterCacheManager
     assert(level_recorder.size() == segment_ranges_recorder.size());
     // assert(level_recorder.size() == unit_size_recorder.size());
+    // should not train when loading, train_signal_ only true when starting YCSB run.
     if (train_signal_ == false) {
-        return;
+        return false;
     }
 
+    // auto level_it_0 = level_recorder.begin();
+    // while (level_it_0 != level_recorder.end()) {
+    //     if (last_count_recorder_.find(level_it_0->first) == last_count_recorder_.end()) continue;
+    //     uint32_t cnt = last_count_recorder_[level_it_0->first];
+    //     std::cout << level_it_0->first << " : " << cnt << ", level: " << level_it_0->second << std::endl;
+    //     level_it_0++;
+    // }
+
+    // recheck whether each segments include at least one key ranges.
+    // auto ranges_it = segment_ranges_recorder.begin();
+    // while(ranges_it != segment_ranges_recorder.end())
+    // {
+    //     // std::cout << "segment " << ranges_it->first 
+    //     //           << " ranges num : " << (ranges_it->second).size() << std::endl;
+    //     assert((ranges_it->second).size() > 0);
+
+    //     double rate_sum = 0;
+    //     for (RangeRatePair& pair : ranges_it->second) {
+    //         rate_sum += pair.rate_in_segment;
+    //     }
+    //     assert(rate_sum <= 1.02 && rate_sum >= 0.98);
+
+    //     ranges_it++;
+    // }                                        
+
     // solve programming problem
     std::map<uint32_t, uint16_t> label_recorder;
     std::map<uint32_t, SegmentAlgoInfo> algo_infos;
@@ -322,24 +552,64 @@ void FilterCacheManager::try_retrain_model(std::map<uint32_t, uint16_t>& level_r
     greedy_algo_.solve(algo_infos, label_recorder, filter_cache_.cache_size_except_level_0());
     */
     assert(unit_size_recorder.size() == 0);
-    auto get_cnt_it = last_count_recorder_.begin();
-    while (get_cnt_it != last_count_recorder_.end()) {
+
+    std::map<uint32_t, uint32_t> last_count_recorder_copy;
+    count_mutex_.ReadLock();
+    last_count_recorder_copy = last_count_recorder_;
+    count_mutex_.ReadUnlock();
+
+    auto get_cnt_it = last_count_recorder_copy.begin();
+    while (get_cnt_it != last_count_recorder_copy.end()) {
         // unit_size_recorder always empty, so we only use DEFAULT_UNIT_SIZE
-        algo_infos.insert(std::make_pair(get_cnt_it->first, SegmentAlgoInfo(get_cnt_it->second, DEFAULT_UNIT_SIZE)));
+        // exclude level 0 segments
+        if (level_recorder[get_cnt_it->first] > 0) {
+            algo_infos.insert(std::make_pair(get_cnt_it->first, SegmentAlgoInfo(get_cnt_it->second, DEFAULT_UNIT_SIZE)));
+        }
         get_cnt_it ++;
     }
+    assert(algo_infos.size() > 0);
+    if (UNLIKELY(algo_infos.empty())) return false;
+    std::cout << "[ALGO] algo_infos size: " << algo_infos.size() << std::endl;
     greedy_algo_.solve(algo_infos, label_recorder, filter_cache_.cache_size_except_level_0());
+    std::cout << "[ALGO] stage 1: recorder size (exclude level 0): " << label_recorder.size() << std::endl;
+    assert(algo_infos.size() == label_recorder.size());
+    // // need to verify solutions
+    // greedy_algo_.verify(algo_infos, label_recorder, filter_cache_.cache_size_except_level_0() / 256);
+
+    // assert(level_recorder.size() == segment_ranges_recorder.size());
+    // should make these two recorders share the same segment ids
+    auto level_it_1 = level_recorder.begin();
+    auto range_it_1 = segment_ranges_recorder.begin();
+    while (level_it_1 != level_recorder.end()
+           && range_it_1 != segment_ranges_recorder.end())
+    {
+        if (level_it_1->first < range_it_1->first) {
+            level_it_1 = level_recorder.erase(level_it_1);
+        } else if (level_it_1->first > range_it_1->first) {
+            range_it_1 = segment_ranges_recorder.erase(range_it_1);
+        } else {
+            level_it_1++; range_it_1++;
+        }
+    }
+    while (level_it_1 != level_recorder.end()) {
+        level_it_1 = level_recorder.erase(level_it_1);
+    }
+    while (range_it_1 != segment_ranges_recorder.end()) {
+        range_it_1 = segment_ranges_recorder.erase(range_it_1);
+    }
+    assert(level_recorder.size() == segment_ranges_recorder.size());
 
-    // programming problem may include some merged segments, we need to ignore them
+    // level_recorder and segment_ranges_recorder may include some merged segments, we need to ignore them
     auto old_level_it = level_recorder.begin();
     auto old_range_it = segment_ranges_recorder.begin();
     auto old_label_it = label_recorder.begin();
     while (old_level_it != level_recorder.end() && 
            old_range_it != segment_ranges_recorder.end() && 
            old_label_it != label_recorder.end()) {
+        // std::cout << "debug : " << old_level_it->first << " : " << old_range_it->first << std::endl;
         assert(old_level_it->first == old_range_it->first);
         if (old_level_it->first < old_label_it->first) {
-            old_level_it = level_recorder.erase(old_label_it);
+            old_level_it = level_recorder.erase(old_level_it);
             old_range_it = segment_ranges_recorder.erase(old_range_it);
         } else if (old_level_it->first > old_label_it->first) {
             old_label_it = label_recorder.erase(old_label_it);
@@ -349,27 +619,43 @@ void FilterCacheManager::try_retrain_model(std::map<uint32_t, uint16_t>& level_r
             old_label_it ++;
         }
     }
+    // if some different elements remain in recorder's tail, we need to erase them
     while (old_level_it != level_recorder.end() && 
            old_range_it != segment_ranges_recorder.end()) {
         assert(old_level_it->first == old_range_it->first);
-        old_level_it = level_recorder.erase(old_label_it);
+        old_level_it = level_recorder.erase(old_level_it);
         old_range_it = segment_ranges_recorder.erase(old_range_it);
     }
     while (old_label_it != label_recorder.end()) {
         old_label_it = label_recorder.erase(old_label_it);
     }
 
+    // recheck whether these 3 recorder have same size
+    assert(level_recorder.size() == segment_ranges_recorder.size());
+    assert(level_recorder.size() == label_recorder.size());
+    // auto check_level_it_2 = level_recorder.begin();
+    // auto check_label_it_2 = label_recorder.begin();
+    // while (check_level_it_2 != level_recorder.end()
+    //        && check_label_it_2 != label_recorder.end())
+    // {
+    //     assert(check_level_it_2->first == check_label_it_2->first);
+    //     check_level_it_2++; check_label_it_2++;
+    // }
+    std::cout << "[ALGO] stage 2: recorder size (exclude level 0): " << label_recorder.size() << std::endl;
+
     std::vector<Bucket> buckets = heat_buckets_.buckets();
     std::vector<std::vector<uint32_t>> datas;
     std::vector<uint16_t> labels;
     std::vector<uint32_t> get_cnts;
 
-    auto level_it = level_recorder.begin(); // key range id start with 0
+    std::cout << "[ALGO] stage 3: current count recorder size (include level 0): " << last_count_recorder_copy.size() << std::endl;
+    // remember key range id starts with 0
+    auto level_it = level_recorder.begin(); 
     auto range_it = segment_ranges_recorder.begin();
-    auto count_it = last_count_recorder_.begin();
+    auto count_it = last_count_recorder_copy.begin();
     auto label_it = label_recorder.begin();
     while (level_it != level_recorder.end() && range_it != segment_ranges_recorder.end() &&
-           count_it != last_count_recorder_.end() && label_it != label_recorder.end()) {
+           count_it != last_count_recorder_copy.end() && label_it != label_recorder.end()) {
         assert(level_it->first == range_it->first);
         assert(level_it->first == label_it->first);
         if (count_it->first < level_it->first) {
@@ -379,19 +665,42 @@ void FilterCacheManager::try_retrain_model(std::map<uint32_t, uint16_t>& level_r
             range_it ++;
             label_it ++;
         } else {
-            if (level_it->second > 0) {
+            // only train with non level 0 data
+            if (LIKELY(level_it->second > 0)) {
                 // add data row
                 std::vector<uint32_t> data;
-                std::sort((range_it->second).begin(), (range_it->second).end(), RangeRatePairGreatorComparor);
-                data.emplace_back(level_it->second);
+                std::vector<RangeHeatPair> heat_pairs;
+                // double rate_sum = 0;
                 for (RangeRatePair& pair : range_it->second) {
+                    // rate_sum += pair.rate_in_segment;
+
+                    RangeHeatPair heat_pair;
                     assert(pair.range_id >= 0 && pair.range_id < buckets.size());
-                    data.emplace_back(uint32_t(RATE_SIGNIFICANT_DIGITS_FACTOR * pair.rate_in_segment));
-                    data.emplace_back(uint32_t(HOTNESS_SIGNIFICANT_DIGITS_FACTOR * buckets[pair.range_id].hotness_));
+                    heat_pair.rate_in_segment = pair.rate_in_segment;
+                    heat_pair.heat_value = buckets[pair.range_id].hotness_;
+                    heat_pairs.emplace_back(heat_pair);
                 }
+                // assert(rate_sum >= 0.98 && rate_sum <= 1.02);
+                assert(heat_pairs.size() == (range_it->second).size());
+
+                std::sort(heat_pairs.begin(), heat_pairs.end(), RangeHeatPairGreatorComparor);
+                for (size_t i = 0; i < heat_pairs.size() - 1; i ++) {
+                    assert(heat_pairs[i].heat_value >= heat_pairs[i+1].heat_value);
+                }
+                
+                data.emplace_back(level_it->second);
+                for (RangeHeatPair& heat_pair : heat_pairs) {
+                    data.emplace_back(uint32_t(RATE_SIGNIFICANT_DIGITS_FACTOR * heat_pair.rate_in_segment));
+                    data.emplace_back(uint32_t(HOTNESS_SIGNIFICANT_DIGITS_FACTOR * heat_pair.heat_value));
+                }
+                // std::cout << "[DEBUG] segment " << level_it->first << " data features num: " << data.size() << std::endl;
+                assert(data.size() >= 3 && data.size() % 2 == 1);
+                assert((range_it->second).size() * 2 + 1 == data.size());
+                assert(data[0] > 0);
                 datas.emplace_back(data);
                 // add label row
                 labels.emplace_back(label_it->second);
+                assert(label_it->second <= MAX_UNITS_NUM);
                 // add get cnt row
                 get_cnts.emplace_back(count_it->second);
             }
@@ -405,15 +714,17 @@ void FilterCacheManager::try_retrain_model(std::map<uint32_t, uint16_t>& level_r
     // check three vectors have same length
     assert(datas.size() == labels.size());
     assert(get_cnts.size() == labels.size());
+    std::cout << "[ALGO] stage 3: training labels size (exclude level 0): " << labels.size() << std::endl;
 
     clf_model_.make_train(datas, labels, get_cnts);
 
     train_signal_ = false;
+
+    return true;
 }
 
 void FilterCacheManager::update_cache_and_heap(std::map<uint32_t, uint16_t>& level_recorder,
                                                std::map<uint32_t, std::vector<RangeRatePair>>& segment_ranges_recorder) {
-    assert(level_recorder.size() == segment_ranges_recorder.size());
     std::vector<uint32_t> segment_ids;
     std::vector<std::vector<uint32_t>> datas;
     std::vector<uint16_t> preds;
@@ -421,9 +732,28 @@ void FilterCacheManager::update_cache_and_heap(std::map<uint32_t, uint16_t>& lev
     std::map<uint32_t, uint16_t> current_units_num_limit_recorder;
     std::vector<Bucket> buckets = heat_buckets_.buckets();
 
+    // check whether level 0 segments exist?
+    // assert(level_recorder.size() == segment_ranges_recorder.size());
+    // auto level_it_1 = level_recorder.begin();
+    // auto range_it_1 = segment_ranges_recorder.begin();
+    // while (level_it_1 != level_recorder.end()
+    //        && range_it_1 != segment_ranges_recorder.end()) {
+    //     assert(level_it_1->first == range_it_1->first);
+    //     assert(level_it_1->second > 0);
+    //     level_it_1++;
+    //     range_it_1++;
+    // }
+    // check whether level 0 segments exist?
+    // level_it_1 = level_recorder.begin();
+    // while (level_it_1 != level_recorder.end()) {
+    //     assert(level_it_1->second > 0);
+    //     level_it_1++;
+    // }
+
     // build data rows into datas
     auto level_it = level_recorder.begin();
     auto range_it = segment_ranges_recorder.begin();
+    assert(level_recorder.size() == segment_ranges_recorder.size());
     while (level_it != level_recorder.end() && range_it != segment_ranges_recorder.end()) {
         if (level_it->first < range_it->first) {
             level_it ++;
@@ -431,17 +761,39 @@ void FilterCacheManager::update_cache_and_heap(std::map<uint32_t, uint16_t>& lev
             range_it ++;
         } else {
             assert(level_it->first == range_it->first);
-
-            if (level_it->second > 0) {
+            assert(level_it->second > 0);
+            if (LIKELY(level_it->second > 0)) {
+                segment_ids.emplace_back(level_it->first);
                 // add data row
                 std::vector<uint32_t> data;
-                std::sort((range_it->second).begin(), (range_it->second).end(), RangeRatePairGreatorComparor);
-                data.emplace_back(level_it->second);
+                std::vector<RangeHeatPair> heat_pairs;
+                // double rate_sum = 0;
                 for (RangeRatePair& pair : range_it->second) {
+                    // rate_sum += pair.rate_in_segment;
+
+                    RangeHeatPair heat_pair;
                     assert(pair.range_id >= 0 && pair.range_id < buckets.size());
-                    data.emplace_back(uint32_t(RATE_SIGNIFICANT_DIGITS_FACTOR * pair.rate_in_segment));
-                    data.emplace_back(uint32_t(HOTNESS_SIGNIFICANT_DIGITS_FACTOR * buckets[pair.range_id].hotness_));
+                    heat_pair.rate_in_segment = pair.rate_in_segment;
+                    heat_pair.heat_value = buckets[pair.range_id].hotness_;
+                    heat_pairs.emplace_back(heat_pair);
+                }
+                // assert(rate_sum >= 0.98 && rate_sum <= 1.02);
+                assert(heat_pairs.size() == (range_it->second).size());
+
+                std::sort(heat_pairs.begin(), heat_pairs.end(), RangeHeatPairGreatorComparor);
+                for (size_t i = 0; i < heat_pairs.size() - 1; i ++) {
+                    assert(heat_pairs[i].heat_value >= heat_pairs[i+1].heat_value);
+                }
+                
+                data.emplace_back(level_it->second);
+                for (RangeHeatPair& heat_pair : heat_pairs) {
+                    data.emplace_back(uint32_t(RATE_SIGNIFICANT_DIGITS_FACTOR * heat_pair.rate_in_segment));
+                    data.emplace_back(uint32_t(HOTNESS_SIGNIFICANT_DIGITS_FACTOR * heat_pair.heat_value));
                 }
+                // std::cout << "[DEBUG] segment " << level_it->first << " data features num: " << data.size() << std::endl;
+                assert(data.size() >= 3 && data.size() % 2 == 1);
+                assert((range_it->second).size() * 2 + 1 == data.size());
+                assert(data[0] > 0);
                 datas.emplace_back(data);
             }
 
@@ -454,11 +806,15 @@ void FilterCacheManager::update_cache_and_heap(std::map<uint32_t, uint16_t>& lev
     clf_model_.make_predict(datas, preds);
     assert(segment_ids.size() == preds.size());
     size_t idx = 0;
+    // std::cout << std::endl << "sync units num limit" << std::endl;
     while (idx < segment_ids.size() && idx < preds.size()) {
         segment_units_num_recorder.insert(std::make_pair(segment_ids[idx], preds[idx]));
         current_units_num_limit_recorder.insert(std::make_pair(segment_ids[idx], preds[idx]));
+        // std::cout << "segment id: " << segment_ids[idx] << ", units limit: " << preds[idx] << std::endl;
         idx = idx + 1;
     }
+    assert(segment_units_num_recorder.size() == current_units_num_limit_recorder.size());
+    assert(segment_ids.size() == segment_units_num_recorder.size());
 
     // update filter cache helper heaps
     heap_manager_.sync_units_num_limit(current_units_num_limit_recorder);
@@ -466,14 +822,9 @@ void FilterCacheManager::update_cache_and_heap(std::map<uint32_t, uint16_t>& lev
     // update filter cache
     std::set<uint32_t> empty_level_0_segment_ids; // no level 0 segment in heaps and model data, dont worry
     std::set<uint32_t> empty_failed_segment_ids; 
-    filter_cache_.update_for_segments(segment_units_num_recorder, true, empty_level_0_segment_ids, empty_failed_segment_ids);
-}
+    filter_cache_.enable_for_segments(segment_units_num_recorder, true, empty_level_0_segment_ids, empty_failed_segment_ids);
+    assert(empty_failed_segment_ids.empty());
 
-void FilterCacheManager::remove_segments(std::vector<uint32_t>& segment_ids, std::set<uint32_t>& level_0_segment_ids) {
-    // update filter cache helper heaps
-    heap_manager_.batch_delete(segment_ids);
-    // update filter cache map
-    filter_cache_.release_for_segments(segment_ids, level_0_segment_ids);
 }
 
 bool FilterCacheManager::adjust_cache_and_heap() {
@@ -500,14 +851,16 @@ bool FilterCacheManager::adjust_cache_and_heap() {
         std::set<uint32_t> empty_failed_segment_ids; // force to update segments' filter units group, so dont worry for cache space
         segment_units_num_recorder.insert(std::make_pair(result.enable_segment_id, result.enable_segment_next_units_num));
         segment_units_num_recorder.insert(std::make_pair(result.disable_segment_id, result.disable_segment_next_units_num));
-        filter_cache_.update_for_segments(segment_units_num_recorder, true, empty_level_0_segment_ids, empty_failed_segment_ids);
-    }
+        filter_cache_.enable_for_segments(segment_units_num_recorder, true, empty_level_0_segment_ids, empty_failed_segment_ids);
+        assert(empty_failed_segment_ids.empty());
+    } 
+    // std::this_thread::sleep_for(std::chrono::milliseconds(10));
     return can_adjust;
 }
 
 void FilterCacheManager::insert_segments(std::vector<uint32_t>& merged_segment_ids, std::vector<uint32_t>& new_segment_ids,
                                          std::map<uint32_t, std::unordered_map<uint32_t, double>>& inherit_infos_recorder,
-                                         std::map<uint32_t, uint16_t>& level_recorder, const uint32_t& level_0_base_count,
+                                         std::map<uint32_t, uint16_t>& new_level_recorder, const uint32_t& level_0_base_count,
                                          std::map<uint32_t, std::vector<RangeRatePair>>& segment_ranges_recorder) {
     std::unordered_map<uint32_t, uint16_t> segment_units_num_recorder;
     std::map<uint32_t, uint32_t> approximate_counts_recorder;
@@ -518,50 +871,111 @@ void FilterCacheManager::insert_segments(std::vector<uint32_t>& merged_segment_i
     std::sort(merged_segment_ids.begin(), merged_segment_ids.end());
     std::sort(new_segment_ids.begin(), new_segment_ids.end());
 
-    // pick up merged or new level 0 segments
-    // assume level_recorder keys set equals to merged_segment_ids + new_segment_ids
-    assert(new_segment_ids.size() == 0 || merged_segment_ids.size() + new_segment_ids.size() == level_recorder.size());
-    auto level_it = level_recorder.begin();
-    size_t merged_idx = 0, new_idx = 0;
-    while (level_it != level_recorder.end()) {
-        if (merged_idx < merged_segment_ids.size() && level_it->first == merged_segment_ids[merged_idx]) {
-            if (level_it->second == 0) {
-                old_level_0_segment_ids.insert(level_it->first);
-            }
-            merged_idx ++;
-        } else if (new_idx < new_segment_ids.size() && level_it->first == new_segment_ids[new_idx]) {
-            if (level_it->second == 0) {
-                new_level_0_segment_ids.insert(level_it->first);
-                segment_units_num_recorder.insert(std::make_pair(level_it->first, MAX_UNITS_NUM));
-            } else {
-                // not a level 0 segment, set default units num
-                segment_units_num_recorder.insert(std::make_pair(level_it->first, DEFAULT_UNITS_NUM));
-            }
-            new_idx ++;
-        } 
-        level_it ++;
+    assert(new_segment_ids.size() == new_level_recorder.size());
+    assert(new_segment_ids.size() == segment_ranges_recorder.size());
+    assert(new_segment_ids.size() >= inherit_infos_recorder.size());
+    // uint32_t new_l0_count = 0, new_non_l0_count = 0;
+    // size_t cached_l0_count = cached_level_0_segment_ids_.size();
+    assert(DEFAULT_UNITS_NUM <= MAX_UNITS_NUM && DEFAULT_UNITS_NUM >= MIN_UNITS_NUM);
+    for (auto& item : new_level_recorder) {
+        auto segment_id = item.first;
+        auto level = item.second;
+        if (level == 0) {
+            new_level_0_segment_ids.insert(segment_id);
+            cached_level_0_segment_ids_.insert(segment_id); // update current cached level 0 segments
+            segment_units_num_recorder.insert(std::make_pair(segment_id, MAX_UNITS_NUM));
+            // new_l0_count++;
+        } else {
+            segment_units_num_recorder.insert(std::make_pair(segment_id, DEFAULT_UNITS_NUM));
+            // new_non_l0_count++;
+        }
+    }
+    // cached_l0_count += new_l0_count;
+    // assert(new_l0_count + new_non_l0_count == new_segment_ids.size());
+    assert(segment_units_num_recorder.size() == new_segment_ids.size());
+    
+    // // print new segment ids
+    // std::cout << std::endl;
+    // std::cout << "new level-0 segment id: ";
+    // for (uint32_t segment_id : new_level_0_segment_ids) {
+    //     std::cout << segment_id << " ";
+    // }
+    // std::cout << std::endl;
+    // std::cout << "new non level-0 segment id: ";
+    // for (uint32_t segment_id : new_segment_ids) {
+    //     if (new_level_0_segment_ids.count(segment_id) == 0) {
+    //         std::cout << segment_id << " ";
+    //     }
+    // }
+    // std::cout << std::endl;
+
+    // collect old segments id on level 0
+    // uint32_t old_l0_count = 0, old_non_l0_count = 0;
+    // std::cout << "merged segment ids: " << std::endl;
+    for (uint32_t& merged_segment_id : merged_segment_ids) {
+        if (cached_level_0_segment_ids_.count(merged_segment_id) > 0) {
+            old_level_0_segment_ids.insert(merged_segment_id);
+            cached_level_0_segment_ids_.erase(merged_segment_id);
+            // old_l0_count++;
+        } else { 
+            // old_non_l0_count++; 
+            // std::cout << merged_segment_id << " ";
+        }
     }
+    // std::cout << std::endl;
+
+    // cached_l0_count -= old_l0_count;
+    // assert(cached_l0_count == cached_level_0_segment_ids_.size());
+    // assert(old_l0_count + old_non_l0_count == merged_segment_ids.size());
+
+    // // print merged segment ids
+    // std::cout << "merged level-0 segment id: ";
+    // for (uint32_t segment_id : old_level_0_segment_ids) {
+    //     std::cout << segment_id << " ";
+    // }
+    // std::cout << std::endl;
+    // std::cout << "merged non level-0 segment id: ";
+    // for (uint32_t segment_id : merged_segment_ids) {
+    //     if (old_level_0_segment_ids.count(segment_id) == 0) {
+    //         std::cout << segment_id << " ";
+    //     }
+    // }
+    // std::cout << std::endl;
 
     if (!is_ready_) {
         // if is_ready_ is false, no need to enable two-heaps adjustment, remember to update is_ready_ in the end
         // remove merged segments' units in filter cache and nodes in filter heaps
-        heap_manager_.batch_delete(merged_segment_ids);
+        std::vector<uint32_t> merged_segment_ids_except_l0;
+        for (uint32_t &segment_id : merged_segment_ids) {
+            if (old_level_0_segment_ids.count(segment_id) == 0) {
+                merged_segment_ids_except_l0.emplace_back(segment_id);
+            }
+        }
+        heap_manager_.batch_delete(merged_segment_ids_except_l0);
+        // std::cout << merged_segment_ids_except_l0.size() << " " << merged_segment_ids.size() << std::endl;
         filter_cache_.release_for_segments(merged_segment_ids, old_level_0_segment_ids);
 
         // inherit merged segments' counts to new segments' counts
         // ensure that new segments that are not in inherit_infos_recorder keys set are only level 0 segments
+        // this function will remove moved segments from last_count_recorder_ and current_count_recorder_
         inherit_count_recorder(merged_segment_ids, new_segment_ids, level_0_base_count, inherit_infos_recorder);
-        estimate_counts_for_all(approximate_counts_recorder);
+
+        std::vector<uint32_t> needed_segment_ids;
+        for (uint32_t segment_id : new_segment_ids) {
+            needed_segment_ids.emplace_back(segment_id);
+        }
+        estimate_recent_counts(approximate_counts_recorder, needed_segment_ids);
+        assert(approximate_counts_recorder.size() > 0);
 
         // insert units into filter cache
         filter_cache_.enable_for_segments(segment_units_num_recorder, false, new_level_0_segment_ids, failed_segment_ids);
         
         // insert nodes into filter heaps
         for (uint32_t& new_segment_id : new_segment_ids) {
-            if (new_level_0_segment_ids.count(new_segment_id)) {
+            if (new_level_0_segment_ids.count(new_segment_id) > 0) {
                 // no need to insert level 0 segment nodes into heap
                 continue;
-            } else if (failed_segment_ids.count(new_segment_id)) {
+            } else if (failed_segment_ids.count(new_segment_id) > 0) {
                 // failed to insert filter units
                 uint16_t units_num = segment_units_num_recorder[new_segment_id];
                 new_segment_items.emplace_back(FilterCacheHeapItem(new_segment_id, approximate_counts_recorder[new_segment_id],
@@ -573,6 +987,7 @@ void FilterCacheManager::insert_segments(std::vector<uint32_t>& merged_segment_i
                                                                    units_num, 0, units_num));
             }
         }
+        assert(new_segment_items.size() + new_level_0_segment_ids.size() == new_segment_ids.size());
         heap_manager_.batch_upsert(new_segment_items);
 
         // remember to update is_ready_
@@ -582,32 +997,70 @@ void FilterCacheManager::insert_segments(std::vector<uint32_t>& merged_segment_i
     } else {
         // is_ready_ is true, then we will not update is_ready_, that means is_ready_ will be always true
         // remove merged segments' units in filter cache and nodes in filter heaps
-        heap_manager_.batch_delete(merged_segment_ids);
+        std::vector<uint32_t> merged_segment_ids_except_l0;
+        for (uint32_t &segment_id : merged_segment_ids) {
+            if (old_level_0_segment_ids.count(segment_id) == 0) {
+                merged_segment_ids_except_l0.emplace_back(segment_id);
+            }
+        }
+        heap_manager_.batch_delete(merged_segment_ids_except_l0);
+        // std::cout << merged_segment_ids_except_l0.size() << " " << merged_segment_ids.size() << std::endl;
         filter_cache_.release_for_segments(merged_segment_ids, old_level_0_segment_ids);
 
         // inherit merged segments' counts to new segments' counts
         // ensure that new segments that are not in inherit_infos_recorder keys set are only level 0 segments
+        // this function will remove moved segments from last_count_recorder_ and current_count_recorder_
         inherit_count_recorder(merged_segment_ids, new_segment_ids, level_0_base_count, inherit_infos_recorder);
-        estimate_counts_for_all(approximate_counts_recorder);
+
+        std::vector<uint32_t> needed_segment_ids;
+        for (uint32_t segment_id : new_segment_ids) {
+            needed_segment_ids.emplace_back(segment_id);
+        }
+        estimate_recent_counts(approximate_counts_recorder, needed_segment_ids);
+        assert(approximate_counts_recorder.size() > 0);
 
         // predict units num for new non level 0 segments and update segment_units_num_recorder
         std::vector<std::vector<uint32_t>> pred_datas;
         std::vector<uint32_t> pred_segment_ids;
         std::vector<uint16_t> pred_results;
         for (uint32_t& new_segment_id : new_segment_ids) {
-            if (new_level_0_segment_ids.count(new_segment_id)) {
+            if (new_level_0_segment_ids.count(new_segment_id) > 0) {
                 // no need to predict for level 0 segments
                 continue;
             } else {
                 pred_segment_ids.emplace_back(new_segment_id);
 
+                auto range_it = segment_ranges_recorder.find(new_segment_id);
+                assert(range_it != segment_ranges_recorder.end());
                 std::vector<uint32_t> pred_data;
-                pred_data.emplace_back(level_recorder[new_segment_id]);
-                for (RangeRatePair& pair : segment_ranges_recorder[new_segment_id]) {
+                std::vector<RangeHeatPair> heat_pairs;
+                // double rate_sum = 0;
+                for (RangeRatePair& pair : range_it->second) {
+                    // rate_sum += pair.rate_in_segment;
+
+                    RangeHeatPair heat_pair;
                     assert(pair.range_id >= 0 && pair.range_id < buckets.size());
-                    pred_data.emplace_back(uint32_t(RATE_SIGNIFICANT_DIGITS_FACTOR * pair.rate_in_segment));
-                    pred_data.emplace_back(uint32_t(HOTNESS_SIGNIFICANT_DIGITS_FACTOR * buckets[pair.range_id].hotness_));
+                    heat_pair.rate_in_segment = pair.rate_in_segment;
+                    heat_pair.heat_value = buckets[pair.range_id].hotness_;
+                    heat_pairs.emplace_back(heat_pair);
+                }
+                // assert(rate_sum >= 0.98 && rate_sum <= 1.02);
+                assert(heat_pairs.size() == (range_it->second).size());
+
+                std::sort(heat_pairs.begin(), heat_pairs.end(), RangeHeatPairGreatorComparor);
+                for (size_t i = 0; i < heat_pairs.size() - 1; i ++) {
+                    assert(heat_pairs[i].heat_value >= heat_pairs[i+1].heat_value);
+                }
+                
+                pred_data.emplace_back(new_level_recorder[new_segment_id]);
+                for (RangeHeatPair& heat_pair : heat_pairs) {
+                    pred_data.emplace_back(uint32_t(RATE_SIGNIFICANT_DIGITS_FACTOR * heat_pair.rate_in_segment));
+                    pred_data.emplace_back(uint32_t(HOTNESS_SIGNIFICANT_DIGITS_FACTOR * heat_pair.heat_value));
                 }
+                // std::cout << "[DEBUG] segment " << level_it->first << " data features num: " << data.size() << std::endl;
+                assert(pred_data.size() >= 3 && pred_data.size() % 2 == 1);
+                assert((range_it->second).size() * 2 + 1 == pred_data.size());
+                assert(pred_data[0] > 0);
                 pred_datas.emplace_back(pred_data);
             }
         }
@@ -617,18 +1070,22 @@ void FilterCacheManager::insert_segments(std::vector<uint32_t>& merged_segment_i
         size_t pred_idx = 0;
         while (pred_idx < pred_segment_ids.size() && pred_idx < pred_results.size()) {
             segment_units_num_recorder[pred_segment_ids[pred_idx]] = pred_results[pred_idx];
+            assert(new_level_0_segment_ids.count(pred_segment_ids[pred_idx]) == 0);
+            assert(pred_results[pred_idx] >= MIN_UNITS_NUM && pred_results[pred_idx] <= MAX_UNITS_NUM);
             pred_idx = pred_idx + 1;
         }
+        // std::cout << "insert predict " << pred_results.size() << " segments" << std::endl;
+        assert(pred_results.size() + new_level_0_segment_ids.size() == new_segment_ids.size());
 
         // insert units into filter cache
         filter_cache_.enable_for_segments(segment_units_num_recorder, false, new_level_0_segment_ids, failed_segment_ids);
 
         // insert nodes into filter heaps
         for (uint32_t& new_segment_id : new_segment_ids) {
-            if (new_level_0_segment_ids.count(new_segment_id)) {
+            if (new_level_0_segment_ids.count(new_segment_id) > 0) {
                 // no need to insert level 0 segment nodes into heap
                 continue;
-            } else if (failed_segment_ids.count(new_segment_id)) {
+            } else if (failed_segment_ids.count(new_segment_id) > 0) {
                 // failed to insert filter units
                 uint16_t units_num = segment_units_num_recorder[new_segment_id];
                 new_segment_items.emplace_back(FilterCacheHeapItem(new_segment_id, approximate_counts_recorder[new_segment_id],
@@ -640,27 +1097,25 @@ void FilterCacheManager::insert_segments(std::vector<uint32_t>& merged_segment_i
                                                                    units_num, 0, units_num));
             }
         }
+        assert(new_segment_items.size() + new_level_0_segment_ids.size() == new_segment_ids.size());
         heap_manager_.batch_upsert(new_segment_items);
     }
 }
 
-void FilterCacheManager::delete_segments(std::vector<uint32_t>& merged_segment_ids, std::map<uint32_t, uint16_t>& level_recorder) {
+void FilterCacheManager::delete_segments(std::vector<uint32_t>& merged_segment_ids) {
+    assert(false);
+    exit(1);
     std::set<uint32_t> old_level_0_segment_ids;
-    std::sort(merged_segment_ids.begin(), merged_segment_ids.end());
 
-    // level_recorder is a copy of global level_recorder
-    assert(merged_segment_ids.size() == level_recorder.size());
-    auto level_it = level_recorder.begin();
-    size_t merged_idx = 0;
-    while (level_it != level_recorder.end()) {
-        assert(merged_idx < merged_segment_ids.size() && level_it->first == merged_segment_ids[merged_idx]);
-        if (merged_idx < merged_segment_ids.size() && level_it->first == merged_segment_ids[merged_idx]) {
-            if (level_it->second == 0) {
-                old_level_0_segment_ids.insert(level_it->first);
-            }
-            merged_idx ++;
+    // collect old segments id on level 0
+    for (uint32_t& merged_segment_id : merged_segment_ids) {
+        if (cached_level_0_segment_ids_.count(merged_segment_id)) {
+            old_level_0_segment_ids.insert(merged_segment_id);
+            cached_level_0_segment_ids_.erase(merged_segment_id);
         }
-        level_it ++;
+        // remove merged segments' count
+        last_count_recorder_.erase(merged_segment_id);
+        current_count_recorder_.erase(merged_segment_id);
     }
 
     if (!is_ready_) {
@@ -685,6 +1140,8 @@ void FilterCacheManager::move_segments(std::vector<uint32_t>& moved_segment_ids,
                                        std::map<uint32_t, uint16_t>& old_level_recorder,
                                        std::map<uint32_t, uint16_t>& move_level_recorder,
                                        std::map<uint32_t, std::vector<RangeRatePair>>& move_segment_ranges_recorder) {
+    assert(false);
+    exit(1);
     std::unordered_map<uint32_t, uint16_t> segment_units_num_recorder;
     std::map<uint32_t, uint32_t> approximate_counts_recorder;
     std::vector<FilterCacheHeapItem> new_segment_items;
@@ -698,13 +1155,10 @@ void FilterCacheManager::move_segments(std::vector<uint32_t>& moved_segment_ids,
     assert(moved_segment_ids.size() == move_level_recorder.size());
     assert(moved_segment_ids.size() == move_segment_ranges_recorder.size());
     auto level_it = old_level_recorder.begin();
-    size_t moved_idx = 0, new_idx = 0;
+    size_t moved_idx = 0;
     while (level_it != old_level_recorder.end()) {
         assert(moved_idx < moved_segment_ids.size() && level_it->first == moved_segment_ids[moved_idx]);
         if (moved_idx < moved_segment_ids.size() && level_it->first == moved_segment_ids[moved_idx]) {
-            if (level_it->second == 0) {
-                old_level_0_segment_ids.insert(level_it->first);
-            }
             segment_units_num_recorder.insert(std::make_pair(level_it->first, DEFAULT_UNITS_NUM));
             // actually, we cannot move segments to level 0 in trivial move compaction (only flushing do this).
             moved_idx ++;
@@ -712,27 +1166,39 @@ void FilterCacheManager::move_segments(std::vector<uint32_t>& moved_segment_ids,
         level_it ++;
     }
 
+    // collect old segments id on level 0
+    for (uint32_t moved_segment_id : moved_segment_ids) {
+        if (cached_level_0_segment_ids_.count(moved_segment_id)) {
+            old_level_0_segment_ids.insert(moved_segment_id);
+            cached_level_0_segment_ids_.erase(moved_segment_id);
+        }
+    }
+
     if (!is_ready_) {
         // firstly, delete moved segments
         heap_manager_.batch_delete(moved_segment_ids);
-        filter_cache_.release_for_segments(moved_segment_ids, old_level_0_segment_ids);
 
         // inherit these segments' count
-        for (uint32_t& segment_id : moved_segment_ids) {
-            auto last_it = last_count_recorder_.find(segment_id);
-            auto current_it = current_count_recorder_.find(segment_id);
-            if (last_it != last_count_recorder_.end()) {
-                last_it->second = INHERIT_REMAIN_FACTOR * (last_it->second);
-            }
-            if (current_it != current_count_recorder_.end()) {
-                current_it->second = INHERIT_REMAIN_FACTOR * (current_it->second);
-            }
+        // for (uint32_t& segment_id : moved_segment_ids) {
+        //     auto last_it = last_count_recorder_.find(segment_id);
+        //     auto current_it = current_count_recorder_.find(segment_id);
+        //     if (last_it != last_count_recorder_.end()) {
+        //         last_it->second = INHERIT_REMAIN_FACTOR * (last_it->second);
+        //     }
+        //     if (current_it != current_count_recorder_.end()) {
+        //         current_it->second = INHERIT_REMAIN_FACTOR * (current_it->second);
+        //     }
+        // }
+        std::vector<uint32_t> needed_segment_ids;
+        for (uint32_t segment_id : moved_segment_ids) {
+            needed_segment_ids.emplace_back(segment_id);
         }
-        estimate_counts_for_all(approximate_counts_recorder);
+        estimate_recent_counts(approximate_counts_recorder, needed_segment_ids);
+        assert(approximate_counts_recorder.size() > 0);
 
-        // insert units into filter cache
-        std::set<uint32_t> empty_new_level_0_segment_ids, empty_failed_segment_ids;
-        filter_cache_.enable_for_segments(segment_units_num_recorder, true, empty_new_level_0_segment_ids, empty_failed_segment_ids);
+        // modify units into filter cache
+        std::set<uint32_t> empty_failed_segment_ids;
+        filter_cache_.update_for_segments(segment_units_num_recorder, old_level_0_segment_ids, empty_failed_segment_ids);
         
         // insert nodes into filter heaps
         for (uint32_t& segment_id : moved_segment_ids) {
@@ -750,36 +1216,65 @@ void FilterCacheManager::move_segments(std::vector<uint32_t>& moved_segment_ids,
     } else {
         // firstly, delete moved segments
         heap_manager_.batch_delete(moved_segment_ids);
-        filter_cache_.release_for_segments(moved_segment_ids, old_level_0_segment_ids);
 
         // inherit these segments' count
-        for (uint32_t& segment_id : moved_segment_ids) {
-            auto last_it = last_count_recorder_.find(segment_id);
-            auto current_it = current_count_recorder_.find(segment_id);
-            if (last_it != last_count_recorder_.end()) {
-                last_it->second = INHERIT_REMAIN_FACTOR * (last_it->second);
-            }
-            if (current_it != current_count_recorder_.end()) {
-                current_it->second = INHERIT_REMAIN_FACTOR * (current_it->second);
-            }
+        // for (uint32_t& segment_id : moved_segment_ids) {
+        //     auto last_it = last_count_recorder_.find(segment_id);
+        //     auto current_it = current_count_recorder_.find(segment_id);
+        //     if (last_it != last_count_recorder_.end()) {
+        //         last_it->second = INHERIT_REMAIN_FACTOR * (last_it->second);
+        //     }
+        //     if (current_it != current_count_recorder_.end()) {
+        //         current_it->second = INHERIT_REMAIN_FACTOR * (current_it->second);
+        //     }
+        // }
+        std::vector<uint32_t> needed_segment_ids;
+        for (uint32_t segment_id : moved_segment_ids) {
+            needed_segment_ids.emplace_back(segment_id);
         }
-        estimate_counts_for_all(approximate_counts_recorder);
+        estimate_recent_counts(approximate_counts_recorder, needed_segment_ids);
+        assert(approximate_counts_recorder.size() > 0);
 
         // predict units num for new non level 0 segments and update segment_units_num_recorder
         std::vector<std::vector<uint32_t>> pred_datas;
         std::vector<uint32_t> pred_segment_ids;
         std::vector<uint16_t> pred_results;
-        for (uint32_t& segment_id : moved_segment_ids) {
-            assert(move_level_recorder[segment_id] > 0);
-            pred_segment_ids.emplace_back(segment_id);
+        for (uint32_t moved_segment_id : moved_segment_ids) {
+            assert(move_level_recorder[moved_segment_id] > 0);
+            pred_segment_ids.emplace_back(moved_segment_id);
 
+            auto range_it = move_segment_ranges_recorder.find(moved_segment_id);
+            assert(range_it != move_segment_ranges_recorder.end());
             std::vector<uint32_t> pred_data;
-            pred_data.emplace_back(move_level_recorder[segment_id]);
-            for (RangeRatePair& pair : move_segment_ranges_recorder[segment_id]) {
+            std::vector<RangeHeatPair> heat_pairs;
+            double rate_sum = 0;
+            for (RangeRatePair& pair : range_it->second) {
+                rate_sum += pair.rate_in_segment;
+
+                RangeHeatPair heat_pair;
                 assert(pair.range_id >= 0 && pair.range_id < buckets.size());
-                pred_data.emplace_back(uint32_t(RATE_SIGNIFICANT_DIGITS_FACTOR * pair.rate_in_segment));
-                pred_data.emplace_back(uint32_t(HOTNESS_SIGNIFICANT_DIGITS_FACTOR * buckets[pair.range_id].hotness_));
+                heat_pair.rate_in_segment = pair.rate_in_segment;
+                heat_pair.heat_value = buckets[pair.range_id].hotness_;
+                heat_pairs.emplace_back(heat_pair);
+            }
+            assert(rate_sum >= 0.98 && rate_sum <= 1.02);
+            assert(heat_pairs.size() == (range_it->second).size());
+
+            std::sort(heat_pairs.begin(), heat_pairs.end(), RangeHeatPairGreatorComparor);
+            for (size_t i = 0; i < heat_pairs.size() - 1; i ++) {
+                assert(heat_pairs[i].heat_value >= heat_pairs[i+1].heat_value);
             }
+                
+            pred_data.emplace_back(move_level_recorder[moved_segment_id]);
+            for (RangeHeatPair& heat_pair : heat_pairs) {
+                pred_data.emplace_back(uint32_t(RATE_SIGNIFICANT_DIGITS_FACTOR * heat_pair.rate_in_segment));
+                pred_data.emplace_back(uint32_t(HOTNESS_SIGNIFICANT_DIGITS_FACTOR * heat_pair.heat_value));
+            }
+            // std::cout << "[DEBUG] segment " << level_it->first << " data features num: " << data.size() << std::endl;
+            assert(pred_data.size() >= 3 && pred_data.size() % 2 == 1);
+            assert((range_it->second).size() * 2 + 1 == pred_data.size());
+            assert(pred_data[0] > 0);
+            pred_datas.emplace_back(pred_data);
         }
         assert(pred_datas.size() == pred_segment_ids.size());
         clf_model_.make_predict(pred_datas, pred_results);
@@ -789,13 +1284,14 @@ void FilterCacheManager::move_segments(std::vector<uint32_t>& moved_segment_ids,
             segment_units_num_recorder[pred_segment_ids[pred_idx]] = pred_results[pred_idx];
             pred_idx = pred_idx + 1;
         }
+        assert(pred_results.size() == moved_segment_ids.size());
 
-        // insert units into filter cache
-        std::set<uint32_t> empty_new_level_0_segment_ids, empty_failed_segment_ids;
-        filter_cache_.enable_for_segments(segment_units_num_recorder, true, empty_new_level_0_segment_ids, empty_failed_segment_ids);
+        // modify units into filter cache
+        std::set<uint32_t> empty_failed_segment_ids;
+        filter_cache_.update_for_segments(segment_units_num_recorder, old_level_0_segment_ids, empty_failed_segment_ids);
 
         // insert nodes into filter heaps
-        for (uint32_t& segment_id : moved_segment_ids) {
+        for (uint32_t segment_id : moved_segment_ids) {
             assert(move_level_recorder[segment_id] > 0);
             uint16_t units_num = segment_units_num_recorder[segment_id];
             new_segment_items.emplace_back(FilterCacheHeapItem(segment_id, approximate_counts_recorder[segment_id],
@@ -805,4 +1301,65 @@ void FilterCacheManager::move_segments(std::vector<uint32_t>& moved_segment_ids,
     }
 }
 
+    const char* FilterCache::Name() const { return "FilterCache"; }
+
+    // overrides rocksdb::Cache but no nothing
+    Status FilterCache::Insert(const Slice& key, void* value, size_t charge,
+                            void (*deleter)(const Slice& key, void* value),
+                            Handle** handle,
+                            Priority priority) {
+                                assert(false);
+                                return Status::OK();
+                            }
+
+    // overrides rocksdb::Cache but no nothing
+    Cache::Handle* FilterCache::Lookup(const Slice& key, Statistics* stats) {
+        assert(false);
+        return nullptr;
+    }
+
+    // overrides rocksdb::Cache but no nothing
+    bool FilterCache::Ref(Handle* handle) { return false; }
+
+    // used by CachableEntry
+    bool FilterCache::Release(Cache::Handle* handle, bool force_erase) { return false; }
+
+    // overrides rocksdb::Cache but no nothing
+    void* FilterCache::Value(Cache::Handle* handle) { assert(false); return nullptr; }
+
+    // overrides rocksdb::Cache but no nothing
+    void FilterCache::Erase(const Slice& key) { assert(false); }
+    // overrides rocksdb::Cache but no nothing
+    uint64_t FilterCache::NewId() { assert(false); return 0; }
+
+    // overrides rocksdb::Cache but no nothing
+    void FilterCache::SetCapacity(size_t capacity) { assert(false); }
+
+    // overrides rocksdb::Cache but no nothing
+    void FilterCache::SetStrictCapacityLimit(bool strict_capacity_limit) { assert(false);}
+
+    // overrides rocksdb::Cache but no nothing
+    bool FilterCache::HasStrictCapacityLimit() const { assert(false); return false; }
+
+    // overrides rocksdb::Cache but no nothing
+    size_t FilterCache::GetCapacity() const { assert(false);  return 0; }
+
+    // overrides rocksdb::Cache but no nothing
+    size_t FilterCache::GetUsage() const { assert(false); return 0; }
+
+    // overrides rocksdb::Cache but no nothing
+    size_t FilterCache::GetUsage(Handle* handle) const { assert(false); return 0; }
+
+    // overrides rocksdb::Cache but no nothing
+    size_t FilterCache::GetPinnedUsage() const { assert(false); return 0; }
+
+    // overrides rocksdb::Cache but no nothing
+    size_t FilterCache::GetCharge(Handle* handle) const { assert(false); return 0; }
+
+    // overrides rocksdb::Cache but no nothing
+    void FilterCache::ApplyToAllCacheEntries(void (*callback)(void*, size_t),
+                                        bool thread_safe) { assert(false); }
+
+    // overrides rocksdb::Cache but no nothing
+    void FilterCache::EraseUnRefEntries() { assert(false); }
 }
\ No newline at end of file
diff --git a/db/art/filter_cache.h b/db/art/filter_cache.h
index 5578e58f7..45ab8d5a2 100644
--- a/db/art/filter_cache.h
+++ b/db/art/filter_cache.h
@@ -1,5 +1,7 @@
 #pragma once
 
+#include <atomic>
+#include <cstdint>
 #include <iostream>
 #include <fstream>
 #include <mutex>
@@ -8,17 +10,21 @@
 #include <map>
 #include <set>
 #include <unordered_map>
+#include "db/art/filter_cache_entry.h"
+#include "db/version_edit.h"
 #include "macros.h"
 #include "greedy_algo.h"
 #include "clf_model.h"
 #include "heat_buckets.h"
 #include "filter_cache_heap.h"
-#include "filter_cache_item.h"
+#include "rocksdb/cache.h"
+#include "table/block_based/cachable_entry.h"
+#include "table/block_based/parsed_full_filter_block.h"
 
 namespace ROCKSDB_NAMESPACE {
 
-class FilterCache;
 class FilterCacheManager;
+class BlockBasedTable;
 
 // FilterCache main component is a STL Map, key -- segment id, value -- Structure of Filter Units （ called FilterCacheItem）
 // its main job is auto enable/disable filter units for one segment, and check whether one key exists in enabled units
@@ -28,9 +34,9 @@ class FilterCacheManager;
 // 3. check whether filter cache is approximately full
 // 4. check whether ready to train first model
 // 5. release FilterCacheItem of these merged (outdated) segments
-class FilterCache {
+class FilterCache : public Cache {
 private:
-    std::map<uint32_t, FilterCacheItem> filter_cache_;
+    std::map<uint32_t, FilterCacheEntry> filter_cache_;
     uint32_t used_space_size_;
     uint32_t level_0_used_space_size_;
     uint32_t cache_size_; // max size of cache
@@ -39,24 +45,29 @@ class FilterCache {
 public:
     FilterCache() { filter_cache_.clear(); cache_size_ = CACHE_SPACE_SIZE; used_space_size_ = 0; level_0_used_space_size_ = 0; }
 
-    ~FilterCache() { /* do nothing */ }
+    ~FilterCache() override { /* do nothing */ }
 
     // other levels total cache size 
-    uint32_t cache_size_except_level_0() { return cache_size_ * FULL_RATE - level_0_used_space_size_; }
+    // assume level 0 segments' filter never use filter cache space
+    uint32_t cache_size_except_level_0() { return cache_size_ * FULL_RATE; }
 
-    // check whether one given key exist in one segment
-    bool check_key(const uint32_t& segment_id, const std::string& key);
+    // get all cached filter blocks of one segment
+    std::vector<CachableEntry<ParsedFullFilterBlock>> get_filter_blocks(const uint32_t segment_id);
 
     // enable / disable units for a batch of segments (one segment may not exist in FilterCache)
     // if enabled units num exceed given units num, it will disable units
     void enable_for_segments(std::unordered_map<uint32_t, uint16_t>& segment_units_num_recorder, const bool& is_forced,
-                             std::set<uint32_t>& level_0_segment_ids, std::set<uint32_t>& failed_segment_ids);
+                             std::set<uint32_t>& new_level_0_segment_ids, std::set<uint32_t>& failed_segment_ids);
 
     // the only difference from enable_for_segments is:
-    // this func dont insert any filter units for segments that dont exist in cache, but enable_for_segments unc does
-    void update_for_segments(std::unordered_map<uint32_t, uint16_t>& segment_units_num_recorder, const bool& is_forced,
-                             std::set<uint32_t>& level_0_segment_ids, std::set<uint32_t>& failed_segment_ids);
-
+    // this is designed for moved compaction
+    // if one segment moved from L0 to L1, we update L0 cached filter usage and filter cache usage
+    // we do not remove filter handle of these segments, because these segments' ids are still valid
+    void update_for_segments(std::unordered_map<uint32_t, uint16_t>& segment_units_num_recorder,
+                             std::set<uint32_t>& old_level_0_segment_ids, std::set<uint32_t>& failed_segment_ids);
+
+    // should be called in/after compaction, before any filter adjustment operation that may affect given segment
+    void init_segment(uint32_t segment_id, const BlockBasedTable* table, const std::vector<BlockHandle>& block_handles);
     // check whether filter cache is approximately full
     // actually, we will leave (1-FULL_RATE) * cache_size_ space for emergency usage
     bool is_full();
@@ -65,7 +76,75 @@ class FilterCache {
     bool is_ready();
 
     // release filter units of merged segments
-    void release_for_segments(std::vector<uint32_t>& segment_ids, std::set<uint32_t>& level_0_segment_ids);
+    void release_for_segments(std::vector<uint32_t>& segment_ids, std::set<uint32_t>& old_level_0_segment_ids);
+
+    // The type of the Cache
+    virtual const char* Name() const override;
+
+    // overrides rocksdb::Cache but no nothing
+    Status Insert(const Slice& key, void* value, size_t charge,
+                            void (*deleter)(const Slice& key, void* value),
+                            Handle** handle = nullptr,
+                            Priority priority = Priority::LOW) override;
+
+    // overrides rocksdb::Cache but no nothing
+    Handle* Lookup(const Slice& key, Statistics* stats = nullptr) override;
+
+    // overrides rocksdb::Cache but no nothing
+    bool Ref(Handle* handle) override;
+
+    /**
+    * Release a mapping returned by a previous Lookup(). A released entry might
+    * still  remain in cache in case it is later looked up by others. If
+    * force_erase is set then it also erase it from the cache if there is no
+    * other reference to  it. Erasing it should call the deleter function that
+    * was provided when the
+    * entry was inserted.
+    *
+    * Returns true if the entry was also erased.
+    */
+    // REQUIRES: handle must not have been released yet.
+    // REQUIRES: handle must have been returned by a method on *this.
+    bool Release(Handle* handle, bool force_erase = false) override;
+
+    // overrides rocksdb::Cache but no nothing
+    void* Value(Handle* handle) override;
+
+    // overrides rocksdb::Cache but no nothing
+    void Erase(const Slice& key) override;
+    // overrides rocksdb::Cache but no nothing
+    uint64_t NewId() override;
+
+    // overrides rocksdb::Cache but no nothing
+    void SetCapacity(size_t capacity) override;
+
+    // overrides rocksdb::Cache but no nothing
+    void SetStrictCapacityLimit(bool strict_capacity_limit) override;
+
+    // overrides rocksdb::Cache but no nothing
+    bool HasStrictCapacityLimit() const override;
+
+    // overrides rocksdb::Cache but no nothing
+    size_t GetCapacity() const override;
+
+    // overrides rocksdb::Cache but no nothing
+    size_t GetUsage() const override;
+
+    // overrides rocksdb::Cache but no nothing
+    size_t GetUsage(Handle* handle) const override;
+
+    // overrides rocksdb::Cache but no nothing
+    size_t GetPinnedUsage() const override;
+
+    // overrides rocksdb::Cache but no nothing
+    size_t GetCharge(Handle* handle) const override;
+
+    // overrides rocksdb::Cache but no nothing
+    void ApplyToAllCacheEntries(void (*callback)(void*, size_t),
+                                        bool thread_safe) override;
+
+    // overrides rocksdb::Cache but no nothing
+    void EraseUnRefEntries() override;
 };
 
 // FilterCacheManager is combined of these components:
@@ -87,25 +166,29 @@ class FilterCache {
 class FilterCacheManager {
 private:
     // TODO: mutex can be optimized or use a message queue or a thread pool to reduce time costed by mutex
-    static FilterCache filter_cache_;
-    static HeatBuckets heat_buckets_;
-    static ClfModel clf_model_;
-    static GreedyAlgo greedy_algo_;
-    static FilterCacheHeapManager heap_manager_;
-    static uint32_t get_cnt_; // record get cnt in current period, when exceeding PERIOD_COUNT, start next period
-    static uint32_t period_cnt_; // record period cnt, if period_cnt_ - last_train_period_ >= TRAIN_PERIODS, start to evaluate or retrain ClfModel
-    static uint32_t last_long_period_; // record last short period cnt of last long period
-    static uint32_t last_short_period_; // helper var for update job when one short period ends
-    static std::mutex update_mutex_; // guarantee counts records only updated once
-    static bool train_signal_; // if true, try to retrain model. we call one background thread to monitor this flag and retrain
-    static std::map<uint32_t, uint32_t> last_count_recorder_; // get cnt recorder of segments in last long period
-    static std::map<uint32_t, uint32_t> current_count_recorder_; // get cnt recorder of segments in current long period
-    static std::mutex count_mutex_; // guarentee last_count_recorder and current_count_recorder treated orderedly
-    static bool is_ready_; // check whether ready to use adaptive filter assignment
+    std::set<uint32_t> cached_level_0_segment_ids_;
+    FilterCache filter_cache_;
+    HeatBuckets heat_buckets_;
+    ClfModel clf_model_;
+    GreedyAlgo greedy_algo_;
+    FilterCacheHeapManager heap_manager_;
+    uint32_t get_cnt_; // record get cnt in current period, when exceeding PERIOD_COUNT, start next period
+    uint32_t period_cnt_; // record period cnt, if period_cnt_ - last_train_period_ >= TRAIN_PERIODS, start to evaluate or retrain ClfModel
+    uint32_t last_long_period_; // record last short period cnt of last long period
+    uint32_t last_short_period_; // helper var for update job when one short period ends
+    mutable port::RWMutex period_mutex_; // guarantee heat buckets, get_cnt_ and period_cnt_ are updated orderly
+    // std::mutex update_mutex_; // guarantee counts records only updated once
+    bool train_signal_; // if true, try to retrain model. we call one background thread to monitor this flag and retrain
+    std::map<uint32_t, uint32_t> last_count_recorder_; // get cnt recorder of segments in last long period
+    std::map<uint32_t, uint32_t> current_count_recorder_; // get cnt recorder of segments in current long period
+    mutable port::RWMutex count_mutex_; // guarentee last_count_recorder and current_count_recorder treated orderedly
+    bool is_ready_; // check whether ready to use adaptive filter assignment
+    std::map<uint32_t, FileMetaData*> segment_in_file; // map segment_id to SST file
+    std::atomic<ColumnFamilyData*> cfd_; // In WaLSM+, we only support one column family
 public:
     FilterCacheManager() { get_cnt_ = 0; last_long_period_ = 0; last_short_period_ = 0; train_signal_ = false; }
 
-    ~FilterCacheManager();
+    ~FilterCacheManager() {}
 
     // one background thread monitor this func, if return true, call try_retrain_model at once, wait for training end, and call update_cache_and_heap
     bool need_retrain() { return train_signal_; }
@@ -122,24 +205,27 @@ class FilterCacheManager {
     // normal bloom filter units query, can we put hit_count_recorder outside this func? this will make get opt faster
     // will be called by a get operation, this will block get operation
     // remember to call hit_count_recorder in a background thread
-    bool check_key(const uint32_t& segment_id, const std::string& key);
+    std::vector<CachableEntry<ParsedFullFilterBlock>> get_filter_blocks(const uint32_t segment_id);
 
     // add 1 to get cnt of specified segment in current long period
     // will be called when calling check_key
     // remember to move this func to a single background thread aside check_key
     // because this func shouldn't block get operations
-    void hit_count_recorder(const uint32_t& segment_id);
+    void hit_count_recorder(uint32_t segment_id);
 
     // copy counts to last_count_recorder and reset counts of current_count_recorder
     void update_count_recorder();
 
+    // when debugging, we need to print out counters of each segment.
+    void debug_count_recorder();
+
     // inherit counts of merged segments to counts of new segments and remove counts of merged segments
     // inherit_infos_recorder: { {new segment 1: [{old segment 1: inherit rate 1}, {old segment 2: inherit rate 2}, ...]}, ...}
     void inherit_count_recorder(std::vector<uint32_t>& merged_segment_ids, std::vector<uint32_t>& new_segment_ids, const uint32_t& level_0_base_count,
                                 std::map<uint32_t, std::unordered_map<uint32_t, double>>& inherit_infos_recorder);
 
     // estimate approximate get cnts for every alive segment
-    void estimate_counts_for_all(std::map<uint32_t, uint32_t>& approximate_counts_recorder);
+    void estimate_recent_counts(std::map<uint32_t, uint32_t>& approximate_counts_recorder, const std::vector<uint32_t>& needed_segment_ids);
 
     // noticed that at the beginning, heat buckets need to sample put keys to init itself before heat buckets start to work
     // segment_info_recorder is external variable that records every alive segments' min key and max key
@@ -168,6 +254,11 @@ class FilterCacheManager {
     // we should use one background thread to call this func in every get operation
     void hit_heat_buckets(const std::string& key);
 
+    // when one short period ends, we estimate recent access counter of each segment, then update heaps
+    // when one long period ends, we reset counters and send a training signal. then classifier will be evaluated and retrained.
+    // we leave this function to one single thread. Exec this func and never end.
+    void do_periods_work();
+
     // if one long period end, we need to check effectiveness of model. 
     // if model doesnt work well in current workload, we retrain this model
     // 1. use greedy algorithm to solve filter units allocation problem (receive ideal enabled units num for every current segments)
@@ -183,7 +274,7 @@ class FilterCacheManager {
     // we ignore all level 0 segments !!! 3 recorders keys set should be the same ------ all alive segments' ids (except level 0)
     // because of the time cost of writing csv file, we need to do this func with a background thread
     // need real benchmark data to debug this func
-    void try_retrain_model(std::map<uint32_t, uint16_t>& level_recorder,
+    bool try_retrain_model(std::map<uint32_t, uint16_t>& level_recorder,
                            std::map<uint32_t, std::vector<RangeRatePair>>& segment_ranges_recorder,
                            std::map<uint32_t, uint32_t>& unit_size_recorder);
 
@@ -200,15 +291,6 @@ class FilterCacheManager {
     void update_cache_and_heap(std::map<uint32_t, uint16_t>& level_recorder,
                                std::map<uint32_t, std::vector<RangeRatePair>>& segment_ranges_recorder);
 
-    // remove merged segments' filter units in the filter cache
-    // also remove related items in FilterCacheHeap
-    // segment_ids: [level_1_segment_1, level_0_segment_1, ...]
-    // level_0_segment_ids: [level_0_segment_1, ...]
-    // should be called by one background thread
-    // this func will be called by insert_segments
-    // you can also call this func alone after segments are merged (not suggested)
-    void remove_segments(std::vector<uint32_t>& segment_ids, std::set<uint32_t>& level_0_segment_ids);
-
     // insert new segments into cache
     // all level 0 segments must enable all filter units
     // if is_ready_ is not true, set default filter units num (except level 0), insert into filter_cache_ and heaps
@@ -232,7 +314,7 @@ class FilterCacheManager {
     // when old segments are merged into some new segments, call this func in one background thread
     void insert_segments(std::vector<uint32_t>& merged_segment_ids, std::vector<uint32_t>& new_segment_ids,
                          std::map<uint32_t, std::unordered_map<uint32_t, double>>& inherit_infos_recorder,
-                         std::map<uint32_t, uint16_t>& level_recorder, const uint32_t& level_0_base_count,
+                         std::map<uint32_t, uint16_t>& new_level_recorder, const uint32_t& level_0_base_count,
                          std::map<uint32_t, std::vector<RangeRatePair>>& segment_ranges_recorder);
 
     // in func insert_segments above, we will also remove merged segments, this work well for normal compaction and flush
@@ -241,7 +323,7 @@ class FilterCacheManager {
     // this func only delete merged segments
     // we only need argument merged_segment_ids (all merged segments' ids)
     // and level_recorder which only include merged segments' level
-    void delete_segments(std::vector<uint32_t>& merged_segment_ids, std::map<uint32_t, uint16_t>& level_recorder);
+    void delete_segments(std::vector<uint32_t>& merged_segment_ids);
 
     // move segments to another level, used for trivial move compaction
     void move_segments(std::vector<uint32_t>& moved_segment_ids,
@@ -261,6 +343,18 @@ class FilterCacheManager {
     std::vector<std::string>& range_seperators() {
         return heat_buckets_.seperators();
     }
+
+    inline void update_cfd(ColumnFamilyData* cfd) {
+        ColumnFamilyData* expected = nullptr;
+        cfd_.compare_exchange_strong(expected, cfd, std::memory_order_release);
+    }
+
+    inline ColumnFamilyData* get_cfd() {
+        return cfd_.load(std::memory_order_acquire);
+    }
+
+    // should be called in/after compaction, before any filter adjustment operation that may affect given segment
+    void init_segment(uint32_t segment_id, const BlockBasedTable* table, const std::vector<BlockHandle>& block_handles);
 };
 
 }
\ No newline at end of file
diff --git a/db/art/filter_cache_client.cc b/db/art/filter_cache_client.cc
index 72a483235..f6a23e5b1 100644
--- a/db/art/filter_cache_client.cc
+++ b/db/art/filter_cache_client.cc
@@ -1,155 +1,244 @@
 #include "filter_cache_client.h"
+#include <iostream>
+#include <mutex>
+#include <ostream>
+#include "db/art/macros.h"
+#include "table/block_based/parsed_full_filter_block.h"
+#include "db/art/global_filter_cache_context.h"
 
 namespace ROCKSDB_NAMESPACE {
 
-task_thread_pool::task_thread_pool FilterCacheClient::pool_{FILTER_CACHE_THREADS_NUM};
-FilterCacheManager FilterCacheClient::filter_cache_manager_;
-bool FilterCacheClient::heat_buckets_ready_;
-
-void FilterCacheClient::do_prepare_heat_buckets(const std::string& key, std::unordered_map<uint32_t, std::vector<std::string>>* const segment_info_recorder) {
+void FilterCacheClient::do_prepare_heat_buckets(const std::string& key, std::unordered_map<uint32_t, std::vector<std::string>>* segment_info_recorder) {
     filter_cache_manager_.make_heat_buckets_ready(key, *segment_info_recorder);
 }
 
-bool FilterCacheClient::prepare_heat_buckets(const std::string& key, std::unordered_map<uint32_t, std::vector<std::string>>* const segment_info_recorder) {
+bool FilterCacheClient::prepare_heat_buckets(const std::string& key, std::unordered_map<uint32_t, std::vector<std::string>>* segment_info_recorder) {
     heat_buckets_ready_ = filter_cache_manager_.heat_buckets_ready();
     if (!heat_buckets_ready_) {
         // if heat_buckets_ready_ false
         assert(segment_info_recorder->size() == 0); // should always empty
         heat_buckets_ready_ = filter_cache_manager_.heat_buckets_ready();
         if (!heat_buckets_ready_) {
-            pool_.submit_detach(do_prepare_heat_buckets, key, segment_info_recorder);
+            // will leaks memory if pass ref
+            // pool_.submit_detach([this, &key, segment_info_recorder]() {
+                do_prepare_heat_buckets(key, segment_info_recorder);
+            // });
             heat_buckets_ready_ = filter_cache_manager_.heat_buckets_ready();
         }
     }
     return heat_buckets_ready_;
 }
 
-void FilterCacheClient::do_retrain_or_keep_model(std::vector<uint16_t>* const features_nums_except_level_0, 
-                                                 std::map<uint32_t, uint16_t>* const level_recorder,
-                                                 std::map<uint32_t, std::vector<RangeRatePair>>* const segment_ranges_recorder,
-                                                 std::map<uint32_t, uint32_t>* const unit_size_recorder) {
+void FilterCacheClient::do_retrain_or_keep_model(std::vector<uint16_t>* features_nums_except_level_0, 
+                                                 const std::map<uint32_t, uint16_t>* level_recorder,
+                                                 const std::map<uint32_t, std::vector<RangeRatePair>>* segment_ranges_recorder,
+                                                 const std::map<uint32_t, uint32_t>* unit_size_recorder) {
     std::map<uint32_t, uint16_t> level_copy;
     std::map<uint32_t, std::vector<RangeRatePair>> segment_ranges_copy;
     std::map<uint32_t, uint32_t> unit_size_copy;
+    bool clf_ready = false;
+    bool clf_train = false; // if true, then clf model evaluated or retrained
+    assert(READY_RATE <= FULL_RATE && READY_RATE >= 0);
     // if this func background monitor signal, how can it receive latest argument? input pointer!
-    while (!filter_cache_manager_.heat_buckets_ready());
-    while (!filter_cache_manager_.ready_work()); // wait for manager ready
+    while (!filter_cache_manager_.heat_buckets_ready())
+        std::this_thread::sleep_for(std::chrono::milliseconds(100));
     assert(filter_cache_manager_.heat_buckets_ready()); // must guarantee that heat buckets ready before we make filter cache manager ready
+    std::cout << "[MODEL] heat buckets are ready." << std::endl;
+    while (!filter_cache_manager_.ready_work())
+        std::this_thread::sleep_for(std::chrono::milliseconds(100)); // wait for manager ready
+    assert(filter_cache_manager_.ready_work());                                                
+    std::cout << "[MODEL] filter cache is ready." << std::endl;
     
     // actually we will load data before we test, so we can ensure that heat buckets ready first
-    filter_cache_manager_.make_clf_model_ready(*features_nums_except_level_0);
+    std::cout << "[MODEL] model feature number: " << (*features_nums_except_level_0)[0] << std::endl;
+    assert((*features_nums_except_level_0)[0] == MAX_FEATURES_NUM);
+    clf_ready = filter_cache_manager_.make_clf_model_ready(*features_nums_except_level_0);
+    assert(clf_ready);
     // lock and copy recorders
-    global_recorder_mutex_.lock();
+    global_filter_cache_recorders_mutex.lock();
     level_copy = *level_recorder; 
     segment_ranges_copy = *segment_ranges_recorder;
     unit_size_copy = *unit_size_recorder;
-    global_recorder_mutex_.unlock();
-    // train first time, before that, there is no model left
-    filter_cache_manager_.try_retrain_model(level_copy, segment_ranges_copy, unit_size_copy);
-    filter_cache_manager_.update_cache_and_heap(level_copy, segment_ranges_copy);
+    global_filter_cache_recorders_mutex.unlock();
+    assert(level_copy.size() == segment_ranges_copy.size());
+    std::cout << "[MODEL] level recorder size (include level 0): " << level_copy.size() << std::endl;
+    std::cout << "[MODEL] range recorder size (include level 0): " << segment_ranges_copy.size() << std::endl;
+    // train first time, before that, there is no model left. 
+    // Note: if it reach here when YCSB loading, we dont train model. we will train first model when one long period ends.
+    clf_train = filter_cache_manager_.try_retrain_model(level_copy, segment_ranges_copy, unit_size_copy);
+    if (UNLIKELY(clf_train)) {
+        assert(false); // only train first model when YCSB load ends.
+        std::cout << "[MODEL] we retrain a new model, thus we update filter cache and heaps" << std::endl;
+        std::cout << "[MODEL] level recorder size (exclude level 0): " << level_copy.size() << std::endl;
+        std::cout << "[MODEL] range recorder size (exclude level 0): " << segment_ranges_copy.size() << std::endl;
+        filter_cache_manager_.update_cache_and_heap(level_copy, segment_ranges_copy); 
+    }
 
     // retrain in long periods
     while (true) {
+        bool adjusted = false;
         // in one long period
-        while (!filter_cache_manager_.need_retrain()); // wait for long period end
+        while (!filter_cache_manager_.need_retrain()) {
+            // std::this_thread::sleep_for(std::chrono::milliseconds(100)); // wait for long period end
+            adjusted = filter_cache_manager_.adjust_cache_and_heap();
+            // if (adjusted) std::cout << "[ADJUST] filter cache adjustment!" << std::endl;
+        }
+        assert(filter_cache_manager_.need_retrain());
         // lock and copy recorders
-        global_recorder_mutex_.lock();
+        global_filter_cache_recorders_mutex.lock();
         level_copy = *level_recorder; 
         segment_ranges_copy = *segment_ranges_recorder;
         unit_size_copy = *unit_size_recorder;
-        global_recorder_mutex_.unlock();
+        global_filter_cache_recorders_mutex.unlock();
+        assert(level_copy.size() == segment_ranges_copy.size());
+        std::cout << "[MODEL] level recorder size (include level 0): " << level_copy.size() << std::endl;
+        std::cout << "[MODEL] range recorder size (include level 0): " << segment_ranges_copy.size() << std::endl;
         // train first time, before that, there is no model left
-        filter_cache_manager_.try_retrain_model(level_copy, segment_ranges_copy, unit_size_copy);
-        filter_cache_manager_.update_cache_and_heap(level_copy, segment_ranges_copy);
+        clf_train = filter_cache_manager_.try_retrain_model(level_copy, segment_ranges_copy, unit_size_copy);
+        if (LIKELY(clf_train)) {
+            std::cout << "[MODEL] we retrain a new model, thus we update filter cache and heaps" << std::endl;
+            std::cout << "[MODEL] level recorder size (exclude level 0): " << level_copy.size() << std::endl;
+            std::cout << "[MODEL] range recorder size (exclude level 0): " << segment_ranges_copy.size() << std::endl;
+            filter_cache_manager_.update_cache_and_heap(level_copy, segment_ranges_copy);
+        }
     }
     // this loop never end
 }
 
-void FilterCacheClient::retrain_or_keep_model(std::vector<uint16_t>* const features_nums_except_level_0, 
-                                              std::map<uint32_t, uint16_t>* const level_recorder,
-                                              std::map<uint32_t, std::vector<RangeRatePair>>* const segment_ranges_recorder,
-                                              std::map<uint32_t, uint32_t>* const unit_size_recorder) {
-    pool_.submit_detach(do_retrain_or_keep_model, features_nums_except_level_0, level_recorder, segment_ranges_recorder, unit_size_recorder);
+void FilterCacheClient::retrain_or_keep_model(std::vector<uint16_t>* features_nums_except_level_0, 
+                                              const std::map<uint32_t, uint16_t>* level_recorder,
+                                              const std::map<uint32_t, std::vector<RangeRatePair>>* segment_ranges_recorder,
+                                              const std::map<uint32_t, uint32_t>* unit_size_recorder) {
+    pool_.submit_detach([this, features_nums_except_level_0, level_recorder, segment_ranges_recorder, unit_size_recorder]() {
+        do_retrain_or_keep_model(features_nums_except_level_0, level_recorder, segment_ranges_recorder, unit_size_recorder);
+    });
     // if first model training not end, python lgb_model server still return default units num
     // then retrain model when every long period end. if model still work well, keep this model instead
     // no need to return any value
 }
 
-void FilterCacheClient::do_hit_count_recorder(const uint32_t& segment_id) {
+// // TODO: make it a atomic operation rather than a mutex + threading
+void FilterCacheClient::do_hit_count_recorder(uint32_t segment_id) {
     filter_cache_manager_.hit_count_recorder(segment_id);
 }
 
-bool FilterCacheClient::check_key(const uint32_t& segment_id, const std::string& key) {
-    bool result = filter_cache_manager_.check_key(segment_id, key);
-    pool_.submit_detach(do_hit_count_recorder, segment_id);
-    return result;
+std::vector<CachableEntry<ParsedFullFilterBlock>> FilterCacheClient::get_filter_blocks(uint32_t segment_id) {
+    // pool_.submit_detach([this, segment_id]() {
+    //     do_hit_count_recorder(segment_id);
+    // });
+    do_hit_count_recorder(segment_id);
+    return filter_cache_manager_.get_filter_blocks(segment_id);
 }
 
 void FilterCacheClient::do_hit_heat_buckets(const std::string& key) {
     filter_cache_manager_.hit_heat_buckets(key);
 }
 
-void FilterCacheClient::get_updating_work(const std::string& key) {
-    pool_.submit_detach(do_hit_heat_buckets, key);
+void FilterCacheClient::hit_heat_buckets(const std::string& key) {
+    // pool_.submit_detach([this, key]() {
+    //     do_hit_heat_buckets(key);
+    // });
+    do_hit_heat_buckets(key);
 }
 
-void FilterCacheClient::do_make_adjustment() {
+void FilterCacheClient::do_periods_work() {
     while (true) {
-        // never stop making heap adjustment
-        filter_cache_manager_.adjust_cache_and_heap();
+        filter_cache_manager_.do_periods_work();
     }
 }
 
-void FilterCacheClient::make_adjustment() {
-    pool_.submit_detach(do_make_adjustment);
+void FilterCacheClient::periods_work() {
+    pool_.submit_detach([this]() {
+        do_periods_work();
+    });
 }
 
+// void FilterCacheClient::do_make_adjustment() {
+//     assert(false);
+//     while (true) {
+//         // never stop making heap adjustment
+//         filter_cache_manager_.adjust_cache_and_heap();
+//     }
+// }
+
+// void FilterCacheClient::make_adjustment() {
+//     assert(false);
+//     pool_.submit_detach([this]() {
+//         do_make_adjustment();
+//     });
+// }
+
 void FilterCacheClient::do_batch_insert_segments(std::vector<uint32_t>& merged_segment_ids, std::vector<uint32_t>& new_segment_ids,
                                                  std::map<uint32_t, std::unordered_map<uint32_t, double>>& inherit_infos_recorder,
-                                                 std::map<uint32_t, uint16_t>& level_recorder, const uint32_t& level_0_base_count,
+                                                 std::map<uint32_t, uint16_t>& new_level_recorder, uint32_t level_0_base_count,
                                                  std::map<uint32_t, std::vector<RangeRatePair>>& segment_ranges_recorder) {
     filter_cache_manager_.insert_segments(merged_segment_ids, new_segment_ids, inherit_infos_recorder,
-                                          level_recorder, level_0_base_count, segment_ranges_recorder);
+                                          new_level_recorder, level_0_base_count, segment_ranges_recorder);
 }
 
 void FilterCacheClient::batch_insert_segments(std::vector<uint32_t> merged_segment_ids, std::vector<uint32_t> new_segment_ids,
                                               std::map<uint32_t, std::unordered_map<uint32_t, double>> inherit_infos_recorder,
-                                              std::map<uint32_t, uint16_t> level_recorder, const uint32_t& level_0_base_count,
+                                              std::map<uint32_t, uint16_t> new_level_recorder, uint32_t level_0_base_count,
                                               std::map<uint32_t, std::vector<RangeRatePair>> segment_ranges_recorder) {
-    assert(merged_segment_ids.size() > 0 && new_segment_ids.size() > 0);
-    assert(new_segment_ids.size() == inherit_infos_recorder.size());
-    assert(merged_segment_ids.size() + new_segment_ids.size() == level_recorder.size());
+    assert(new_segment_ids.size() == new_level_recorder.size());
     assert(new_segment_ids.size() == segment_ranges_recorder.size());
+    assert(new_segment_ids.size() > 0);
     if (level_0_base_count == 0) {
-        pool_.submit_detach(do_batch_insert_segments, merged_segment_ids, new_segment_ids, inherit_infos_recorder, level_recorder, INIT_LEVEL_0_COUNT, segment_ranges_recorder);
+        pool_.submit_detach([this, merged_segment_ids, new_segment_ids, inherit_infos_recorder, new_level_recorder, segment_ranges_recorder]() mutable {
+            do_batch_insert_segments(merged_segment_ids, new_segment_ids, inherit_infos_recorder, new_level_recorder, INIT_LEVEL_0_COUNT, segment_ranges_recorder);
+        });
     } else {
-        pool_.submit_detach(do_batch_insert_segments, merged_segment_ids, new_segment_ids, inherit_infos_recorder, level_recorder, level_0_base_count, segment_ranges_recorder);
+        pool_.submit_detach([this, merged_segment_ids, new_segment_ids, inherit_infos_recorder, new_level_recorder, level_0_base_count, segment_ranges_recorder]() mutable {
+            do_batch_insert_segments(merged_segment_ids, new_segment_ids, inherit_infos_recorder, new_level_recorder, level_0_base_count, segment_ranges_recorder);
+        });
     }
 }
 
-void FilterCacheClient::do_batch_delete_segments(std::vector<uint32_t>& merged_segment_ids, std::map<uint32_t, uint16_t>& level_recorder) {
-    filter_cache_manager_.delete_segments(merged_segment_ids, level_recorder);
+void FilterCacheClient::update_cfd_ptr_if_needed(ColumnFamilyData* cfd) {
+    filter_cache_manager_.update_cfd(cfd);
+}
+
+void FilterCacheClient::do_batch_delete_segments(std::vector<uint32_t>& merged_segment_ids) {
+    assert(false);
+    exit(1);
+    filter_cache_manager_.delete_segments(merged_segment_ids);
 }
 
-void FilterCacheClient::batch_delete_segments(std::vector<uint32_t> merged_segment_ids, std::map<uint32_t, uint16_t> level_recorder) {
-    assert(merged_segment_ids.size() == level_recorder.size());
-    pool_.submit_detach(do_batch_delete_segments, merged_segment_ids, level_recorder);
+// disallowed in WaLSM+
+void FilterCacheClient::batch_delete_segments(std::vector<uint32_t> merged_segment_ids) {
+    assert(false);
+    exit(1);
+    pool_.submit_detach([this, merged_segment_ids]() mutable {
+        do_batch_delete_segments(merged_segment_ids);
+    });
 }
 
 void FilterCacheClient::do_batch_move_segments(std::vector<uint32_t>& moved_segment_ids,
                                                std::map<uint32_t, uint16_t>& old_level_recorder,
                                                std::map<uint32_t, uint16_t>& move_level_recorder,
                                                std::map<uint32_t, std::vector<RangeRatePair>>& move_segment_ranges_recorder) {
+    assert(false);
+    exit(1);
     filter_cache_manager_.move_segments(moved_segment_ids, old_level_recorder, move_level_recorder, move_segment_ranges_recorder);                                         
 }
 
+// disallowed in WaLSM+
 void FilterCacheClient::batch_move_segments(std::vector<uint32_t> moved_segment_ids,
                                             std::map<uint32_t, uint16_t> old_level_recorder,
                                             std::map<uint32_t, uint16_t> move_level_recorder,
                                             std::map<uint32_t, std::vector<RangeRatePair>> move_segment_ranges_recorder) {
+    assert(false);
+    exit(1);
     assert(moved_segment_ids.size() == move_level_recorder.size());
     assert(moved_segment_ids.size() == move_segment_ranges_recorder.size());
-    pool_.submit_detach(do_batch_move_segments, moved_segment_ids, old_level_recorder, move_level_recorder, move_segment_ranges_recorder);                                     
+    pool_.submit_detach([this, &moved_segment_ids, &old_level_recorder, &move_level_recorder, &move_segment_ranges_recorder]() {
+        do_batch_move_segments(moved_segment_ids, old_level_recorder, move_level_recorder, move_segment_ranges_recorder);
+    });
+}
+
+void FilterCacheClient::init_segment(uint32_t segment_id, const BlockBasedTable* table, const std::vector<BlockHandle>& block_handles) {
+    assert(block_handles.size() > 0);
+    filter_cache_manager_.init_segment(segment_id, table, block_handles);
 }
 
 }
\ No newline at end of file
diff --git a/db/art/filter_cache_client.h b/db/art/filter_cache_client.h
index ff9414071..dbe3717f3 100644
--- a/db/art/filter_cache_client.h
+++ b/db/art/filter_cache_client.h
@@ -5,55 +5,60 @@
 #include <task_thread_pool.hpp>
 #include "macros.h"
 #include "filter_cache.h" 
+#include "table/block_based/parsed_full_filter_block.h"
+#include "table/format.h"
 
 namespace ROCKSDB_NAMESPACE {
 
-// global mutex to control global level recorder, ... 
-static std::mutex global_recorder_mutex_;
-
 class FilterCacheClient;
+class FilterCacheManager;
+class ParsedFullFilterBlock;
 
 class FilterCacheClient {
 private:
-    static task_thread_pool::task_thread_pool pool_;
-    static FilterCacheManager filter_cache_manager_;
+    task_thread_pool::task_thread_pool pool_{FILTER_CACHE_THREADS_NUM};
+    FilterCacheManager filter_cache_manager_;
     // we need heat_buckets_ready_ to become true before filter_cache_ready_
     // In YCSB benchmark, we first load data (insert key-value pairs) then may try get operation
     // so we can guarantee that heat_buckets_ready_ become true before filter_cache_ready_
-    static bool heat_buckets_ready_; // the same as FilterCacheManager.heat_buckets_.is_ready()
+    bool heat_buckets_ready_; // the same as FilterCacheManager.heat_buckets_.is_ready()
 
     // background thread part of prepare_heat_buckets
-    static void do_prepare_heat_buckets(const std::string& key, std::unordered_map<uint32_t, std::vector<std::string>>* const segment_info_recorder);
+    void do_prepare_heat_buckets(const std::string& key, std::unordered_map<uint32_t, std::vector<std::string>>* segment_info_recorder);
 
     // background thread part of retrain_or_keep_model
-    static void do_retrain_or_keep_model(std::vector<uint16_t>* const features_nums_except_level_0, 
-                                         std::map<uint32_t, uint16_t>* const level_recorder,
-                                         std::map<uint32_t, std::vector<RangeRatePair>>* const segment_ranges_recorder,
-                                         std::map<uint32_t, uint32_t>* const unit_size_recorder);
+    void do_retrain_or_keep_model(std::vector<uint16_t>* features_nums_except_level_0, 
+                                         const std::map<uint32_t, uint16_t>* level_recorder,
+                                         const std::map<uint32_t, std::vector<RangeRatePair>>* segment_ranges_recorder,
+                                         const std::map<uint32_t, uint32_t>* unit_size_recorder);
 
     // background thread part of check_key
-    static void do_hit_count_recorder(const uint32_t& segment_id);
+    void do_hit_count_recorder(uint32_t segment_id);
 
-    // background thread part of get_updating_work
-    static void do_hit_heat_buckets(const std::string& key);
+    // background thread part of hit_heat_buckets
+    void do_hit_heat_buckets(const std::string& key);
 
-    // background thread part of make_adjustment
-    static void do_make_adjustment();
+    // // background thread part of make_adjustment
+    // void do_make_adjustment();
 
     // background thread part of batch_insert_segments
-    static void do_batch_insert_segments(std::vector<uint32_t>& merged_segment_ids, std::vector<uint32_t>& new_segment_ids,
+    void do_batch_insert_segments(std::vector<uint32_t>& merged_segment_ids, std::vector<uint32_t>& new_segment_ids,
                                          std::map<uint32_t, std::unordered_map<uint32_t, double>>& inherit_infos_recorder,
-                                         std::map<uint32_t, uint16_t>& level_recorder, const uint32_t& level_0_base_count,
+                                         std::map<uint32_t, uint16_t>& new_level_recorder, uint32_t level_0_base_count,
                                          std::map<uint32_t, std::vector<RangeRatePair>>& segment_ranges_recorder);
 
     // background thread part of batch_delete_segments
-    void do_batch_delete_segments(std::vector<uint32_t>& merged_segment_ids, std::map<uint32_t, uint16_t>& level_recorder);
+    void do_batch_delete_segments(std::vector<uint32_t>& merged_segment_ids);
 
     // background thread part of batch_move_segments
     void do_batch_move_segments(std::vector<uint32_t>& moved_segment_ids,
                                 std::map<uint32_t, uint16_t>& old_level_recorder,
                                 std::map<uint32_t, uint16_t>& move_level_recorder,
                                 std::map<uint32_t, std::vector<RangeRatePair>>& move_segment_ranges_recorder);
+
+    // background thread part of periods_work;
+    void do_periods_work();
+
 public:
     FilterCacheClient() {
         heat_buckets_ready_ = false;
@@ -79,35 +84,44 @@ class FilterCacheClient {
     // please ensure that 3 recorders need to keep the same segments set, or error will occur in train func
     // you can use mutex in compaction and flushing to guarantee this
     // then when every long period end, try to retrain a new model or keep last model
-    void retrain_or_keep_model(std::vector<uint16_t>* const features_nums_except_level_0, 
-                               std::map<uint32_t, uint16_t>* const level_recorder,
-                               std::map<uint32_t, std::vector<RangeRatePair>>* const segment_ranges_recorder,
-                               std::map<uint32_t, uint32_t>* const unit_size_recorder);
+    void retrain_or_keep_model(std::vector<uint16_t>* features_nums_except_level_0, 
+                               const std::map<uint32_t, uint16_t>* level_recorder,
+                               const std::map<uint32_t, std::vector<RangeRatePair>>* segment_ranges_recorder,
+                               const std::map<uint32_t, uint32_t>* unit_size_recorder);
 
     // correespinding to FilterCacheManager work: check_key and hit_count_recorder
     // return FilterCacheManager.check_key() and leave hit_count_recorder to background
-    bool check_key(const uint32_t& segment_id, const std::string& key);
+    std::vector<CachableEntry<ParsedFullFilterBlock>> get_filter_blocks(uint32_t segment_id);
 
     // every db get operation need one hit_heat_buckets
-    void get_updating_work(const std::string& key);
+    void hit_heat_buckets(const std::string& key);
+
+    // keep track of period count, update access counters and retrain classifier model
+    void periods_work();
 
-    // heap based adjustment
-    void make_adjustment();
+    // // heap based adjustment
+    // void make_adjustment();
 
     // batch insert segments into filter cache manager, will also delete merged segments
     void batch_insert_segments(std::vector<uint32_t> merged_segment_ids, std::vector<uint32_t> new_segment_ids,
                                std::map<uint32_t, std::unordered_map<uint32_t, double>> inherit_infos_recorder,
-                               std::map<uint32_t, uint16_t> level_recorder, const uint32_t& level_0_base_count,
+                               std::map<uint32_t, uint16_t> new_level_recorder, uint32_t level_0_base_count,
                                std::map<uint32_t, std::vector<RangeRatePair>> segment_ranges_recorder);
+    
+    // In WaLSM+, we only support one column family, we just save cfd ptr here
+    void update_cfd_ptr_if_needed(ColumnFamilyData* cfd); 
 
     // batch delete segments from filter cache manager
-    void batch_delete_segments(std::vector<uint32_t> merged_segment_ids, std::map<uint32_t, uint16_t> level_recorder);
+    void batch_delete_segments(std::vector<uint32_t> merged_segment_ids);
 
     // batch of moving segments to one level
     void batch_move_segments(std::vector<uint32_t> moved_segment_ids,
                              std::map<uint32_t, uint16_t> old_level_recorder,
                              std::map<uint32_t, uint16_t> move_level_recorder,
                              std::map<uint32_t, std::vector<RangeRatePair>> move_segment_ranges_recorder);
+    
+    
+    void init_segment(uint32_t segment_id, const BlockBasedTable* table, const std::vector<BlockHandle>& block_handles);
 };
 
 }
diff --git a/db/art/filter_cache_entry.cc b/db/art/filter_cache_entry.cc
new file mode 100644
index 000000000..bd5c8e85b
--- /dev/null
+++ b/db/art/filter_cache_entry.cc
@@ -0,0 +1,157 @@
+#include "filter_cache_entry.h"
+
+#include <atomic>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <iostream>
+#include <memory>
+
+#include "db/table_cache.h"
+#include "rocksdb/options.h"
+#include "table/block_based/block_based_table_reader.h"
+#include "table/block_based/cachable_entry.h"
+#include "table/block_based/filter_block.h"
+#include "table/block_based/parsed_full_filter_block.h"
+#include "table/format.h"
+#include "table/table_reader.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// 构造函数，可以初始化成员变量
+// TODO pass right parameters
+FilterCacheEntry::FilterCacheEntry(
+    const uint32_t segment_id, const BlockBasedTable* table,
+    FilterCache* filter_cache, const std::vector<BlockHandle>& block_handles) {
+  segment_id_ = segment_id;
+  table_ = table;
+  filter_cache_ = filter_cache;
+  loaded_units_num_ = 0;
+
+  // fill block_handles from the input vector, or fill with null handles
+  assert(block_handles.size() == MAX_UNITS_NUM);
+  block_handles_.fill(BlockHandle::NullBlockHandle());
+  cache_handles_.fill(nullptr);
+  for (size_t i = 0; i < block_handles.size(); i++) {
+    block_handles_[i] = block_handles[i];
+  }
+
+  // load units into memory, then only modify loaded_units_num_
+  prefetch_units();
+}
+
+// 清理成员变量，避免内存泄漏，如果new了空间，就可能需要在这里清理
+FilterCacheEntry::~FilterCacheEntry() {}
+
+size_t FilterCacheEntry::approximate_size() {
+  uint32_t sum = 0;
+  // for (size_t i = 0; i < loaded_units_num_; i++) {
+  //   if (cache_handles_[i] == nullptr) {
+  //     continue;
+  //   }
+  //   sum += cache_handles_[i]->value_->ApproximateMemoryUsage();
+  // }
+  // sum *= 8;  // convert to bits
+  sum += DEFAULT_UNIT_SIZE * loaded_units_num_;
+  return sum;
+}
+
+std::vector<CachableEntry<ParsedFullFilterBlock>>
+FilterCacheEntry::get_filter_blocks() {
+  uint32_t units_num = loaded_units_num_;
+
+  rwlock.ReadLock();
+  std::vector<CachableEntry<ParsedFullFilterBlock>> result;
+
+  result.reserve(units_num);
+  for (size_t i = 0; i < units_num; i++) {
+    if (UNLIKELY(cache_handles_[i] == nullptr)) {
+      result.emplace_back(nullptr, nullptr, nullptr, false);
+      result[i].Reset();
+      continue;
+    }
+    result.emplace_back(cache_handles_[i]->value_.get(), filter_cache_,
+                        cache_handles_[i].get(), false);
+  }
+  rwlock.ReadUnlock();
+  return result;
+}
+
+void FilterCacheEntry::enable_units(uint32_t target_unit_num) {
+  if (target_unit_num > MAX_UNITS_NUM) {
+    target_unit_num = MAX_UNITS_NUM;
+  }
+
+  // std::cout << "segment id: " << segment_id_ << ", enable units num from " << loaded_units_num_ << " to " << target_unit_num << std::endl;
+
+  rwlock.WriteLock();
+  loaded_units_num_ = target_unit_num;
+  rwlock.WriteUnlock();
+
+  // static std::atomic<uint32_t> target_unit_num_5_counter{0};
+  // static std::atomic<uint32_t> target_unit_num_12_counter{0};
+  // if (target_unit_num == 5) {
+  //   target_unit_num_5_counter.fetch_add(1);
+  // } else if (target_unit_num == 12) {
+  //   target_unit_num_12_counter.fetch_add(1);
+  // }
+  // std::cout << "target_unit_num_5_counter: " << target_unit_num_5_counter.load() << ", target_unit_num_12_counter: " << target_unit_num_12_counter.load() << std::endl;
+}
+
+void FilterCacheEntry::prefetch_units() {
+  uint32_t target_unit_num = MAX_UNITS_NUM;
+  uint32_t prefetch_success_num = 0;
+
+  rwlock.WriteLock();
+  const ReadOptions read_options;
+  for (uint32_t i = 0; i < target_unit_num; i++) {
+    // do nothing for null block handle
+    if (block_handles_[i] == BlockHandle::NullBlockHandle()) {
+      continue;
+    }
+    CachableEntry<ParsedFullFilterBlock> block_entry;
+    Status s = table_->RetrieveBlock(
+        nullptr, read_options, block_handles_[i],
+        UncompressionDict::GetEmptyDict(), &block_entry, BlockType::kFilter,
+        nullptr, nullptr,
+        /* for_compaction */ false, /* use_cache */ false);
+
+    
+    if (s.ok()) {
+      prefetch_success_num++;
+    }
+
+    // do nothing if no data retrieved
+    if (!s.ok()) {
+      std::cout << "failed to retrive filter data, segment id: " << segment_id_
+                << std::endl;
+      cache_handles_[i].reset();
+      units_[i].reset();
+      break;
+    }
+
+    units_[i] =
+        std::shared_ptr<ParsedFullFilterBlock>(block_entry.ReleaseValue());
+    cache_handles_[i] =
+        // std::make_shared<FilterCacheDataHandle>(units_[i], filter_cache_);
+        std::shared_ptr<FilterCacheDataHandle>(
+            new FilterCacheDataHandle(units_[i], filter_cache_));
+  }
+  rwlock.WriteUnlock();
+
+  // std::cout << "segment id: " << segment_id_ << ", prefetch success num: " << prefetch_success_num << std::endl;
+  // std::cout << "used bytes:" ;
+  // for (uint32_t i = 0; i < target_unit_num; i++) {
+  //   if (cache_handles_[i] == nullptr) {
+  //     std::cout << "null ";
+  //     continue;
+  //   }
+  //   std::cout << cache_handles_[i]->value_->ApproximateMemoryUsage() << " ";
+  // }
+  // std::cout << std::endl;
+}
+
+FilterCacheEntry::FilterCacheDataHandle::FilterCacheDataHandle(
+    DataPtr value, FilterCache* cache)
+    : value_(value), cache_(cache) {}
+}  // namespace ROCKSDB_NAMESPACE
\ No newline at end of file
diff --git a/db/art/filter_cache_entry.h b/db/art/filter_cache_entry.h
new file mode 100644
index 000000000..0778c9883
--- /dev/null
+++ b/db/art/filter_cache_entry.h
@@ -0,0 +1,83 @@
+#pragma once
+
+#include <array>
+#include <cassert>
+#include <fstream>
+#include <iostream>
+#include <map>
+#include <memory>
+#include <mutex>
+#include <vector>
+
+#include "macros.h"
+#include "port/port_posix.h"
+#include "table/block_based/cachable_entry.h"
+#include "table/block_based/parsed_full_filter_block.h"
+#include "table/format.h"
+
+namespace ROCKSDB_NAMESPACE {
+class TableCache;  // forward declaration
+class BlockBasedTable;
+class FilterCache;
+
+// 先在filter
+// cache里为每个segment默认启用总bits-per-key=8，随着写入的segment的增加，
+// 一旦已经占用了filter cache最大容量的一定阈值(如80%),
+// 就利用GreedyAlgo计算规划问题，并进行模型训练 一旦filter
+// cache已满，就进入filter cache的double
+// heap调整，我们只需将新的segment用模型进行预测
+// 将新segment的node插入到两个heap里，在后台启动一个线程，自行调整两个堆，并不断返回调整的结果
+// 得到结果后，我们可以立即对filter
+// units的启用情况进行调节，也可以先保存后面批量调整 具体见文档
+
+// 注意加上一些必要的英文注释
+// filter cache主要为一个map, key是segment id(uint32_t),
+// value就为FilterCacheItem类 成员函数需要在filter_cache_item.cc里定义
+
+// TODO: how to get block_handles?
+class FilterCacheEntry {
+  using DataPtr = std::shared_ptr<ParsedFullFilterBlock>;
+
+ private:
+  const BlockBasedTable* table_;
+  FilterCache* filter_cache_;
+
+  struct FilterCacheDataHandle : public Cache::Handle {
+    DataPtr value_;
+    FilterCache* cache_;
+
+    FilterCacheDataHandle(DataPtr value, FilterCache* cache);
+  };
+  using HandlePtr = std::shared_ptr<FilterCacheDataHandle>;
+
+  uint32_t segment_id_;
+  uint32_t loaded_units_num_;
+  std::array<DataPtr, MAX_UNITS_NUM> units_{};
+  std::array<BlockHandle, MAX_UNITS_NUM> block_handles_{};
+  std::array<HandlePtr, MAX_UNITS_NUM> cache_handles_{};
+  mutable port::RWMutex rwlock;
+
+ public:
+  // 构造函数，可以初始化成员变量
+  // TODO pass right parameters
+  FilterCacheEntry(const uint32_t segment_id, const BlockBasedTable* table,
+                   FilterCache* filter_cache, const std::vector<BlockHandle>& block_handles);
+
+  // 清理成员变量，避免内存泄漏，如果new了空间，就可能需要在这里清理
+  ~FilterCacheEntry();
+
+  // 占用的内存空间，这里估计总共使用的filter units占用的空间就行了
+  // 注意，返回的空间大小为占用的bits数量，不是bytes数量
+  size_t approximate_size();
+
+  // 根据目前已经启用的units数，启用或禁用filter units
+  // 输入需要启用的units数，决定启用、禁用还是不处理
+  // units_num : [MIN_UNITS_NUM, MAX_UNITS_NUM]
+  void enable_units(uint32_t units_num);
+
+  void prefetch_units();
+
+  // 获取缓存的 filter Block
+  std::vector<CachableEntry<ParsedFullFilterBlock>> get_filter_blocks();
+};
+}  // namespace ROCKSDB_NAMESPACE
\ No newline at end of file
diff --git a/db/art/filter_cache_heap.cc b/db/art/filter_cache_heap.cc
index fd2afb48c..35cefb59f 100644
--- a/db/art/filter_cache_heap.cc
+++ b/db/art/filter_cache_heap.cc
@@ -1,20 +1,16 @@
 #include "filter_cache_heap.h"
 #include <fstream>
 #include <iostream>
+#include "port/likely.h"
 
 namespace ROCKSDB_NAMESPACE {
 
-FilterCacheHeap FilterCacheHeapManager::benefit_heap_;
-FilterCacheHeap FilterCacheHeapManager::cost_heap_;
-std::map<uint32_t, uint32_t> FilterCacheHeapManager::heap_visit_cnt_recorder_;
-std::map<uint32_t, uint16_t> FilterCacheHeapManager::units_num_limit_recorder_;
-std::mutex FilterCacheHeapManager::manager_mutex_; 
 
 FilterCacheHeapNode FilterCacheHeap::heap_top() {
     // need lock heap, or we may retrive outdated node
     // heap_mutex_.lock();
 
-    if (!heap_.empty()) {
+    if (LIKELY(!heap_.empty())) {
         return heap_[0];
     } else {
         return nullptr;
@@ -23,6 +19,46 @@ FilterCacheHeapNode FilterCacheHeap::heap_top() {
     // heap_mutex_.unlock();
 }
 
+void FilterCacheHeap::heap_check(bool max_heap) {
+    // need lock heap, or we may retrive outdated node
+    // heap_mutex_.lock();
+
+    if (LIKELY(!heap_.empty())) {
+        if (max_heap) {
+            for (auto &node : heap_) {
+                assert(node->benefit_or_cost <= heap_[0]->benefit_or_cost);
+            }
+        } else {
+            for (auto &node : heap_) {
+                assert(node->benefit_or_cost >= heap_[0]->benefit_or_cost);
+            }
+        }
+    } 
+
+    // heap_mutex_.unlock();
+}
+
+void FilterCacheHeap::heap_print(std::vector<uint32_t>& needed_segment_ids, const bool should_exist) {
+    // heap_mutex_.lock();
+
+    for (uint32_t &segment_id : needed_segment_ids) {
+        auto it = heap_index_.find(segment_id);
+        if (should_exist)
+            assert(it != heap_index_.end());
+        if (UNLIKELY(it == heap_index_.end())) continue;
+        std::cout << "segment id: " << it->second->segment_id
+                  << ", visit cnt: " << it->second->approx_visit_cnt
+                  << ", benefit/cost: " << it->second->benefit_or_cost
+                  << ", current units: " << it->second->current_units_num
+                  << ", units limit: " << it->second->units_num_limit
+                  << std::endl;
+        assert(it->second->is_alive);
+        assert(it->second->current_units_num <= it->second->units_num_limit);
+    }
+
+    // heap_mutex_.unlock();
+}
+
 /*
 void FilterCacheHeap::pop() {
     // heap_mutex_.lock();
@@ -94,6 +130,7 @@ void FilterCacheHeap::push(FilterCacheHeapNode& node) {
 void FilterCacheHeap::batch_query(std::vector<uint32_t>& segment_ids, std::vector<FilterCacheHeapNode>& return_nodes) {
     // heap_mutex_.lock();
 
+    // uint32_t return_count = 0;
     return_nodes.clear();
     for (uint32_t& segment_id : segment_ids) {
         auto it = heap_index_.find(segment_id);
@@ -102,9 +139,11 @@ void FilterCacheHeap::batch_query(std::vector<uint32_t>& segment_ids, std::vecto
         // so we should return null when query a merged segment id
         if (it != heap_index_.end() && (it->second)->is_alive == true) { 
             return_node = it->second; // node exists in heap_index_ and segment alive
+            // return_count++;
         }
         return_nodes.emplace_back(return_node);
     }
+    // assert(segment_ids.size() == return_count);
 
     // heap_mutex_.unlock();
 }
@@ -120,14 +159,27 @@ void FilterCacheHeap::batch_upsert(std::vector<FilterCacheHeapNode>& nodes) {
             // exist in heap_index_ and heap_
             // we may query nodes from this heap, and update var in nodes, then upsert original nodes
             // check it->second != node to make sure that we won't free a refered sapce
+            assert(it->second != node);
+            assert(it->second->segment_id == segment_id);
             if (it->second != node) { 
                 *(it->second) = *(node); // only copy content, this will update content of node in heap_index_ and heap_
                 delete node; // remember to free unnecessary space!
             }
+            // bool found = false;
+            // for (auto &node : heap_) {
+            //     found = found || (heap_index_[segment_id]->segment_id == node->segment_id);
+            //     if (found) {
+            //         assert(node == heap_index_[segment_id]);
+            //         break;
+            //     }
+            // }
+            // assert(found); // should found
         } else {
             // not exist in heap_index_ and heap_
             heap_index_.insert(std::make_pair(segment_id, node)); // insert into heap_index_
             heap_.emplace_back(node); // push into heap_
+            assert(heap_index_[segment_id] == heap_[heap_.size()-1]);
+            assert(heap_index_[segment_id]->segment_id == segment_id);
         }
     }
 
@@ -140,16 +192,34 @@ void FilterCacheHeap::batch_upsert(std::vector<FilterCacheHeapNode>& nodes) {
 void FilterCacheHeap::batch_delete(std::vector<uint32_t>& segment_ids) {
     // heap_mutex_.lock();
 
+    // uint32_t delete_count = 0;
+    // uint32_t size_before = heap_.size();
+
     // we guarantee that if one node not exist in heap_index_, it must not exist in heap
     for (uint32_t& segment_id : segment_ids) {
         auto it = heap_index_.find(segment_id);
-        if (it == heap_index_.end()) {
+        if (UNLIKELY(it == heap_index_.end())) {
             // not exist in heap_index_ and heap_
             // do nothing
+            // for (auto &node : heap_) {
+            //     assert(node->segment_id != segment_id);
+            // }
         } else {
             // exist in heap_index_ and heap_
             // set is_alive to false and delete after that
             it->second->is_alive = false;
+            assert(heap_index_[segment_id]->is_alive == false);
+            // bool found = false;
+            // for (auto &node : heap_) {
+            //     found = found || (heap_index_[segment_id]->segment_id == node->segment_id);
+            //     if (found) {
+            //         assert(node == heap_index_[segment_id]);
+            //         assert(node->is_alive == false);
+            //         break;
+            //     }
+            // }
+            // assert(found); // should found
+            // delete_count++;
         }
     }
 
@@ -172,6 +242,11 @@ void FilterCacheHeap::batch_delete(std::vector<uint32_t>& segment_ids) {
         }
     }
 
+    // check already deleted?
+    // for (uint32_t &segment_id : segment_ids) {
+    //     assert(heap_index_.find(segment_id) == heap_index_.end());
+    // }
+    // assert(heap_.size() + delete_count == size_before);
     // delete done, need to rebuild heap_
     rebuild_heap();
 
@@ -181,20 +256,47 @@ void FilterCacheHeap::batch_delete(std::vector<uint32_t>& segment_ids) {
 void FilterCacheHeapManager::batch_delete(std::vector<uint32_t>& segment_ids) {
     manager_mutex_.lock();
 
+    // std::set<uint32_t> segment_ids_set;
     for (uint32_t& segment_id : segment_ids) {
         auto cnt_it = heap_visit_cnt_recorder_.find(segment_id);
         auto limit_it = units_num_limit_recorder_.find(segment_id);
-        if (cnt_it != heap_visit_cnt_recorder_.end()) {
+        // assert((cnt_it != heap_visit_cnt_recorder_.end() && limit_it != units_num_limit_recorder_.end())
+        //        || (cnt_it == heap_visit_cnt_recorder_.end() && limit_it == units_num_limit_recorder_.end()));
+        if (LIKELY(cnt_it != heap_visit_cnt_recorder_.end())) {
             heap_visit_cnt_recorder_.erase(segment_id);
         }
-        if (limit_it != units_num_limit_recorder_.end()) {
+        if (LIKELY(limit_it != units_num_limit_recorder_.end())) {
             units_num_limit_recorder_.erase(segment_id);
         }
+        // segment_ids_set.insert(segment_id);
     }
+    // assert(segment_ids_set.size() == segment_ids.size()); // all segment ids should be unique
+
+    // // print before deletion
+    // if (segment_ids_set.size() > 0) {
+    //     std::cout << std::endl;
+    //     std::cout << "before deletion, print benefit heap items: " << std::endl;
+    //     benefit_heap_.heap_print(segment_ids, false);
+    //     std::cout << "before deletion, print cost heap items: " << std::endl;
+    //     cost_heap_.heap_print(segment_ids, false);
+    //     std::cout << std::endl;
+    // }
 
     benefit_heap_.batch_delete(segment_ids);
     cost_heap_.batch_delete(segment_ids);
 
+    // check whether it is heap?
+    // benefit_heap_.heap_check(true);
+    // cost_heap_.heap_check(false);
+    // for (uint32_t &segment_id : segment_ids) {
+    //     assert(heap_visit_cnt_recorder_.find(segment_id) == heap_visit_cnt_recorder_.end());
+    //     assert(units_num_limit_recorder_.find(segment_id) == units_num_limit_recorder_.end());
+    // }
+
+    assert(benefit_heap_.heap_size() == heap_visit_cnt_recorder_.size());
+    assert(benefit_heap_.heap_size() == units_num_limit_recorder_.size());
+    assert(benefit_heap_.heap_size() == cost_heap_.heap_size());
+
     manager_mutex_.unlock();
 }
 
@@ -202,11 +304,15 @@ void FilterCacheHeapManager::batch_upsert(std::vector<FilterCacheHeapItem>& item
     manager_mutex_.lock();
 
     std::vector<FilterCacheHeapNode> benefit_nodes, cost_nodes;
+    // std::set<uint32_t> segment_ids_set;
     for (FilterCacheHeapItem& item : items) {
         assert(item.current_units_num >= MIN_UNITS_NUM);
         assert(item.current_units_num <= item.units_num_limit);
+        assert(item.units_num_limit <= MAX_UNITS_NUM);
         double benefit = StandardBenefitWithMaxBound(item.approx_visit_cnt, item.current_units_num, item.units_num_limit);
         double cost = StandardCostWithMinBound(item.approx_visit_cnt, item.current_units_num, MIN_UNITS_NUM);
+        // if (item.units_num_limit == item.current_units_num) assert(benefit == 0);
+        // if (item.current_units_num == MIN_UNITS_NUM) assert(cost == __DBL_MAX__);
         // item meets at least one conditions
         // so that item always upsert into heap
         // if item.approx_visit_cnt = 0, still push into heap
@@ -234,20 +340,18 @@ void FilterCacheHeapManager::batch_upsert(std::vector<FilterCacheHeapItem>& item
         }
         */
 
-        if (item.current_units_num <= item.units_num_limit) {
-            cost_nodes.emplace_back(new FilterCacheHeapItem(item.segment_id,
-                                                            item.approx_visit_cnt,
-                                                            item.current_units_num,
-                                                            cost,
-                                                            item.units_num_limit)
-                                    );
-            benefit_nodes.emplace_back(new FilterCacheHeapItem(item.segment_id,
-                                                                item.approx_visit_cnt,
-                                                                item.current_units_num,
-                                                                benefit,
-                                                                item.units_num_limit)
-                                        );
-        }
+        cost_nodes.emplace_back(new FilterCacheHeapItem(item.segment_id,
+                                                        item.approx_visit_cnt,
+                                                        item.current_units_num,
+                                                        cost,
+                                                        item.units_num_limit)
+                                );
+        benefit_nodes.emplace_back(new FilterCacheHeapItem(item.segment_id,
+                                                           item.approx_visit_cnt,
+                                                           item.current_units_num,
+                                                           benefit,
+                                                           item.units_num_limit)
+                                   );
 
         // update visit cnt, we need to keep recorder visit cnt and heap visit cnt the same
         const uint32_t segment_id = item.segment_id;
@@ -265,12 +369,42 @@ void FilterCacheHeapManager::batch_upsert(std::vector<FilterCacheHeapItem>& item
         } else {
             units_num_limit_recorder_.insert(std::make_pair(segment_id, units_limit));
         }
+        // segment_ids_set.insert(segment_id);
+        assert(heap_visit_cnt_recorder_[segment_id] == visit_cnt);
+        assert(units_num_limit_recorder_[segment_id] == units_limit);
     }
+    // assert(segment_ids_set.size() == items.size());
 
     // upsert nodes into heaps
     benefit_heap_.batch_upsert(benefit_nodes);
     cost_heap_.batch_upsert(cost_nodes);
 
+    // std::vector<uint32_t> segment_ids;
+    // std::copy(segment_ids_set.begin(), segment_ids_set.end(), std::back_inserter(segment_ids));
+    // assert(segment_ids.size() == segment_ids_set.size());
+
+    // // print after upsertion
+    // if (segment_ids_set.size() > 0) {
+    //     std::cout << std::endl;
+    //     std::cout << "after upsertion, print benefit heap items: " << std::endl;
+    //     benefit_heap_.heap_print(segment_ids, true);
+    //     std::cout << "after upsertion, print cost heap items: " << std::endl;
+    //     cost_heap_.heap_print(segment_ids, true);
+    //     std::cout << std::endl;
+    // }
+
+    // check whether is heap?
+    // benefit_heap_.heap_check(true);
+    // cost_heap_.heap_check(false);
+    // for (uint32_t &segment_id : segment_ids) {
+    //     assert(heap_visit_cnt_recorder_.find(segment_id) != heap_visit_cnt_recorder_.end());
+    //     assert(units_num_limit_recorder_.find(segment_id) != units_num_limit_recorder_.end());
+    // }
+
+    assert(benefit_heap_.heap_size() == heap_visit_cnt_recorder_.size());
+    assert(benefit_heap_.heap_size() == units_num_limit_recorder_.size());
+    assert(benefit_heap_.heap_size() == cost_heap_.heap_size());
+
     manager_mutex_.unlock();
 }
 
@@ -280,12 +414,13 @@ bool FilterCacheHeapManager::try_modify(FilterCacheModifyResult& result) {
     FilterCacheHeapNode benefit_node = benefit_heap_.heap_top();
     FilterCacheHeapNode cost_node = cost_heap_.heap_top();
     // if benefit heap or cost heap empty, no need to modify
-    if (benefit_node == nullptr || cost_node == nullptr) {
+    if (UNLIKELY(benefit_node == nullptr || cost_node == nullptr)) {
         manager_mutex_.unlock(); // remember to unlock, or we will cause deadlock
         return false;
     }
 
-    if (benefit_node->is_alive == false || cost_node->is_alive == false) {
+    if (UNLIKELY(benefit_node->is_alive == false || cost_node->is_alive == false)) {
+        // std::cout << "one node is not alive, stop modification." << std::endl;
         manager_mutex_.unlock(); // remember to unlock, or we will cause deadlock
         return false;
     }
@@ -293,7 +428,10 @@ bool FilterCacheHeapManager::try_modify(FilterCacheModifyResult& result) {
     const double benefit = benefit_node->benefit_or_cost;
     const double cost = cost_node->benefit_or_cost;
     // if benefit of enable one unit <= cost of disable one unit, no need to modify
-    if (benefit <= cost) {
+    if (benefit - cost < double(PURE_BENEFIT_BOUND)) {
+        // std::cout << std::endl;
+        // std::cout << "failed to modify, because benefit of modification do not hit threshold." << std::endl;
+        // std::cout << "benefit: " << benefit << ", cost: " << cost << std::endl;
         manager_mutex_.unlock(); // remember to unlock, or we will cause deadlock
         return false;
     }
@@ -301,15 +439,29 @@ bool FilterCacheHeapManager::try_modify(FilterCacheModifyResult& result) {
     const uint32_t benefit_segment_id = benefit_node->segment_id;
     const uint32_t cost_segment_id = cost_node->segment_id;
     // if we will enable and disable one unit of the same segment, ignore it
-    if (benefit_segment_id == cost_segment_id) {
+    if (UNLIKELY(benefit_segment_id == cost_segment_id)) {
+        // std::cout << "cannot modify the same segment!" << std::endl;
         manager_mutex_.unlock(); // remember to unlock, or we will cause deadlock
         return false;
     }
+    if (UNLIKELY(heap_visit_cnt_recorder_.find(benefit_segment_id) == heap_visit_cnt_recorder_.end()
+        || heap_visit_cnt_recorder_.find(cost_segment_id) == heap_visit_cnt_recorder_.end())) {
+        // std::cout << "target segment merged, stop modification." << std::endl; 
+        manager_mutex_.unlock();
+        return false;
+    }
+    if (UNLIKELY(units_num_limit_recorder_.find(benefit_segment_id) == units_num_limit_recorder_.end()
+        || units_num_limit_recorder_.find(cost_segment_id) == units_num_limit_recorder_.end())) {
+        // std::cout << "target segment merged, stop modification." << std::endl; 
+        manager_mutex_.unlock();
+        return false;
+    }
 
     // FilterCacheHeapItem(const uint32_t& id, const uint32_t& cnt, const uint16_t& units, const double& heap_value) 
     // we can try filter unit modification, reminded that this modification will modify units num of two segments
     // so we need to upsert new nodes of these two segments into benefit heap and cost heap
     std::vector<FilterCacheHeapNode> new_benefit_nodes, new_cost_nodes;
+    // std::vector<uint32_t> segment_ids;
 
     /*
     if (benefit_node->current_units_num + 1 < benefit_node->units_num_limit) { 
@@ -399,11 +551,21 @@ bool FilterCacheHeapManager::try_modify(FilterCacheModifyResult& result) {
                                                            cost_node->units_num_limit
                                                             )
                                     );
-    // already make ready for upsert
-    benefit_heap_.batch_upsert(new_benefit_nodes);
-    cost_heap_.batch_upsert(new_cost_nodes);
 
-    // write result
+    // segment_ids.emplace_back(benefit_node->segment_id);
+    // segment_ids.emplace_back(cost_node->segment_id);
+
+    // // print nodes
+    // std::cout << std::endl;
+    // std::cout << "before modification, print nodes." << std::endl;
+    // std::cout << "benefit nodes: " << std::endl;
+    // benefit_heap_.heap_print(segment_ids, true);
+    // std::cout << "cost nodes: " << std::endl;
+    // cost_heap_.heap_print(segment_ids, true);
+
+    // write result before real upsert, 
+    // noticed that batch_upsert will also modify content of these nodes, 
+    // so we need to save contents right now
     result.enable_segment_id = benefit_node->segment_id;
     result.disable_segment_id = cost_node->segment_id;
     result.enable_segment_units_num = benefit_node->current_units_num;
@@ -413,6 +575,37 @@ bool FilterCacheHeapManager::try_modify(FilterCacheModifyResult& result) {
     result.enable_benefit = benefit;
     result.disable_cost = cost;
 
+    // std::cout << std::endl;
+    // std::cout << "enable one unit for segment " << result.enable_segment_id
+    //           << ", disable one unit for segment " << result.disable_segment_id
+    //           << ", enable from " << result.enable_segment_units_num << " to " << result.enable_segment_next_units_num
+    //           << ", disable from " << result.disable_segment_units_num << " to " << result.disable_segment_next_units_num
+    //           << ", benefit: " << result.enable_benefit << ", cost: " << result.disable_cost << std::endl;
+
+    // already make ready for upsert
+    benefit_heap_.batch_upsert(new_benefit_nodes);
+    cost_heap_.batch_upsert(new_cost_nodes);
+
+    // // print nodes
+    // std::cout << std::endl;
+    // std::cout << "after modification, print nodes." << std::endl;
+    // std::cout << "benefit nodes: " << std::endl;
+    // benefit_heap_.heap_print(segment_ids, true);
+    // std::cout << "cost nodes: " << std::endl;
+    // cost_heap_.heap_print(segment_ids, true);
+
+    // check whether is heap?
+    // benefit_heap_.heap_check(true);
+    // cost_heap_.heap_check(false);
+    // for (uint32_t &segment_id : segment_ids) {
+    //     assert(heap_visit_cnt_recorder_.find(segment_id) != heap_visit_cnt_recorder_.end());
+    //     assert(units_num_limit_recorder_.find(segment_id) != units_num_limit_recorder_.end());
+    // }
+
+    assert(benefit_heap_.heap_size() == heap_visit_cnt_recorder_.size());
+    assert(benefit_heap_.heap_size() == units_num_limit_recorder_.size());
+    assert(benefit_heap_.heap_size() == cost_heap_.heap_size());
+
     // return nothing, result already written into var result
 
     manager_mutex_.unlock();
@@ -420,34 +613,49 @@ bool FilterCacheHeapManager::try_modify(FilterCacheModifyResult& result) {
     return true;
 }
 
-void FilterCacheHeapManager::sync_visit_cnt(std::map<uint32_t, uint32_t>& current_visit_cnt_recorder) {
+void FilterCacheHeapManager::sync_visit_cnt(std::map<uint32_t, uint32_t>& recent_visit_cnt_recorder) {
     manager_mutex_.lock();
 
     std::vector<FilterCacheHeapNode> sync_nodes;
     std::vector<uint32_t> sync_segment_ids;
 
     auto heap_it = heap_visit_cnt_recorder_.begin();
-    auto current_it = current_visit_cnt_recorder.begin();
+    auto recent_it = recent_visit_cnt_recorder.begin();
     while (heap_it != heap_visit_cnt_recorder_.end() &&
-            current_it != current_visit_cnt_recorder.end()) {
-        if (heap_it->first < current_it->first) {
+           recent_it != recent_visit_cnt_recorder.end()) {
+        if (heap_it->first < recent_it->first) {
             heap_it ++;
-        } else if (heap_it->first > current_it->first) {
-            current_it ++;
+        } else if (heap_it->first > recent_it->first) {
+            recent_it ++;
         } else { 
-            // heap_it->first == current_it->first
-            assert(heap_it->first == current_it->first);
+            // heap_it->first == recent_it->first
+            assert(heap_it->first == recent_it->first);
             int64_t old_visit_cnt = heap_it->second;
-            int64_t cur_visit_cnt = current_it->second;
-            if (std::abs(cur_visit_cnt-old_visit_cnt) > VISIT_CNT_UPDATE_BOUND) {
-                heap_it->second = current_it->second; // remember to update heap visit cnt recorder
-                sync_segment_ids.emplace_back(current_it->first);
+            int64_t rec_visit_cnt = recent_it->second;
+            if (std::abs(rec_visit_cnt-old_visit_cnt) > VISIT_CNT_UPDATE_BOUND) {
+                heap_it->second = recent_it->second; // remember to update heap visit cnt recorder
+                sync_segment_ids.emplace_back(recent_it->first);
+                // std::cout << "segment " << heap_it->first << " cnt diff is " << std::abs(rec_visit_cnt-old_visit_cnt)
+                //           << ", bigger than " << VISIT_CNT_UPDATE_BOUND << ", start to sync." << std::endl;
+            } 
+            else {
+                // std::cout << "segment " << heap_it->first << " cnt diff is " << std::abs(rec_visit_cnt-old_visit_cnt)
+                //           << ", smaller than / euqal to " << VISIT_CNT_UPDATE_BOUND << ", do not sync." << std::endl;
             }
-            // heap_it ++;
-            current_it ++;
+            heap_it ++;
+            recent_it ++;
         }
     }
 
+    // // print sync nodes before 
+    // std::cout << std::endl;
+    // std::cout << "before sync visit count, print sync nodes: " << std::endl;
+    // std::cout << "benefit heap:" << std::endl;
+    // benefit_heap_.heap_print(sync_segment_ids, true);
+    // std::cout << "cost heap:" << std::endl;
+    // cost_heap_.heap_print(sync_segment_ids, true);
+    // std::cout << std::endl;
+
     // query nodes in heap
     std::vector<FilterCacheHeapNode> sync_benefit_nodes, sync_cost_nodes;
     benefit_heap_.batch_query(sync_segment_ids, sync_benefit_nodes);
@@ -455,16 +663,18 @@ void FilterCacheHeapManager::sync_visit_cnt(std::map<uint32_t, uint32_t>& curren
 
     // update visit cnt and benefit/cost in these nodes 
     for (FilterCacheHeapNode& sync_benefit_node : sync_benefit_nodes) {
-        if (sync_benefit_node != nullptr) {
-            sync_benefit_node->approx_visit_cnt = current_visit_cnt_recorder[sync_benefit_node->segment_id];
+        assert(sync_benefit_node != nullptr);
+        if (LIKELY(sync_benefit_node != nullptr)) {
+            sync_benefit_node->approx_visit_cnt = recent_visit_cnt_recorder[sync_benefit_node->segment_id];
             sync_benefit_node->benefit_or_cost = StandardBenefitWithMaxBound(sync_benefit_node->approx_visit_cnt,
                                                                              sync_benefit_node->current_units_num,
                                                                              sync_benefit_node->units_num_limit);
         }
     }
     for (FilterCacheHeapNode& sync_cost_node : sync_cost_nodes) {
-        if (sync_cost_node != nullptr) {
-            sync_cost_node->approx_visit_cnt = current_visit_cnt_recorder[sync_cost_node->segment_id];
+        assert(sync_cost_node != nullptr);
+        if (LIKELY(sync_cost_node != nullptr)) {
+            sync_cost_node->approx_visit_cnt = recent_visit_cnt_recorder[sync_cost_node->segment_id];
             sync_cost_node->benefit_or_cost = StandardCostWithMinBound(sync_cost_node->approx_visit_cnt,
                                                                        sync_cost_node->current_units_num,
                                                                        MIN_UNITS_NUM);
@@ -474,7 +684,6 @@ void FilterCacheHeapManager::sync_visit_cnt(std::map<uint32_t, uint32_t>& curren
     // upsert nodes into benefit heap and cost heap
     // benefit_heap_.batch_upsert(sync_benefit_nodes);
     // cost_heap_.batch_upsert(sync_cost_nodes);
-
     
     // notice that we already updated these nodes in heap, we only need to rebuild heap
     // but heap.upsert include the step of checking whether these segments already in heap
@@ -482,6 +691,28 @@ void FilterCacheHeapManager::sync_visit_cnt(std::map<uint32_t, uint32_t>& curren
     benefit_heap_.rebuild_heap();
     cost_heap_.rebuild_heap();
 
+    // check whether is heap?
+    // benefit_heap_.heap_check(true);
+    // cost_heap_.heap_check(false);
+    // for (uint32_t &segment_id : sync_segment_ids) {
+    //     assert(heap_visit_cnt_recorder_.find(segment_id) != heap_visit_cnt_recorder_.end());
+    //     assert(heap_visit_cnt_recorder_[segment_id] == recent_visit_cnt_recorder[segment_id]);
+    //     assert(units_num_limit_recorder_.find(segment_id) != units_num_limit_recorder_.end());
+    // }
+
+    assert(benefit_heap_.heap_size() == heap_visit_cnt_recorder_.size());
+    assert(benefit_heap_.heap_size() == units_num_limit_recorder_.size());
+    assert(benefit_heap_.heap_size() == cost_heap_.heap_size());
+
+    // // print sync nodes after
+    // std::cout << std::endl;
+    // std::cout << "after sync visit count, print sync nodes: " << std::endl;
+    // std::cout << "benefit heap:" << std::endl;
+    // benefit_heap_.heap_print(sync_segment_ids, true);
+    // std::cout << "cost heap:" << std::endl;
+    // cost_heap_.heap_print(sync_segment_ids, true);
+    // std::cout << std::endl;
+
     manager_mutex_.unlock();
 }
 
@@ -502,15 +733,27 @@ void FilterCacheHeapManager::sync_units_num_limit(std::map<uint32_t, uint16_t>&
         } else { 
             // origin_it->first == current_it->first
             assert(origin_it->first == current_it->first);
-            assert(current_it->second <= MAX_UNITS_NUM);
+            assert(current_it->second <= MAX_UNITS_NUM && current_it->second >= MIN_UNITS_NUM);
             if (origin_it->second != current_it->second) {
                 origin_it->second = current_it->second;
                 sync_segment_ids.emplace_back(current_it->first);
             }
+            uint32_t segment_id = origin_it->first;
+            assert(units_num_limit_recorder_[segment_id] == current_it->second);
+            origin_it ++;
             current_it ++;
         }
     }
 
+    // // print sync nodes before 
+    // std::cout << std::endl;
+    // std::cout << "before sync units limit, print sync nodes: " << std::endl;
+    // std::cout << "benefit heap:" << std::endl;
+    // benefit_heap_.heap_print(sync_segment_ids, true);
+    // std::cout << "cost heap:" << std::endl;
+    // cost_heap_.heap_print(sync_segment_ids, true);
+    // std::cout << std::endl;
+
     // query nodes in heap
     std::vector<FilterCacheHeapNode> sync_benefit_nodes, sync_cost_nodes;
     benefit_heap_.batch_query(sync_segment_ids, sync_benefit_nodes);
@@ -518,20 +761,24 @@ void FilterCacheHeapManager::sync_units_num_limit(std::map<uint32_t, uint16_t>&
 
     // update units num limit, units num and benefit/cost in these nodes 
     for (FilterCacheHeapNode& sync_benefit_node : sync_benefit_nodes) {
-        if (sync_benefit_node != nullptr) {
+        assert(sync_benefit_node != nullptr);
+        if (LIKELY(sync_benefit_node != nullptr)) {
             sync_benefit_node->units_num_limit = current_units_num_limit_recorder[sync_benefit_node->segment_id];
             sync_benefit_node->current_units_num = std::min(sync_benefit_node->units_num_limit,
                                                             sync_benefit_node->current_units_num);
+            assert(sync_benefit_node->units_num_limit >= sync_benefit_node->current_units_num);
             sync_benefit_node->benefit_or_cost = StandardBenefitWithMaxBound(sync_benefit_node->approx_visit_cnt,
                                                                              sync_benefit_node->current_units_num,
                                                                              sync_benefit_node->units_num_limit);
         }
     }
     for (FilterCacheHeapNode& sync_cost_node : sync_cost_nodes) {
-        if (sync_cost_node != nullptr) {
+        assert(sync_cost_node != nullptr);
+        if (LIKELY(sync_cost_node != nullptr)) {
             sync_cost_node->units_num_limit = current_units_num_limit_recorder[sync_cost_node->segment_id];
             sync_cost_node->current_units_num = std::min(sync_cost_node->units_num_limit,
                                                          sync_cost_node->current_units_num);
+            assert(sync_cost_node->units_num_limit >= sync_cost_node->current_units_num);
             sync_cost_node->benefit_or_cost = StandardCostWithMinBound(sync_cost_node->approx_visit_cnt,
                                                                         sync_cost_node->current_units_num,
                                                                         MIN_UNITS_NUM);
@@ -549,488 +796,509 @@ void FilterCacheHeapManager::sync_units_num_limit(std::map<uint32_t, uint16_t>&
     benefit_heap_.rebuild_heap();
     cost_heap_.rebuild_heap();
 
+    // check whether is heap?
+    // benefit_heap_.heap_check(true);
+    // cost_heap_.heap_check(false);
+    // for (uint32_t &segment_id : sync_segment_ids) {
+    //     assert(heap_visit_cnt_recorder_.find(segment_id) != heap_visit_cnt_recorder_.end());
+    //     assert(units_num_limit_recorder_.find(segment_id) != units_num_limit_recorder_.end());
+    // }
+
+    assert(benefit_heap_.heap_size() == heap_visit_cnt_recorder_.size());
+    assert(benefit_heap_.heap_size() == units_num_limit_recorder_.size());
+    assert(benefit_heap_.heap_size() == cost_heap_.heap_size());
+
+    // // print sync nodes after
+    // std::cout << std::endl;
+    // std::cout << "after sync units limit, print sync nodes: " << std::endl;
+    // std::cout << "benefit heap:" << std::endl;
+    // benefit_heap_.heap_print(sync_segment_ids, true);
+    // std::cout << "cost heap:" << std::endl;
+    // cost_heap_.heap_print(sync_segment_ids, true);
+    // std::cout << std::endl;
+
     manager_mutex_.unlock();
 }
 
-void FilterCacheHeapManager::debug() {
-    std::vector<FilterCacheHeapItem> items;
-    std::vector<uint32_t> segment_ids;
-    std::map<uint32_t, uint32_t> current_visit_cnt_recorder;
-    std::map<uint32_t, uint16_t> current_units_num_limit_recorder;
-    std::map<uint32_t, FilterCacheHeapNode> b_heap_index;
-    std::vector<FilterCacheHeapNode> b_heap;
-    std::map<uint32_t, FilterCacheHeapNode> c_heap_index;
-    std::vector<FilterCacheHeapNode> c_heap;
-    std::fstream f_heap;
-    f_heap.open("/pg_wal/ycc/heap.log", std::ios::out | std::ios::app);
-    // FilterCacheHeapItem(const uint32_t& id, const uint32_t& cnt, const uint16_t& units,
-    //                     const double& heap_value, const uint16_t& limit)
-    // 1. try to insert some new data
-    f_heap << "[DEBUG] debug step 1 : batch insert" << std::endl << std::endl;
-    for (uint32_t id = 0; id < 70; id++) {
-        items.emplace_back(id % 70, (id % 70) * 10, (id % 70) / 10, 0, MAX_UNITS_NUM);
-    }
-    batch_upsert(items);
-    benefit_heap_.heap_index(b_heap_index);
-    benefit_heap_.heap(b_heap);
-    cost_heap_.heap_index(c_heap_index);
-    cost_heap_.heap(c_heap);
-    f_heap << "[DEBUG] step1 b_heap_index : " << std::endl;
-    for (auto it = b_heap_index.begin(); it != b_heap_index.end(); it++) {
-        FilterCacheHeapNode node = it->second;
-        f_heap << it->first << " -> ";
-        f_heap << " id : " << node->segment_id;
-        f_heap << " , cnt : " << node->approx_visit_cnt;
-        f_heap << " , units : " << node->current_units_num;
-        f_heap << " , value : " << node->benefit_or_cost;
-        f_heap << " , limit : " << node->units_num_limit;
-        f_heap << " , alive : " << node->is_alive << std::endl;
-    }
-    f_heap << "[DEBUG] step1 b_heap : " << std::endl;
-    for (FilterCacheHeapNode& node : b_heap) {
-        f_heap << " id : " << node->segment_id;
-        f_heap << " , cnt : " << node->approx_visit_cnt;
-        f_heap << " , units : " << node->current_units_num;
-        f_heap << " , value : " << node->benefit_or_cost;
-        f_heap << " , limit : " << node->units_num_limit;
-        f_heap << " , alive : " << node->is_alive << std::endl;
-    }
-    f_heap << "[DEBUG] step1 c_heap_index : " << std::endl;
-    for (auto it = c_heap_index.begin(); it != c_heap_index.end(); it++) {
-        FilterCacheHeapNode node = it->second;
-        f_heap << it->first << " -> ";
-        f_heap << " id : " << node->segment_id;
-        f_heap << " , cnt : " << node->approx_visit_cnt;
-        f_heap << " , units : " << node->current_units_num;
-        f_heap << " , value : " << node->benefit_or_cost;
-        f_heap << " , limit : " << node->units_num_limit;
-        f_heap << " , alive : " << node->is_alive << std::endl;
-    }
-    f_heap << "[DEBUG] step1 c_heap : " << std::endl;
-    for (FilterCacheHeapNode& node : c_heap) {
-        f_heap << " id : " << node->segment_id;
-        f_heap << " , cnt : " << node->approx_visit_cnt;
-        f_heap << " , units : " << node->current_units_num;
-        f_heap << " , value : " << node->benefit_or_cost;
-        f_heap << " , limit : " << node->units_num_limit;
-        f_heap << " , alive : " << node->is_alive << std::endl;
-    }
-    f_heap << "[DEBUG] step1 visit_cnt_recorder : " << std::endl;
-    for (auto it = heap_visit_cnt_recorder_.begin();
-         it != heap_visit_cnt_recorder_.end(); it++) {
-        f_heap << it->first << " -> " << it->second << std::endl;
-    }
-    f_heap << "[DEBUG] step1 units_limit_recorder : " << std::endl;
-    for (auto it = units_num_limit_recorder_.begin();
-         it != units_num_limit_recorder_.end(); it++) {
-        f_heap << it->first << " -> " << it->second << std::endl;
-    }
-
-    // 2. try to update old data
-    f_heap << std::endl << std::endl<< "[DEBUG] debug step 2 : batch update (using upsert)" << std::endl << std::endl;
-    items.clear();
-    for (uint32_t id = 0; id < 70; id++) {
-        items.emplace_back(id % 70, (id % 70) * std::pow(10, (id % 70) / 10), (id % 70) / 10, 0, MAX_UNITS_NUM);
-    }
-    batch_upsert(items);
-    benefit_heap_.heap_index(b_heap_index);
-    benefit_heap_.heap(b_heap);
-    cost_heap_.heap_index(c_heap_index);
-    cost_heap_.heap(c_heap);
-    f_heap << "[DEBUG] step2 b_heap_index : " << std::endl;
-    for (auto it = b_heap_index.begin(); it != b_heap_index.end(); it++) {
-        FilterCacheHeapNode node = it->second;
-        f_heap << it->first << " -> ";
-        f_heap << " id : " << node->segment_id;
-        f_heap << " , cnt : " << node->approx_visit_cnt;
-        f_heap << " , units : " << node->current_units_num;
-        f_heap << " , value : " << node->benefit_or_cost;
-        f_heap << " , limit : " << node->units_num_limit;
-        f_heap << " , alive : " << node->is_alive << std::endl;
-    }
-    f_heap << "[DEBUG] step2 b_heap : " << std::endl;
-    for (FilterCacheHeapNode& node : b_heap) {
-        f_heap << " id : " << node->segment_id;
-        f_heap << " , cnt : " << node->approx_visit_cnt;
-        f_heap << " , units : " << node->current_units_num;
-        f_heap << " , value : " << node->benefit_or_cost;
-        f_heap << " , limit : " << node->units_num_limit;
-        f_heap << " , alive : " << node->is_alive << std::endl;
-    }
-    f_heap << "[DEBUG] step2 c_heap_index : " << std::endl;
-    for (auto it = c_heap_index.begin(); it != c_heap_index.end(); it++) {
-        FilterCacheHeapNode node = it->second;
-        f_heap << it->first << " -> ";
-        f_heap << " id : " << node->segment_id;
-        f_heap << " , cnt : " << node->approx_visit_cnt;
-        f_heap << " , units : " << node->current_units_num;
-        f_heap << " , value : " << node->benefit_or_cost;
-        f_heap << " , limit : " << node->units_num_limit;
-        f_heap << " , alive : " << node->is_alive << std::endl;
-    }
-    f_heap << "[DEBUG] step2 c_heap : " << std::endl;
-    for (FilterCacheHeapNode& node : c_heap) {
-        f_heap << " id : " << node->segment_id;
-        f_heap << " , cnt : " << node->approx_visit_cnt;
-        f_heap << " , units : " << node->current_units_num;
-        f_heap << " , value : " << node->benefit_or_cost;
-        f_heap << " , limit : " << node->units_num_limit;
-        f_heap << " , alive : " << node->is_alive << std::endl;
-    }
-    f_heap << "[DEBUG] step2 visit_cnt_recorder : " << std::endl;
-    for (auto it = heap_visit_cnt_recorder_.begin();
-         it != heap_visit_cnt_recorder_.end(); it++) {
-        f_heap << it->first << " -> " << it->second << std::endl;
-    }
-    f_heap << "[DEBUG] step2 units_limit_recorder : " << std::endl;
-    for (auto it = units_num_limit_recorder_.begin();
-         it != units_num_limit_recorder_.end(); it++) {
-        f_heap << it->first << " -> " << it->second << std::endl;
-    }
-
-    // 3. try to delete some data
-    f_heap << std::endl << std::endl<< "[DEBUG] debug step 3 : batch delete" << std::endl << std::endl;
-    items.clear();
-    segment_ids.clear();
-    for (uint32_t i = 0; i < 10; i++) {
-        segment_ids.emplace_back(i);
-    }
-    for (uint32_t i = 60; i < 100; i++) {
-        segment_ids.emplace_back(i);
-    }
-    batch_delete(segment_ids);
-    benefit_heap_.heap_index(b_heap_index);
-    benefit_heap_.heap(b_heap);
-    cost_heap_.heap_index(c_heap_index);
-    cost_heap_.heap(c_heap);
-    f_heap << "[DEBUG] step3 b_heap_index : " << std::endl;
-    for (auto it = b_heap_index.begin(); it != b_heap_index.end(); it++) {
-        FilterCacheHeapNode node = it->second;
-        f_heap << it->first << " -> ";
-        f_heap << " id : " << node->segment_id;
-        f_heap << " , cnt : " << node->approx_visit_cnt;
-        f_heap << " , units : " << node->current_units_num;
-        f_heap << " , value : " << node->benefit_or_cost;
-        f_heap << " , limit : " << node->units_num_limit;
-        f_heap << " , alive : " << node->is_alive << std::endl;
-    }
-    f_heap << "[DEBUG] step3 b_heap : " << std::endl;
-    for (FilterCacheHeapNode& node : b_heap) {
-        f_heap << " id : " << node->segment_id;
-        f_heap << " , cnt : " << node->approx_visit_cnt;
-        f_heap << " , units : " << node->current_units_num;
-        f_heap << " , value : " << node->benefit_or_cost;
-        f_heap << " , limit : " << node->units_num_limit;
-        f_heap << " , alive : " << node->is_alive << std::endl;
-    }
-    f_heap << "[DEBUG] step3 c_heap_index : " << std::endl;
-    for (auto it = c_heap_index.begin(); it != c_heap_index.end(); it++) {
-        FilterCacheHeapNode node = it->second;
-        f_heap << it->first << " -> ";
-        f_heap << " id : " << node->segment_id;
-        f_heap << " , cnt : " << node->approx_visit_cnt;
-        f_heap << " , units : " << node->current_units_num;
-        f_heap << " , value : " << node->benefit_or_cost;
-        f_heap << " , limit : " << node->units_num_limit;
-        f_heap << " , alive : " << node->is_alive << std::endl;
-    }
-    f_heap << "[DEBUG] step3 c_heap : " << std::endl;
-    for (FilterCacheHeapNode& node : c_heap) {
-        f_heap << " id : " << node->segment_id;
-        f_heap << " , cnt : " << node->approx_visit_cnt;
-        f_heap << " , units : " << node->current_units_num;
-        f_heap << " , value : " << node->benefit_or_cost;
-        f_heap << " , limit : " << node->units_num_limit;
-        f_heap << " , alive : " << node->is_alive << std::endl;
-    }
-    f_heap << "[DEBUG] step3 visit_cnt_recorder : " << std::endl;
-    for (auto it = heap_visit_cnt_recorder_.begin();
-         it != heap_visit_cnt_recorder_.end(); it++) {
-        f_heap << it->first << " -> " << it->second << std::endl;
-    }
-    f_heap << "[DEBUG] step3 units_limit_recorder : " << std::endl;
-    for (auto it = units_num_limit_recorder_.begin();
-         it != units_num_limit_recorder_.end(); it++) {
-        f_heap << it->first << " -> " << it->second << std::endl;
-    }
-
-    // 4. try to sync visit cnt
-    f_heap << std::endl << std::endl<< "[DEBUG] debug step 4 : sync visit cnt " << std::endl << std::endl;
-    for (uint32_t id = 0; id < 40; id++) {
-        if (id % 2 == 0) {
-            current_visit_cnt_recorder.insert(std::make_pair(id, (id % 70) * std::pow(10, (id % 70) / 10) + 101010));
-        }
-    }
-    for (uint32_t id = 40; id < 60; id++) {
-        current_visit_cnt_recorder.insert(std::make_pair(id, (id % 70) * std::pow(10, (id % 70) / 10) + 101010));
-    }
-    sync_visit_cnt(current_visit_cnt_recorder);
-    benefit_heap_.heap_index(b_heap_index);
-    benefit_heap_.heap(b_heap);
-    cost_heap_.heap_index(c_heap_index);
-    cost_heap_.heap(c_heap);
-    f_heap << "[DEBUG] step4 b_heap_index : " << std::endl;
-    for (auto it = b_heap_index.begin(); it != b_heap_index.end(); it++) {
-        FilterCacheHeapNode node = it->second;
-        f_heap << it->first << " -> ";
-        f_heap << " id : " << node->segment_id;
-        f_heap << " , cnt : " << node->approx_visit_cnt;
-        f_heap << " , units : " << node->current_units_num;
-        f_heap << " , value : " << node->benefit_or_cost;
-        f_heap << " , limit : " << node->units_num_limit;
-        f_heap << " , alive : " << node->is_alive << std::endl;
-    }
-    f_heap << "[DEBUG] step4 b_heap : " << std::endl;
-    for (FilterCacheHeapNode& node : b_heap) {
-        f_heap << " id : " << node->segment_id;
-        f_heap << " , cnt : " << node->approx_visit_cnt;
-        f_heap << " , units : " << node->current_units_num;
-        f_heap << " , value : " << node->benefit_or_cost;
-        f_heap << " , limit : " << node->units_num_limit;
-        f_heap << " , alive : " << node->is_alive << std::endl;
-    }
-    f_heap << "[DEBUG] step4 c_heap_index : " << std::endl;
-    for (auto it = c_heap_index.begin(); it != c_heap_index.end(); it++) {
-        FilterCacheHeapNode node = it->second;
-        f_heap << it->first << " -> ";
-        f_heap << " id : " << node->segment_id;
-        f_heap << " , cnt : " << node->approx_visit_cnt;
-        f_heap << " , units : " << node->current_units_num;
-        f_heap << " , value : " << node->benefit_or_cost;
-        f_heap << " , limit : " << node->units_num_limit;
-        f_heap << " , alive : " << node->is_alive << std::endl;
-    }
-    f_heap << "[DEBUG] step4 c_heap : " << std::endl;
-    for (FilterCacheHeapNode& node : c_heap) {
-        f_heap << " id : " << node->segment_id;
-        f_heap << " , cnt : " << node->approx_visit_cnt;
-        f_heap << " , units : " << node->current_units_num;
-        f_heap << " , value : " << node->benefit_or_cost;
-        f_heap << " , limit : " << node->units_num_limit;
-        f_heap << " , alive : " << node->is_alive << std::endl;
-    }
-    f_heap << "[DEBUG] step4 visit_cnt_recorder : " << std::endl;
-    for (auto it = heap_visit_cnt_recorder_.begin();
-         it != heap_visit_cnt_recorder_.end(); it++) {
-        f_heap << it->first << " -> " << it->second << std::endl;
-    }
-    f_heap << "[DEBUG] step4 units_limit_recorder : " << std::endl;
-    for (auto it = units_num_limit_recorder_.begin();
-         it != units_num_limit_recorder_.end(); it++) {
-        f_heap << it->first << " -> " << it->second << std::endl;
-    }
-
-    // 5. try to decrease units limit
-    f_heap << std::endl << std::endl<< "[DEBUG] debug step 5 : decrease units limit " << std::endl << std::endl;
-    for (uint32_t id = 0; id < 40; id++) {
-        if (id % 2 == 0) {
-            current_units_num_limit_recorder.insert(std::make_pair(id, 0));
-        } else {
-            current_units_num_limit_recorder.insert(std::make_pair(id, 1));
-        }
-    }
-    for (uint32_t id = 40; id < 50; id++) {
-        current_units_num_limit_recorder.insert(std::make_pair(id, 3));
-    }
-    for (uint32_t id = 50; id < 70; id++) {
-        current_units_num_limit_recorder.insert(std::make_pair(id, 5));
-    }
-    sync_units_num_limit(current_units_num_limit_recorder);
-    benefit_heap_.heap_index(b_heap_index);
-    benefit_heap_.heap(b_heap);
-    cost_heap_.heap_index(c_heap_index);
-    cost_heap_.heap(c_heap);
-    f_heap << "[DEBUG] step5 b_heap_index : " << std::endl;
-    for (auto it = b_heap_index.begin(); it != b_heap_index.end(); it++) {
-        FilterCacheHeapNode node = it->second;
-        f_heap << it->first << " -> ";
-        f_heap << " id : " << node->segment_id;
-        f_heap << " , cnt : " << node->approx_visit_cnt;
-        f_heap << " , units : " << node->current_units_num;
-        f_heap << " , value : " << node->benefit_or_cost;
-        f_heap << " , limit : " << node->units_num_limit;
-        f_heap << " , alive : " << node->is_alive << std::endl;
-    }
-    f_heap << "[DEBUG] step5 b_heap : " << std::endl;
-    for (FilterCacheHeapNode& node : b_heap) {
-        f_heap << " id : " << node->segment_id;
-        f_heap << " , cnt : " << node->approx_visit_cnt;
-        f_heap << " , units : " << node->current_units_num;
-        f_heap << " , value : " << node->benefit_or_cost;
-        f_heap << " , limit : " << node->units_num_limit;
-        f_heap << " , alive : " << node->is_alive << std::endl;
-    }
-    f_heap << "[DEBUG] step5 c_heap_index : " << std::endl;
-    for (auto it = c_heap_index.begin(); it != c_heap_index.end(); it++) {
-        FilterCacheHeapNode node = it->second;
-        f_heap << it->first << " -> ";
-        f_heap << " id : " << node->segment_id;
-        f_heap << " , cnt : " << node->approx_visit_cnt;
-        f_heap << " , units : " << node->current_units_num;
-        f_heap << " , value : " << node->benefit_or_cost;
-        f_heap << " , limit : " << node->units_num_limit;
-        f_heap << " , alive : " << node->is_alive << std::endl;
-    }
-    f_heap << "[DEBUG] step5 c_heap : " << std::endl;
-    for (FilterCacheHeapNode& node : c_heap) {
-        f_heap << " id : " << node->segment_id;
-        f_heap << " , cnt : " << node->approx_visit_cnt;
-        f_heap << " , units : " << node->current_units_num;
-        f_heap << " , value : " << node->benefit_or_cost;
-        f_heap << " , limit : " << node->units_num_limit;
-        f_heap << " , alive : " << node->is_alive << std::endl;
-    }
-    f_heap << "[DEBUG] step5 visit_cnt_recorder : " << std::endl;
-    for (auto it = heap_visit_cnt_recorder_.begin();
-         it != heap_visit_cnt_recorder_.end(); it++) {
-        f_heap << it->first << " -> " << it->second << std::endl;
-    }
-    f_heap << "[DEBUG] step5 units_limit_recorder : " << std::endl;
-    for (auto it = units_num_limit_recorder_.begin();
-         it != units_num_limit_recorder_.end(); it++) {
-        f_heap << it->first << " -> " << it->second << std::endl;
-    }
-
-    // 6. try to increase units limit
-    f_heap << std::endl << std::endl<< "[DEBUG] debug step 6 : increase units limit " << std::endl << std::endl;
-    for (uint32_t id = 0; id < 40; id++) {
-        if (id % 2 == 0) {
-            current_units_num_limit_recorder[id] = 3;
-        } else {
-            current_units_num_limit_recorder[id] = 4;
-        }
-    }
-    for (uint32_t id = 40; id < 50; id++) {
-        current_units_num_limit_recorder[id] = 5;
-    }
-    for (uint32_t id = 50; id < 70; id++) {
-        current_units_num_limit_recorder[id] = 6;
-    }
-    sync_units_num_limit(current_units_num_limit_recorder);
-    benefit_heap_.heap_index(b_heap_index);
-    benefit_heap_.heap(b_heap);
-    cost_heap_.heap_index(c_heap_index);
-    cost_heap_.heap(c_heap);
-    f_heap << "[DEBUG] step6 b_heap_index : " << std::endl;
-    for (auto it = b_heap_index.begin(); it != b_heap_index.end(); it++) {
-        FilterCacheHeapNode node = it->second;
-        f_heap << it->first << " -> ";
-        f_heap << " id : " << node->segment_id;
-        f_heap << " , cnt : " << node->approx_visit_cnt;
-        f_heap << " , units : " << node->current_units_num;
-        f_heap << " , value : " << node->benefit_or_cost;
-        f_heap << " , limit : " << node->units_num_limit;
-        f_heap << " , alive : " << node->is_alive << std::endl;
-    }
-    f_heap << "[DEBUG] step6 b_heap : " << std::endl;
-    for (FilterCacheHeapNode& node : b_heap) {
-        f_heap << " id : " << node->segment_id;
-        f_heap << " , cnt : " << node->approx_visit_cnt;
-        f_heap << " , units : " << node->current_units_num;
-        f_heap << " , value : " << node->benefit_or_cost;
-        f_heap << " , limit : " << node->units_num_limit;
-        f_heap << " , alive : " << node->is_alive << std::endl;
-    }
-    f_heap << "[DEBUG] step6 c_heap_index : " << std::endl;
-    for (auto it = c_heap_index.begin(); it != c_heap_index.end(); it++) {
-        FilterCacheHeapNode node = it->second;
-        f_heap << it->first << " -> ";
-        f_heap << " id : " << node->segment_id;
-        f_heap << " , cnt : " << node->approx_visit_cnt;
-        f_heap << " , units : " << node->current_units_num;
-        f_heap << " , value : " << node->benefit_or_cost;
-        f_heap << " , limit : " << node->units_num_limit;
-        f_heap << " , alive : " << node->is_alive << std::endl;
-    }
-    f_heap << "[DEBUG] step6 c_heap : " << std::endl;
-    for (FilterCacheHeapNode& node : c_heap) {
-        f_heap << " id : " << node->segment_id;
-        f_heap << " , cnt : " << node->approx_visit_cnt;
-        f_heap << " , units : " << node->current_units_num;
-        f_heap << " , value : " << node->benefit_or_cost;
-        f_heap << " , limit : " << node->units_num_limit;
-        f_heap << " , alive : " << node->is_alive << std::endl;
-    }
-    f_heap << "[DEBUG] step6 visit_cnt_recorder : " << std::endl;
-    for (auto it = heap_visit_cnt_recorder_.begin();
-         it != heap_visit_cnt_recorder_.end(); it++) {
-        f_heap << it->first << " -> " << it->second << std::endl;
-    }
-    f_heap << "[DEBUG] step6 units_limit_recorder : " << std::endl;
-    for (auto it = units_num_limit_recorder_.begin();
-         it != units_num_limit_recorder_.end(); it++) {
-        f_heap << it->first << " -> " << it->second << std::endl;
-    }
-
-    // 7. try to loop modification
-    f_heap << std::endl << std::endl<< "[DEBUG] debug step 7 : loop try_modify " << std::endl << std::endl;
-    f_heap << "[DEBUG] step7 loop start : " << std::endl;
-    FilterCacheModifyResult result;
-    while (try_modify(result)) {
-        f_heap << "enable segment -> " << "id : " << result.enable_segment_id;
-        f_heap << " , prev units num : " << result.enable_segment_units_num;
-        f_heap << " , benefit : " << result.enable_benefit << std::endl;
-        f_heap << "disable segment -> " << "id : " << result.disable_segment_id;
-        f_heap << " , prev units num : " << result.disable_segment_units_num;
-        f_heap << " , cost : " << result.disable_cost << std::endl;
-    }
-    // write final indexs and heaps
-    benefit_heap_.heap_index(b_heap_index);
-    benefit_heap_.heap(b_heap);
-    cost_heap_.heap_index(c_heap_index);
-    cost_heap_.heap(c_heap);
-    f_heap << "[DEBUG] step7 b_heap_index : " << std::endl;
-    for (auto it = b_heap_index.begin(); it != b_heap_index.end(); it++) {
-        FilterCacheHeapNode node = it->second;
-        f_heap << it->first << " -> ";
-        f_heap << " id : " << node->segment_id;
-        f_heap << " , cnt : " << node->approx_visit_cnt;
-        f_heap << " , units : " << node->current_units_num;
-        f_heap << " , value : " << node->benefit_or_cost;
-        f_heap << " , limit : " << node->units_num_limit;
-        f_heap << " , alive : " << node->is_alive << std::endl;
-    }
-    f_heap << "[DEBUG] step7 b_heap : " << std::endl;
-    for (FilterCacheHeapNode& node : b_heap) {
-        f_heap << " id : " << node->segment_id;
-        f_heap << " , cnt : " << node->approx_visit_cnt;
-        f_heap << " , units : " << node->current_units_num;
-        f_heap << " , value : " << node->benefit_or_cost;
-        f_heap << " , limit : " << node->units_num_limit;
-        f_heap << " , alive : " << node->is_alive << std::endl;
-    }
-    f_heap << "[DEBUG] step7 c_heap_index : " << std::endl;
-    for (auto it = c_heap_index.begin(); it != c_heap_index.end(); it++) {
-        FilterCacheHeapNode node = it->second;
-        f_heap << it->first << " -> ";
-        f_heap << " id : " << node->segment_id;
-        f_heap << " , cnt : " << node->approx_visit_cnt;
-        f_heap << " , units : " << node->current_units_num;
-        f_heap << " , value : " << node->benefit_or_cost;
-        f_heap << " , limit : " << node->units_num_limit;
-        f_heap << " , alive : " << node->is_alive << std::endl;
-    }
-    f_heap << "[DEBUG] step7 c_heap : " << std::endl;
-    for (FilterCacheHeapNode& node : c_heap) {
-        f_heap << " id : " << node->segment_id;
-        f_heap << " , cnt : " << node->approx_visit_cnt;
-        f_heap << " , units : " << node->current_units_num;
-        f_heap << " , value : " << node->benefit_or_cost;
-        f_heap << " , limit : " << node->units_num_limit;
-        f_heap << " , alive : " << node->is_alive << std::endl;
-    }
-    f_heap << "[DEBUG] step7 visit_cnt_recorder : " << std::endl;
-    for (auto it = heap_visit_cnt_recorder_.begin();
-         it != heap_visit_cnt_recorder_.end(); it++) {
-        f_heap << it->first << " -> " << it->second << std::endl;
-    }
-    f_heap << "[DEBUG] step7 units_limit_recorder : " << std::endl;
-    for (auto it = units_num_limit_recorder_.begin();
-         it != units_num_limit_recorder_.end(); it++) {
-        f_heap << it->first << " -> " << it->second << std::endl;
-    }
-
-    f_heap.close();
-}
+// void FilterCacheHeapManager::debug() {
+//     std::vector<FilterCacheHeapItem> items;
+//     std::vector<uint32_t> segment_ids;
+//     std::map<uint32_t, uint32_t> current_visit_cnt_recorder;
+//     std::map<uint32_t, uint16_t> current_units_num_limit_recorder;
+//     std::map<uint32_t, FilterCacheHeapNode> b_heap_index;
+//     std::vector<FilterCacheHeapNode> b_heap;
+//     std::map<uint32_t, FilterCacheHeapNode> c_heap_index;
+//     std::vector<FilterCacheHeapNode> c_heap;
+//     std::fstream f_heap;
+//     f_heap.open("/home/guoteng_20241228_135/WaLSM+/log/heap.log", std::ios::out | std::ios::app);
+//     // FilterCacheHeapItem(const uint32_t& id, const uint32_t& cnt, const uint16_t& units,
+//     //                     const double& heap_value, const uint16_t& limit)
+//     // 1. try to insert some new data
+//     f_heap << "[DEBUG] debug step 1 : batch insert" << std::endl << std::endl;
+//     for (uint32_t id = 0; id < 70; id++) {
+//         items.emplace_back(id % 70, (id % 70) * 10, (id % 70) / 10, 0, MAX_UNITS_NUM);
+//     }
+//     batch_upsert(items);
+//     benefit_heap_.heap_index(b_heap_index);
+//     benefit_heap_.heap(b_heap);
+//     cost_heap_.heap_index(c_heap_index);
+//     cost_heap_.heap(c_heap);
+//     f_heap << "[DEBUG] step1 b_heap_index : " << std::endl;
+//     for (auto it = b_heap_index.begin(); it != b_heap_index.end(); it++) {
+//         FilterCacheHeapNode node = it->second;
+//         f_heap << it->first << " -> ";
+//         f_heap << " id : " << node->segment_id;
+//         f_heap << " , cnt : " << node->approx_visit_cnt;
+//         f_heap << " , units : " << node->current_units_num;
+//         f_heap << " , value : " << node->benefit_or_cost;
+//         f_heap << " , limit : " << node->units_num_limit;
+//         f_heap << " , alive : " << node->is_alive << std::endl;
+//     }
+//     f_heap << "[DEBUG] step1 b_heap : " << std::endl;
+//     for (FilterCacheHeapNode& node : b_heap) {
+//         f_heap << " id : " << node->segment_id;
+//         f_heap << " , cnt : " << node->approx_visit_cnt;
+//         f_heap << " , units : " << node->current_units_num;
+//         f_heap << " , value : " << node->benefit_or_cost;
+//         f_heap << " , limit : " << node->units_num_limit;
+//         f_heap << " , alive : " << node->is_alive << std::endl;
+//     }
+//     f_heap << "[DEBUG] step1 c_heap_index : " << std::endl;
+//     for (auto it = c_heap_index.begin(); it != c_heap_index.end(); it++) {
+//         FilterCacheHeapNode node = it->second;
+//         f_heap << it->first << " -> ";
+//         f_heap << " id : " << node->segment_id;
+//         f_heap << " , cnt : " << node->approx_visit_cnt;
+//         f_heap << " , units : " << node->current_units_num;
+//         f_heap << " , value : " << node->benefit_or_cost;
+//         f_heap << " , limit : " << node->units_num_limit;
+//         f_heap << " , alive : " << node->is_alive << std::endl;
+//     }
+//     f_heap << "[DEBUG] step1 c_heap : " << std::endl;
+//     for (FilterCacheHeapNode& node : c_heap) {
+//         f_heap << " id : " << node->segment_id;
+//         f_heap << " , cnt : " << node->approx_visit_cnt;
+//         f_heap << " , units : " << node->current_units_num;
+//         f_heap << " , value : " << node->benefit_or_cost;
+//         f_heap << " , limit : " << node->units_num_limit;
+//         f_heap << " , alive : " << node->is_alive << std::endl;
+//     }
+//     f_heap << "[DEBUG] step1 visit_cnt_recorder : " << std::endl;
+//     for (auto it = heap_visit_cnt_recorder_.begin();
+//          it != heap_visit_cnt_recorder_.end(); it++) {
+//         f_heap << it->first << " -> " << it->second << std::endl;
+//     }
+//     f_heap << "[DEBUG] step1 units_limit_recorder : " << std::endl;
+//     for (auto it = units_num_limit_recorder_.begin();
+//          it != units_num_limit_recorder_.end(); it++) {
+//         f_heap << it->first << " -> " << it->second << std::endl;
+//     }
+
+//     // 2. try to update old data
+//     f_heap << std::endl << std::endl<< "[DEBUG] debug step 2 : batch update (using upsert)" << std::endl << std::endl;
+//     items.clear();
+//     for (uint32_t id = 0; id < 70; id++) {
+//         items.emplace_back(id % 70, (id % 70) * std::pow(10, (id % 70) / 10), (id % 70) / 10, 0, MAX_UNITS_NUM);
+//     }
+//     batch_upsert(items);
+//     benefit_heap_.heap_index(b_heap_index);
+//     benefit_heap_.heap(b_heap);
+//     cost_heap_.heap_index(c_heap_index);
+//     cost_heap_.heap(c_heap);
+//     f_heap << "[DEBUG] step2 b_heap_index : " << std::endl;
+//     for (auto it = b_heap_index.begin(); it != b_heap_index.end(); it++) {
+//         FilterCacheHeapNode node = it->second;
+//         f_heap << it->first << " -> ";
+//         f_heap << " id : " << node->segment_id;
+//         f_heap << " , cnt : " << node->approx_visit_cnt;
+//         f_heap << " , units : " << node->current_units_num;
+//         f_heap << " , value : " << node->benefit_or_cost;
+//         f_heap << " , limit : " << node->units_num_limit;
+//         f_heap << " , alive : " << node->is_alive << std::endl;
+//     }
+//     f_heap << "[DEBUG] step2 b_heap : " << std::endl;
+//     for (FilterCacheHeapNode& node : b_heap) {
+//         f_heap << " id : " << node->segment_id;
+//         f_heap << " , cnt : " << node->approx_visit_cnt;
+//         f_heap << " , units : " << node->current_units_num;
+//         f_heap << " , value : " << node->benefit_or_cost;
+//         f_heap << " , limit : " << node->units_num_limit;
+//         f_heap << " , alive : " << node->is_alive << std::endl;
+//     }
+//     f_heap << "[DEBUG] step2 c_heap_index : " << std::endl;
+//     for (auto it = c_heap_index.begin(); it != c_heap_index.end(); it++) {
+//         FilterCacheHeapNode node = it->second;
+//         f_heap << it->first << " -> ";
+//         f_heap << " id : " << node->segment_id;
+//         f_heap << " , cnt : " << node->approx_visit_cnt;
+//         f_heap << " , units : " << node->current_units_num;
+//         f_heap << " , value : " << node->benefit_or_cost;
+//         f_heap << " , limit : " << node->units_num_limit;
+//         f_heap << " , alive : " << node->is_alive << std::endl;
+//     }
+//     f_heap << "[DEBUG] step2 c_heap : " << std::endl;
+//     for (FilterCacheHeapNode& node : c_heap) {
+//         f_heap << " id : " << node->segment_id;
+//         f_heap << " , cnt : " << node->approx_visit_cnt;
+//         f_heap << " , units : " << node->current_units_num;
+//         f_heap << " , value : " << node->benefit_or_cost;
+//         f_heap << " , limit : " << node->units_num_limit;
+//         f_heap << " , alive : " << node->is_alive << std::endl;
+//     }
+//     f_heap << "[DEBUG] step2 visit_cnt_recorder : " << std::endl;
+//     for (auto it = heap_visit_cnt_recorder_.begin();
+//          it != heap_visit_cnt_recorder_.end(); it++) {
+//         f_heap << it->first << " -> " << it->second << std::endl;
+//     }
+//     f_heap << "[DEBUG] step2 units_limit_recorder : " << std::endl;
+//     for (auto it = units_num_limit_recorder_.begin();
+//          it != units_num_limit_recorder_.end(); it++) {
+//         f_heap << it->first << " -> " << it->second << std::endl;
+//     }
+
+//     // 3. try to delete some data
+//     f_heap << std::endl << std::endl<< "[DEBUG] debug step 3 : batch delete" << std::endl << std::endl;
+//     items.clear();
+//     segment_ids.clear();
+//     for (uint32_t i = 0; i < 10; i++) {
+//         segment_ids.emplace_back(i);
+//     }
+//     for (uint32_t i = 60; i < 100; i++) {
+//         segment_ids.emplace_back(i);
+//     }
+//     batch_delete(segment_ids);
+//     benefit_heap_.heap_index(b_heap_index);
+//     benefit_heap_.heap(b_heap);
+//     cost_heap_.heap_index(c_heap_index);
+//     cost_heap_.heap(c_heap);
+//     f_heap << "[DEBUG] step3 b_heap_index : " << std::endl;
+//     for (auto it = b_heap_index.begin(); it != b_heap_index.end(); it++) {
+//         FilterCacheHeapNode node = it->second;
+//         f_heap << it->first << " -> ";
+//         f_heap << " id : " << node->segment_id;
+//         f_heap << " , cnt : " << node->approx_visit_cnt;
+//         f_heap << " , units : " << node->current_units_num;
+//         f_heap << " , value : " << node->benefit_or_cost;
+//         f_heap << " , limit : " << node->units_num_limit;
+//         f_heap << " , alive : " << node->is_alive << std::endl;
+//     }
+//     f_heap << "[DEBUG] step3 b_heap : " << std::endl;
+//     for (FilterCacheHeapNode& node : b_heap) {
+//         f_heap << " id : " << node->segment_id;
+//         f_heap << " , cnt : " << node->approx_visit_cnt;
+//         f_heap << " , units : " << node->current_units_num;
+//         f_heap << " , value : " << node->benefit_or_cost;
+//         f_heap << " , limit : " << node->units_num_limit;
+//         f_heap << " , alive : " << node->is_alive << std::endl;
+//     }
+//     f_heap << "[DEBUG] step3 c_heap_index : " << std::endl;
+//     for (auto it = c_heap_index.begin(); it != c_heap_index.end(); it++) {
+//         FilterCacheHeapNode node = it->second;
+//         f_heap << it->first << " -> ";
+//         f_heap << " id : " << node->segment_id;
+//         f_heap << " , cnt : " << node->approx_visit_cnt;
+//         f_heap << " , units : " << node->current_units_num;
+//         f_heap << " , value : " << node->benefit_or_cost;
+//         f_heap << " , limit : " << node->units_num_limit;
+//         f_heap << " , alive : " << node->is_alive << std::endl;
+//     }
+//     f_heap << "[DEBUG] step3 c_heap : " << std::endl;
+//     for (FilterCacheHeapNode& node : c_heap) {
+//         f_heap << " id : " << node->segment_id;
+//         f_heap << " , cnt : " << node->approx_visit_cnt;
+//         f_heap << " , units : " << node->current_units_num;
+//         f_heap << " , value : " << node->benefit_or_cost;
+//         f_heap << " , limit : " << node->units_num_limit;
+//         f_heap << " , alive : " << node->is_alive << std::endl;
+//     }
+//     f_heap << "[DEBUG] step3 visit_cnt_recorder : " << std::endl;
+//     for (auto it = heap_visit_cnt_recorder_.begin();
+//          it != heap_visit_cnt_recorder_.end(); it++) {
+//         f_heap << it->first << " -> " << it->second << std::endl;
+//     }
+//     f_heap << "[DEBUG] step3 units_limit_recorder : " << std::endl;
+//     for (auto it = units_num_limit_recorder_.begin();
+//          it != units_num_limit_recorder_.end(); it++) {
+//         f_heap << it->first << " -> " << it->second << std::endl;
+//     }
+
+//     // 4. try to sync visit cnt
+//     f_heap << std::endl << std::endl<< "[DEBUG] debug step 4 : sync visit cnt " << std::endl << std::endl;
+//     for (uint32_t id = 0; id < 40; id++) {
+//         if (id % 2 == 0) {
+//             current_visit_cnt_recorder.insert(std::make_pair(id, (id % 70) * std::pow(10, (id % 70) / 10) + 101010));
+//         }
+//     }
+//     for (uint32_t id = 40; id < 60; id++) {
+//         current_visit_cnt_recorder.insert(std::make_pair(id, (id % 70) * std::pow(10, (id % 70) / 10) + 101010));
+//     }
+//     sync_visit_cnt(current_visit_cnt_recorder);
+//     benefit_heap_.heap_index(b_heap_index);
+//     benefit_heap_.heap(b_heap);
+//     cost_heap_.heap_index(c_heap_index);
+//     cost_heap_.heap(c_heap);
+//     f_heap << "[DEBUG] step4 b_heap_index : " << std::endl;
+//     for (auto it = b_heap_index.begin(); it != b_heap_index.end(); it++) {
+//         FilterCacheHeapNode node = it->second;
+//         f_heap << it->first << " -> ";
+//         f_heap << " id : " << node->segment_id;
+//         f_heap << " , cnt : " << node->approx_visit_cnt;
+//         f_heap << " , units : " << node->current_units_num;
+//         f_heap << " , value : " << node->benefit_or_cost;
+//         f_heap << " , limit : " << node->units_num_limit;
+//         f_heap << " , alive : " << node->is_alive << std::endl;
+//     }
+//     f_heap << "[DEBUG] step4 b_heap : " << std::endl;
+//     for (FilterCacheHeapNode& node : b_heap) {
+//         f_heap << " id : " << node->segment_id;
+//         f_heap << " , cnt : " << node->approx_visit_cnt;
+//         f_heap << " , units : " << node->current_units_num;
+//         f_heap << " , value : " << node->benefit_or_cost;
+//         f_heap << " , limit : " << node->units_num_limit;
+//         f_heap << " , alive : " << node->is_alive << std::endl;
+//     }
+//     f_heap << "[DEBUG] step4 c_heap_index : " << std::endl;
+//     for (auto it = c_heap_index.begin(); it != c_heap_index.end(); it++) {
+//         FilterCacheHeapNode node = it->second;
+//         f_heap << it->first << " -> ";
+//         f_heap << " id : " << node->segment_id;
+//         f_heap << " , cnt : " << node->approx_visit_cnt;
+//         f_heap << " , units : " << node->current_units_num;
+//         f_heap << " , value : " << node->benefit_or_cost;
+//         f_heap << " , limit : " << node->units_num_limit;
+//         f_heap << " , alive : " << node->is_alive << std::endl;
+//     }
+//     f_heap << "[DEBUG] step4 c_heap : " << std::endl;
+//     for (FilterCacheHeapNode& node : c_heap) {
+//         f_heap << " id : " << node->segment_id;
+//         f_heap << " , cnt : " << node->approx_visit_cnt;
+//         f_heap << " , units : " << node->current_units_num;
+//         f_heap << " , value : " << node->benefit_or_cost;
+//         f_heap << " , limit : " << node->units_num_limit;
+//         f_heap << " , alive : " << node->is_alive << std::endl;
+//     }
+//     f_heap << "[DEBUG] step4 visit_cnt_recorder : " << std::endl;
+//     for (auto it = heap_visit_cnt_recorder_.begin();
+//          it != heap_visit_cnt_recorder_.end(); it++) {
+//         f_heap << it->first << " -> " << it->second << std::endl;
+//     }
+//     f_heap << "[DEBUG] step4 units_limit_recorder : " << std::endl;
+//     for (auto it = units_num_limit_recorder_.begin();
+//          it != units_num_limit_recorder_.end(); it++) {
+//         f_heap << it->first << " -> " << it->second << std::endl;
+//     }
+
+//     // 5. try to decrease units limit
+//     f_heap << std::endl << std::endl<< "[DEBUG] debug step 5 : decrease units limit " << std::endl << std::endl;
+//     for (uint32_t id = 0; id < 40; id++) {
+//         if (id % 2 == 0) {
+//             current_units_num_limit_recorder.insert(std::make_pair(id, 0));
+//         } else {
+//             current_units_num_limit_recorder.insert(std::make_pair(id, 1));
+//         }
+//     }
+//     for (uint32_t id = 40; id < 50; id++) {
+//         current_units_num_limit_recorder.insert(std::make_pair(id, 3));
+//     }
+//     for (uint32_t id = 50; id < 70; id++) {
+//         current_units_num_limit_recorder.insert(std::make_pair(id, 5));
+//     }
+//     sync_units_num_limit(current_units_num_limit_recorder);
+//     benefit_heap_.heap_index(b_heap_index);
+//     benefit_heap_.heap(b_heap);
+//     cost_heap_.heap_index(c_heap_index);
+//     cost_heap_.heap(c_heap);
+//     f_heap << "[DEBUG] step5 b_heap_index : " << std::endl;
+//     for (auto it = b_heap_index.begin(); it != b_heap_index.end(); it++) {
+//         FilterCacheHeapNode node = it->second;
+//         f_heap << it->first << " -> ";
+//         f_heap << " id : " << node->segment_id;
+//         f_heap << " , cnt : " << node->approx_visit_cnt;
+//         f_heap << " , units : " << node->current_units_num;
+//         f_heap << " , value : " << node->benefit_or_cost;
+//         f_heap << " , limit : " << node->units_num_limit;
+//         f_heap << " , alive : " << node->is_alive << std::endl;
+//     }
+//     f_heap << "[DEBUG] step5 b_heap : " << std::endl;
+//     for (FilterCacheHeapNode& node : b_heap) {
+//         f_heap << " id : " << node->segment_id;
+//         f_heap << " , cnt : " << node->approx_visit_cnt;
+//         f_heap << " , units : " << node->current_units_num;
+//         f_heap << " , value : " << node->benefit_or_cost;
+//         f_heap << " , limit : " << node->units_num_limit;
+//         f_heap << " , alive : " << node->is_alive << std::endl;
+//     }
+//     f_heap << "[DEBUG] step5 c_heap_index : " << std::endl;
+//     for (auto it = c_heap_index.begin(); it != c_heap_index.end(); it++) {
+//         FilterCacheHeapNode node = it->second;
+//         f_heap << it->first << " -> ";
+//         f_heap << " id : " << node->segment_id;
+//         f_heap << " , cnt : " << node->approx_visit_cnt;
+//         f_heap << " , units : " << node->current_units_num;
+//         f_heap << " , value : " << node->benefit_or_cost;
+//         f_heap << " , limit : " << node->units_num_limit;
+//         f_heap << " , alive : " << node->is_alive << std::endl;
+//     }
+//     f_heap << "[DEBUG] step5 c_heap : " << std::endl;
+//     for (FilterCacheHeapNode& node : c_heap) {
+//         f_heap << " id : " << node->segment_id;
+//         f_heap << " , cnt : " << node->approx_visit_cnt;
+//         f_heap << " , units : " << node->current_units_num;
+//         f_heap << " , value : " << node->benefit_or_cost;
+//         f_heap << " , limit : " << node->units_num_limit;
+//         f_heap << " , alive : " << node->is_alive << std::endl;
+//     }
+//     f_heap << "[DEBUG] step5 visit_cnt_recorder : " << std::endl;
+//     for (auto it = heap_visit_cnt_recorder_.begin();
+//          it != heap_visit_cnt_recorder_.end(); it++) {
+//         f_heap << it->first << " -> " << it->second << std::endl;
+//     }
+//     f_heap << "[DEBUG] step5 units_limit_recorder : " << std::endl;
+//     for (auto it = units_num_limit_recorder_.begin();
+//          it != units_num_limit_recorder_.end(); it++) {
+//         f_heap << it->first << " -> " << it->second << std::endl;
+//     }
+
+//     // 6. try to increase units limit
+//     f_heap << std::endl << std::endl<< "[DEBUG] debug step 6 : increase units limit " << std::endl << std::endl;
+//     for (uint32_t id = 0; id < 40; id++) {
+//         if (id % 2 == 0) {
+//             current_units_num_limit_recorder[id] = 3;
+//         } else {
+//             current_units_num_limit_recorder[id] = 4;
+//         }
+//     }
+//     for (uint32_t id = 40; id < 50; id++) {
+//         current_units_num_limit_recorder[id] = 5;
+//     }
+//     for (uint32_t id = 50; id < 70; id++) {
+//         current_units_num_limit_recorder[id] = 6;
+//     }
+//     sync_units_num_limit(current_units_num_limit_recorder);
+//     benefit_heap_.heap_index(b_heap_index);
+//     benefit_heap_.heap(b_heap);
+//     cost_heap_.heap_index(c_heap_index);
+//     cost_heap_.heap(c_heap);
+//     f_heap << "[DEBUG] step6 b_heap_index : " << std::endl;
+//     for (auto it = b_heap_index.begin(); it != b_heap_index.end(); it++) {
+//         FilterCacheHeapNode node = it->second;
+//         f_heap << it->first << " -> ";
+//         f_heap << " id : " << node->segment_id;
+//         f_heap << " , cnt : " << node->approx_visit_cnt;
+//         f_heap << " , units : " << node->current_units_num;
+//         f_heap << " , value : " << node->benefit_or_cost;
+//         f_heap << " , limit : " << node->units_num_limit;
+//         f_heap << " , alive : " << node->is_alive << std::endl;
+//     }
+//     f_heap << "[DEBUG] step6 b_heap : " << std::endl;
+//     for (FilterCacheHeapNode& node : b_heap) {
+//         f_heap << " id : " << node->segment_id;
+//         f_heap << " , cnt : " << node->approx_visit_cnt;
+//         f_heap << " , units : " << node->current_units_num;
+//         f_heap << " , value : " << node->benefit_or_cost;
+//         f_heap << " , limit : " << node->units_num_limit;
+//         f_heap << " , alive : " << node->is_alive << std::endl;
+//     }
+//     f_heap << "[DEBUG] step6 c_heap_index : " << std::endl;
+//     for (auto it = c_heap_index.begin(); it != c_heap_index.end(); it++) {
+//         FilterCacheHeapNode node = it->second;
+//         f_heap << it->first << " -> ";
+//         f_heap << " id : " << node->segment_id;
+//         f_heap << " , cnt : " << node->approx_visit_cnt;
+//         f_heap << " , units : " << node->current_units_num;
+//         f_heap << " , value : " << node->benefit_or_cost;
+//         f_heap << " , limit : " << node->units_num_limit;
+//         f_heap << " , alive : " << node->is_alive << std::endl;
+//     }
+//     f_heap << "[DEBUG] step6 c_heap : " << std::endl;
+//     for (FilterCacheHeapNode& node : c_heap) {
+//         f_heap << " id : " << node->segment_id;
+//         f_heap << " , cnt : " << node->approx_visit_cnt;
+//         f_heap << " , units : " << node->current_units_num;
+//         f_heap << " , value : " << node->benefit_or_cost;
+//         f_heap << " , limit : " << node->units_num_limit;
+//         f_heap << " , alive : " << node->is_alive << std::endl;
+//     }
+//     f_heap << "[DEBUG] step6 visit_cnt_recorder : " << std::endl;
+//     for (auto it = heap_visit_cnt_recorder_.begin();
+//          it != heap_visit_cnt_recorder_.end(); it++) {
+//         f_heap << it->first << " -> " << it->second << std::endl;
+//     }
+//     f_heap << "[DEBUG] step6 units_limit_recorder : " << std::endl;
+//     for (auto it = units_num_limit_recorder_.begin();
+//          it != units_num_limit_recorder_.end(); it++) {
+//         f_heap << it->first << " -> " << it->second << std::endl;
+//     }
+
+//     // 7. try to loop modification
+//     f_heap << std::endl << std::endl<< "[DEBUG] debug step 7 : loop try_modify " << std::endl << std::endl;
+//     f_heap << "[DEBUG] step7 loop start : " << std::endl;
+//     FilterCacheModifyResult result;
+//     while (try_modify(result)) {
+//         f_heap << "enable segment -> " << "id : " << result.enable_segment_id;
+//         f_heap << " , prev units num : " << result.enable_segment_units_num;
+//         f_heap << " , benefit : " << result.enable_benefit << std::endl;
+//         f_heap << "disable segment -> " << "id : " << result.disable_segment_id;
+//         f_heap << " , prev units num : " << result.disable_segment_units_num;
+//         f_heap << " , cost : " << result.disable_cost << std::endl;
+//     }
+//     // write final indexs and heaps
+//     benefit_heap_.heap_index(b_heap_index);
+//     benefit_heap_.heap(b_heap);
+//     cost_heap_.heap_index(c_heap_index);
+//     cost_heap_.heap(c_heap);
+//     f_heap << "[DEBUG] step7 b_heap_index : " << std::endl;
+//     for (auto it = b_heap_index.begin(); it != b_heap_index.end(); it++) {
+//         FilterCacheHeapNode node = it->second;
+//         f_heap << it->first << " -> ";
+//         f_heap << " id : " << node->segment_id;
+//         f_heap << " , cnt : " << node->approx_visit_cnt;
+//         f_heap << " , units : " << node->current_units_num;
+//         f_heap << " , value : " << node->benefit_or_cost;
+//         f_heap << " , limit : " << node->units_num_limit;
+//         f_heap << " , alive : " << node->is_alive << std::endl;
+//     }
+//     f_heap << "[DEBUG] step7 b_heap : " << std::endl;
+//     for (FilterCacheHeapNode& node : b_heap) {
+//         f_heap << " id : " << node->segment_id;
+//         f_heap << " , cnt : " << node->approx_visit_cnt;
+//         f_heap << " , units : " << node->current_units_num;
+//         f_heap << " , value : " << node->benefit_or_cost;
+//         f_heap << " , limit : " << node->units_num_limit;
+//         f_heap << " , alive : " << node->is_alive << std::endl;
+//     }
+//     f_heap << "[DEBUG] step7 c_heap_index : " << std::endl;
+//     for (auto it = c_heap_index.begin(); it != c_heap_index.end(); it++) {
+//         FilterCacheHeapNode node = it->second;
+//         f_heap << it->first << " -> ";
+//         f_heap << " id : " << node->segment_id;
+//         f_heap << " , cnt : " << node->approx_visit_cnt;
+//         f_heap << " , units : " << node->current_units_num;
+//         f_heap << " , value : " << node->benefit_or_cost;
+//         f_heap << " , limit : " << node->units_num_limit;
+//         f_heap << " , alive : " << node->is_alive << std::endl;
+//     }
+//     f_heap << "[DEBUG] step7 c_heap : " << std::endl;
+//     for (FilterCacheHeapNode& node : c_heap) {
+//         f_heap << " id : " << node->segment_id;
+//         f_heap << " , cnt : " << node->approx_visit_cnt;
+//         f_heap << " , units : " << node->current_units_num;
+//         f_heap << " , value : " << node->benefit_or_cost;
+//         f_heap << " , limit : " << node->units_num_limit;
+//         f_heap << " , alive : " << node->is_alive << std::endl;
+//     }
+//     f_heap << "[DEBUG] step7 visit_cnt_recorder : " << std::endl;
+//     for (auto it = heap_visit_cnt_recorder_.begin();
+//          it != heap_visit_cnt_recorder_.end(); it++) {
+//         f_heap << it->first << " -> " << it->second << std::endl;
+//     }
+//     f_heap << "[DEBUG] step7 units_limit_recorder : " << std::endl;
+//     for (auto it = units_num_limit_recorder_.begin();
+//          it != units_num_limit_recorder_.end(); it++) {
+//         f_heap << it->first << " -> " << it->second << std::endl;
+//     }
+
+//     f_heap.close();
+// }
 
 }
\ No newline at end of file
diff --git a/db/art/filter_cache_heap.h b/db/art/filter_cache_heap.h
index ca9aea1aa..ab299ca9e 100644
--- a/db/art/filter_cache_heap.h
+++ b/db/art/filter_cache_heap.h
@@ -4,6 +4,7 @@
 #include <vector>
 #include <algorithm>
 #include <map>
+#include <set>
 #include <cmath>
 #include <cassert>
 #include <mutex>
@@ -182,6 +183,14 @@ class FilterCacheHeap {
     // return heap top
     FilterCacheHeapNode heap_top();
 
+    // check heap_top's benefit/cost is smaller/greater than other items
+    void heap_check(bool max_heap);
+
+    // print heap items of selected segments' ids
+    void heap_print(std::vector<uint32_t>& needed_segment_ids, const bool should_exist);
+
+    size_t heap_size() { assert(heap_.size() == heap_index_.size()); return heap_.size(); }
+
     // pop one node with deleting node from heap_index_
     // void pop();
 
@@ -206,12 +215,14 @@ class FilterCacheHeap {
 
     // only used in debug !!!
     void heap_index(std::map<uint32_t, FilterCacheHeapNode>& heap_index) {
+        assert(false);
         heap_index.clear();
         heap_index.insert(heap_index_.begin(), heap_index_.end());
     }
 
     // only used in debug !!!
     void heap(std::vector<FilterCacheHeapNode>& heap) {
+        assert(false);
         heap.clear();
         heap.assign(heap_.begin(), heap_.end());
     }
@@ -219,18 +230,18 @@ class FilterCacheHeap {
 
 class FilterCacheHeapManager {
 private: 
-    static FilterCacheHeap benefit_heap_;
-    static FilterCacheHeap cost_heap_;
+    FilterCacheHeap benefit_heap_;
+    FilterCacheHeap cost_heap_;
     // set heap node visit cnt = c_1, real estimated visit cnt = c_2
     // we only update c_1 when | c_1 - c_2 | >= VISIT_CNT_UPDATE_BOUND
     // update c_1 means we need to update this recorder and heap
     // heap_visit_cnt_recorder: map<segment id : visit cnt in heap>
     // when filter cache call delete, this recorder will automately delete these merged segment ids
     // when filter cache call upsert, this recorder will automately upsert these segment ids
-    static std::map<uint32_t, uint32_t> heap_visit_cnt_recorder_;
-    static std::map<uint32_t, uint16_t> units_num_limit_recorder_;
+    std::map<uint32_t, uint32_t> heap_visit_cnt_recorder_;
+    std::map<uint32_t, uint16_t> units_num_limit_recorder_;
     // TODO: mutex can be optimized
-    static std::mutex manager_mutex_; 
+    std::mutex manager_mutex_; 
 
 public:
     FilterCacheHeapManager() {
@@ -252,7 +263,7 @@ class FilterCacheHeapManager {
     // sync visit cnt in heap and real estimated visit cnt
     // reminded that we will not insert or delete nodes in this method
     // we only update these nodes that already exist in two heaps
-    void sync_visit_cnt(std::map<uint32_t, uint32_t>& current_visit_cnt_recorder);
+    void sync_visit_cnt(std::map<uint32_t, uint32_t>& recent_visit_cnt_recorder);
 
     // try to read benefit_heap top and cost_heap top, then judge whether we need to modify units num in filter cache
     // return true when we can modify units num of several segments, return false when we cannot
@@ -268,9 +279,9 @@ class FilterCacheHeapManager {
     // because we need to keep heap visit cnt and recorder visit cnt the same
     void batch_upsert(std::vector<FilterCacheHeapItem>& items);
 
-    // 1. try debug batch insert
-    // 2. try debug batch update(use batch_upsert)
-    void debug();
+    // // 1. try debug batch insert
+    // // 2. try debug batch update(use batch_upsert)
+    // void debug();
 };
 
 }
diff --git a/db/art/filter_cache_item.cc b/db/art/filter_cache_item.cc
deleted file mode 100644
index 6f5cb1163..000000000
--- a/db/art/filter_cache_item.cc
+++ /dev/null
@@ -1,5 +0,0 @@
-#include "filter_cache_item.h"
-
-namespace ROCKSDB_NAMESPACE {
-
-}
\ No newline at end of file
diff --git a/db/art/filter_cache_item.h b/db/art/filter_cache_item.h
deleted file mode 100644
index 8a591b88c..000000000
--- a/db/art/filter_cache_item.h
+++ /dev/null
@@ -1,53 +0,0 @@
-#pragma once
-
-#include <iostream>
-#include <fstream>
-#include <mutex>
-#include <cassert>
-#include <vector>
-#include <map>
-#include "macros.h"
-
-namespace ROCKSDB_NAMESPACE {
-
-// 先在filter cache里为每个segment默认启用总bits-per-key=8，随着写入的segment的增加，
-// 一旦已经占用了filter cache最大容量的一定阈值(如80%), 就利用GreedyAlgo计算规划问题，并进行模型训练
-// 一旦filter cache已满，就进入filter cache的double heap调整，我们只需将新的segment用模型进行预测
-// 将新segment的node插入到两个heap里，在后台启动一个线程，自行调整两个堆，并不断返回调整的结果
-// 得到结果后，我们可以立即对filter units的启用情况进行调节，也可以先保存后面批量调整
-// 具体见文档
-
-
-// 注意加上一些必要的英文注释
-// filter cache主要为一个map, key是segment id(uint32_t), value就为FilterCacheItem类
-// 成员函数需要在filter_cache_item.cc里定义
-class FilterCacheItem {
-private:
-    // 这里定义一些必要的成员变量，尽量设置为private
-    // 可以存handle、segment id等信息
-    // 允许使用STL类，如vector、map等
-    // 是否需要使用mutex来保证filter units的启用/禁用管理与用units检查key二者不冲突?
-public:
-    // 构造函数，可以初始化成员变量
-    FilterCacheItem(const uint32_t& segment_id);
-
-    // 清理成员变量，避免内存泄漏，如果new了空间，就可能需要在这里清理
-    ~FilterCacheItem();
-
-    // 占用的内存空间，这里估计总共使用的filter units占用的空间就行了
-    // 注意，返回的空间大小为占用的bits数量，不是bytes数量
-    uint32_t approximate_size();
-
-    // 根据目前已经启用的units数，启用或禁用filter units
-    // 输入需要启用的units数，决定启用、禁用还是不处理
-    // units_num : [MIN_UNITS_NUM, MAX_UNITS_NUM]
-    void enable_units(const uint32_t& units_num);
-
-    // 输入一个key，判断是否存在
-    // 具体就是从第一个unit开始，依次判断，如果有一个unit判断不存在。就停止。
-    // 如果每个unit都判断存在，就返回true，否则返回false
-    // 如果启用的unit数为0，默认返回true
-    bool check_key(const std::string& key);
-};
-
-}
\ No newline at end of file
diff --git a/db/art/global_filter_cache_context.cc b/db/art/global_filter_cache_context.cc
new file mode 100644
index 000000000..207a81757
--- /dev/null
+++ b/db/art/global_filter_cache_context.cc
@@ -0,0 +1,66 @@
+#include <cstdint>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "db/art/filter_cache_client.h"
+
+namespace ROCKSDB_NAMESPACE {
+// TODO: add necessary filter cache info structures
+rocksdb::FilterCacheClient
+    global_filter_cache;  // already contain FilterCacheManager
+
+// TODO: mutex for updating these recorders below
+//       will be locked when updating these recorders below, and unlock after
+//       updating ends.
+//       Should not be used when you just want to call filtercache's function
+std::mutex global_filter_cache_recorders_mutex;
+
+// these global recorders need to be latest after every flush or compaction:
+// std::map<uint32_t, uint16_t>* level_recorder_
+// std::map<uint32_t, std::vector<RangeRatePair>>* segment_ranges_recorder_
+// std::map<uint32_t, uint32_t>* unit_size_recorder_
+// you may need filter_cache_.range_seperators() to receive key range seperators
+// exactly, if key k < seperators[i+1] and key k >= seperators[i], then key k
+// hit key range i HeatBuckets::locate(const std::string& key) will tell you how
+// to binary search corresponding key range for one key
+
+// segment_info_recorder save every segments' min key and max key
+// but we only need to pass empty segment_info_recorder now
+// TODO: it should contain all levels segments' min key and max key, then pass
+// to filter cache client, but not used now this recorder will help decide the
+// key ranges' num, but it dont work in current work you can try to modify macro
+// APPROXIMATE_BUCKETS_NUM to decide the key ranges' num
+std::unordered_map<uint32_t, std::vector<std::string>>
+    global_segment_info_recorder;
+
+// record every alive segments' level
+// TODO: need to be latest all the time
+std::map<uint32_t, uint16_t> global_level_recorder;
+
+// record features num of every segments
+// we choose max features num to define model feature num
+// if you want to use a default features num, set MAX_FEATURES_NUM to non-zero
+// value then do not insert any entry into this vector later
+// TODO: we dont use this vector, so we set MAX_FEATURES_NUM to non-zero value
+std::vector<uint16_t> global_features_nums_except_level_0;
+
+// should be based level 0 visit cnt in a total long period
+// simply we set level_0_base_count to 0, and use macro INIT_LEVEL_0_COUNT
+// we can set this macro to ( PERIOD_COUNT * TRAIN_PERIODS ) * ( level 0 sorted
+// runs num ) / ( max level 0 segments num )
+// TODO: modify INIT_LEVEL_0_COUNT to proper value
+uint32_t global_level_0_base_count;
+
+// record interacting ranges and their rates of alive segments
+// TODO: should be latest all the time
+std::map<uint32_t, std::vector<RangeRatePair>> global_segment_ranges_recorder;
+
+// every segment's filter unit size is the same
+// this recorder should hold all alive segment
+// simply, you can also use default macro DEFAULT_UNIT_SIZE for all segments,
+// just leave this recorder empty
+// TODO: modify DEFAULT_UNIT_SIZE
+std::map<uint32_t, uint32_t> global_unit_size_recorder;
+
+}  // namespace ROCKSDB_NAMESPACE
\ No newline at end of file
diff --git a/db/art/global_filter_cache_context.h b/db/art/global_filter_cache_context.h
new file mode 100644
index 000000000..762bbfef9
--- /dev/null
+++ b/db/art/global_filter_cache_context.h
@@ -0,0 +1,68 @@
+
+#include <cstdint>
+#include <functional>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "db/art/filter_cache_client.h"
+
+namespace ROCKSDB_NAMESPACE {
+// TODO: add necessary filter cache info structures
+extern rocksdb::FilterCacheClient
+    global_filter_cache;  // already contain FilterCacheManager
+
+// TODO: mutex for updating these recorders below
+//       will be locked when updating these recorders below, and unlock after
+//       updating ends
+extern std::mutex global_filter_cache_recorders_mutex;
+
+// these global recorders need to be latest after every flush or compaction:
+// std::map<uint32_t, uint16_t>* level_recorder_
+// std::map<uint32_t, std::vector<RangeRatePair>>* segment_ranges_recorder_
+// std::map<uint32_t, uint32_t>* unit_size_recorder_
+// you may need filter_cache_.range_seperators() to receive key range seperators
+// exactly, if key k < seperators[i+1] and key k >= seperators[i], then key k
+// hit key range i HeatBuckets::locate(const std::string& key) will tell you how
+// to binary search corresponding key range for one key
+
+// segment_info_recorder save every segments' min key and max key
+// but we only need to pass empty segment_info_recorder now
+// TODO: it should contain all levels segments' min key and max key, then pass
+// to filter cache client, but not used now this recorder will help decide the
+// key ranges' num, but it dont work in current work you can try to modify macro
+// APPROXIMATE_BUCKETS_NUM to decide the key ranges' num
+extern std::unordered_map<uint32_t, std::vector<std::string>>
+    global_segment_info_recorder;
+
+// record every alive segments' level
+// TODO: need to be latest all the time
+extern std::map<uint32_t, uint16_t> global_level_recorder;
+
+// record features num of every segments
+// we choose max features num to define model feature num
+// if you want to use a default features num, set MAX_FEATURES_NUM to non-zero
+// value then do not insert any entry into this vector later
+// TODO: we dont use this vector, so we set MAX_FEATURES_NUM to non-zero value
+extern std::vector<uint16_t> global_features_nums_except_level_0;
+
+// should be based level 0 visit cnt in a total long period
+// simply we set level_0_base_count to 0, and use macro INIT_LEVEL_0_COUNT
+// we can set this macro to ( PERIOD_COUNT * TRAIN_PERIODS ) * ( level 0 sorted
+// runs num ) / ( max level 0 segments num )
+// TODO: modify INIT_LEVEL_0_COUNT to proper value
+extern uint32_t global_level_0_base_count;
+
+// record interacting ranges and their rates of alive segments
+// TODO: should be latest all the time
+extern std::map<uint32_t, std::vector<RangeRatePair>>
+    global_segment_ranges_recorder;
+
+// every segment's filter unit size is the same
+// this recorder should hold all alive segment
+// simply, you can also use default macro DEFAULT_UNIT_SIZE for all segments,
+// just leave this recorder empty
+// TODO: modify DEFAULT_UNIT_SIZE
+extern std::map<uint32_t, uint32_t> global_unit_size_recorder;
+
+}  // namespace ROCKSDB_NAMESPACE
\ No newline at end of file
diff --git a/db/art/greedy_algo.cc b/db/art/greedy_algo.cc
index 9aff8beed..bf2157316 100644
--- a/db/art/greedy_algo.cc
+++ b/db/art/greedy_algo.cc
@@ -27,9 +27,9 @@ void GreedyAlgo::solve(std::map<uint32_t, SegmentAlgoInfo>& segment_algo_infos,
                     segment_algo_helper_heap.end(), 
                     CompareSegmentAlgoHelper);  
 
-    std::fstream f_algo;
-    f_algo.open("/pg_wal/ycc/algo.log", std::ios::out | std::ios::app);
-    f_algo << "[DEBUG] start to record algo : " << std::endl;
+    // std::fstream f_algo;
+    // f_algo.open("/home/guoteng_20241228_135/WaLSM+/log/algo.log", std::ios::out | std::ios::app);
+    // f_algo << "[DEBUG] start to record algo : " << std::endl;
 
     // current used space size (bits) of filter cache
     uint32_t current_cache_size = 0; 
@@ -44,11 +44,9 @@ void GreedyAlgo::solve(std::map<uint32_t, SegmentAlgoInfo>& segment_algo_infos,
         SegmentAlgoHelper segment_algo_helper_top = segment_algo_helper_heap[size-1];
         // check whether free space (in filter cache) is enough
         uint32_t size_needed = segment_algo_helper_top.size_per_unit;
-        // if not enough, remove this segment helper from heap
-        // that means we will not consider this segment any longer
+        // if not enough, exit. we allocate the same size to all units.
         if (current_cache_size + size_needed > cache_size) {
-            segment_algo_helper_heap.pop_back();
-            continue;
+            break;
         }
         // SegmentAlgoHelper(const uint32_t& id, const uint32_t& cnt, const uint32_t& size, const uint16_t& units)
         SegmentAlgoHelper segment_algo_helper_needed(segment_algo_helper_top.segment_id,
@@ -56,20 +54,21 @@ void GreedyAlgo::solve(std::map<uint32_t, SegmentAlgoInfo>& segment_algo_infos,
                                                         segment_algo_helper_top.size_per_unit,
                                                         segment_algo_helper_top.units_num + 1);
         // update enabled units
-        // noticed that if one segment visit cnt == 0, it still enable one unit
+        // noticed that if one segment visit cnt == 0, it enable zero units
         // so check visit num before update algo_solution
         if (segment_algo_helper_needed.visit_cnt > 0) {
             algo_solution[segment_algo_helper_needed.segment_id] = segment_algo_helper_needed.units_num;
             current_cache_size += size_needed;
-            f_algo << "[DEBUG] segment " << segment_algo_helper_needed.segment_id
-                    << " : " << segment_algo_helper_needed.units_num - 1 << " -> "
-                    << segment_algo_helper_needed.units_num << " , cache space left : " 
-                    << cache_size - current_cache_size << " , recv benefit : " 
-                    << segment_algo_helper_top.enable_benifit << " , next benefit : " 
-                    << segment_algo_helper_needed.enable_benifit << std::endl;
+            // f_algo << "[DEBUG] segment " << segment_algo_helper_needed.segment_id
+            //         << " : " << segment_algo_helper_needed.units_num - 1 << " -> "
+            //         << segment_algo_helper_needed.units_num << " , cache space left : " 
+            //         << cache_size - current_cache_size << " , recv benefit : " 
+            //         << segment_algo_helper_top.enable_benifit << " , next benefit : " 
+            //         << segment_algo_helper_needed.enable_benifit << " , visit count: "
+            //         << segment_algo_helper_needed.visit_cnt << std::endl;
         }
         assert(algo_solution[segment_algo_helper_needed.segment_id] <= MAX_UNITS_NUM);
-        // enable benefit == 0 means units_num == MAX_UNITS_NUM
+        // enable benefit == 0 means units_num == MAX_UNITS_NUM or its visit cnt == 0
         // that means we cannot enable one unit for this segment, already enable all units
         if (segment_algo_helper_needed.enable_benifit == 0) {
             // assert(segment_algo_helper_needed.units_num >= MAX_UNITS_NUM);
@@ -83,9 +82,72 @@ void GreedyAlgo::solve(std::map<uint32_t, SegmentAlgoInfo>& segment_algo_infos,
                         CompareSegmentAlgoHelper);
     }
 
-    f_algo << std::endl;
-    f_algo.close();
+    // f_algo << std::endl;
+    // f_algo.close();
     // return nothing, all results should be written into algo_solution
 }
 
-}
\ No newline at end of file
+void GreedyAlgo::verify(std::map<uint32_t, SegmentAlgoInfo>& segment_algo_infos,
+                        std::map<uint32_t, uint16_t>& algo_solution, const uint32_t& cache_size) {
+    assert(!segment_algo_infos.empty());
+    assert(algo_solution.size() == segment_algo_infos.size());
+
+    std::fstream f_algo;
+    f_algo.open("/home/guoteng_20241228_135/WaLSM+/log/algo.log", std::ios::out | std::ios::app);
+    f_algo << "[DEBUG] start to verify algo : " << std::endl;
+
+    f_algo << "[DEBUG] segment_algo_infos size : " << segment_algo_infos.size() << std::endl;
+    f_algo << "[DEBUG] algo_solution size : " << algo_solution.size() << std::endl;
+    f_algo << "[DEBUG] cache size : " << cache_size << std::endl;
+    assert(segment_algo_infos.size() == algo_solution.size());
+
+    std::map<uint16_t, uint32_t> min_cnt_recorder, max_cnt_recorder;
+    for (uint16_t i = 0; i <= MAX_UNITS_NUM; i++) {
+        min_cnt_recorder[i] = 0xFFFFFFFFU; max_cnt_recorder[i] = 0;
+    }                        
+
+    // recheck that we already compute for all segments in segment_algo_infos
+    auto infos_it = segment_algo_infos.begin();
+    auto solution_it = algo_solution.begin();
+    std::vector<uint32_t> segment_ids;
+    uint32_t current_cache_size = 0;
+    double ideal_cost = 0;
+    while (infos_it != segment_algo_infos.end() && solution_it != algo_solution.end()) {
+        assert(infos_it->first == solution_it->first);
+        segment_ids.emplace_back(infos_it->first);
+        current_cache_size += (infos_it->second.size_per_unit * solution_it->second);
+        f_algo << "[DEBUG] segment " << infos_it->first << " , visit cnt : " 
+                << infos_it->second.visit_cnt << " , size of each unit : " 
+                << infos_it->second.size_per_unit << " , units num : " 
+                << solution_it->second << std::endl;
+        min_cnt_recorder[solution_it->second] = std::min(min_cnt_recorder[solution_it->second], infos_it->second.visit_cnt);
+        max_cnt_recorder[solution_it->second] = std::max(max_cnt_recorder[solution_it->second], infos_it->second.visit_cnt);
+        ideal_cost += StandardCostForDebug(infos_it->second.visit_cnt, solution_it->second);
+        infos_it++; solution_it++;
+    }
+    assert(current_cache_size <= cache_size);
+    f_algo << "[DEBUG] current cache size : " << current_cache_size << std::endl;
+    for (uint16_t i = 0; i <= MAX_UNITS_NUM; i++) {
+        f_algo << "[DEBUG] " << i << " units, min cnt: " << min_cnt_recorder[i] << ", max cnt: " << max_cnt_recorder[i] << std::endl;
+    }
+    f_algo << "[DEBUG] ideal I/O cost : " << ideal_cost << std::endl;
+
+    // if visit cnt of segment i > visit cnt of segment j, then segment i should enable more units than segment j
+    const size_t segment_size = segment_ids.size();
+    for (size_t i=0; i<segment_size; i++) {
+        for (size_t j=i+1; j<segment_size; j++) {
+            uint32_t segment_id_i = segment_ids[i];
+            uint32_t segment_id_j = segment_ids[j];
+            if ((segment_algo_infos.find(segment_id_i)->second).visit_cnt >= (segment_algo_infos.find(segment_id_j)->second).visit_cnt) {
+                assert(algo_solution[segment_id_i] >= algo_solution[segment_id_j]);
+            } else {
+                assert(algo_solution[segment_id_i] <= algo_solution[segment_id_j]);
+            }
+        }
+    }
+
+    f_algo << std::endl;
+    f_algo.close();
+};
+
+} // namespace ROCKSDB_NAMESPACE
\ No newline at end of file
diff --git a/db/art/greedy_algo.h b/db/art/greedy_algo.h
index a1d03acc6..81a9488fb 100644
--- a/db/art/greedy_algo.h
+++ b/db/art/greedy_algo.h
@@ -15,7 +15,7 @@ struct SegmentAlgoHelper;
 class GreedyAlgo;
 
 inline double StandardBenefit(const uint32_t& visit_cnt, const uint16_t& units_num);
-inline double StandardCost(const uint32_t& visit_cnt, const uint16_t& units_num);
+inline double StandardCostForDebug(const uint32_t& visit_cnt, const uint16_t& units_num);
 inline bool CompareSegmentAlgoHelper(const SegmentAlgoHelper& helper_1, const SegmentAlgoHelper& helper_2);
 
 // contain visit counter of every segment in last long period
@@ -81,7 +81,7 @@ inline double StandardBenefit(const uint32_t& visit_cnt, const uint16_t& units_n
     return benefit;
 }
 
-inline double StandardCost(const uint32_t& visit_cnt, const uint16_t& units_num) {
+inline double StandardCostForDebug(const uint32_t& visit_cnt, const uint16_t& units_num) {
     int bits_per_key = BITS_PER_KEY_PER_UNIT;
     // We intentionally round down to reduce probing cost a little bit
     int num_probes = static_cast<int>(bits_per_key * 0.69);  // 0.69 =~ ln(2)
@@ -91,22 +91,9 @@ inline double StandardCost(const uint32_t& visit_cnt, const uint16_t& units_num)
     // compute false positive rate of one filter unit
     double rate_per_unit = std::pow(1.0 - std::exp(-double(num_probes) / double(bits_per_key)), num_probes);
 
-    if (units_num <= MIN_UNITS_NUM) {
-        return __DBL_MAX__;
-    }
-
-    uint16_t next_units_num = units_num - 1;
     double rate = std::pow(rate_per_unit, units_num);
-    double next_rate = std::pow(rate_per_unit, next_units_num);
 
-    double cost = double(visit_cnt) * (next_rate - rate);
-    /*
-    std::cout << "visit_cnt : " << visit_cnt
-                << " , rate : " << rate
-                << " , next_rate : " << next_rate
-                << " . rate_per_unit : " << rate_per_unit 
-                << std::endl;
-    */
+    double cost = double(visit_cnt) * rate;
     assert(cost >= 0);
     return cost;
 }
@@ -125,6 +112,14 @@ class GreedyAlgo {
     // so make sure that only called by one thread
     void solve(std::map<uint32_t, SegmentAlgoInfo>& segment_algo_infos,
                 std::map<uint32_t, uint16_t>& algo_solution, const uint32_t& cache_size);
+
+    // simple check results
+    // noticed that if segment a visit_cnt >= segment b visit_cnt
+    // then segment a units_num >= segment b units_num
+    // and check whether usage exceeds cache size            
+    void verify(std::map<uint32_t, SegmentAlgoInfo>& segment_algo_infos,
+                std::map<uint32_t, uint16_t>& algo_solution, const uint32_t& cache_size);
+    
     // full debug process of GreedyAlgo, not thread-secured
     // so make sure that only called by one thread
     void debug(std::map<uint32_t, uint16_t>& algo_solution, const uint32_t& cache_size) {
diff --git a/db/art/heat_buckets.cc b/db/art/heat_buckets.cc
index 5b83dff2a..b99f5bcfb 100644
--- a/db/art/heat_buckets.cc
+++ b/db/art/heat_buckets.cc
@@ -3,16 +3,6 @@
 #include <iostream>
 
 namespace ROCKSDB_NAMESPACE {
-std::vector<std::string> HeatBuckets::seperators_;
-std::vector<Bucket> HeatBuckets::buckets_;
-uint32_t HeatBuckets::current_cnt_; // current get count in this period
-std::vector<std::unique_ptr<std::mutex>> HeatBuckets::mutex_ptrs_;
-std::mutex HeatBuckets::cnt_mutex_;
-std::mutex HeatBuckets::sample_mutex_;
-bool HeatBuckets::is_ready_; // identify whether HeatBuckets ready for hit
-SamplesPool HeatBuckets::samples_; 
-bool HeatBuckets::updated_; // prevent from updating hotness more than once in a short time
-
 
 Bucket::Bucket() {
     hit_cnt_ = 0;
@@ -44,10 +34,8 @@ HeatBuckets::HeatBuckets() {
     seperators_.resize(0);
     buckets_.resize(0);
     current_cnt_ = 0;
-    mutex_ptrs_.resize(0);
     is_ready_ = false;
     samples_.clear(); 
-    updated_ = false;
 }
 
 HeatBuckets::~HeatBuckets() {
@@ -66,23 +54,24 @@ void HeatBuckets::debug() {
 }
 
 void HeatBuckets::update() {
-    // mark already updated, after current_cnt_ more than PERIOD_COUNT / MAGIC_FACTOR, updated_ will be reset to false;
-    // we need guarantee that in one period (one constant time span), db gets are much larger than PERIOD_COUNT / MAGIC_FACTOR;
-    // usually in server, exec get requests PERIOD_COUNT / MAGIC_FACTOR times only account for a very very short time.
-    updated_ = true; 
-    
-    assert(mutex_ptrs_.size() == buckets_.size());
-    for (size_t i=0; i<mutex_ptrs_.size(); i++) {
-        mutex_ptrs_[i]->lock();
+    uint32_t current_cnt = 0;
+
+    // remember to reset current_cnt_ counter
+    if (current_cnt_ < PERIOD_COUNT) return;
+    hit_mutex_.WriteLock();
+    if (current_cnt_ >= PERIOD_COUNT) {
+        current_cnt = current_cnt_;
+        current_cnt_ = 0;
     }
+    hit_mutex_.WriteUnlock();
+
+    if (current_cnt == 0) return;
 
+    // debug();
     // TODO: use multiple threads to update hotness of all buckets
     for (size_t i=0; i<buckets_.size(); i++) {
-        buckets_[i].update(BUCKETS_ALPHA, current_cnt_);
-        mutex_ptrs_[i]->unlock();
+        buckets_[i].update(BUCKETS_ALPHA, current_cnt);
     }
-    // remember to reset current_cnt_ counter
-    current_cnt_ = 0;
 }
 
 uint32_t HeatBuckets::locate(const std::string& key) {
@@ -102,7 +91,7 @@ uint32_t HeatBuckets::locate(const std::string& key) {
     return left;
 }
 
-void HeatBuckets::hit(const std::string& key, const bool& signal) {
+void HeatBuckets::hit(const std::string& key, bool& signal) {
     assert(is_ready_);
     // use binary search to find index i, making seperators_[i] <= key and seperators_[i+1] > i
     // reminding we have set border guard, so dont worry about out of bounds error
@@ -128,39 +117,26 @@ void HeatBuckets::hit(const std::string& key, const bool& signal) {
     // std::cout << "debug mutex_ptrs_ size : " << mutex_ptrs_.size() << std::endl;
     // std::cout << "debug period_cnt_ : " << period_cnt_ << std::endl;
     // std::cout << "debug alpha_ : " << alpha_ << std::endl;
-    assert(buckets_.size() == mutex_ptrs_.size());
     assert(idx >= 0 && idx < buckets_.size());
     assert(seperators_[idx] <= key && key < seperators_[idx+1]);
     
-    mutex_ptrs_[idx]->lock();
+    hit_mutex_.ReadLock();
     buckets_[idx].hit(); // mutex only permits one write opr to one bucket
-    mutex_ptrs_[idx]->unlock();
-
-    cnt_mutex_.lock();
     current_cnt_ += 1;
+    if (current_cnt_ >= PERIOD_COUNT) {
+        signal = true;
+    }
+    hit_mutex_.ReadUnlock();
    
-    // use updated_ to prevent from updating hotness in a very short time span (due to multi-threads operation)
-    if (signal && !updated_) {
-        // debug();
+    if (signal) {
         update();
     }
-    cnt_mutex_.unlock();
-
-    // remember to reset updated_ to false
-    if (updated_ && current_cnt_ >= PERIOD_COUNT / MAGIC_FACTOR) {
-        updated_ = false; 
-    }
 }
 
 SamplesPool::SamplesPool() {
     samples_cnt_ = 0;
     pool_.resize(0);
     filter_.clear();
-
-    // because put opt will input duplicated keys, we need to guarantee SAMPLES_MAXCNT much larger than SAMPLES_LIMIT
-    // however std::set only remain deduplicated keys
-    // to collect good samples for previous put keys, we need a larger SAMPLES_MAXCNT
-    assert(SAMPLES_MAXCNT >= MAGIC_FACTOR * SAMPLES_LIMIT); 
 }
 
 void SamplesPool::clear() {
@@ -207,13 +183,14 @@ void SamplesPool::sample(const std::string& key) {
 }
 
 void SamplesPool::prepare() {
-    std::string key_min = "user"; // defined min key for YCSB
-    std::string key_max = pool_[pool_.size()-1] + pool_[pool_.size()-1];
     if (!is_ready()) {
         return;
     }
     sort(pool_.begin(), pool_.end());
     // add border guard
+    std::string key_min = "user"; // defined min key for YCSB
+    // std::string key_max = pool_[pool_.size()-1] + pool_[pool_.size()-1];
+    std::string key_max = "user" + std::string(512, '9'); // this key must exteed each key of requests
     pool_.emplace(pool_.begin(), key_min);
     pool_.emplace_back(key_max);
 }
@@ -259,9 +236,9 @@ uint32_t SamplesPool::determine_k(std::vector<std::vector<std::string>>& segment
     uint32_t k = pool_.size() - 2;
     // if segments is empty, use default k to debug
     if (segments.empty()) {
-        k = (pool_.size() - 2) / DEFAULT_BUCKETS_NUM;  
+        k = (pool_.size() - 2) / APPROXIMATE_BUCKETS_NUM;  
     }
-    assert(k > 1);
+    assert(k >= 1);
     for (auto& segment : segments) {
         assert(segment.size() == 2);
         assert(segment[0] < segment[1]);
@@ -295,22 +272,23 @@ void HeatBuckets::init(std::vector<std::vector<std::string>>& segments) {
     samples_.divide(k, seperators_);
 
     // std::cout << "[DEBUG] show key ranges below: " << std::endl;
+    // for (size_t i=0; i<seperators_.size()-1; i++) {
+    //     std::cout << "[DEBUG] key range " << i+1;
+    //     std::cout << ": " << seperators_[i];
+    //     std::cout << "  --  " << seperators_[i+1];
+    //     std::cout << std::endl;
+    // }
     for (size_t i=0; i<seperators_.size()-1; i++) {
-        assert(seperators_[i] < seperators_[i+1]);
         // std::cout << "[DEBUG] key range " << i+1;
         // std::cout << ": " << seperators_[i];
         // std::cout << "  --  " << seperators_[i+1];
         // std::cout << std::endl;
+        assert(seperators_[i] < seperators_[i+1]);
     }
 
     // init other vars in HeatBuckets
     current_cnt_ = 0;
     buckets_.resize(seperators_.size()-1);
-    mutex_ptrs_.resize(0);
-    for (uint32_t i=0; i<buckets_.size(); i++) {
-        mutex_ptrs_.emplace_back(std::unique_ptr<std::mutex>(new std::mutex()));
-    }
-    assert(mutex_ptrs_.size() == buckets_.size());
     assert(seperators_.size() == buckets_.size()+1);
 
     is_ready_ = true;
@@ -318,5 +296,7 @@ void HeatBuckets::init(std::vector<std::vector<std::string>>& segments) {
     // debug
     // std::cout << "[DEBUG] heat buckets size: " << buckets_.size() << std::endl;
     // std::cout << "[DEBUG] key ranges init" << std::endl;
+    std::cout << "[RANGE] seperators_ size : " << seperators_.size() << std::endl;
+    std::cout << "[RANGE] buckets_ size : " << buckets_.size() << std::endl;
 }
 }
\ No newline at end of file
diff --git a/db/art/heat_buckets.h b/db/art/heat_buckets.h
index 68a8277fe..41adfd683 100644
--- a/db/art/heat_buckets.h
+++ b/db/art/heat_buckets.h
@@ -8,6 +8,7 @@
 #include <cassert>
 #include <cstdlib>
 #include <algorithm>
+#include "port/port_posix.h"
 
 namespace ROCKSDB_NAMESPACE {
 
@@ -28,6 +29,36 @@ class Bucket {
     void hit(); 
 };
 
+class SamplesPool {
+private:
+    std::vector<std::string> pool_; // using set to guarantee only store deduplicated samples
+    std::set<std::string> filter_; // used to check whether new key already exist in pool
+    uint32_t samples_cnt_; // current sample tries num, need to update after every try
+public:
+    SamplesPool();
+
+    ~SamplesPool() { return; }
+
+    void clear();
+
+    // we can modify SAMPLES_MAXCNT to control the moment that starts init heat buckets
+    bool is_ready() { return samples_cnt_ >= SAMPLES_MAXCNT; }
+    bool is_full() { return pool_.size() >= SAMPLES_LIMIT; }
+    bool is_sampled(const std::string& key) { return filter_.count(key) > 0; }
+
+    void sample(const std::string& key);
+
+    void prepare();
+
+    // need call prepare() before
+    // generate seperators
+    void divide(const uint32_t& k, std::vector<std::string>& dst);
+
+    // determine k based on low-level segments' key range
+    uint32_t determine_k(std::vector<std::vector<std::string>>& segments);
+    uint32_t locate(const std::string& key); // helper func when determine k
+};
+
 
 /*
  first sample put keys using reservoir sampling. 
@@ -42,15 +73,13 @@ class Bucket {
 class HeatBuckets {
 private:
     // TODO: mutex can be optimized
-    static std::vector<std::string> seperators_;
-    static std::vector<Bucket> buckets_;
-    static uint32_t current_cnt_; // current get count in this period
-    static std::vector<std::unique_ptr<std::mutex>> mutex_ptrs_;
-    static std::mutex cnt_mutex_;
-    static std::mutex sample_mutex_;
-    static bool is_ready_; // identify whether HeatBuckets ready for hit
-    static SamplesPool samples_; 
-    static bool updated_;
+    std::vector<std::string> seperators_;
+    std::vector<Bucket> buckets_;
+    uint32_t current_cnt_; // current get count in this period
+    mutable port::RWMutex hit_mutex_;
+    std::mutex sample_mutex_;
+    bool is_ready_; // identify whether HeatBuckets ready for hit
+    SamplesPool samples_; 
     
 public:
     HeatBuckets();
@@ -67,39 +96,9 @@ class HeatBuckets {
     void init(std::vector<std::vector<std::string>>& segments); // if sample enough keys, ready to init heatbuckets
 
     void update(); // update hotness value of all buckets
-    void hit(const std::string& key, const bool& signal); // one key only hit one bucket (also mean only hit one key range)
+    void hit(const std::string& key, bool& signal); // one key only hit one bucket (also mean only hit one key range)
     // if signal is true, update hotness
     void debug(); // output debug message in standard output
 };
 
-class SamplesPool {
-private:
-    std::vector<std::string> pool_; // using set to guarantee only store deduplicated samples
-    std::set<std::string> filter_; // used to check whether new key already exist in pool
-    uint32_t samples_cnt_; // current sample tries num, need to update after every try
-public:
-    SamplesPool();
-
-    ~SamplesPool() { return; }
-
-    void clear();
-
-    // we can modify SAMPLES_MAXCNT to control the moment that starts init heat buckets
-    bool is_ready() { return samples_cnt_ >= SAMPLES_MAXCNT; }
-    bool is_full() { return pool_.size() >= SAMPLES_LIMIT; }
-    bool is_sampled(const std::string& key) { return filter_.count(key) > 0; }
-
-    void sample(const std::string& key);
-
-    void prepare();
-
-    // need call prepare() before
-    // generate seperators
-    void divide(const uint32_t& k, std::vector<std::string>& dst);
-
-    // determine k based on low-level segments' key range
-    uint32_t determine_k(std::vector<std::vector<std::string>>& segments);
-    uint32_t locate(const std::string& key); // helper func when determine k
-};
-
 }
\ No newline at end of file
diff --git a/db/art/macros.h b/db/art/macros.h
index 9d8a3e8e3..57fcef147 100644
--- a/db/art/macros.h
+++ b/db/art/macros.h
@@ -140,39 +140,53 @@ namespace ROCKSDB_NAMESPACE {
 // micros for HeatBuckets
 
 // hotness update formula
-#define BUCKETS_ALPHA 0.2  
+#define BUCKETS_ALPHA 0.4  
 // samples pool max size, using reservoir sampling
-#define SAMPLES_LIMIT 10000 
-// if recv samples exceed SAMPLES_MAXCNT, end reservoir sampling and init Heat Buckets
-#define SAMPLES_MAXCNT 5000000 
-// short period get count, if get count equal to or exceed PERIOD_COUNT, 
-// end this short period and start next short period
-#define PERIOD_COUNT 50000 
-// number of heat buckets (number of key ranges, see hotness estimating in the paper)
-#define DEFAULT_BUCKETS_NUM 500 
+#define SAMPLES_LIMIT 1000000
+#define SAMPLES_MAXCNT 10000000
+// short period get count
+#define PERIOD_COUNT 2000000
+// key sample file
+// in order to init key ranges before first flush, 
+// we need to read keys in a file, then init key ranges first.
+#define SAMPLES_FILE "/home/guoteng_20241228_135/WaLSM+/key_sample.txt"
+// determine number of heat buckets, its value approximately equals to (this number - 1)
+// TODO: how to control number of key ranges correctly? For example, if we set to 999, this number is still 1001.
+#define APPROXIMATE_BUCKETS_NUM 100000
 // magic number in class HeatBuckets
-#define MAGIC_FACTOR 500 
+#define MAGIC_FACTOR 10 
 
-// micros for Model Train
+// key number of each segment
+#define KV_NUM_OF_SEGMENT 420
+// default size of one filter unit (bits) 
+// bits-per-key for every filter unit of every segment, 
+// found default bits-per-key = DEFAULT_UNITS_NUM * BITS_PER_KEY_PER_UNIT = 10
+// equal to primary value of paper benchmark config value
+#define BITS_PER_KEY_PER_UNIT 2
+// TODO: needed to be set based on size of KV pairs
+#define DEFAULT_UNIT_SIZE (KV_NUM_OF_SEGMENT * BITS_PER_KEY_PER_UNIT)
+
+// macros for Model Train
 
 // long period = TRAIN_PERIODS * short period. if one long period end, evaluate model and retrain model if necessary
-#define TRAIN_PERIODS 10 
+#define TRAIN_PERIODS 15
 // dataset csv file name
 #define DATASET_NAME "dataset.csv"
 // the path to save model txt file and train dataset csv file
-#define MODEL_PATH "/pg_wal/ycc/" 
+#define MODEL_PATH "/home/guoteng_20241228_135/WaLSM+/log/" 
 // we cannot send hotness value (double) to model side, 
 // so we try multiple hotness value by HOTNESS_SIGNIFICANT_DIGITS_FACTOR, then send its integer part to model
 // also we need to multiple key range rate by RATE_SIGNIFICANT_DIGITS_FACTOR
 #define HOTNESS_SIGNIFICANT_DIGITS_FACTOR 1e6 
-#define RATE_SIGNIFICANT_DIGITS_FACTOR 1e3
+#define RATE_SIGNIFICANT_DIGITS_FACTOR 1e6
 // model feature num max limit : 2 * 45 + 1
-#define MAX_FEATURES_NUM 91
+#define MAX_FEATURES_NUM 21
 
 // config micro connecting to LightGBM server 
 
 // we use Inet socket to connect server
 #define HOST "127.0.0.1"
+// #define PORT "10090"
 #define PORT "9090"
 // max size of socket receive buffer size
 #define BUFFER_SIZE 1024
@@ -183,34 +197,35 @@ namespace ROCKSDB_NAMESPACE {
 // micros for filter cache
 
 // before model work, we enable DEFAULT_UNITS_NUM units for every segments
-#define DEFAULT_UNITS_NUM 4
-// bits-per-key for every filter unit of every segment, 
-// found default bits-per-key = DEFAULT_UNITS_NUM * BITS_PER_KEY_PER_UNIT = 10
-// equal to primary value of paper benchmark config value
-#define BITS_PER_KEY_PER_UNIT 4
+#define DEFAULT_UNITS_NUM 2
 // max unit nums for every segment, we only generate MAX_UNITS_NUM units for every segment
-#define MAX_UNITS_NUM 8
+#define MAX_UNITS_NUM 6
 // we enable 0 unit for coldest segments
 #define MIN_UNITS_NUM 0
 // default max size of cache space : 8 * 1024 * 1024 * 128 = 1073741824 bit = 128 MB
-#define CACHE_SPACE_SIZE 1073741824
+#define CACHE_SPACE_SIZE (32 * 1024 * 1024 * 8)
 // fitler cache helper heap type
 #define BENEFIT_HEAP 0
 #define COST_HEAP 1
 #define UNKNOWN_HEAP 2
 // visit cnt update bound
-#define VISIT_CNT_UPDATE_BOUND 10
+#define VISIT_CNT_UPDATE_BOUND 500
+// adjustment benefit bound
+#define PURE_BENEFIT_BOUND 500
 // filter cache map threshold
-#define FULL_RATE 0.95
-#define READY_RATE 0.60
+#define FULL_RATE 1.00
+#define READY_RATE 0.80
 // default init L0 counts
 #define INIT_LEVEL_0_COUNT 0
-// default size of one filter unit (bits)
-#define DEFAULT_UNIT_SIZE 0
 // inherit remain factor
-#define INHERIT_REMAIN_FACTOR 0.5
+#define INHERIT_REMAIN_FACTOR 1
 
 // filter cache client background threads num
-#define FILTER_CACHE_THREADS_NUM 10
+#define FILTER_CACHE_THREADS_NUM 6
+
+// #define KV_SIZE = 1024
+// data block size for a segment
+// #define SEGMENT_DATA_BLOCK_SIZE 32 * 1024
+// #define KEYS_PER_SEGMENT 4096 
 
 }  // namespace ROCKSDB_NAMESPACE
\ No newline at end of file
diff --git a/db/art/nvm_manager.cc b/db/art/nvm_manager.cc
index 5de2dff71..36492814a 100644
--- a/db/art/nvm_manager.cc
+++ b/db/art/nvm_manager.cc
@@ -46,7 +46,9 @@ bool InitializeMemory(std::unordered_map<std::string, int64_t>& memory_usages,
   size_t mapped_len;
   base_memptr = (char*)pmem_map_file(
       nvm_path.c_str(), TotalSize, PMEM_FILE_CREATE, 0666, &mapped_len, &is_pmem);
-  //assert(is_pmem && mapped_len == (size_t)TotalSize);
+  // base_memptr = (char*)mmap(
+  //     nvm_path.c_str(), TotalSize, PMEM_FILE_CREATE, 0666, &mapped_len, &is_pmem);
+  assert(is_pmem && mapped_len == (size_t)TotalSize);
   aligned_ptr = reinterpret_cast<char*>(ALIGN_UP(reinterpret_cast<size_t>(base_memptr), 256));
 
   close(fd);
diff --git a/db/builder.cc b/db/builder.cc
index a5deebff9..2140c3144 100644
--- a/db/builder.cc
+++ b/db/builder.cc
@@ -455,6 +455,8 @@ Status BuildTableFromArt(
     if (table_properties) {
       *table_properties = tp;
     }
+    // store SegmentBuilderResult
+    job->segment_builder_result = builder->GetSegmentBuilderResult();
   }
   delete builder;
 
@@ -516,6 +518,12 @@ Status BuildTableFromArt(
         s = Status::Corruption("Paranoid checksums do not match");
       }
     }
+
+    // // init table_reader for later use
+    if (s.ok()) {
+      s = table_cache->InitFileTableReader(
+          read_options, internal_comparator, *meta);
+    }
   }
 
   if (!s.ok() || meta->fd.GetFileSize() == 0) {
diff --git a/db/column_family.cc b/db/column_family.cc
index d9344f4bb..241b7fc87 100644
--- a/db/column_family.cc
+++ b/db/column_family.cc
@@ -26,6 +26,7 @@
 #include "db/job_context.h"
 #include "db/range_del_aggregator.h"
 #include "db/table_properties_collector.h"
+#include "db/version_edit.h"
 #include "db/version_set.h"
 #include "db/write_controller.h"
 #include "file/sst_file_manager_impl.h"
diff --git a/db/compaction/compaction.cc b/db/compaction/compaction.cc
index 4f2d70ba7..7c893b972 100644
--- a/db/compaction/compaction.cc
+++ b/db/compaction/compaction.cc
@@ -284,65 +284,68 @@ bool Compaction::InputCompressionMatchesOutput() const {
 }
 
 bool Compaction::IsTrivialMove() const {
-  // Avoid a move if there is lots of overlapping grandparent data.
-  // Otherwise, the move could create a parent file that will require
-  // a very expensive merge later on.
-  // If start_level_== output_level_, the purpose is to force compaction
-  // filter to be applied to that level, and thus cannot be a trivial move.
-
-  // Check if start level have files with overlapping ranges
-  if (start_level_ == 0 && input_vstorage_->level0_non_overlapping() == false) {
-    // We cannot move files from L0 to L1 if the files are overlapping
-    return false;
-  }
-
-  if (is_manual_compaction_ &&
-      (immutable_cf_options_.compaction_filter != nullptr ||
-       immutable_cf_options_.compaction_filter_factory != nullptr)) {
-    // This is a manual compaction and we have a compaction filter that should
-    // be executed, we cannot do a trivial move
-    return false;
-  }
-
-  // Used in universal compaction, where trivial move can be done if the
-  // input files are non overlapping
-  if ((mutable_cf_options_.compaction_options_universal.allow_trivial_move) &&
-      (output_level_ != 0)) {
-    return is_trivial_move_;
-  }
-
-  if (!(start_level_ != output_level_ && num_input_levels() == 1 &&
-          input(0, 0)->fd.GetPathId() == output_path_id() &&
-          InputCompressionMatchesOutput())) {
-    return false;
-  }
-
-  // assert inputs_.size() == 1
-
-  std::unique_ptr<SstPartitioner> partitioner = CreateSstPartitioner();
-
-  for (const auto& file : inputs_.front().files) {
-    std::vector<FileMetaData*> file_grand_parents;
-    if (output_level_ + 1 >= number_levels_) {
-      continue;
-    }
-    input_vstorage_->GetOverlappingInputs(output_level_ + 1, &file->smallest,
-                                          &file->largest, &file_grand_parents);
-    const auto compaction_size =
-        file->fd.GetFileSize() + TotalFileSize(file_grand_parents);
-    if (compaction_size > max_compaction_bytes_) {
-      return false;
-    }
-
-    if (partitioner.get() != nullptr) {
-      if (!partitioner->CanDoTrivialMove(file->smallest.user_key(),
-                                         file->largest.user_key())) {
-        return false;
-      }
-    }
-  }
-
-  return true;
+  // // Avoid a move if there is lots of overlapping grandparent data.
+  // // Otherwise, the move could create a parent file that will require
+  // // a very expensive merge later on.
+  // // If start_level_== output_level_, the purpose is to force compaction
+  // // filter to be applied to that level, and thus cannot be a trivial move.
+
+  // // Check if start level have files with overlapping ranges
+  // if (start_level_ == 0 && input_vstorage_->level0_non_overlapping() == false) {
+  //   // We cannot move files from L0 to L1 if the files are overlapping
+  //   return false;
+  // }
+
+  // if (is_manual_compaction_ &&
+  //     (immutable_cf_options_.compaction_filter != nullptr ||
+  //      immutable_cf_options_.compaction_filter_factory != nullptr)) {
+  //   // This is a manual compaction and we have a compaction filter that should
+  //   // be executed, we cannot do a trivial move
+  //   return false;
+  // }
+
+  // // Used in universal compaction, where trivial move can be done if the
+  // // input files are non overlapping
+  // if ((mutable_cf_options_.compaction_options_universal.allow_trivial_move) &&
+  //     (output_level_ != 0)) {
+  //   return is_trivial_move_;
+  // }
+
+  // if (!(start_level_ != output_level_ && num_input_levels() == 1 &&
+  //         input(0, 0)->fd.GetPathId() == output_path_id() &&
+  //         InputCompressionMatchesOutput())) {
+  //   return false;
+  // }
+
+  // // assert inputs_.size() == 1
+
+  // std::unique_ptr<SstPartitioner> partitioner = CreateSstPartitioner();
+
+  // for (const auto& file : inputs_.front().files) {
+  //   std::vector<FileMetaData*> file_grand_parents;
+  //   if (output_level_ + 1 >= number_levels_) {
+  //     continue;
+  //   }
+  //   input_vstorage_->GetOverlappingInputs(output_level_ + 1, &file->smallest,
+  //                                         &file->largest, &file_grand_parents);
+  //   const auto compaction_size =
+  //       file->fd.GetFileSize() + TotalFileSize(file_grand_parents);
+  //   if (compaction_size > max_compaction_bytes_) {
+  //     return false;
+  //   }
+
+  //   if (partitioner.get() != nullptr) {
+  //     if (!partitioner->CanDoTrivialMove(file->smallest.user_key(),
+  //                                        file->largest.user_key())) {
+  //       return false;
+  //     }
+  //   }
+  // }
+
+  // return true;
+
+  // disallow trivial move compaction in WaLSM+
+  return false;
 }
 
 void Compaction::AddInputDeletions(VersionEdit* out_edit) {
@@ -389,8 +392,8 @@ bool Compaction::KeyNotExistsBeyondOutputLevel(
 void Compaction::MarkFilesBeingCompacted(bool mark_as_compacted) {
   for (size_t i = 0; i < num_input_levels(); i++) {
     for (size_t j = 0; j < inputs_[i].size(); j++) {
-      assert(mark_as_compacted ? !inputs_[i][j]->being_compacted
-                               : inputs_[i][j]->being_compacted);
+      // assert(mark_as_compacted ? !inputs_[i][j]->being_compacted
+      //                          : inputs_[i][j]->being_compacted);
       inputs_[i][j]->being_compacted = mark_as_compacted;
     }
   }
diff --git a/db/compaction/compaction_iterator.cc b/db/compaction/compaction_iterator.cc
index 4555ec568..a1a2829db 100644
--- a/db/compaction/compaction_iterator.cc
+++ b/db/compaction/compaction_iterator.cc
@@ -151,6 +151,7 @@ void CompactionIterator::Next() {
     if (merge_out_iter_.Valid()) {
       key_ = merge_out_iter_.key();
       value_ = merge_out_iter_.value();
+      segment_id_ = merge_out_iter_.segment_id();
       Status s = ParseInternalKey(key_, &ikey_);
       // MergeUntil stops when it encounters a corrupt key and does not
       // include them in the result, so we expect the keys here to be valid.
@@ -268,6 +269,7 @@ void CompactionIterator::NextFromInput() {
          !IsShuttingDown()) {
     key_ = input_->key();
     value_ = input_->value();
+    segment_id_ = input_->segment_id();
     iter_stats_.num_input_records++;
 
     Status pikStatus = ParseInternalKey(key_, &ikey_);
@@ -625,6 +627,7 @@ void CompactionIterator::NextFromInput() {
         //       These will be correctly set below.
         key_ = merge_out_iter_.key();
         value_ = merge_out_iter_.value();
+        segment_id_ = merge_out_iter_.segment_id();
         pikStatus = ParseInternalKey(key_, &ikey_);
         // MergeUntil stops when it encounters a corrupt key and does not
         // include them in the result, so we expect the keys here to valid.
diff --git a/db/compaction/compaction_iterator.h b/db/compaction/compaction_iterator.h
index 29dedd3c7..c2e385588 100644
--- a/db/compaction/compaction_iterator.h
+++ b/db/compaction/compaction_iterator.h
@@ -12,6 +12,7 @@
 
 #include "db/compaction/compaction.h"
 #include "db/compaction/compaction_iteration_stats.h"
+#include "db/dbformat.h"
 #include "db/merge_helper.h"
 #include "db/pinned_iterators_manager.h"
 #include "db/range_del_aggregator.h"
@@ -116,6 +117,7 @@ class CompactionIterator {
   bool Valid() const { return valid_; }
   const Slice& user_key() const { return current_user_key_; }
   const CompactionIterationStats& iter_stats() const { return iter_stats_; }
+  uint32_t segment_id() { return segment_id_; }
 
  private:
   // Processes the input stream to find the next output
@@ -206,6 +208,7 @@ class CompactionIterator {
   Slice current_user_key_;
   SequenceNumber current_user_key_sequence_;
   SequenceNumber current_user_key_snapshot_;
+  uint32_t segment_id_ = INVALID_SEGMENT_ID;
 
   // True if the iterator has already returned a record for the current key.
   bool has_outputted_key_ = false;
diff --git a/db/compaction/compaction_job.cc b/db/compaction/compaction_job.cc
index 3b1ee2ae0..a30ccaa99 100644
--- a/db/compaction/compaction_job.cc
+++ b/db/compaction/compaction_job.cc
@@ -10,18 +10,24 @@
 #include "db/compaction/compaction_job.h"
 
 #include <algorithm>
+#include <cassert>
+#include <cctype>
 #include <cinttypes>
 #include <functional>
+#include <iostream>
 #include <list>
 #include <memory>
+#include <ostream>
 #include <random>
 #include <set>
+#include <string>
 #include <thread>
 #include <utility>
 #include <vector>
 
 #include "db/art/compactor.h"
 #include "db/art/logger.h"
+#include "db/art/art_metric.h"
 #include "db/builder.h"
 #include "db/db_impl/db_impl.h"
 #include "db/db_iter.h"
@@ -49,12 +55,15 @@
 #include "port/port.h"
 #include "rocksdb/db.h"
 #include "rocksdb/env.h"
+#include "rocksdb/options.h"
+#include "rocksdb/slice.h"
 #include "rocksdb/sst_partitioner.h"
 #include "rocksdb/statistics.h"
 #include "rocksdb/status.h"
 #include "rocksdb/table.h"
 #include "table/block_based/block.h"
 #include "table/block_based/block_based_table_factory.h"
+#include "table/block_based/filter_block.h"
 #include "table/merging_iterator.h"
 #include "table/table_builder.h"
 #include "test_util/sync_point.h"
@@ -67,6 +76,8 @@
 
 namespace ROCKSDB_NAMESPACE {
 
+static SSDWriteMetric writeMetric_;
+
 const char* GetCompactionReasonString(CompactionReason compaction_reason) {
   switch (compaction_reason) {
     case CompactionReason::kUnknown:
@@ -171,6 +182,9 @@ struct CompactionJob::SubcompactionState {
   // A flag determine whether the key has been seen in ShouldStopBefore()
   bool seen_key = false;
 
+  // stores segment_builder_result for each subcompaction
+  SegmentBuilderResult segment_builder_result;
+
   SubcompactionState(Compaction* c, Slice* _start, Slice* _end, uint64_t size)
       : compaction(c), start(_start), end(_end), approx_size(size) {
     assert(compaction != nullptr);
@@ -178,7 +192,7 @@ struct CompactionJob::SubcompactionState {
 
   // Adds the key and value to the builder
   // If paranoid is true, adds the key-value to the paranoid hash
-  Status AddToBuilder(const Slice& key, const Slice& value) {
+  Status AddToBuilder(const Slice& key, const Slice& value, uint32_t segment_id) {
     auto curr = current_output();
     assert(builder != nullptr);
     assert(curr != nullptr);
@@ -186,7 +200,7 @@ struct CompactionJob::SubcompactionState {
     if (!s.ok()) {
       return s;
     }
-    builder->Add(key, value);
+    builder->Add(key, value, segment_id);
     return Status::OK();
   }
 
@@ -560,6 +574,71 @@ void CompactionJob::GenSubcompactionBoundaries() {
   }
 }
 
+void CompactionJob::CollectDataAndPrefetch() {
+  // aggregate SegmentBuilderResult from subcompactions
+  for (auto& state : compact_->sub_compact_states) {
+    auto& sub_result = state.segment_builder_result;
+    assert(!sub_result.merged_segment_ids.empty());
+    assert(!sub_result.new_segment_ids.empty());
+    segment_builder_result_.new_segment_ids.insert(
+        sub_result.new_segment_ids.begin(),
+        sub_result.new_segment_ids.end());
+
+    segment_builder_result_.merged_segment_ids.insert(
+        sub_result.merged_segment_ids.begin(),
+        sub_result.merged_segment_ids.end());
+
+    for (auto& per_segment_result : sub_result.per_segment_results) {
+      segment_builder_result_.per_segment_results.push_back(
+          std::move(per_segment_result));
+    }
+  }
+  segment_builder_result_.output_level = compact_->compaction->output_level();
+
+  // static std::mutex debug_mutex;
+  // {
+  //   std::lock_guard<std::mutex> lock_guard(debug_mutex);
+  //   for (auto& segment_result : segment_builder_result_.per_segment_results) {
+  //     double rate_sum = 0;
+  //     std::cout << "segment_id=" << segment_result.segment_id << ": " << segment_result.range_rate_pairs.size() << " ranges, level=" << compact_->compaction->output_level() << ", count=" << segment_result.key_count;
+  //     // std::cout << std::endl;
+  //     for (const auto& range_pair : segment_result.range_rate_pairs) {
+  //       rate_sum += range_pair.rate_in_segment;
+  //       // std::cout << range_pair.range_id << "-" << range_pair.rate_in_segment << " ";
+  //     }
+  //     assert(rate_sum >= 0.98 && rate_sum <= 1.02);
+  //     std::cout << std::endl;
+  //   }
+  // }
+
+
+  // WaLSM+ debug
+  // for (auto& state : compact_->sub_compact_states) {
+  //   for (auto& output : state.outputs) {
+  //     std::cout << "c, filename=" << output.meta.fd.GetNumber() 
+  //               << ", smallest=" << output.meta.smallest.user_key().ToString()
+  //               << ", largest=" << output.meta.largest.user_key().ToString()
+  //               << std::endl;
+  //   }
+  // }
+
+  // insert all filter block handles to FilterCache
+  for (auto& state : compact_->sub_compact_states) {
+    for (auto& output : state.outputs) {
+      assert(output.meta.fd.table_reader != nullptr);
+      const auto* table = output.meta.fd.table_reader;
+      auto block_handles_map = table->GetSegmentBlockHandles();
+      assert(block_handles_map.size() > 0);
+      for (const auto& segment_id_and_block_handles : block_handles_map) {
+        auto segment_id = segment_id_and_block_handles.first;
+        const auto& block_handles = segment_id_and_block_handles.second;
+        // dangerous cast, but we know that the table is BlockBasedTablde
+        filter_cache_client_->init_segment(segment_id, (BlockBasedTable*) table, block_handles);
+      }
+    }
+  }
+}
+
 // TODO(WaLSM+): pass temp recorders ptr and update
 Status CompactionJob::Run() {
   AutoThreadOperationStageUpdater stage_updater(
@@ -627,9 +706,10 @@ Status CompactionJob::Run() {
   }
   if (status.ok()) {
     thread_pool.clear();
-    std::vector<const CompactionJob::SubcompactionState::Output*> files_output;
-    for (const auto& state : compact_->sub_compact_states) {
-      for (const auto& output : state.outputs) {
+    // WaLSM+: remove const qulifier to init file table reader
+    std::vector<CompactionJob::SubcompactionState::Output*> files_output;
+    for (auto& state : compact_->sub_compact_states) {
+      for (auto& output : state.outputs) {
         files_output.emplace_back(&output);
       }
     }
@@ -685,6 +765,12 @@ Status CompactionJob::Run() {
           }
         }
 
+        // // init table_reader for later use
+        if (s.ok()) {
+          s = cfd->table_cache()->InitFileTableReader(
+              read_options, cfd->internal_comparator(), files_output[file_idx]->meta);
+        }
+
         delete iter;
 
         if (!s.ok()) {
@@ -781,6 +867,9 @@ Status CompactionJob::Install(const MutableCFOptions& mutable_cf_options) {
               (GetStartTime() - stats.micros) * 1e-6,
               compact_->compaction->output_level());
 
+  // update WaLSM write metric
+  writeMetric_.updateMetric(stats.bytes_written);
+
   ROCKS_LOG_BUFFER(
       log_buffer_,
       "[%s] compacted to: %s, MB/sec: %.1f rd, %.1f wr, level %d, "
@@ -843,7 +932,6 @@ Status CompactionJob::Install(const MutableCFOptions& mutable_cf_options) {
     stream.EndArray();
   }
 
-  CleanupCompaction();
   return status;
 }
 
@@ -965,6 +1053,7 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
     // returns true.
     const Slice& key = c_iter->key();
     const Slice& value = c_iter->value();
+    const uint32_t segment_id = c_iter->segment_id();
 
     // If an end key (exclusive) is specified, check if the current key is
     // >= than it and exit if it is because the iterator is out of its range
@@ -987,7 +1076,7 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
       }
     }
     // TODO(WaLSM+): pass temp recorders ptr and update
-    status = sub_compact->AddToBuilder(key, value); 
+    status = sub_compact->AddToBuilder(key, value, segment_id); 
     if (!status.ok()) {
       break;
     }
@@ -1487,6 +1576,21 @@ Status CompactionJob::FinishCompactionOutputFile(
   }
 #endif
 
+  // WaLSM+: collect data before resetting builder pointer
+  auto current_segment_builder_result = sub_compact->builder->GetSegmentBuilderResult();
+  // merge segment_builder_result into sub_compact->segment_builder_result
+  for (const auto id : current_segment_builder_result.merged_segment_ids) {
+    sub_compact->segment_builder_result.merged_segment_ids.insert(id);
+  }
+  for (const auto id : current_segment_builder_result.new_segment_ids) {
+    sub_compact->segment_builder_result.new_segment_ids.insert(id);
+  }
+  for (auto& per_segment_result : current_segment_builder_result.per_segment_results) {
+    sub_compact->segment_builder_result.per_segment_results.emplace_back(std::move(per_segment_result));
+  }
+  assert(!sub_compact->segment_builder_result.merged_segment_ids.empty());
+  assert(!sub_compact->segment_builder_result.new_segment_ids.empty());
+
   sub_compact->builder.reset();
   sub_compact->current_output_file_size = 0;
   return s;
diff --git a/db/compaction/compaction_job.h b/db/compaction/compaction_job.h
index aafad8d3a..dbd9c0520 100644
--- a/db/compaction/compaction_job.h
+++ b/db/compaction/compaction_job.h
@@ -39,6 +39,7 @@
 #include "rocksdb/env.h"
 #include "rocksdb/memtablerep.h"
 #include "rocksdb/transaction_log.h"
+#include "table/block_based/filter_block.h"
 #include "table/scoped_arena_iterator.h"
 #include "util/autovector.h"
 #include "util/stop_watch.h"
@@ -103,6 +104,21 @@ class CompactionJob {
   // Return the IO status
   IOStatus io_status() const { return io_status_; }
 
+  SegmentBuilderResult GetSegmentBuilderResult() const {
+    return segment_builder_result_;
+  }
+
+  // should be called before Run()
+  void SetFilterCacheClient(FilterCacheClient* filter_cache_client) {
+    filter_cache_client_ = filter_cache_client;
+  }
+
+  // collect data for WaLSM+
+  void CollectDataAndPrefetch();
+
+  // call cleanup after CollectDataAndPrefetch()
+  void CleanupCompaction();
+
  private:
   struct SubcompactionState;
 
@@ -130,7 +146,6 @@ class CompactionJob {
   Status InstallCompactionResults(const MutableCFOptions& mutable_cf_options);
   void RecordCompactionIOStats();
   Status OpenCompactionOutputFile(SubcompactionState* sub_compact);
-  void CleanupCompaction();
   void UpdateCompactionJobStats(
     const InternalStats::CompactionStats& stats) const;
   void RecordDroppedKeys(const CompactionIterationStats& c_iter_stats,
@@ -200,6 +215,9 @@ class CompactionJob {
   Env::WriteLifeTimeHint write_hint_;
   Env::Priority thread_pri_;
   IOStatus io_status_;
+
+  SegmentBuilderResult segment_builder_result_;
+  FilterCacheClient* filter_cache_client_;
 };
 
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/db/compaction/compaction_picker_universal.cc b/db/compaction/compaction_picker_universal.cc
index 0768f1958..e78f0a71b 100644
--- a/db/compaction/compaction_picker_universal.cc
+++ b/db/compaction/compaction_picker_universal.cc
@@ -8,6 +8,10 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
 #include "db/compaction/compaction_picker_universal.h"
+#include <iostream>
+#include <ostream>
+#include <set>
+#include "db/version_edit.h"
 #ifndef ROCKSDB_LITE
 
 #include <cinttypes>
@@ -659,12 +663,17 @@ Compaction* UniversalCompactionBuilder::PickCompactionForQLearning() {
       if (!partition->is_tier[i] && !partition->is_compaction_work[i]
           && partition->files_[i].size() > 1) {
         bool ok = true;
+        std::set<FileMetaData*> file_metadata_pointers;
         for (FileMetaData* f : partition->files_[i]) {
           if (f->being_compacted) {
             ok = false;
             break;
           }
+          if (file_metadata_pointers.count(f)) {
+            std::cout << "wtf" << std::endl;
+          }
           inputs[i].files.push_back(f);
+          file_metadata_pointers.insert(f);
         }
         if (!ok) {
           inputs[i].files.clear();
diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc
index 18ed841ae..b892d970e 100644
--- a/db/db_impl/db_impl.cc
+++ b/db/db_impl/db_impl.cc
@@ -9,6 +9,7 @@
 #include "db/db_impl/db_impl.h"
 
 #include <stdint.h>
+#include <mutex>
 #ifdef OS_SOLARIS
 #include <alloca.h>
 #endif
@@ -109,6 +110,7 @@
 #include "util/mutexlock.h"
 #include "util/stop_watch.h"
 #include "util/string_util.h"
+#include "db/art/global_filter_cache_context.h"
 
 namespace ROCKSDB_NAMESPACE {
 
@@ -250,25 +252,34 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname,
   period_cnt_ = 0;
   last_train_period_ = 0;
   */
-  segment_info_recorder_ = new std::unordered_map<uint32_t, std::vector<std::string>>;
-  level_recorder_ = new std::map<uint32_t, uint16_t>;
-  level_0_base_count_ = 0;
-
-  features_nums_except_level_0_ = new std::vector<uint16_t>;
-  uint16_t features_num = MAX_FEATURES_NUM;
-  if (features_num > 0) {
-    features_nums_except_level_0_->emplace_back(features_num);
+  {
+    std::lock_guard<std::mutex> global_filter_cache_lock_guard(global_filter_cache_recorders_mutex);
+    global_level_0_base_count = 0;
+    uint16_t features_num = MAX_FEATURES_NUM;
+    if (features_num > 0) {
+      global_features_nums_except_level_0.emplace_back(features_num);
+    }
+    global_filter_cache.periods_work();
+    global_filter_cache.retrain_or_keep_model(
+        &global_features_nums_except_level_0, &global_level_recorder,
+        &global_segment_ranges_recorder, &global_unit_size_recorder);
+    // global_filter_cache.make_adjustment();
+    #ifdef SAMPLES_FILE
+    std::ifstream input(SAMPLES_FILE);
+    assert(input.is_open());
+
+    std::string art_key;
+    uint32_t key_count = 0;
+    while (std::getline(input, art_key)) {
+      global_filter_cache.prepare_heat_buckets(art_key, &global_segment_info_recorder);
+    }
+    assert(global_filter_cache.range_seperators().size() > 0); // heat buckets must be ready 
+    // std::cout << "seperators size: " << global_filter_cache.range_seperators().size() << std::endl;
+    // for (std::string &seperator : global_filter_cache.range_seperators()) {
+    //   std::cout << seperator << std::endl;
+    // }
+    #endif
   }
-
-  segment_ranges_recorder_ = new std::map<uint32_t, std::vector<RangeRatePair>>;
-
-  unit_size_recorder_ = new std::map<uint32_t, uint32_t>;
-
-  filter_cache_.retrain_or_keep_model(features_nums_except_level_0_, 
-                                      level_recorder_,
-                                      segment_ranges_recorder_,
-                                      unit_size_recorder_);
-  filter_cache_.make_adjustment();
 #endif
   // !batch_per_trx_ implies seq_per_batch_ because it is only unset for
   // WriteUnprepared, which should use seq_per_batch_.
@@ -1691,6 +1702,9 @@ Status DBImpl::GetImpl(const ReadOptions& read_options, const Slice& key,
       get_impl_options.column_family);
   auto cfd = cfh->cfd();
 
+  // WaLSM+: update cfd pointer for future use
+  global_filter_cache.update_cfd_ptr_if_needed(cfd);
+
   if (tracer_) {
     // TODO: This mutex should be removed later, to improve performance when
     // tracing is enabled.
@@ -1777,7 +1791,7 @@ Status DBImpl::GetImpl(const ReadOptions& read_options, const Slice& key,
 #ifdef ART
 #ifdef ART_PLUS
   std::string art_key(key.data(), key.size());
-  filter_cache_.get_updating_work(art_key);
+  global_filter_cache.hit_heat_buckets(art_key);
   // ready to estimate hotness, update heat buckets
   /*
   if (heat_buckets_.is_ready()) {
@@ -1803,7 +1817,7 @@ Status DBImpl::GetImpl(const ReadOptions& read_options, const Slice& key,
     // only one thread can train model.
     if (need_train) {
       std::fstream f_model;
-      f_model.open("/pg_wal/ycc/model.log", std::ios::out | std::ios::app);
+      f_model.open("/home/guoteng_20241228_135/WaLSM+/log/model.log", std::ios::out | std::ios::app);
       f_model << "[DEBUG] try to train models" << std::endl;
       f_model << "[DEBUG] period_cnt_ : " << period_cnt_ << std::endl;
       f_model << "[DEBUG] PERIOD_COUNT : " << PERIOD_COUNT << std::endl;
@@ -1908,7 +1922,7 @@ Status DBImpl::GetImpl(const ReadOptions& read_options, const Slice& key,
         get_impl_options.get_value);
 #else
     sv->current->Get(
-        filter_cache_,
+        global_filter_cache,
         read_options, lkey, get_impl_options.value, timestamp, &s,
         &merge_context, &max_covering_tombstone_seq,
         get_impl_options.get_value ? get_impl_options.value_found : nullptr,
diff --git a/db/db_impl/db_impl.h b/db/db_impl/db_impl.h
index 32209ea75..7d411ba62 100644
--- a/db/db_impl/db_impl.h
+++ b/db/db_impl/db_impl.h
@@ -26,7 +26,6 @@
 #include "db/art/vlog_manager.h"
 #include "db/art/heat_buckets.h"
 #include "db/art/clf_model.h"
-#include "db/art/filter_cache_item.h"
 #include "db/art/filter_cache_heap.h"
 #include "db/art/filter_cache.h"
 #include "db/art/filter_cache_client.h"
@@ -1907,54 +1906,7 @@ class DBImpl : public DB {
   HeatGroupManager* group_manager_;
 
 #ifdef ART_PLUS
-  // TODO: add necessary filter cache info structures
-  FilterCacheClient filter_cache_; // already contain FilterCacheManager
-
-  // TODO: mutex for updating these recorders below
-  //       will be locked when updating these recorders below, and unlock after updating ends
-  std::mutex filter_cache_mutex_;
-
-  // these global recorders need to be latest after every flush or compaction:
-  // std::map<uint32_t, uint16_t>* level_recorder_
-  // std::map<uint32_t, std::vector<RangeRatePair>>* segment_ranges_recorder_
-  // std::map<uint32_t, uint32_t>* unit_size_recorder_
-  // you may need filter_cache_.range_seperators() to receive key range seperators
-  // exactly, if key k < seperators[i+1] and key k >= seperators[i], then key k hit key range i
-  // HeatBuckets::locate(const std::string& key) will tell you how to binary search corresponding key range for one key
-
-  // segment_info_recorder save every segments' min key and max key
-  // but we only need to pass empty segment_info_recorder now
-  // TODO: it should contain all levels segments' min key and max key, then pass to filter cache client, but not used now
-  // this recorder will help decide the key ranges' num, but it dont work in current work
-  // you can try to modify macro DEFAULT_BUCKETS_NUM to decide the key ranges' num
-  std::unordered_map<uint32_t, std::vector<std::string>>* segment_info_recorder_;
-
-  // record every alive segments' level
-  // TODO: need to be latest all the time
-  std::map<uint32_t, uint16_t>* level_recorder_;
-
-  // record features num of every segments
-  // we choose max features num to define model feature num
-  // if you want to use a default features num, set MAX_FEATURES_NUM to non-zero value
-  // then do not insert any entry into this vector later
-  // TODO: we dont use this vector, so we set MAX_FEATURES_NUM to non-zero value
-  std::vector<uint16_t>* features_nums_except_level_0_;
-
-  // should be based level 0 visit cnt in a total long period
-  // simply we set level_0_base_count to 0, and use macro INIT_LEVEL_0_COUNT
-  // we can set this macro to ( PERIOD_COUNT * TRAIN_PERIODS ) * ( level 0 sorted runs num ) / ( max level 0 segments num ) 
-  // TODO: modify INIT_LEVEL_0_COUNT to proper value
-  uint32_t level_0_base_count_;
-
-  // record interacting ranges and their rates of alive segments
-  // TODO: should be latest all the time
-  std::map<uint32_t, std::vector<RangeRatePair>>* segment_ranges_recorder_;
-                                         
-  // every segment's filter unit size is the same
-  // this recorder should hold all alive segment
-  // simply, you can also use default macro DEFAULT_UNIT_SIZE for all segments, just leave this recorder empty
-  // TODO: modify DEFAULT_UNIT_SIZE
-  std::map<uint32_t, uint32_t>* unit_size_recorder_;
+  // filter_cache context (moved to filter_cache_client.h/cc)
 
   /*
   HeatBuckets heat_buckets_;
diff --git a/db/db_impl/db_impl_compaction_flush.cc b/db/db_impl/db_impl_compaction_flush.cc
index 846c262e6..df537005c 100644
--- a/db/db_impl/db_impl_compaction_flush.cc
+++ b/db/db_impl/db_impl_compaction_flush.cc
@@ -7,6 +7,8 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 #include <cinttypes>
+#include <memory>
+#include <mutex>
 
 #include "db/art/logger.h"
 #include "db/builder.h"
@@ -14,14 +16,18 @@
 #include "db/error_handler.h"
 #include "db/event_helpers.h"
 #include "db/nvm_flush_job.h"
+#include "db/table_cache.h"
 #include "file/sst_file_manager_impl.h"
 #include "monitoring/iostats_context_imp.h"
 #include "monitoring/perf_context_imp.h"
 #include "monitoring/thread_status_updater.h"
 #include "monitoring/thread_status_util.h"
+#include "rocksdb/options.h"
+#include "table/block_based/filter_block.h"
 #include "test_util/sync_point.h"
 #include "util/cast_util.h"
 #include "util/concurrent_task_limiter_impl.h"
+#include "db/art/global_filter_cache_context.h"
 
 namespace ROCKSDB_NAMESPACE {
 
@@ -2546,13 +2552,13 @@ void DBImpl::SyncCallFlush(std::vector<SingleCompactionJob*>& jobs) {
     // you may need filter_cache_.range_seperators() to receive key range seperators
     // exactly, if key k < seperators[i+1] and key k >= seperators[i], then key k hit key range i
     // HeatBuckets::locate(const std::string& key) will tell you how to binary search corresponding key range for one key
-    std::set<uint32_t>* merged_segment_ids = new std::set<uint32_t>; // the merged segments' id, we need to delete them from these 3 global recorders
-    std::map<uint32_t, uint16_t>* new_level_recorder = new std::map<uint32_t, uint16_t>;
-    std::map<uint32_t, std::vector<RangeRatePair>>* new_segment_ranges_recorder = new std::map<uint32_t, std::vector<RangeRatePair>>;
-    std::map<uint32_t, uint32_t>* new_unit_size_recorder = new std::map<uint32_t, uint32_t>;
-    std::vector<std::string>& key_range_seperators = filter_cache_.range_seperators();
-    std::set<uint32_t>* new_segment_ids = new std::set<uint32_t>;
-    std::map<uint32_t, std::unordered_map<uint32_t, double>>* inherit_infos_recorder = new std::map<uint32_t, std::unordered_map<uint32_t, double>>;
+    std::unique_ptr<std::set<uint32_t>> merged_segment_ids (new std::set<uint32_t>); // the merged segments' id, we need to delete them from these 3 global recorders
+    std::unique_ptr<std::map<uint32_t, uint16_t>> new_level_recorder (new std::map<uint32_t, uint16_t>);
+    std::unique_ptr<std::map<uint32_t, std::vector<RangeRatePair>>> new_segment_ranges_recorder (new std::map<uint32_t, std::vector<RangeRatePair>>);
+    std::unique_ptr<std::map<uint32_t, uint32_t>> new_unit_size_recorder (new std::map<uint32_t, uint32_t>);
+    std::vector<std::string>& key_range_seperators = global_filter_cache.range_seperators();
+    std::unique_ptr<std::set<uint32_t>> new_segment_ids(new std::set<uint32_t>);
+    std::unique_ptr<std::map<uint32_t, std::unordered_map<uint32_t, double>>> inherit_infos_recorder (new std::map<uint32_t, std::unordered_map<uint32_t, double>>);
     // TODO(WaLSM+): you can pass these var into NVMFlushJob and update them when flushing
   #endif
 
@@ -2632,6 +2638,15 @@ void DBImpl::SyncCallFlush(std::vector<SingleCompactionJob*>& jobs) {
       }
     }
 
+    // // WaLSM+ debug
+    // for (auto& db_job : db_jobs) {
+    //   auto& meta = db_job.nvm_flush_job->meta_;
+    //     std::cout << "f, filename=" << meta.fd.GetNumber() 
+    //               << ", smallest=" << meta.smallest.user_key().ToString()
+    //               << ", largest=" << meta.largest.user_key().ToString()
+    //               << std::endl;
+    // }
+
     TEST_SYNC_POINT("DBImpl::SyncCallFlush:FlushFinish:0");
     ReleaseFileNumberFromPendingOutputs(pending_outputs_inserted_elem);
 
@@ -2657,80 +2672,114 @@ void DBImpl::SyncCallFlush(std::vector<SingleCompactionJob*>& jobs) {
     }
     TEST_SYNC_POINT("DBImpl::SyncCallFlush:ContextCleanedUp");
 
+    atomic_flush_install_cv_.SignalAll();
+    bg_cv_.SignalAll();
+
+    // sync first, we leave other data collection work at last.
+    // std::vector<SegmentBuilderResult> segment_builder_results;
+    SegmentBuilderResult agg_segment_builder_result;
+    for (auto& db_job : db_jobs) {
+      // segment_builder_results.emplace_back(std::move(
+      //     db_job.nvm_flush_job->segment_builder_result_));
+      auto& sub_result = db_job.nvm_flush_job->segment_builder_result_;
+      agg_segment_builder_result.new_segment_ids.insert(
+          sub_result.new_segment_ids.begin(),
+          sub_result.new_segment_ids.end());
+
+      agg_segment_builder_result.merged_segment_ids.insert(
+          sub_result.merged_segment_ids.begin(),
+          sub_result.merged_segment_ids.end());
+
+      for (auto& per_segment_result : sub_result.per_segment_results) {
+        agg_segment_builder_result.per_segment_results.push_back(
+            std::move(per_segment_result));
+      }
+    }
+    agg_segment_builder_result.output_level = 0; // flushed
+    assert(agg_segment_builder_result.new_segment_ids.size() > 0);
+    // insert all filter block handles to filtercache
+    // TableCache* table_cache = (TableCache*) table_cache_.get();
+    for (auto& db_job : db_jobs) {
+      auto& meta = db_job.nvm_flush_job->meta_;
+      assert(meta.fd.table_reader != nullptr);
+      const auto* table = meta.fd.table_reader;
+      auto block_handles_map = table->GetSegmentBlockHandles();
+      assert(block_handles_map.size() > 0);
+      for (const auto& segment_id_and_block_handles : block_handles_map) {
+        auto segment_id = segment_id_and_block_handles.first;
+        const auto& block_handles = segment_id_and_block_handles.second;
+        // dangerous cast, but we know that the table is BlockBasedTable
+        global_filter_cache.init_segment(segment_id, (BlockBasedTable*) table, block_handles);
+      }
+    }
+
     for (auto& db_job : db_jobs) {
       num_running_flushes_--;
       delete db_job.nvm_flush_job;
     }
 
-    atomic_flush_install_cv_.SignalAll();
-    bg_cv_.SignalAll();
-
   #ifdef ART_PLUS
+    // transfer agg_segment_builder_result to temp recorders
+
+    // update merged_segment_ids and new_segment_ids
+    // agg_merrged_segment_ids should only have one element - INVALID_SEGMENT_ID
+    assert(agg_segment_builder_result.merged_segment_ids.size() <= 1);
+    assert(merged_segment_ids->empty());
+
+    for (const auto& id : agg_segment_builder_result.new_segment_ids) {
+      new_segment_ids->insert(id);
+    }
+
+    // update new_level_recorder 
+    for (const auto id : agg_segment_builder_result.new_segment_ids) {
+      new_level_recorder->insert(std::make_pair(id, agg_segment_builder_result.output_level));
+    }
+
+    // update new_segment_ranges_recorder and inherit_infos_recorder
+    for (const auto& per_segment_result : agg_segment_builder_result.per_segment_results) {
+      const auto segment_id = per_segment_result.segment_id;
+      (*new_segment_ranges_recorder)[segment_id] = per_segment_result.range_rate_pairs;
+      // no inherit_info when flushing
+    }
+
     // do new SSTs already exist in latest version?
     // TODO(WaLSM+): if all ok, merge temp recorders into global DBImpl recorders. 
     //               we need a mutex to guarantee these recorders modified by only one background thread at one time
-    filter_cache_mutex_.lock();
     // std::map<uint32_t, uint16_t> merged_level_recorder; // actually when flushing, there is no merged segment
 
     // remove merged segments
-    assert(merged_segment_ids->empty());
-    /*
-    auto level_it = level_recorder_->begin();
-    auto range_it = segment_ranges_recorder_->begin();
-    auto units_it = unit_size_recorder_->begin();
-    while (level_it != level_recorder_->end()) {
-      if (merged_segment_ids->count(level_it->first) > 0) {
-        merged_level_recorder.insert(std::make_pair(level_it->first, level_it->second))
-        level_it = level_recorder_->erase(level_it);
-      } else {
-        level_it ++;
+    // lock and update global recorders
+    {
+      std::lock_guard<std::mutex> lock_guard(global_filter_cache_recorders_mutex);
+      assert(new_level_recorder->size() == new_segment_ranges_recorder->size());
+      auto new_level_it = new_level_recorder->begin();
+      auto new_range_it = new_segment_ranges_recorder->begin();
+      auto new_units_it = new_unit_size_recorder->begin();
+      while (new_level_it != new_level_recorder->end()) {
+        global_level_recorder.insert(
+            std::make_pair(new_level_it->first, new_level_it->second));
+        new_level_it++;
       }
-    }
-    while (range_it != segment_ranges_recorder_->end()) {
-      if (merged_segment_ids->count(range_it->first) > 0) {
-        range_it = segment_ranges_recorder_->erase(range_it);
-      } else {
-        range_it ++;
+      while (new_range_it != new_segment_ranges_recorder->end()) {
+        global_segment_ranges_recorder.insert(
+            std::make_pair(new_range_it->first, new_range_it->second));
+        new_range_it++;
       }
-    }
-    while (units_it != unit_size_recorder_->end()) {
-      if (merged_segment_ids->count(units_it->first) > 0) {
-        units_it = unit_size_recorder_->erase(units_it);
-      } else {
-        units_it ++;
+      while (new_units_it != new_unit_size_recorder->end()) {
+        // unit_size_recorder_.insert(std::make_pair(new_units_it->first,
+        // new_units_it->second)); we only use DEFAULT_UNIT_SIZE
+        new_units_it++;
       }
     }
-    */
+    // recorder's lock released
 
-    // lock and update global recorders
-    global_recorder_mutex_.lock();
-    // merge merge temp recorders into global DBImpl recorders.
-    assert(new_level_recorder->size() == new_segment_ranges_recorder->size());
-    auto new_level_it = new_level_recorder->begin();
-    auto new_range_it = new_segment_ranges_recorder->begin();
-    auto new_units_it = new_unit_size_recorder->begin();
-    while (new_level_it != new_level_recorder->end()) {
-      level_recorder_.insert(std::make_pair(new_level_it->first, new_level_it->second));
-      new_level_it ++;
-    }
-    while (new_range_it != new_segment_ranges_recorder->end()) {
-      segment_ranges_recorder_.insert(std::make_pair(new_range_it->first, new_range_it->second));
-      new_range_it ++;
-    }
-    while (new_units_it != new_unit_size_recorder->end()) {
-      // unit_size_recorder_.insert(std::make_pair(new_units_it->first, new_units_it->second));
-      // we only use DEFAULT_UNIT_SIZE
-      new_units_it ++;
-    }
-    global_recorder_mutex_.unlock();
-    
     // call filter cache client DBImpl::filter_cache_ update work 
     assert(merged_segment_ids->empty());
     assert(inherit_infos_recorder->empty());
     std::vector<uint32_t> merged_segment_ids_vec, new_segment_ids_vec;
-    merged_segment_ids_vec.assign(merged_segment_ids.begin(), merged_segment_ids.end());
-    new_segment_ids_vec.assign(new_segment_ids.begin(), new_segment_ids.end());
-    filter_cache_.batch_insert_segments(merged_segment_ids_vec, new_segment_ids_vec, *inherit_infos_recorder,
+    merged_segment_ids_vec.assign(merged_segment_ids->begin(), merged_segment_ids->end());
+    new_segment_ids_vec.assign(new_segment_ids->begin(), new_segment_ids->end());
+    global_filter_cache.batch_insert_segments(merged_segment_ids_vec, new_segment_ids_vec, *inherit_infos_recorder.get(),
                                         *new_level_recorder, 0, *new_segment_ranges_recorder);
     
     // temp recorders below:
@@ -2745,16 +2794,6 @@ void DBImpl::SyncCallFlush(std::vector<SingleCompactionJob*>& jobs) {
     // std::map<uint32_t, uint16_t>* level_recorder_
     // std::map<uint32_t, std::vector<RangeRatePair>>* segment_ranges_recorder_
     // std::map<uint32_t, uint32_t>* unit_size_recorder_
-
-    // release temp recorders?
-    delete merged_segment_ids;
-    delete new_level_recorder;
-    delete new_segment_ranges_recorder;
-    delete new_unit_size_recorder;
-    delete new_segment_ids;
-    delete inherit_infos_recorder;
-
-    filter_cache_mutex_.unlock();
   #endif
   }
 }
@@ -3080,24 +3119,38 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
   // you may need filter_cache_.range_seperators() to receive key range seperators
   // exactly, if key k < seperators[i+1] and key k >= seperators[i], then key k hit key range i
   // HeatBuckets::locate(const std::string& key) will tell you how to binary search corresponding key range for one key
-  std::set<uint32_t>* merged_segment_ids = new std::set<uint32_t>; // the merged segments' id, we need to delete them from these 3 global recorders
-  std::map<uint32_t, uint16_t>* new_level_recorder = new std::map<uint32_t, uint16_t>;
-  std::map<uint32_t, std::vector<RangeRatePair>>* new_segment_ranges_recorder = new std::map<uint32_t, std::vector<RangeRatePair>>;
-  std::map<uint32_t, uint32_t>* new_unit_size_recorder = new std::map<uint32_t, uint32_t>;
-  std::vector<std::string>& key_range_seperators = filter_cache_.range_seperators();
-  std::set<uint32_t>* new_segment_ids = new std::set<uint32_t>;
-  std::map<uint32_t, std::unordered_map<uint32_t, double>>* inherit_infos_recorder = new std::map<uint32_t, std::unordered_map<uint32_t, double>>;
+  std::unique_ptr<std::set<uint32_t>> merged_segment_ids(
+      new std::set<uint32_t>);  // the merged segments' id, we need to delete
+                                // them from these 3 global recorders
+  std::unique_ptr<std::map<uint32_t, uint16_t>> new_level_recorder(
+      new std::map<uint32_t, uint16_t>);
+  std::unique_ptr<std::map<uint32_t, std::vector<RangeRatePair>>>
+      new_segment_ranges_recorder(
+          new std::map<uint32_t, std::vector<RangeRatePair>>);
+  std::unique_ptr<std::map<uint32_t, uint32_t>> new_unit_size_recorder(
+      new std::map<uint32_t, uint32_t>);
+  const std::vector<std::string>& key_range_seperators =
+      global_filter_cache.range_seperators();
+  std::unique_ptr<std::set<uint32_t>> new_segment_ids(new std::set<uint32_t>);
+  std::unique_ptr<std::map<uint32_t, std::unordered_map<uint32_t, double>>>
+      inherit_infos_recorder(
+          new std::map<uint32_t, std::unordered_map<uint32_t, double>>);
   // TODO(WaLSM+): you can pass these var into NVMFlushJob and update them when compacting
   int compaction_flag = 0; // 0 = not defined, 1 = delete compaction, 2 = trivial compaction, 3 = other
 #endif
 
+  // WaLSM+: result from compaction
+  SegmentBuilderResult segment_builder_result;
+
   IOStatus io_s;
   if (!c) {
     // Nothing to do
     ROCKS_LOG_BUFFER(log_buffer, "Compaction nothing to do");
-  } else if (c->deletion_compaction()) {
+  } else if (UNLIKELY(c->deletion_compaction())) {
     // TODO(icanadi) Do we want to honor snapshots here? i.e. not delete old
     // file if there is alive snapshot pointing to it
+    assert(false); // cannot get here
+    exit(1);
     TEST_SYNC_POINT_CALLBACK("DBImpl::BackgroundCompaction:BeforeCompaction",
                              c->column_family_data());
     assert(c->num_input_files(1) == 0);
@@ -3115,10 +3168,17 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
   #ifdef ART_PLUS
     compaction_flag = 1;
   #endif
-    /*
-      std::set<uint32_t>* merged_segment_ids = new std::set<uint32_t>; 
-      // the merged segments' id, we need to delete them from these 3 global recorders
-    */
+
+    // the merged segments' id, we need to delete them from these 3 global recorders
+    std::unique_ptr<std::set<uint32_t>> merged_segment_ids_f1(new std::set<uint32_t>); 
+    for (const auto& f : *c->inputs(0)) {
+      auto segment_handles_map = f->fd.table_reader->GetSegmentBlockHandles();
+      for (const auto& segment_id_and_block_handles : segment_handles_map) {
+        auto segment_id = segment_id_and_block_handles.first;
+        merged_segment_ids_f1->insert(segment_id);
+      }
+    }
+
     for (const auto& f : *c->inputs(0)) {
       c->edit()->DeleteFile(c->level(), f->fd.GetNumber());
     }
@@ -3135,7 +3195,9 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
     *made_progress = true;
     TEST_SYNC_POINT_CALLBACK("DBImpl::BackgroundCompaction:AfterCompaction",
                              c->column_family_data());
-  } else if (!trivial_move_disallowed && c->IsTrivialMove()) {
+  } else if (UNLIKELY(!trivial_move_disallowed && c->IsTrivialMove())) {
+    assert(false); // cannot get here
+    exit(1);
     TEST_SYNC_POINT("DBImpl::BackgroundCompaction:TrivialMove");
     TEST_SYNC_POINT_CALLBACK("DBImpl::BackgroundCompaction:BeforeCompaction",
                              c->column_family_data());
@@ -3156,21 +3218,39 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
     int64_t moved_bytes = 0;
   #ifdef ART_PLUS
     compaction_flag = 2; // sign for TrivialMove
+
+    // TODO(WaLSM+): no new SST generated and no SST merged, just move segments(from different levels) to target levels
+    //               we can copy moved segment ids into merged_segment_ids. 
+    //               then record these moved segments' new level to new_level_recorder
+    //               maybe we need to record segment ids for every SST for convience?
+
+    std::unique_ptr<std::set<uint32_t>> merged_segment_ids_f2(new std::set<uint32_t>); 
+    // the merged segments' id, we need to delete them from these 3 global recorders
+    std::unique_ptr<std::map<uint32_t, uint16_t>> new_level_recorder_f2(new std::map<uint32_t, uint16_t>);
+    auto output_level_f2 = c->output_level();
+
+    for (unsigned int l = 0; l < c->num_input_levels(); l++) {
+      if (c->level(l) == c->output_level()) {
+        continue;
+      }
+      for (size_t i = 0; i < c->num_input_files(l); i++) {
+        FileMetaData* f = c->input(l, i);
+        auto segment_handles_map = f->fd.table_reader->GetSegmentBlockHandles();
+        for (const auto& segment_id_and_block_handles : segment_handles_map) {
+          auto segment_id = segment_id_and_block_handles.first;
+          merged_segment_ids_f2->insert(segment_id);
+          (*new_level_recorder_f2)[segment_id] = output_level_f2;
+        }
+      }
+    }
+
   #endif
+
     for (unsigned int l = 0; l < c->num_input_levels(); l++) {
       if (c->level(l) == c->output_level()) {
         continue;
       }
       for (size_t i = 0; i < c->num_input_files(l); i++) {
-        // TODO(WaLSM+): no new SST generated and no SST merged, just move segments(from different levels) to target levels
-        //               we can copy moved segment ids into merged_segment_ids. 
-        //               then record these moved segments' new level to new_level_recorder
-        //               maybe we need to record segment ids for every SST for convience?
-        /*
-          std::set<uint32_t>* merged_segment_ids; 
-          // the merged segments' id, we need to delete them from these 3 global recorders
-          std::map<uint32_t, uint16_t>* new_level_recorder = new std::map<uint32_t, uint16_t>;
-        */
         FileMetaData* f = c->input(l, i);
         c->edit()->DeleteFile(c->level(l), f->fd.GetNumber());
         c->edit()->AddFile(c->output_level(), f->fd.GetNumber(),
@@ -3293,6 +3373,7 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
     TEST_SYNC_POINT_CALLBACK(
         "DBImpl::BackgroundCompaction:NonTrivial:BeforeRun", nullptr);
     // Should handle erorr?
+    compaction_job.SetFilterCacheClient(&global_filter_cache);
     compaction_job.Run().PermitUncheckedError();
     TEST_SYNC_POINT("DBImpl::BackgroundCompaction:NonTrivial:AfterRun");
     mutex_.Lock();
@@ -3308,9 +3389,40 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
     TEST_SYNC_POINT_CALLBACK("DBImpl::BackgroundCompaction:AfterCompaction",
                              c->column_family_data());
 
-    
+  #ifdef ART_PLUS
+    if (status.ok() && !io_s.ok()) {
+      status = io_s;
+    } else {
+      io_s.PermitUncheckedError();
+    }
+
+    if (c != nullptr) {
+      c->ReleaseCompactionFiles(status);
+      *made_progress = true;
+
+    #ifndef ROCKSDB_LITE
+      // Need to make sure SstFileManager does its bookkeeping
+      auto sfm = static_cast<SstFileManagerImpl*>(
+          immutable_db_options_.sst_file_manager.get());
+      if (sfm && sfm_reserved_compact_space) {
+        sfm->OnCompactionCompletion(c.get());
+      }
+    #endif  // ROCKSDB_LITE
+
+      NotifyOnCompactionCompleted(c->column_family_data(), c.get(), status,
+                                  compaction_job_stats, job_context->job_id);
+    }
+  #endif
+
+    compaction_job.CollectDataAndPrefetch();
+    segment_builder_result = compaction_job.GetSegmentBuilderResult();
+    compaction_job.CleanupCompaction();
+    assert(segment_builder_result.new_segment_ids.size() > 0);
   }
 
+#ifdef ART_PLUS
+if (compaction_flag != 3) {
+#endif
   if (status.ok() && !io_s.ok()) {
     status = io_s;
   } else {
@@ -3333,7 +3445,9 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
     NotifyOnCompactionCompleted(c->column_family_data(), c.get(), status,
                                 compaction_job_stats, job_context->job_id);
   }
-
+#ifdef ART_PLUS
+}
+#endif
   if (status.ok() || status.IsCompactionTooLarge() ||
       status.IsManualCompactionPaused()) {
     // Done
@@ -3424,39 +3538,40 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
   // do new SSTs already exist in latest version?
   // TODO(WaLSM+): if all ok, merge temp recorders into global DBImpl recorders. 
   //               we need a mutex to guarantee these recorders modified by only one background thread at one time
-  filter_cache_mutex_.lock();
   assert(compaction_flag >= 0 && compaction_flag <= 3);
-  if (compaction_flag == 1) {
+  if (UNLIKELY(compaction_flag == 1)) {
+    assert(false); // cannot get here
+    exit(1);
     // lock and update global recorders
-    global_recorder_mutex_.lock();
+    global_filter_cache_recorders_mutex.lock();
     // remove merged segments
-    auto level_it = level_recorder_->begin();
-    auto range_it = segment_ranges_recorder_->begin();
-    auto units_it = unit_size_recorder_->begin();
+    auto level_it = global_level_recorder.begin();
+    auto range_it = global_segment_ranges_recorder.begin();
+    auto units_it = global_unit_size_recorder.begin();
     std::map<uint32_t, uint16_t> merged_level_recorder;
-    while (level_it != level_recorder_->end()) {
+    while (level_it != global_level_recorder.end()) {
       if (merged_segment_ids->count(level_it->first) > 0) {
         merged_level_recorder.insert(std::make_pair(level_it->first, level_it->second));
-        level_it = level_recorder_->erase(level_it);
+        level_it = global_level_recorder.erase(level_it);
       } else {
         level_it ++;
       }
     }
-    while (range_it != segment_ranges_recorder_->end()) {
+    while (range_it != global_segment_ranges_recorder.end()) {
       if (merged_segment_ids->count(range_it->first) > 0) {
-        range_it = segment_ranges_recorder_->erase(range_it);
+        range_it = global_segment_ranges_recorder.erase(range_it);
       } else {
         range_it ++;
       }
     }
-    while (units_it != unit_size_recorder_->end()) {
+    while (units_it != global_unit_size_recorder.end()) {
       if (merged_segment_ids->count(units_it->first) > 0) {
-        units_it = unit_size_recorder_->erase(units_it);
+        units_it = global_unit_size_recorder.erase(units_it);
       } else {
         units_it ++;
       }
     }
-    global_recorder_mutex_.unlock();
+    global_filter_cache_recorders_mutex.unlock();
 
     // merge merge temp recorders into global DBImpl recorders.
     assert(new_level_recorder->empty());
@@ -3488,8 +3603,8 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
     // new segments id empty, that will not fit in batch_insert_segments
     // we need a new method batch_delete_segments to only delete merge segments
     std::vector<uint32_t> merged_segment_ids_vec;
-    merged_segment_ids_vec.assign(merged_segment_ids.begin(), merged_segment_ids.end());
-    filter_cache_.batch_delete_segments(merged_segment_ids_vec, merged_level_recorder);
+    merged_segment_ids_vec.assign(merged_segment_ids->begin(), merged_segment_ids->end());
+    global_filter_cache.batch_delete_segments(merged_segment_ids_vec);
       
     // temp recorders below:
     // std::set<uint32_t>* merged_segment_ids = new std::set<uint32_t>; // the merged segments' id, we need to delete them from these 3 global recorders
@@ -3503,42 +3618,36 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
     // std::map<uint32_t, uint16_t>* level_recorder_
     // std::map<uint32_t, std::vector<RangeRatePair>>* segment_ranges_recorder_
     // std::map<uint32_t, uint32_t>* unit_size_recorder_
-
-    // release temp recorders?
-    delete merged_segment_ids;
-    delete new_level_recorder;
-    delete new_segment_ranges_recorder;
-    delete new_unit_size_recorder;
-    delete new_segment_ids;
-    delete inherit_infos_recorder;
-
-  } else if (compaction_flag == 2) {
+  } else if (UNLIKELY(compaction_flag == 2)) {
+    assert(false); // cannot get here
+    exit(1);
     // lock and update global recorders
-    global_recorder_mutex_.lock();
+    global_filter_cache_recorders_mutex.lock();
     // modify segments' level
-    auto level_it = level_recorder_->begin();
-    auto range_it = segment_ranges_recorder_->begin();
+    auto level_it = global_level_recorder.begin();
+    auto range_it = global_segment_ranges_recorder.begin();
     assert(new_level_recorder->size() > 0);
     assert(merged_segment_ids->size() == new_level_recorder->size());
     std::map<uint32_t, uint16_t> old_level_recorder;
-    while (level_it != level_recorder_->end()) {
+    while (level_it != global_level_recorder.end()) {
       if (merged_segment_ids->count(level_it->first) > 0) {
         old_level_recorder.insert(std::make_pair(level_it->first, level_it->second));
-        level_it = level_recorder_->erase(level_it);
+        level_it = global_level_recorder.erase(level_it);
       } else {
         level_it ++;
       }
     }
-    while (range_it != segment_ranges_recorder_->end()) {
+    while (range_it != global_segment_ranges_recorder.end()) {
       if (merged_segment_ids->count(range_it->first) > 0) {
-        new_segment_ranges_recorder->insert(std::make_pair(range_it->first, range->second));
-        range_it = segment_ranges_recorder_->erase(range_it);
+        new_segment_ranges_recorder->insert(std::make_pair(range_it->first, range_it->second));
+        // no need to erase
+        // range_it = global_segment_ranges_recorder.erase(range_it);
       } else {
         range_it ++;
       }
     }
 
-    assert(unit_size_recorder_->empty());
+    assert(new_unit_size_recorder->empty());
     /*
     while (units_it != unit_size_recorder_->end()) {
       if (merged_segment_ids->count(units_it->first) > 0) {
@@ -3551,27 +3660,27 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
 
     assert(new_level_recorder->size() == new_segment_ranges_recorder->size());
     auto new_level_it = new_level_recorder->begin();
-    auto new_range_it = new_segment_ranges_recorder->begin();
+    // auto new_range_it = new_segment_ranges_recorder->begin();
     auto new_units_it = new_unit_size_recorder->begin();
     while (new_level_it != new_level_recorder->end()) {
-      level_recorder_->insert(std::make_pair(new_level_it->first, new_level_it->second));
+      global_level_recorder.insert(std::make_pair(new_level_it->first, new_level_it->second));
       new_level_it ++;
     }
-    while (new_range_it != new_segment_ranges_recorder->end()) {
-      segment_ranges_recorder_->insert(std::make_pair(new_range_it->first, new_range_it->second));
-      new_range_it ++;
-    }
+    // while (new_range_it != new_segment_ranges_recorder->end()) {
+    //   global_segment_ranges_recorder.insert(std::make_pair(new_range_it->first, new_range_it->second));
+    //   new_range_it ++;
+    // }
     while (new_units_it != new_unit_size_recorder->end()) {
       // unit_size_recorder_.insert(std::make_pair(new_units_it->first, new_units_it->second));
       new_units_it ++;
     }
-    global_recorder_mutex_.unlock();
+    global_filter_cache_recorders_mutex.unlock();
 
     // call filter cache client DBImpl::filter_cache_ update work 
     // we need a new filter cache operation to support moving segments to a new level 
     std::vector<uint32_t> merged_segment_ids_vec;
-    merged_segment_ids_vec.assign(merged_segment_ids.begin(), merged_segment_ids.end());
-    filter_cache_.batch_move_segments(merged_segment_ids_vec, old_level_recorder, *new_level_recorder, *new_segment_ranges_recorder);
+    merged_segment_ids_vec.assign(merged_segment_ids->begin(), merged_segment_ids->end());
+    global_filter_cache.batch_move_segments(merged_segment_ids_vec, old_level_recorder, *new_level_recorder, *new_segment_ranges_recorder);
       
     // temp recorders below:
     // std::set<uint32_t>* merged_segment_ids = new std::set<uint32_t>; // the merged segments' id, we need to delete them from these 3 global recorders
@@ -3585,44 +3694,58 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
     // std::map<uint32_t, uint16_t>* level_recorder_
     // std::map<uint32_t, std::vector<RangeRatePair>>* segment_ranges_recorder_
     // std::map<uint32_t, uint32_t>* unit_size_recorder_
+  } else if (LIKELY(compaction_flag == 3)) {
+    // get SegmentBuilderResult from compaction job
+    
+    // update merged_segment_ids and new_segment_ids
+    for (const auto& id : segment_builder_result.merged_segment_ids) {
+      merged_segment_ids->insert(id);
+    }
+    for (const auto& id : segment_builder_result.new_segment_ids) {
+      new_segment_ids->insert(id);
+    }
+
+    // update new_level_recorder 
+    for (const auto id : segment_builder_result.new_segment_ids) {
+      new_level_recorder->insert(std::make_pair(id, segment_builder_result.output_level));
+    }
+
+    // update new_segment_ranges_recorder and inherit_infos_recorder
+    for (const auto& per_segment_result : segment_builder_result.per_segment_results) {
+      const auto segment_id = per_segment_result.segment_id;
+      (*new_segment_ranges_recorder)[segment_id] = per_segment_result.range_rate_pairs;
+      (*inherit_infos_recorder)[segment_id] = per_segment_result.inherit_recorder;
+    }
 
-    // release temp recorders?
-    delete merged_segment_ids;
-    delete new_level_recorder;
-    delete new_segment_ranges_recorder;
-    delete new_unit_size_recorder;
-    delete new_segment_ids;
-    delete inherit_infos_recorder;
 
-  } else if (compaction_flag == 3) {
     // it is normal compaction (merge->split)
     std::map<uint32_t, uint16_t> merged_level_recorder;
 
     // lock and update global recorders
-    global_recorder_mutex_.lock();
+    global_filter_cache_recorders_mutex.lock();
     // remove merged segments
     assert(!(merged_segment_ids->empty()));
-    auto level_it = level_recorder_->begin();
-    auto range_it = segment_ranges_recorder_->begin();
-    auto units_it = unit_size_recorder_->begin();
-    while (level_it != level_recorder_->end()) {
+    auto level_it = global_level_recorder.begin();
+    auto range_it = global_segment_ranges_recorder.begin();
+    auto units_it = global_unit_size_recorder.begin();
+    while (level_it != global_level_recorder.end()) {
       if (merged_segment_ids->count(level_it->first) > 0) {
-        merged_level_recorder.insert(std::make_pair(level_it->first, level_it->second))
-        level_it = level_recorder_->erase(level_it);
+        merged_level_recorder.insert(std::make_pair(level_it->first, level_it->second));
+        level_it = global_level_recorder.erase(level_it);
       } else {
         level_it ++;
       }
     }
-    while (range_it != segment_ranges_recorder_->end()) {
+    while (range_it != global_segment_ranges_recorder.end()) {
       if (merged_segment_ids->count(range_it->first) > 0) {
-        range_it = segment_ranges_recorder_->erase(range_it);
+        range_it = global_segment_ranges_recorder.erase(range_it);
       } else {
         range_it ++;
       }
     }
-    while (units_it != unit_size_recorder_->end()) {
+    while (units_it != global_unit_size_recorder.end()) {
       if (merged_segment_ids->count(units_it->first) > 0) {
-        units_it = unit_size_recorder_->erase(units_it);
+        units_it = global_unit_size_recorder.erase(units_it);
       } else {
         units_it ++;
       }
@@ -3635,11 +3758,11 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
     auto new_range_it = new_segment_ranges_recorder->begin();
     auto new_units_it = new_unit_size_recorder->begin();
     while (new_level_it != new_level_recorder->end()) {
-      level_recorder_.insert(std::make_pair(new_level_it->first, new_level_it->second));
+      global_level_recorder.insert(std::make_pair(new_level_it->first, new_level_it->second));
       new_level_it ++;
     }
     while (new_range_it != new_segment_ranges_recorder->end()) {
-      segment_ranges_recorder_.insert(std::make_pair(new_range_it->first, new_range_it->second));
+      global_segment_ranges_recorder.insert(std::make_pair(new_range_it->first, new_range_it->second));
       new_range_it ++;
     }
     while (new_units_it != new_unit_size_recorder->end()) {
@@ -3647,24 +3770,16 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
       // we only use DEFAULT_UNIT_SIZE
       new_units_it ++;
     }
-    global_recorder_mutex_.unlock();
-
-    // make sure that we also input merged segments' level
-    // batch_insert_segments argument need both merged and new segments' level
-    auto merged_it = merged_level_recorder.begin();
-    while (merged_it != merged_level_recorder.end()) {
-      assert(new_level_recorder->find(merged_it->first) == new_level_recorder.end());
-      new_level_recorder->insert(std::make_pair(merged_it->first, merged_it->second));
-      merged_it ++;
-    }
-    assert(new_level_recorder->size() == new_segment_ids->size() + merged_segment_ids->size());
+    global_filter_cache_recorders_mutex.unlock();
+    // there are no merged segments' id in new_level_recorder
+    assert(new_level_recorder->size() == new_segment_ids->size());
 
     // call filter cache client DBImpl::filter_cache_ update work 
     assert(inherit_infos_recorder->size() == new_segment_ids->size());
     std::vector<uint32_t> merged_segment_ids_vec, new_segment_ids_vec;
-    merged_segment_ids_vec.assign(merged_segment_ids.begin(), merged_segment_ids.end());
-    new_segment_ids_vec.assign(new_segment_ids.begin(), new_segment_ids.end());
-    filter_cache_.batch_insert_segments(merged_segment_ids_vec, new_segment_ids_vec, *inherit_infos_recorder,
+    merged_segment_ids_vec.assign(merged_segment_ids->begin(), merged_segment_ids->end());
+    new_segment_ids_vec.assign(new_segment_ids->begin(), new_segment_ids->end());
+    global_filter_cache.batch_insert_segments(merged_segment_ids_vec, new_segment_ids_vec, *inherit_infos_recorder,
                                         *new_level_recorder, 0, *new_segment_ranges_recorder);
     
     // temp recorders below:
@@ -3679,19 +3794,9 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
     // std::map<uint32_t, uint16_t>* level_recorder_
     // std::map<uint32_t, std::vector<RangeRatePair>>* segment_ranges_recorder_
     // std::map<uint32_t, uint32_t>* unit_size_recorder_
-
-    // release temp recorders?
-    delete merged_segment_ids;
-    delete new_level_recorder;
-    delete new_segment_ranges_recorder;
-    delete new_unit_size_recorder;
-    delete new_segment_ids;
-    delete inherit_infos_recorder;
-
   } else {
     assert(compaction_flag == 0);
   }
-  filter_cache_mutex_.unlock();
 #endif
   return status;
 }
diff --git a/db/db_impl/db_impl_write.cc b/db/db_impl/db_impl_write.cc
index 7838c62b2..938f36418 100644
--- a/db/db_impl/db_impl_write.cc
+++ b/db/db_impl/db_impl_write.cc
@@ -15,6 +15,7 @@
 #include "options/options_helper.h"
 #include "test_util/sync_point.h"
 #include "util/cast_util.h"
+#include "db/art/global_filter_cache_context.h"
 
 namespace ROCKSDB_NAMESPACE {
 // Convenience methods
@@ -24,8 +25,13 @@ Status DBImpl::Put(const WriteOptions& o, ColumnFamilyHandle* column_family,
 #ifdef ART_PLUS
   // heat_buckets not ready, still sample into pool
   // if ready, prepare func auto return and do nothing
+  #ifndef SAMPLES_FILE
+  // if use key sample file, do not use this code.
+  assert(false);
   std::string art_key(key.data(), key.size());
-  filter_cache_.prepare_heat_buckets(art_key, segment_info_recorder_);
+  global_filter_cache.prepare_heat_buckets(art_key, &global_segment_info_recorder);
+
+  #endif
 #endif
   return DB::Put(o, column_family, key, val);
 }
diff --git a/db/db_test3.cc b/db/db_test3.cc
index c616a1b0f..7e11659fa 100644
--- a/db/db_test3.cc
+++ b/db/db_test3.cc
@@ -373,7 +373,7 @@ void DoTest(std::string test_name) {
   options.use_direct_io_for_flush_and_compaction = true;
   options.use_direct_reads = true;
   options.enable_pipelined_write = true;
-  options.nvm_path = "/pg_wal/ycc/memory_art";
+  options.nvm_path = "/mnt/pmem0.7/guoteng/nodememory";
   options.compression = rocksdb::kNoCompression;
   options.IncreaseParallelism(16);
 
@@ -382,7 +382,7 @@ void DoTest(std::string test_name) {
   zipf->Prepare();
 
   DB* db;
-  DB::Open(options, "/tmp/db_old_custom", &db);
+  DB::Open(options, "/mnt/nvme0n1/guoteng/walsmtest/tmp/db_old_custom", &db);
 
   std::thread read_threads[thread_num];
   std::thread write_threads[thread_num];
diff --git a/db/dbformat.h b/db/dbformat.h
index 38fea61ed..7aa425836 100644
--- a/db/dbformat.h
+++ b/db/dbformat.h
@@ -9,6 +9,7 @@
 
 #pragma once
 #include <stdio.h>
+#include <cstdint>
 #include <memory>
 #include <string>
 #include <utility>
@@ -16,6 +17,7 @@
 #include "db/merge_context.h"
 #include "logging/logging.h"
 #include "monitoring/perf_context_imp.h"
+#include "port/port_posix.h"
 #include "rocksdb/comparator.h"
 #include "rocksdb/db.h"
 #include "rocksdb/filter_policy.h"
@@ -98,6 +100,8 @@ static const SequenceNumber kDisableGlobalSequenceNumber = port::kMaxUint64;
 
 constexpr uint64_t kNumInternalBytes = 8;
 
+constexpr uint32_t INVALID_SEGMENT_ID = port::kMaxUint32;
+
 // The data structure that represents an internal key in the way that user_key,
 // sequence number and type are stored in separated forms.
 struct ParsedInternalKey {
@@ -174,11 +178,21 @@ inline Slice ExtractUserKey(const Slice& internal_key) {
   return Slice(internal_key.data(), internal_key.size() - kNumInternalBytes);
 }
 
+#ifdef ART_PLUS
+Slice generate_modified_internal_key(std::unique_ptr<const char[]>& buf,
+                                     Slice original_internal_key,
+                                     int filter_index, int segment_id);
+
+Slice generate_modified_user_key(std::unique_ptr<const char[]>& buf,
+                            Slice original_user_key, int filter_index,
+                            int segment_id);
+#endif
+
 #ifdef ART_PLUS
 // Returns the internal bytes portion of an internal key. (WaLSM+)
 inline Slice ExtractInternalBytes(const Slice& internal_key) {
   assert(internal_key.size() >= kNumInternalBytes);
-  return Slice(internal_key.data() + internal_key.size(), kNumInternalBytes);
+  return Slice(internal_key.data() + internal_key.size() - kNumInternalBytes, kNumInternalBytes);
 }
 #endif
 
diff --git a/db/merge_helper.cc b/db/merge_helper.cc
index 61ff22506..fff01e981 100644
--- a/db/merge_helper.cc
+++ b/db/merge_helper.cc
@@ -5,6 +5,7 @@
 
 #include "db/merge_helper.h"
 
+#include <cassert>
 #include <string>
 
 #include "db/dbformat.h"
@@ -122,6 +123,7 @@ Status MergeHelper::MergeUntil(InternalIterator* iter,
   assert(HasOperator());
   keys_.clear();
   merge_context_.Clear();
+  segment_ids_.clear();
   has_compaction_filter_skip_until_ = false;
   assert(user_merge_operator_);
   bool first_key = true;
@@ -134,6 +136,7 @@ Status MergeHelper::MergeUntil(InternalIterator* iter,
   // original_key_is_iter == (iter->key().ToString() == original_key)
   bool original_key_is_iter = true;
   std::string original_key = iter->key().ToString();
+  uint32_t original_segment_id = iter->segment_id();
   // Important:
   // orig_ikey is backed by original_key if keys_.empty()
   // orig_ikey is backed by keys_.back() if !keys_.empty()
@@ -220,12 +223,15 @@ Status MergeHelper::MergeUntil(InternalIterator* iter,
       if (s.ok()) {
         // The original key encountered
         original_key = std::move(keys_.back());
+        original_segment_id = segment_ids_.back();
         orig_ikey.type = kTypeValue;
         UpdateInternalKey(&original_key, orig_ikey.sequence, orig_ikey.type);
         keys_.clear();
         merge_context_.Clear();
+        segment_ids_.clear();
         keys_.emplace_front(std::move(original_key));
         merge_context_.PushOperand(merge_result);
+        segment_ids_.push_front(original_segment_id);
       }
 
       // move iter to the next entry
@@ -262,8 +268,10 @@ Status MergeHelper::MergeUntil(InternalIterator* iter,
         if (original_key_is_iter) {
           // this is just an optimization that saves us one memcpy
           keys_.push_front(std::move(original_key));
+          segment_ids_.push_front(original_segment_id);
         } else {
           keys_.push_front(iter->key().ToString());
+          segment_ids_.push_front(iter->segment_id());
         }
         if (keys_.size() == 1) {
           // we need to re-anchor the orig_ikey because it was anchored by
@@ -285,6 +293,7 @@ Status MergeHelper::MergeUntil(InternalIterator* iter,
         // (not just this operand), along with some keys following it.
         keys_.clear();
         merge_context_.Clear();
+        segment_ids_.clear();
         has_compaction_filter_skip_until_ = true;
         return s;
       }
@@ -329,11 +338,14 @@ Status MergeHelper::MergeUntil(InternalIterator* iter,
       // We are certain that keys_ is not empty here (see assertions couple of
       // lines before).
       original_key = std::move(keys_.back());
+      original_segment_id = segment_ids_.back();
       orig_ikey.type = kTypeValue;
       UpdateInternalKey(&original_key, orig_ikey.sequence, orig_ikey.type);
       keys_.clear();
       merge_context_.Clear();
+      segment_ids_.clear();
       keys_.emplace_front(std::move(original_key));
+      segment_ids_.push_front(original_segment_id);
       merge_context_.PushOperand(merge_result);
     }
   } else {
@@ -362,6 +374,7 @@ Status MergeHelper::MergeUntil(InternalIterator* iter,
         merge_context_.Clear();
         merge_context_.PushOperand(merge_result);
         keys_.erase(keys_.begin(), keys_.end() - 1);
+        segment_ids_.erase(segment_ids_.begin(), segment_ids_.end() - 1);
       }
     }
   }
@@ -373,19 +386,24 @@ MergeOutputIterator::MergeOutputIterator(const MergeHelper* merge_helper)
     : merge_helper_(merge_helper) {
   it_keys_ = merge_helper_->keys().rend();
   it_values_ = merge_helper_->values().rend();
+  it_segment_ids_ = merge_helper_->segment_ids().rend();
 }
 
 void MergeOutputIterator::SeekToFirst() {
   const auto& keys = merge_helper_->keys();
   const auto& values = merge_helper_->values();
+  const auto& segment_ids = merge_helper_->segment_ids();
   assert(keys.size() == values.size());
+  assert(keys.size() == segment_ids.size());
   it_keys_ = keys.rbegin();
   it_values_ = values.rbegin();
+  it_segment_ids_ = segment_ids.rbegin();
 }
 
 void MergeOutputIterator::Next() {
   ++it_keys_;
   ++it_values_;
+  ++it_segment_ids_;
 }
 
 CompactionFilter::Decision MergeHelper::FilterMerge(const Slice& user_key,
diff --git a/db/merge_helper.h b/db/merge_helper.h
index c0534f08b..89b002cc6 100644
--- a/db/merge_helper.h
+++ b/db/merge_helper.h
@@ -120,6 +120,7 @@ class MergeHelper {
   }
   uint64_t TotalFilterTime() const { return total_filter_time_; }
   bool HasOperator() const { return user_merge_operator_ != nullptr; }
+  const std::deque<uint32_t>& segment_ids() const { return segment_ids_; }
 
   // If compaction filter returned REMOVE_AND_SKIP_UNTIL, this method will
   // return true and fill *until with the key to which we should skip.
@@ -155,6 +156,7 @@ class MergeHelper {
   std::deque<std::string> keys_;
   // Parallel with keys_; stores the operands
   mutable MergeContext merge_context_;
+  std::deque<uint32_t> segment_ids_;
 
   StopWatchNano filter_timer_;
   uint64_t total_filter_time_;
@@ -183,12 +185,14 @@ class MergeOutputIterator {
 
   Slice key() { return Slice(*it_keys_); }
   Slice value() { return Slice(*it_values_); }
+  uint32_t segment_id() { return *it_segment_ids_; }
   bool Valid() { return it_keys_ != merge_helper_->keys().rend(); }
 
  private:
   const MergeHelper* merge_helper_;
   std::deque<std::string>::const_reverse_iterator it_keys_;
   std::vector<Slice>::const_reverse_iterator it_values_;
+  std::deque<uint32_t>::const_reverse_iterator it_segment_ids_;
 };
 
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/db/nvm_flush_job.cc b/db/nvm_flush_job.cc
index 2059f2afb..ce592f008 100644
--- a/db/nvm_flush_job.cc
+++ b/db/nvm_flush_job.cc
@@ -15,6 +15,7 @@
 #include <vector>
 
 #include "db/art/logger.h"
+#include "db/art/art_metric.h"
 #include "db/builder.h"
 #include "db/db_iter.h"
 #include "db/dbformat.h"
@@ -50,6 +51,9 @@
 
 namespace ROCKSDB_NAMESPACE {
 
+static FlushMetric flushMetric_;
+// static NVMWriteMetric writeMetric_;
+
 // WaLSM+ Note: copy of FlushJob, add nvm reading
 NVMFlushJob::NVMFlushJob(SingleCompactionJob* job,
     const std::string& dbname, ColumnFamilyData* cfd,
@@ -200,7 +204,12 @@ void NVMFlushJob::Build() {
       io_status_ = io_s;
     }
     LogFlush(db_options_.info_log);
+    segment_builder_result_ = std::move(job_->segment_builder_result);
   }
+
+  // upadte WaLSM Flush Metric
+  flushMetric_.updateMetric(job_->out_file_size);
+
   ROCKS_LOG_INFO(db_options_.info_log,
                  "[%s] [JOB %d] Level-0 flush table #%" PRIu64 ": %" PRIu64
                  " bytes %s"
@@ -298,6 +307,9 @@ void NVMFlushJob::WriteResult(InternalStats::CompactionStats& stats) {
     stream << "file_cpu_read_nanos"
            << (IOSTATS(cpu_read_nanos) - prev_cpu_read_nanos);
   }
+
+  // // update WaLSM write metric
+  // writeMetric_.updateMetric(stats.bytes_written);
 }
 
 void NVMFlushJob::Cancel() {
diff --git a/db/nvm_flush_job.h b/db/nvm_flush_job.h
index eadbc07ea..7795b8dbb 100644
--- a/db/nvm_flush_job.h
+++ b/db/nvm_flush_job.h
@@ -39,6 +39,7 @@
 #include "rocksdb/listener.h"
 #include "rocksdb/memtablerep.h"
 #include "rocksdb/transaction_log.h"
+#include "table/block_based/filter_block.h"
 #include "table/scoped_arena_iterator.h"
 #include "util/autovector.h"
 #include "util/stop_watch.h"
@@ -93,6 +94,7 @@ class NVMFlushJob {
 
   LogsWithPrepTracker* logs_with_prep_tracker_;
   FileMetaData meta_;
+  SegmentBuilderResult segment_builder_result_;
 
  private:
   void ReportStartedFlush();
@@ -162,7 +164,6 @@ class NVMFlushJob {
 
   const std::shared_ptr<IOTracer> io_tracer_;
 
-
 };
 
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/db/table_cache.cc b/db/table_cache.cc
index 663ce8a94..6c1865a90 100644
--- a/db/table_cache.cc
+++ b/db/table_cache.cc
@@ -8,6 +8,7 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
 #include "db/table_cache.h"
+#include <cassert>
 
 #include "db/dbformat.h"
 #include "db/range_tombstone_fragmenter.h"
@@ -514,6 +515,11 @@ Status TableCache::Get(FilterCacheClient& filter_cache,
         t = GetTableReaderFromHandle(handle);
       }
     }
+
+    BlockBasedTable* block_based_table = nullptr;
+    block_based_table = static_cast<BlockBasedTable*>(t);
+    assert(block_based_table != nullptr);
+
     SequenceNumber* max_covering_tombstone_seq =
         get_context->max_covering_tombstone_seq();
     if (s.ok() && max_covering_tombstone_seq != nullptr &&
@@ -529,7 +535,7 @@ Status TableCache::Get(FilterCacheClient& filter_cache,
     if (s.ok()) {
       get_context->SetReplayLog(row_cache_entry);  // nullptr if no cache.
       // only add filter_cache argument 
-      s = t->Get(filter_cache, options, k, get_context, prefix_extractor, skip_filters);
+      s = block_based_table->Get(filter_cache, options, k, get_context, prefix_extractor, skip_filters);
       get_context->SetReplayLog(nullptr);
     } else if (options.read_tier == kBlockCacheTier && s.IsIncomplete()) {
       // Couldn't find Table in cache but treat as kFound if no_io set
diff --git a/db/table_cache.h b/db/table_cache.h
index 6a1ed0e90..ec7b25ae5 100644
--- a/db/table_cache.h
+++ b/db/table_cache.h
@@ -14,7 +14,6 @@
 #include <vector>
 #include <stdint.h>
 
-#include "db/art/filter_cache_client.h"
 #include "db/dbformat.h"
 #include "db/range_del_aggregator.h"
 #include "options/cf_options.h"
@@ -33,6 +32,7 @@ class Arena;
 struct FileDescriptor;
 class GetContext;
 class HistogramImpl;
+class FilterCacheClient;
 
 // Manages caching for TableReader objects for a column family. The actual
 // cache is allocated separately and passed to the constructor. TableCache
diff --git a/db/version_set.cc b/db/version_set.cc
index 799e1ca2d..47587a800 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -5447,8 +5447,8 @@ uint64_t VersionSet::ApproximateSize(const SizeApproximationOptions& options,
     for (int i = idx_start + 1; i < idx_end; ++i) {
       uint64_t file_size = files_brief.files[i].fd.GetFileSize();
       // The entire file falls into the range, so we can just take its size.
-      assert(file_size ==
-             ApproximateSize(v, files_brief.files[i], start, end, caller));
+      // assert(file_size ==
+      //        ApproximateSize(v, files_brief.files[i], start, end, caller));
       total_full_size += file_size;
     }
 
diff --git a/env/io_posix.cc b/env/io_posix.cc
index 689d89812..a144e8c86 100644
--- a/env/io_posix.cc
+++ b/env/io_posix.cc
@@ -37,6 +37,7 @@
 #include "util/autovector.h"
 #include "util/coding.h"
 #include "util/string_util.h"
+#include "db/art/art_metric.h"
 
 #if defined(OS_LINUX) && !defined(F_SET_RW_HINT)
 #define F_LINUX_SPECIFIC_BASE 1024
@@ -45,6 +46,8 @@
 
 namespace ROCKSDB_NAMESPACE {
 
+static ReadMetric readMetric_;
+
 std::string IOErrorMsg(const std::string& context,
                        const std::string& file_name) {
   if (file_name.empty()) {
@@ -601,6 +604,8 @@ IOStatus PosixRandomAccessFile::Read(uint64_t offset, size_t n,
         filename_, errno);
   }
   *result = Slice(scratch, (r < 0) ? 0 : n - left);
+  // update WaLSM Read Metric
+  readMetric_.updateMetric(offset, offset + ((r < 0) ? 0 : n - left));
   return s;
 }
 
@@ -705,6 +710,10 @@ IOStatus PosixRandomAccessFile::MultiRead(FSReadRequest* reqs,
             "PosixRandomAccessFile::MultiRead:io_uring_result", &bytes_read);
         if (bytes_read == req_wrap->iov.iov_len) {
           req->result = Slice(req->scratch, req->len);
+
+          // update WaLSM Read Metric
+          readMetric_.updateMetric(req->offset, req->offset + req->len);
+
           req->status = IOStatus::OK();
         } else if (bytes_read == 0) {
           // cqe->res == 0 can means EOF, or can mean partial results. See
@@ -717,6 +726,10 @@ IOStatus PosixRandomAccessFile::MultiRead(FSReadRequest* reqs,
             // Bytes reads don't fill sectors. Should only happen at the end
             // of the file.
             req->result = Slice(req->scratch, req_wrap->finished_len);
+
+            // update WaLSM Read Metric
+            readMetric_.updateMetric(req->offset, req->offset + req_wrap->finished_len);
+
             req->status = IOStatus::OK();
           } else {
             Slice tmp_slice;
@@ -726,6 +739,10 @@ IOStatus PosixRandomAccessFile::MultiRead(FSReadRequest* reqs,
                      req->scratch + req_wrap->finished_len, dbg);
             req->result =
                 Slice(req->scratch, req_wrap->finished_len + tmp_slice.size());
+
+            // update WaLSM Read Metric
+            readMetric_.updateMetric(req->offset,
+                                     req->offset + req_wrap->finished_len + tmp_slice.size());
           }
         } else if (bytes_read < req_wrap->iov.iov_len) {
           assert(bytes_read > 0);
@@ -865,6 +882,8 @@ IOStatus PosixMmapReadableFile::Read(uint64_t offset, size_t n,
     n = static_cast<size_t>(length_ - offset);
   }
   *result = Slice(reinterpret_cast<char*>(mmapped_region_) + offset, n);
+  // update WaLSM Read Metric
+  readMetric_.updateMetric(offset, offset + n);
   return s;
 }
 
@@ -1497,4 +1516,4 @@ IOStatus PosixDirectory::Fsync(const IOOptions& /*opts*/,
   return IOStatus::OK();
 }
 }  // namespace ROCKSDB_NAMESPACE
-#endif
+#endif
\ No newline at end of file
diff --git a/examples/Makefile b/examples/Makefile
index 90f8cf9a8..9720b11e2 100644
--- a/examples/Makefile
+++ b/examples/Makefile
@@ -19,7 +19,7 @@ all: write_example simple_example column_families_example compact_files_example
 write_example: librocksdb write_example.cc
 	$(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++11 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS)
 
-simple_example: librocksdb simple_example.cc
+simple_example:  simple_example.cc
 	$(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++11 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS)
 
 custom: librocksdb custom.cc
diff --git a/examples/custom.cc b/examples/custom.cc
index 4c509686f..770968a21 100644
--- a/examples/custom.cc
+++ b/examples/custom.cc
@@ -548,10 +548,10 @@ void DoTest(double zipf) {
   options.use_direct_reads = true;
   options.enable_pipelined_write = true;
   options.compression = rocksdb::kNoCompression;
-  options.nvm_path = "/pg_wal/ycc/memory_art";
+  options.nvm_path = "/mnt/pmem0.7/guoteng/nodememory";
   options.IncreaseParallelism(16);
 
-  std::string db_path = "/tmp/db_old_custom";
+  std::string db_path = "/mnt/nvme0n1/guoteng/walsmtest/tmp/db_old_custom";
 
   DB* db;
   DB::Open(options, db_path, &db);
diff --git a/examples/mini_benchmark.cc b/examples/mini_benchmark.cc
index c841fd6dd..8579f87de 100644
--- a/examples/mini_benchmark.cc
+++ b/examples/mini_benchmark.cc
@@ -593,12 +593,12 @@ int main(int argc, char* argv[]) {
   options.use_direct_reads = true;
   options.enable_pipelined_write = true;
   options.compression = rocksdb::kNoCompression;
-  options.nvm_path = "/pg_wal/ycc/memory_art";
+  options.nvm_path = "/mnt/pmem0.7/guoteng/nodememory";
   options.IncreaseParallelism(16);
 
   std::remove(options.nvm_path.c_str());
 
-  std::string db_path = "/tmp/tmp_data/db_art";
+  std::string db_path = "/mnt/nvme0n1/guoteng/walsmtest/tmp/db_old_custom";
 
   DB* db;
   DB::Open(options, db_path, &db);
diff --git a/examples/run.sh b/examples/run.sh
index 5e736eb33..96e431ff5 100755
--- a/examples/run.sh
+++ b/examples/run.sh
@@ -1,8 +1,8 @@
 #!/bin/bash
 
 SingleTest() {
-    sudo rm -rf /mnt/chen/*
-    sudo rm -rf /tmp/db_old_custom
+    sudo rm -rf /mnt/pmem0.7/guoteng/*
+    sudo rm -rf /mnt/nvme0n1/guoteng/walsmtest/tmp/*
     numactl -N 1 ./ycsb $1 $2
     #mv /tmp/db_old_custom/compaction_art.txt /home/chen/result/art_reset_$1_$2.txt
 }
diff --git a/examples/rw_example.cc b/examples/rw_example.cc
index 97ac580ee..5119cfe35 100644
--- a/examples/rw_example.cc
+++ b/examples/rw_example.cc
@@ -258,11 +258,11 @@ int main() {
   options.use_direct_io_for_flush_and_compaction = true;
   options.use_direct_reads = true;
   options.enable_pipelined_write = true;
-  options.nvm_path = "/pg_wal/ycc/memory_art";
+  options.nvm_path = "/mnt/pmem0.7/guoteng/nodememory";
   options.compression = rocksdb::kNoCompression;
 
   DB* db;
-  DB::Open(options, "/tmp/tmp_data/db_test_art", &db);
+  DB::Open(options, "/mnt/nvme0n1/guoteng/walsmtest/tmp/db_test_art", &db);
 
   std::thread read_threads[thread_num];
   std::thread write_threads[thread_num];
diff --git a/examples/simple_example.cc b/examples/simple_example.cc
index 6bca594e5..bf90d7c78 100644
--- a/examples/simple_example.cc
+++ b/examples/simple_example.cc
@@ -10,8 +10,10 @@
 #include <fstream>
 
 #include "rocksdb/db.h"
+#include "rocksdb/filter_policy.h"
 #include "rocksdb/slice.h"
 #include "rocksdb/options.h"
+#include "rocksdb/table.h"
 
 #include <time.h>
 #include <sys/time.h>
@@ -416,7 +418,7 @@ void ParseOptions(Options& options) {
 }
 
 void DoTest(std::string test_name) {
-  int thread_num = 8;
+  int thread_num = 1;
   int total_count = 320000000;
   int sample_range = 1000000000;
 
@@ -426,20 +428,43 @@ void DoTest(std::string test_name) {
   options.use_direct_reads = true;
   options.enable_pipelined_write = true;
   options.compression = rocksdb::kNoCompression;
-  options.nvm_path = "/mnt/walsm/node_memory";
-  options.IncreaseParallelism(16);
+  options.nvm_path = "/mnt/pmem0.7/guoteng/nodememory";
+  // options.IncreaseParallelism(16);
 
-  std::string db_path = "/tmp/tmp_data/db_test_" + test_name;
+  options.create_if_missing = true;
+  options.use_direct_io_for_flush_and_compaction = true;
+  options.use_direct_reads = true;
+  options.compression = rocksdb::kNoCompression;
+  options.compaction_style = rocksdb::kCompactionStyleUniversal;
+  options.IncreaseParallelism(1);
+  options.statistics = rocksdb::CreateDBStatistics();
+
+  rocksdb::BlockBasedTableOptions block_based_options;
+  block_based_options.pin_top_level_index_and_filter = false;
+  block_based_options.pin_l0_filter_and_index_blocks_in_cache = false;
+  block_based_options.cache_index_and_filter_blocks_with_high_priority = false;
+  block_based_options.index_type = rocksdb::BlockBasedTableOptions::kTwoLevelIndexSearch;
+  block_based_options.partition_filters = true;
+  block_based_options.cache_index_and_filter_blocks = true;
+  block_based_options.filter_policy.reset(rocksdb::NewBloomFilterPolicy(10, false));
+  block_based_options.block_cache =
+      rocksdb::NewLRUCache(static_cast<size_t>(128 * 1024 * 1024));
+  options.table_factory.reset(rocksdb::NewBlockBasedTableFactory(block_based_options));
+  options.memtable_prefix_bloom_size_ratio = 0.02;
+  std::string db_path = "/mnt/nvme0n1/guoteng/walsmtest/tmp/db_test_" + test_name;
 
   DB* db;
   DB::Open(options, db_path, &db);
+  std::cout << " Open OK" << std::endl;
 
   Inserter inserter(thread_num, db);
   inserter.SetGenerator(
       new YCSBZipfianGenerator(total_count, sample_range, 0.98, 26.49));
   inserter.DoInsert();
+  std::cout << "Insert OK" << std::endl;
 
   db->Close();
+  std::cout << "Close OK" << std::endl;
 
   delete db;
 }
diff --git a/examples/walsm_benchmark.cc b/examples/walsm_benchmark.cc
index 447cca829..c113ed604 100644
--- a/examples/walsm_benchmark.cc
+++ b/examples/walsm_benchmark.cc
@@ -545,11 +545,11 @@ int main(int argc, char* argv[]) {
   options.IncreaseParallelism(16);
   options.OptimizeForPointLookup(512);
   options.statistics = CreateDBStatistics();
-  // options.nvm_path = "/mnt/pmem1/crh/nodememory";
+  // options.nvm_path = "/mnt/pmem0.7/guoteng/nodememory";
 
   // std::remove(options.nvm_path.c_str());
 
-  std::string db_path = "/home/crh/db_test_nvm_l0";
+  std::string db_path = "/mnt/nvme0n1/guoteng/walsmtest/tmp/db_test_nvm_l0";
 
   DB* db;
   DB::Open(options, db_path, &db);
diff --git a/examples/write_example.cc b/examples/write_example.cc
index 71a9a0183..72b6ef3e1 100644
--- a/examples/write_example.cc
+++ b/examples/write_example.cc
@@ -85,7 +85,7 @@ int main() {
 
   DB* db;
 
-  assert(DB::Open(options, "/tmp/db_test", &db).ok());
+  assert(DB::Open(options, "/mnt/nvme0n1/guoteng/walsmtest/tmp/db_nvm_l0", &db).ok());
 
   int n = 0;
 
diff --git a/examples/ycsb.cc b/examples/ycsb.cc
index 5bca080dd..e54bc9333 100644
--- a/examples/ycsb.cc
+++ b/examples/ycsb.cc
@@ -419,14 +419,14 @@ void DoTest(double zipf, double read_ratio) {
   options.use_direct_reads = true;
   options.enable_pipelined_write = true;
   options.compression = rocksdb::kNoCompression;
-  options.nvm_path = "/pg_wal/ycc/memory_art";
+  options.nvm_path = "/mnt/pmem0.7/guoteng/nodememory";
   options.IncreaseParallelism(16);
 
   BlockBasedTableOptions table_options;
   table_options.filter_policy.reset(NewBloomFilterPolicy(10));
   options.table_factory.reset(NewBlockBasedTableFactory(table_options));
 
-  std::string db_path = "/tmp/db_old_custom";
+  std::string db_path = "/mnt/nvme0n1/guoteng/walsmtest/tmp/db_old_custom";
 
   DB* db;
   DB::Open(options, db_path, &db);
diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h
index 9a11c0220..595959628 100644
--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@@ -1224,7 +1224,7 @@ struct DBOptions {
   bool enable_rewrite = true;
 
   // Path for nvm file, don't pass directory.
-  std::string nvm_path = "/pg_wal/ycc/memory_art";
+  std::string nvm_path = "/mnt/pmem0.7/guoteng/nodememory";
 };
 
 // Options to control the behavior of a database (passed to DB::Open)
diff --git a/include/task_thread_pool.hpp b/include/task_thread_pool.hpp
new file mode 100644
index 000000000..6029e8572
--- /dev/null
+++ b/include/task_thread_pool.hpp
@@ -0,0 +1,486 @@
+// SPDX-License-Identifier: BSD-2-Clause OR MIT OR BSL-1.0
+/**
+ * @brief A fast and lightweight thread pool for C++11 and newer.
+ * @see https://github.com/alugowski/task-thread-pool
+ * @author Adam Lugowski
+ * @copyright Copyright (C) 2023 Adam Lugowski.
+ *            Licensed under any of the following open-source licenses:
+ *            BSD-2-Clause license, MIT license, Boost Software License 1.0
+ *
+ *
+ * BSD-2-Clause license:
+ *
+ * Copyright (C) 2023 Adam Lugowski
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ *
+ * MIT License:
+ *
+ * Copyright (c) 2023 Adam Lugowski
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *
+ *
+ * Boost Software License 1.0:
+ *
+ * Permission is hereby granted, free of charge, to any person or organization
+ * obtaining a copy of the software and accompanying documentation covered by
+ * this license (the "Software") to use, reproduce, display, distribute, execute,
+ * and transmit the Software, and to prepare derivative works of the Software,
+ * and to permit third-parties to whom the Software is furnished to do so,
+ * all subject to the following:
+ *
+ * The copyright notices in the Software and this entire statement, including
+ * the above license grant, this restriction and the following disclaimer, must
+ * be included in all copies of the Software, in whole or in part, and all
+ * derivative works of the Software, unless such copies or derivative works
+ * are solely in the form of machine-executable object code generated by a
+ * source language processor.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
+ * SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
+ * FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef AL_TASK_THREAD_POOL_HPP
+#define AL_TASK_THREAD_POOL_HPP
+
+// Version macros.
+#define TASK_THREAD_POOL_VERSION_MAJOR 1
+#define TASK_THREAD_POOL_VERSION_MINOR 0
+#define TASK_THREAD_POOL_VERSION_PATCH 10
+
+#include <condition_variable>
+#include <functional>
+#include <future>
+#include <mutex>
+#include <queue>
+#include <thread>
+#include <type_traits>
+
+// MSVC does not correctly set the __cplusplus macro by default, so we must read it from _MSVC_LANG
+// See https://devblogs.microsoft.com/cppblog/msvc-now-correctly-reports-__cplusplus/
+#if __cplusplus >= 201703L || (defined(_MSVC_LANG) && _MSVC_LANG >= 201703L)
+#define TTP_CXX17 1
+#else
+#define TTP_CXX17 0
+#endif
+
+#if TTP_CXX17
+#define TTP_NODISCARD [[nodiscard]]
+#else
+#define TTP_NODISCARD
+#endif
+
+namespace task_thread_pool {
+
+#if !TTP_CXX17
+    /**
+     * A reimplementation of std::decay_t, which is only available since C++14.
+     */
+    template <class T>
+    using decay_t = typename std::decay<T>::type;
+#endif
+
+    /**
+     * A fast and lightweight thread pool that uses C++11 threads.
+     */
+    class task_thread_pool {
+    public:
+        /**
+         * Create a task_thread_pool and start worker threads.
+         *
+         * @param num_threads Number of worker threads. If 0 then number of threads is equal to the
+         *                    number of physical cores on the machine, as given by std::thread::hardware_concurrency().
+         */
+        explicit task_thread_pool(unsigned int num_threads = 0) {
+            if (num_threads < 1) {
+                num_threads = std::thread::hardware_concurrency();
+                if (num_threads < 1) { num_threads = 1; }
+            }
+            start_threads(num_threads);
+        }
+
+        /**
+         * Finish all tasks left in the queue then shut down worker threads.
+         * If the pool is currently paused then it is resumed.
+         */
+        ~task_thread_pool() {
+            unpause();
+            wait_for_queued_tasks();
+            stop_all_threads();
+        }
+
+        /**
+         * Drop all tasks that have been submitted but not yet started by a worker.
+         *
+         * Tasks already in progress continue executing.
+         */
+        void clear_task_queue() {
+            const std::lock_guard<std::mutex> tasks_lock(task_mutex);
+            tasks = {};
+        }
+
+        /**
+         * Get number of enqueued tasks.
+         *
+         * @return Number of tasks that have been enqueued but not yet started.
+         */
+        TTP_NODISCARD size_t get_num_queued_tasks() const {
+            const std::lock_guard<std::mutex> tasks_lock(task_mutex);
+            return tasks.size();
+        }
+
+        /**
+         * Get number of in-progress tasks.
+         *
+         * @return Approximate number of tasks currently being processed by worker threads.
+         */
+        TTP_NODISCARD size_t get_num_running_tasks() const {
+            const std::lock_guard<std::mutex> tasks_lock(task_mutex);
+            return num_inflight_tasks;
+        }
+
+        /**
+         * Get total number of tasks in the pool.
+         *
+         * @return Approximate number of tasks both enqueued and running.
+         */
+        TTP_NODISCARD size_t get_num_tasks() const {
+            const std::lock_guard<std::mutex> tasks_lock(task_mutex);
+            return tasks.size() + num_inflight_tasks;
+        }
+
+        /**
+         * Get number of worker threads.
+         *
+         * @return Number of worker threads.
+         */
+        TTP_NODISCARD unsigned int get_num_threads() const {
+            const std::lock_guard<std::recursive_mutex> threads_lock(thread_mutex);
+            return static_cast<unsigned int>(threads.size());
+        }
+
+        /**
+         * Set number of worker threads. Will start or stop worker threads as necessary.
+         *
+         * @param num_threads Number of worker threads. If 0 then number of threads is equal to the
+         *                    number of physical cores on the machine, as given by std::thread::hardware_concurrency().
+         * @return Previous number of worker threads.
+         */
+        unsigned int set_num_threads(unsigned int num_threads) {
+            const std::lock_guard<std::recursive_mutex> threads_lock(thread_mutex);
+            unsigned int previous_num_threads = get_num_threads();
+
+            if (num_threads < 1) {
+                num_threads = std::thread::hardware_concurrency();
+                if (num_threads < 1) { num_threads = 1; }
+            }
+
+            if (previous_num_threads <= num_threads) {
+                // expanding the thread pool
+                start_threads(num_threads - previous_num_threads);
+            } else {
+                // contracting the thread pool
+                stop_all_threads();
+                {
+                    const std::lock_guard<std::mutex> tasks_lock(task_mutex);
+                    pool_running = true;
+                }
+                start_threads(num_threads);
+            }
+
+            return previous_num_threads;
+        }
+
+        /**
+         * Stop executing queued tasks. Use `unpause()` to resume. Note: Destroying the pool will implicitly unpause.
+         *
+         * Any in-progress tasks continue executing.
+         */
+        void pause() {
+            const std::lock_guard<std::mutex> tasks_lock(task_mutex);
+            pool_paused = true;
+        }
+
+        /**
+         * Resume executing queued tasks.
+         */
+        void unpause() {
+            const std::lock_guard<std::mutex> tasks_lock(task_mutex);
+            pool_paused = false;
+            task_cv.notify_all();
+        }
+
+        /**
+         * Check whether the pool is paused.
+         *
+         * @return true if pause() has been called without an intervening unpause().
+         */
+        TTP_NODISCARD bool is_paused() const {
+            const std::lock_guard<std::mutex> tasks_lock(task_mutex);
+            return pool_paused;
+        }
+
+        /**
+         * Submit a Callable for the pool to execute and return a std::future.
+         *
+         * @param func The Callable to execute. Can be a function, a lambda, std::packaged_task, std::function, etc.
+         * @param args Arguments for func. Optional.
+         * @return std::future that can be used to get func's return value or thrown exception.
+         */
+        template <typename F, typename... A,
+#if TTP_CXX17
+            typename R = std::invoke_result_t<std::decay_t<F>, std::decay_t<A>...>
+#else
+            typename R = typename std::result_of<decay_t<F>(decay_t<A>...)>::type
+#endif
+            >
+        TTP_NODISCARD std::future<R> submit(F&& func, A&&... args) {
+#if defined(_MSC_VER)
+            // MSVC's packaged_task is not movable even though it should be.
+            // Discussion about this bug and its future fix:
+            // https://developercommunity.visualstudio.com/t/unable-to-move-stdpackaged-task-into-any-stl-conta/108672
+            std::shared_ptr<std::packaged_task<R()>> ptask =
+                std::make_shared<std::packaged_task<R()>>(std::bind(std::forward<F>(func), std::forward<A>(args)...));
+            submit_detach([ptask] { (*ptask)(); });
+            return ptask->get_future();
+#else
+            std::packaged_task<R()> task(std::bind(std::forward<F>(func), std::forward<A>(args)...));
+            auto ret = task.get_future();
+            submit_detach(std::move(task));
+            return ret;
+#endif
+        }
+
+        /**
+         * Submit a zero-argument Callable for the pool to execute.
+         *
+         * @param func The Callable to execute. Can be a function, a lambda, std::packaged_task, std::function, etc.
+         */
+        template <typename F>
+        void submit_detach(F&& func) {
+            const std::lock_guard<std::mutex> tasks_lock(task_mutex);
+            tasks.emplace(std::forward<F>(func));
+            task_cv.notify_one();
+        }
+
+        /**
+         * Submit a Callable with arguments for the pool to execute.
+         *
+         * @param func The Callable to execute. Can be a function, a lambda, std::packaged_task, std::function, etc.
+         */
+        template <typename F, typename... A>
+        void submit_detach(F&& func, A&&... args) {
+            const std::lock_guard<std::mutex> tasks_lock(task_mutex);
+            tasks.emplace(std::bind(std::forward<F>(func), std::forward<A>(args)...));
+            task_cv.notify_one();
+        }
+
+        /**
+         * Block until the task queue is empty. Some tasks may be in-progress when this method returns.
+         */
+        void wait_for_queued_tasks() {
+            std::unique_lock<std::mutex> tasks_lock(task_mutex);
+            notify_task_finish = true;
+            task_finished_cv.wait(tasks_lock, [&] { return tasks.empty(); });
+            notify_task_finish = false;
+        }
+
+        /**
+         * Block until all tasks have finished.
+         */
+        void wait_for_tasks() {
+            std::unique_lock<std::mutex> tasks_lock(task_mutex);
+            notify_task_finish = true;
+            task_finished_cv.wait(tasks_lock, [&] { return tasks.empty() && num_inflight_tasks == 0; });
+            notify_task_finish = false;
+        }
+
+    protected:
+
+        /**
+         * Main function for worker threads.
+         */
+        void worker_main() {
+            bool finished_task = false;
+
+            while (true) {
+                std::unique_lock<std::mutex> tasks_lock(task_mutex);
+
+                if (finished_task) {
+                    --num_inflight_tasks;
+                    if (notify_task_finish) {
+                        task_finished_cv.notify_all();
+                    }
+                }
+
+                task_cv.wait(tasks_lock, [&]() { return !pool_running || (!pool_paused && !tasks.empty()); });
+
+                if (!pool_running) {
+                    break;
+                }
+
+                // Must mean that (!pool_paused && !tasks.empty()) is true
+
+                std::packaged_task<void()> task{std::move(tasks.front())};
+                tasks.pop();
+                ++num_inflight_tasks;
+                tasks_lock.unlock();
+
+                try {
+                    task();
+                } catch (...) {
+                    // std::packaged_task::operator() may throw in some error conditions, such as if the task
+                    // had already been run. Nothing that the pool can do anything about.
+                }
+
+                finished_task = true;
+            }
+        }
+
+        /**
+         * Start worker threads.
+         *
+         * @param num_threads How many threads to start.
+         */
+        void start_threads(const unsigned int num_threads) {
+            const std::lock_guard<std::recursive_mutex> threads_lock(thread_mutex);
+
+            for (unsigned int i = 0; i < num_threads; ++i) {
+                threads.emplace_back(&task_thread_pool::worker_main, this);
+            }
+        }
+
+        /**
+         * Stop, join, and destroy all worker threads.
+         */
+        void stop_all_threads() {
+            const std::lock_guard<std::recursive_mutex> threads_lock(thread_mutex);
+
+            {
+                const std::lock_guard<std::mutex> tasks_lock(task_mutex);
+                pool_running = false;
+                task_cv.notify_all();
+            }
+
+            for (auto& thread : threads) {
+                if (thread.joinable()) {
+                    thread.join();
+                }
+            }
+            threads.clear();
+        }
+
+        /**
+         * The worker threads.
+         *
+         * Access protected by thread_mutex
+         */
+        std::vector<std::thread> threads;
+
+        /**
+         * A mutex for methods that start/stop threads.
+         */
+        mutable std::recursive_mutex thread_mutex;
+
+        /**
+         * The task queue.
+         *
+         * Access protected by task_mutex.
+         */
+        std::queue<std::packaged_task<void()>> tasks = {};
+
+        /**
+         * A mutex for all variables related to tasks.
+         */
+        mutable std::mutex task_mutex;
+
+        /**
+         * Used to notify changes to the task queue, such as a new task added, pause/unpause, etc.
+         */
+        std::condition_variable task_cv;
+
+        /**
+         * Used to notify of finished tasks.
+         */
+        std::condition_variable task_finished_cv;
+
+        /**
+         * A signal for worker threads that the pool is either running or shutting down.
+         *
+         * Access protected by task_mutex.
+         */
+        bool pool_running = true;
+
+        /**
+         * A signal for worker threads to not pull new tasks from the queue.
+         *
+         * Access protected by task_mutex.
+         */
+        bool pool_paused = false;
+
+        /**
+         * A signal for worker threads that they should notify task_finished_cv when they finish a task.
+         *
+         * Access protected by task_mutex.
+         */
+        bool notify_task_finish = false;
+
+        /**
+         * A counter of the number of tasks in-progress by worker threads.
+         * Incremented when a task is popped off the task queue and decremented when that task is complete.
+         *
+         * Access protected by task_mutex.
+         */
+        int num_inflight_tasks = 0;
+    };
+}
+
+// clean up
+#undef TTP_NODISCARD
+#undef TTP_CXX17
+
+#endif
diff --git a/lgb_server/model.py b/lgb_server/model.py
index b8a235a0e..f2e258732 100644
--- a/lgb_server/model.py
+++ b/lgb_server/model.py
@@ -3,7 +3,7 @@
 import numpy
 import math
 
-model_path = '/pg_wal/ycc/'
+model_path = '/home/guoteng_20241228_135/WaLSM+/log/'
 # model_path = ''
 
 class LGBModel():
@@ -11,11 +11,14 @@ def __init__(self) -> None:
         self.__model = None
         # one unit is 4 bits-per-key, class = 2 mean bits-per-key = 4 * 2 = 8
         # the default bits-per-key value of previous benchmark is 10
-        self.__default_class = 4
-        self.__bits_per_key = 4 # bits_per_key for one filter unit
+        self.__min_class = 0
+        self.__max_class = 6
+        self.__num_classes = (self.__max_class - self.__min_class) + 1
+        self.__default_class = 2
+        self.__bits_per_key = 2 # bits_per_key for one filter unit, must larger than 1
         self.__num_probes = math.floor(self.__bits_per_key * 0.69) # 4 * 0.69 = 2.76 -> 2
         self.__rate_per_unit = math.pow(1.0 - math.exp(-self.__num_probes/self.__bits_per_key), self.__num_probes) # false positive rate of one unit
-        self.__cost_rate_line = 0.10 # we can torelate deviation that is no more than self.__cost_rate_line * (best I/O cost) (compared to best I/O cost)
+        self.__cost_threshold = 1.2 # we can torelate deviation that is no more than self.__cost_rate_line * (best I/O cost) (compared to best I/O cost)
         self.__model_name = 'model.txt'
         # self.__host = '127.0.0.1'
         # self.__port = '6666'
@@ -37,29 +40,37 @@ def __evaluate_model(self, X: pd.DataFrame, y: pd.Series, c: pd.Series) -> bool:
             
         assert len(count_list) == len(class_list)
         assert len(preds_list) == len(class_list)
+        assert self.__bits_per_key > 1
         
         best_cost = 0.0
         pred_cost = 0.0
         for i in range(0, len(class_list)):
             best_cost += math.pow(self.__rate_per_unit, class_list[i]) * count_list[i]
             pred_cost += math.pow(self.__rate_per_unit, preds_list[i]) * count_list[i]
+        # if pred_cost < best_cost, then the pred_cost will use more memory than the limitation
+        # we force the model to be retrained
+        if pred_cost < best_cost:
+            print("pred_cost smaller than best_cost, forced to retrain")
+            pred_cost += best_cost + best_cost * self.__cost_threshold
         
-        # print("best cost : " + str(best_cost) + ", pred cost: " + str(pred_cost))
-        return math.fabs((pred_cost-best_cost)/best_cost) < self.__cost_rate_line
+        print("best cost : " + str(best_cost) + ", pred cost: " + str(pred_cost))
+        return math.fabs((pred_cost-best_cost)/best_cost) < self.__cost_threshold
         
     def train(self, dataset: str) -> str:
         df = pd.read_csv(dataset)
         y = df['Target']
         c = df['Count'] # used to check I/O cost metric
         X = df.drop(columns=['Target', 'Count'])
+        # print("Length of X:", X.shape[1])
         if self.__model is not None and self.__evaluate_model(X, y, c): 
             # still work well
-            return
+            return 'no need to train'
         # clf = lightgbm.LGBMClassifier(min_child_samples=1, n_estimators=1, objective="multiclass")
-        clf = lightgbm.LGBMClassifier()
+        clf = lightgbm.LGBMClassifier(verbosity=-1, n_estimators=3, objective="multiclass", num_class=self.__num_classes)
         clf.fit(X, y)
         # if we directly set self.__model = clf, then self.__model always predict class 0
         # we need save clf to txt file, then read this model to init self.__model
+        print('train a new model')
         clf.booster_.save_model(model_path + self.__model_name)
         self.__model = lightgbm.Booster(model_file=model_path+self.__model_name)
         # print('load a new model')
@@ -72,6 +83,8 @@ def predict(self, datas: pd.DataFrame) -> str:
             result = self.__model.predict(datas)
             return str(numpy.argmax(result[0]))    
         else:
+            assert self.__default_class <= self.__max_class
+            assert self.__default_class >= self.__min_class
             return str(self.__default_class)
     
     '''      
diff --git a/lgb_server/utils.py b/lgb_server/utils.py
index 3d61825cb..84bdb5862 100644
--- a/lgb_server/utils.py
+++ b/lgb_server/utils.py
@@ -2,7 +2,7 @@
 import pandas as pd
 import sys
 
-dataset_path = '/pg_wal/ycc/'
+dataset_path = '/home/guoteng_20241228_135/WaLSM+/log/'
 # dataset_path = ''
 
 # msg should be like 'dataset1.csv'
@@ -25,6 +25,7 @@ def parse_pred_msg(msg: str) -> list[int]:
 # build predict data row from list[int]
 def prepare_data(data: list[int]) -> pd.DataFrame:
     assert type(data) is list and type(data[0]) is int
+    assert data[0] > 0
     datas = pd.DataFrame([data])
     return datas
 
diff --git a/notes.txt b/notes.txt
new file mode 100644
index 000000000..6b98fa62a
--- /dev/null
+++ b/notes.txt
@@ -0,0 +1,5 @@
+sudo mount /dev/nvme0n1 /mnt/nvme0n1
+sudo mount -o dax,noatime /dev/pmem0.7 /mnt/pmem0.7
+sudo mount -o dax,noatime /dev/pmem0.8 /mnt/pmem0.8
+rm /mnt/nvme0n1/guoteng/walsmtest/tmp/db_nvm_l0/* -rf ; rm /mnt/pmem0.7/guoteng/nodememory
+./ycsb -load -run -db rocksdb -P workloads/workload_test -P rocksdb/rocksdb.properties -p threadcount=8 -s
\ No newline at end of file
diff --git a/src.mk b/src.mk
index f39f6809c..036988551 100644
--- a/src.mk
+++ b/src.mk
@@ -33,7 +33,8 @@ LIB_SOURCES =                                                   \
   db/art/heat_buckets.cc                                        \
   db/art/clf_model.cc                                           \
   db/art/filter_cache_heap.cc                                   \
-  db/art/filter_cache_item.cc                                   \
+  db/art/filter_cache_entry.cc                                  \
+  db/art/global_filter_cache_context.cc                         \
   db/art/filter_cache.cc                                        \
   db/art/filter_cache_client.cc                                 \
   db/art/greedy_algo.cc                                         \
@@ -164,6 +165,7 @@ LIB_SOURCES =                                                   \
   table/block_based/block_based_table_factory.cc                \
   table/block_based/block_based_table_iterator.cc               \
   table/block_based/block_based_table_reader.cc                 \
+  table/block_based/block_based_table_segment_aware_iterator.cc \
   table/block_based/block_builder.cc                            \
   table/block_based/block_prefetcher.cc                         \
   table/block_based/block_prefix_index.cc                       \
diff --git a/table/block_based/block.cc b/table/block_based/block.cc
index 7b3ddb1c7..b07f6854a 100644
--- a/table/block_based/block.cc
+++ b/table/block_based/block.cc
@@ -11,6 +11,7 @@
 
 #include "table/block_based/block.h"
 #include <algorithm>
+#include <iostream>
 #include <string>
 #include <unordered_map>
 #include <vector>
diff --git a/table/block_based/block_based_filter_block.cc b/table/block_based/block_based_filter_block.cc
index 2e457e32f..77e6ebf1f 100644
--- a/table/block_based/block_based_filter_block.cc
+++ b/table/block_based/block_based_filter_block.cc
@@ -80,7 +80,7 @@ void BlockBasedFilterBlockBuilder::StartBlock(uint64_t block_offset) {
   }
 }
 
-void BlockBasedFilterBlockBuilder::Add(const Slice& key) {
+void BlockBasedFilterBlockBuilder::Add(const Slice& key, uint32_t /*segment_id*/) {
   if (prefix_extractor_ && prefix_extractor_->InDomain(key)) {
     AddPrefix(key);
   }
diff --git a/table/block_based/block_based_filter_block.h b/table/block_based/block_based_filter_block.h
index 67ded1ee3..3f89966dc 100644
--- a/table/block_based/block_based_filter_block.h
+++ b/table/block_based/block_based_filter_block.h
@@ -44,7 +44,7 @@ class BlockBasedFilterBlockBuilder : public FilterBlockBuilder {
 
   virtual bool IsBlockBased() override { return true; }
   virtual void StartBlock(uint64_t block_offset) override;
-  virtual void Add(const Slice& key) override;
+  virtual void Add(const Slice& key, uint32_t segment_id) override;
   virtual size_t NumAdded() const override { return num_added_; }
   virtual Slice Finish(const BlockHandle& tmp, Status* status) override;
   using FilterBlockBuilder::Finish;
diff --git a/table/block_based/block_based_table_builder.cc b/table/block_based/block_based_table_builder.cc
index 7bd6eb3ca..246a906f9 100644
--- a/table/block_based/block_based_table_builder.cc
+++ b/table/block_based/block_based_table_builder.cc
@@ -20,6 +20,7 @@
 #include <utility>
 
 #include "db/dbformat.h"
+#include "db/art/global_filter_cache_context.h"
 #include "index_builder.h"
 #include "port/lang.h"
 
@@ -66,7 +67,8 @@ FilterBlockBuilder* CreateFilterBlockBuilder(
     const ImmutableCFOptions& /*opt*/, const MutableCFOptions& mopt,
     const FilterBuildingContext& context,
     const bool use_delta_encoding_for_index_values,
-    PartitionedIndexBuilder* const p_index_builder) {
+    PartitionedIndexBuilder* const p_index_builder,
+    const InternalKeyComparator* internal_comparator) {
   const BlockBasedTableOptions& table_opt = context.table_options;
   if (table_opt.filter_policy == nullptr) return nullptr;
 
@@ -91,7 +93,8 @@ FilterBlockBuilder* CreateFilterBlockBuilder(
       return new PartitionedFilterBlockBuilder(
           mopt.prefix_extractor.get(), table_opt.whole_key_filtering,
           filter_bits_builder, table_opt.index_block_restart_interval,
-          use_delta_encoding_for_index_values, p_index_builder, partition_size);
+          use_delta_encoding_for_index_values, p_index_builder, partition_size, 
+          global_filter_cache.range_seperators(), internal_comparator);
     } else {
       return new FullFilterBlockBuilder(mopt.prefix_extractor.get(),
                                         table_opt.whole_key_filtering,
@@ -265,7 +268,7 @@ struct BlockBasedTableBuilder::Rep {
   // compressing any data blocks.
   // TODO(ajkr): ideally we don't buffer all keys and all uncompressed data
   // blocks as it's redundant, but it's easier to implement for now.
-  std::vector<std::pair<std::string, std::vector<std::string>>>
+  std::vector<std::tuple<std::string, std::vector<std::string>, std::vector<uint32_t>>>
       data_block_and_keys_buffers;
   BlockBuilder range_del_block;
 
@@ -476,7 +479,7 @@ struct BlockBasedTableBuilder::Rep {
       context.info_log = ioptions.info_log;
       filter_builder.reset(CreateFilterBlockBuilder(
           ioptions, moptions, context, use_delta_encoding_for_index_values,
-          p_index_builder_));
+          p_index_builder_, &internal_comparator));
     }
 
     for (auto& collector_factories : *int_tbl_prop_collector_factories) {
@@ -711,7 +714,7 @@ BlockBasedTableBuilder::~BlockBasedTableBuilder() {
   delete rep_;
 }
 
-void BlockBasedTableBuilder::Add(const Slice& key, const Slice& value) {
+void BlockBasedTableBuilder::Add(const Slice& key, const Slice& value, uint32_t segment_id) {
   Rep* r = rep_;
   assert(rep_->state != Rep::State::kClosed);
   if (!ok()) return;
@@ -761,7 +764,7 @@ void BlockBasedTableBuilder::Add(const Slice& key, const Slice& value) {
         if (r->filter_builder != nullptr) {
           size_t ts_sz =
               r->internal_comparator.user_comparator()->timestamp_size();
-          r->filter_builder->Add(ExtractUserKeyAndStripTimestamp(key, ts_sz));
+          r->filter_builder->Add(ExtractUserKeyAndStripTimestamp(key, ts_sz), segment_id);
         }
       }
     }
@@ -774,7 +777,9 @@ void BlockBasedTableBuilder::Add(const Slice& key, const Slice& value) {
       if (r->data_block_and_keys_buffers.empty() || should_flush) {
         r->data_block_and_keys_buffers.emplace_back();
       }
-      r->data_block_and_keys_buffers.back().second.emplace_back(key.ToString());
+      // r->data_block_and_keys_buffers.back().second.emplace_back(key.ToString());
+      std::get<1>(r->data_block_and_keys_buffers.back()).emplace_back(key.ToString());
+      std::get<2>(r->data_block_and_keys_buffers.back()).emplace_back(segment_id);
     } else {
       if (r->compression_opts.parallel_threads == 1) {
         r->index_builder->OnKeyAdded(key);
@@ -889,8 +894,10 @@ void BlockBasedTableBuilder::WriteBlock(const Slice& raw_block_contents,
   if (r->state == Rep::State::kBuffered) {
     assert(is_data_block);
     assert(!r->data_block_and_keys_buffers.empty());
-    r->data_block_and_keys_buffers.back().first = raw_block_contents.ToString();
-    r->data_begin_offset += r->data_block_and_keys_buffers.back().first.size();
+    // r->data_block_and_keys_buffers.back().first = raw_block_contents.ToString();
+    // r->data_begin_offset += r->data_block_and_keys_buffers.back().first.size();
+    std::get<0>(r->data_block_and_keys_buffers.back()) = raw_block_contents.ToString();
+    r->data_begin_offset += std::get<0>(r->data_block_and_keys_buffers.back()).size();
     return;
   }
   Status compress_status;
@@ -1156,6 +1163,7 @@ void BlockBasedTableBuilder::WriteRawBlock(const Slice& block_contents,
   }
 }
 
+// WaLSM+: only used in parallel compaction, which is not supported in WaLSM+
 void BlockBasedTableBuilder::BGWorkWriteRawBlock() {
   Rep* r = rep_;
   ParallelCompressionRep::BlockRepSlot* slot;
@@ -1520,11 +1528,16 @@ void BlockBasedTableBuilder::EnterUnbuffered() {
       size_t rand_idx =
           static_cast<size_t>(
               generator.Uniform(r->data_block_and_keys_buffers.size()));
+      // size_t copy_len =
+      //     std::min(kSampleBytes - compression_dict_samples.size(),
+      //              r->data_block_and_keys_buffers[rand_idx].first.size());
       size_t copy_len =
           std::min(kSampleBytes - compression_dict_samples.size(),
-                   r->data_block_and_keys_buffers[rand_idx].first.size());
+                   std::get<0>(r->data_block_and_keys_buffers[rand_idx]).size());
+      // compression_dict_samples.append(
+      //     r->data_block_and_keys_buffers[rand_idx].first, 0, copy_len);
       compression_dict_samples.append(
-          r->data_block_and_keys_buffers[rand_idx].first, 0, copy_len);
+          std::get<0>(r->data_block_and_keys_buffers[rand_idx]), 0, copy_len);
       compression_dict_sample_lens.emplace_back(copy_len);
     }
   }
@@ -1546,11 +1559,16 @@ void BlockBasedTableBuilder::EnterUnbuffered() {
                 r->compression_type == kZSTDNotFinalCompression));
 
   for (size_t i = 0; ok() && i < r->data_block_and_keys_buffers.size(); ++i) {
-    auto& data_block = r->data_block_and_keys_buffers[i].first;
-    auto& keys = r->data_block_and_keys_buffers[i].second;
+    // auto& data_block = r->data_block_and_keys_buffers[i].first;
+    // auto& keys = r->data_block_and_keys_buffers[i].second;
+    auto& data_block = std::get<0>(r->data_block_and_keys_buffers[i]);
+    auto& keys = std::get<1>(r->data_block_and_keys_buffers[i]);
+    auto& segment_ids = std::get<2>(r->data_block_and_keys_buffers[i]);
     assert(!data_block.empty());
     assert(!keys.empty());
+    assert(!segment_ids.empty());
 
+    // WaLSM+: no parallel compression for now, so no need to modify?
     if (r->compression_opts.parallel_threads > 1) {
       ParallelCompressionRep::BlockRep* block_rep = nullptr;
       r->pc_rep->block_rep_pool.pop(block_rep);
@@ -1561,10 +1579,13 @@ void BlockBasedTableBuilder::EnterUnbuffered() {
       block_rep->compression_type = r->compression_type;
 
       block_rep->keys->SwapAssign(keys);
+      // assign segment_ids here if needed
 
       if (i + 1 < r->data_block_and_keys_buffers.size()) {
+        // block_rep->first_key_in_next_block->assign(
+        //     r->data_block_and_keys_buffers[i + 1].second.front());
         block_rep->first_key_in_next_block->assign(
-            r->data_block_and_keys_buffers[i + 1].second.front());
+            std::get<1>(r->data_block_and_keys_buffers[i + 1]).front());
       } else {
         if (r->first_key_in_next_block == nullptr) {
           block_rep->first_key_in_next_block.reset(nullptr);
@@ -1608,19 +1629,24 @@ void BlockBasedTableBuilder::EnterUnbuffered() {
             lock, [r] { return !r->pc_rep->first_block; });
       }
     } else {
-      for (const auto& key : keys) {
+      assert(keys.size() == segment_ids.size());
+      for (size_t j = 0; j < keys.size(); ++j) {
+        const auto& key = keys[j];
+        const auto segment_id = segment_ids[j];
         if (r->filter_builder != nullptr) {
           size_t ts_sz =
               r->internal_comparator.user_comparator()->timestamp_size();
-          r->filter_builder->Add(ExtractUserKeyAndStripTimestamp(key, ts_sz));
+          r->filter_builder->Add(ExtractUserKeyAndStripTimestamp(key, ts_sz), segment_id);
         }
         r->index_builder->OnKeyAdded(key);
       }
       WriteBlock(Slice(data_block), &r->pending_handle,
                  true /* is_data_block */);
       if (ok() && i + 1 < r->data_block_and_keys_buffers.size()) {
+        // Slice first_key_in_next_block =
+        //     r->data_block_and_keys_buffers[i + 1].second.front();
         Slice first_key_in_next_block =
-            r->data_block_and_keys_buffers[i + 1].second.front();
+            std::get<1>(r->data_block_and_keys_buffers[i + 1]).front();
         Slice* first_key_in_next_block_ptr = &first_key_in_next_block;
         r->index_builder->AddIndexEntry(
             &keys.back(), first_key_in_next_block_ptr, r->pending_handle);
@@ -1762,6 +1788,10 @@ const char* BlockBasedTableBuilder::GetFileChecksumFuncName() const {
   }
 }
 
+SegmentBuilderResult BlockBasedTableBuilder::GetSegmentBuilderResult() {
+  return rep_->filter_builder->GetSegmentBuilderResult();
+}
+
 const std::string BlockBasedTable::kFilterBlockPrefix = "filter.";
 const std::string BlockBasedTable::kFullFilterBlockPrefix = "fullfilter.";
 const std::string BlockBasedTable::kPartitionedFilterBlockPrefix =
diff --git a/table/block_based/block_based_table_builder.h b/table/block_based/block_based_table_builder.h
index 38ad948af..728e8100d 100644
--- a/table/block_based/block_based_table_builder.h
+++ b/table/block_based/block_based_table_builder.h
@@ -19,6 +19,7 @@
 #include "rocksdb/listener.h"
 #include "rocksdb/options.h"
 #include "rocksdb/status.h"
+#include "table/block_based/filter_block.h"
 #include "table/meta_blocks.h"
 #include "table/table_builder.h"
 #include "util/compression.h"
@@ -65,7 +66,7 @@ class BlockBasedTableBuilder : public TableBuilder {
   // REQUIRES: key is after any previously added key according to comparator.
   // REQUIRES: Finish(), Abandon() have not been called
   // WaLSM+ Note: call filter_builder->add()
-  void Add(const Slice& key, const Slice& value) override;
+  void Add(const Slice& key, const Slice& value, uint32_t segment_id) override;
 
   // Return non-ok iff some error has been detected.
   Status status() const override;
@@ -111,6 +112,8 @@ class BlockBasedTableBuilder : public TableBuilder {
   // Get file checksum function name
   const char* GetFileChecksumFuncName() const override;
 
+  SegmentBuilderResult GetSegmentBuilderResult() override;
+
  private:
   bool ok() const { return status().ok(); }
 
diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc
index 2257d10ea..2521d856d 100644
--- a/table/block_based/block_based_table_reader.cc
+++ b/table/block_based/block_based_table_reader.cc
@@ -11,6 +11,7 @@
 #include <algorithm>
 #include <array>
 #include <limits>
+#include <memory>
 #include <string>
 #include <utility>
 #include <vector>
@@ -32,6 +33,7 @@
 #include "rocksdb/iterator.h"
 #include "rocksdb/options.h"
 #include "rocksdb/statistics.h"
+#include "rocksdb/status.h"
 #include "rocksdb/table.h"
 #include "rocksdb/table_properties.h"
 #include "table/block_based/binary_search_index_reader.h"
@@ -39,7 +41,9 @@
 #include "table/block_based/block_based_filter_block.h"
 #include "table/block_based/block_based_table_factory.h"
 #include "table/block_based/block_based_table_iterator.h"
+#include "table/block_based/block_based_table_segment_aware_iterator.h"
 #include "table/block_based/block_prefix_index.h"
+#include "table/block_based/cachable_entry.h"
 #include "table/block_based/filter_block.h"
 #include "table/block_based/full_filter_block.h"
 #include "table/block_based/hash_index_reader.h"
@@ -53,6 +57,7 @@
 #include "table/multiget_context.h"
 #include "table/persistent_cache_helper.h"
 #include "table/sst_file_writer_collectors.h"
+#include "table/table_reader_caller.h"
 #include "table/two_level_iterator.h"
 
 #include "monitoring/perf_context_imp.h"
@@ -1040,9 +1045,9 @@ Status BlockBasedTable::PrefetchIndexAndFilterBlocks(
         lookup_context);
     if (filter) {
       // Refer to the comment above about paritioned indexes always being cached
-      if (prefetch_all) {
-        filter->CacheDependencies(ro, pin_all);
-      }
+      // if (prefetch_all) {
+      //   filter->CacheDependencies(ro, pin_all);
+      // }
 
       rep_->filter = std::move(filter);
     }
@@ -2128,6 +2133,49 @@ InternalIterator* BlockBasedTable::NewIterator(
       need_upper_bound_check &&
           rep_->index_type == BlockBasedTableOptions::kHashSearch,
       /*input_iter=*/nullptr, /*get_context=*/nullptr, &lookup_context));
+
+  // WaLSM+ behavior: when compaction, return a SegmentAwareIterator
+  if (caller == TableReaderCaller::kCompaction) {
+    std::unique_ptr<InternalIterator> data_iter;
+    std::unique_ptr<IndexBlockIter> filter_index_iter;
+    CachableEntry<Block> filter_block;
+    Status s = GetFilterIndexBlock(read_options, true, nullptr, &lookup_context,
+                                   &filter_block);
+    assert(s.ok());
+    filter_index_iter.reset(filter_block.GetValue()->NewIndexIterator(
+        rep_->segment_id_removing_comparator.get(),
+        get_rep()->get_global_seqno(BlockType::kFilter), nullptr, nullptr,
+        true /* total_order_seek */, false /* have_first_key */,
+        rep_->index_key_includes_seq, rep_->index_value_is_full));
+    if (arena == nullptr) {
+      data_iter.reset(new BlockBasedTableIterator(
+          this, read_options, rep_->internal_comparator, std::move(index_iter),
+          !skip_filters && !read_options.total_order_seek &&
+              prefix_extractor != nullptr,
+          need_upper_bound_check, prefix_extractor, caller,
+          compaction_readahead_size, allow_unprepared_value));
+
+      return new BlockBasedTableSegmentAwareIterator(
+          std::move(data_iter), std::move(filter_block), std::move(filter_index_iter),
+          rep_->internal_comparator, caller);
+    } else {
+      auto* mem = arena->AllocateAligned(sizeof(BlockBasedTableIterator));
+      data_iter.reset(new (mem) BlockBasedTableIterator(
+          this, read_options, rep_->internal_comparator, std::move(index_iter),
+          !skip_filters && !read_options.total_order_seek &&
+              prefix_extractor != nullptr,
+          need_upper_bound_check, prefix_extractor, caller,
+          compaction_readahead_size, allow_unprepared_value));
+
+      mem = arena->AllocateAligned(sizeof(BlockBasedTableSegmentAwareIterator));
+      return new BlockBasedTableSegmentAwareIterator(
+          std::move(data_iter), std::move(filter_block), std::move(filter_index_iter),
+          rep_->internal_comparator, caller);
+    }
+
+    // unreachable
+  }
+
   if (arena == nullptr) {
     return new BlockBasedTableIterator(
         this, read_options, rep_->internal_comparator, std::move(index_iter),
@@ -3431,10 +3479,19 @@ std::string BlockBasedTable::ApproximateMiddleKey(const Slice& start,
   uint64_t middle_offset = (end_offset + start_offset) / 2;
   index_iter->SeekToFirst();
 
+  if (!index_iter->Valid()) {
+    return std::string();
+  }
+
   while (index_iter->Valid()
          && index_iter->value().handle.offset() < middle_offset) {
     index_iter->Next();
   }
+
+  if (!index_iter->Valid()) {
+    index_iter->SeekToLast();
+  }
+
   std::string prefix = index_iter->user_key().ToString();
   return prefix;
 
@@ -3796,4 +3853,53 @@ void BlockBasedTable::DumpKeyValue(const Slice& key, const Slice& value,
   out_stream << "  ------\n";
 }
 
+Status BlockBasedTable::GetFilterIndexBlock(
+    const ReadOptions& read_options, bool use_cache, GetContext* get_context,
+    BlockCacheLookupContext* lookup_context,
+    CachableEntry<Block>* filter_block) const {
+  assert(filter_block);
+  assert(filter_block->IsEmpty());
+
+  const BlockBasedTable::Rep* const rep = get_rep();
+  assert(rep);
+  bool for_compaction = lookup_context != nullptr 
+    && lookup_context->caller == TableReaderCaller::kCompaction;
+
+  Status s = RetrieveBlock(
+      nullptr /* prefetch_buffer */, read_options, rep->filter_handle,
+      UncompressionDict::GetEmptyDict(), filter_block, BlockType::kFilter,
+      get_context, lookup_context, for_compaction, use_cache);
+
+  return s;
+}
+
+std::map<uint32_t, std::vector<BlockHandle>>
+BlockBasedTable::GetSegmentBlockHandles() const {
+  CachableEntry<Block> filter_block;
+  Status s =
+      GetFilterIndexBlock(ReadOptions(), true, nullptr, nullptr, &filter_block);
+  assert(s.ok());
+
+  std::unique_ptr<IndexBlockIter> filter_index_iter;
+  filter_index_iter.reset(filter_block.GetValue()->NewIndexIterator(
+      rep_->segment_id_removing_comparator.get(),
+      get_rep()->get_global_seqno(BlockType::kFilter), nullptr, nullptr,
+      true /* total_order_seek */, false /* have_first_key */,
+      rep_->index_key_includes_seq, rep_->index_value_is_full));
+
+  std::map<uint32_t, std::vector<BlockHandle>> segment_block_handles;
+  
+  filter_index_iter->SeekToFirst();
+  while (filter_index_iter->Valid()) {
+    BlockHandle block_handle = filter_index_iter->value().handle;
+    const auto filter_key = filter_index_iter->user_key();
+    uint32_t segment_id = DecodeFixed32R(filter_key.data() + filter_key.size() - 4);
+    segment_block_handles[segment_id].push_back(block_handle);
+
+    filter_index_iter->Next();
+  }
+
+  return segment_block_handles;
+}
+
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/table/block_based/block_based_table_reader.h b/table/block_based/block_based_table_reader.h
index ce7c4ed8a..0a61e273d 100644
--- a/table/block_based/block_based_table_reader.h
+++ b/table/block_based/block_based_table_reader.h
@@ -9,8 +9,8 @@
 
 #pragma once
 
+#include <map>
 #include <memory>
-#include "db/art/filter_cache_client.h"
 #include "db/range_tombstone_fragmenter.h"
 #include "file/filename.h"
 #include "rocksdb/comparator.h"
@@ -42,6 +42,8 @@ struct BlockBasedTableOptions;
 struct EnvOptions;
 struct ReadOptions;
 class GetContext;
+class FilterCacheClient;
+class FilterCacheEntry;
 
 typedef std::vector<std::pair<std::string, std::string>> KVPairBlock;
 
@@ -142,7 +144,9 @@ class BlockBasedTable : public TableReader {
   Status Get(FilterCacheClient& filter_cache,
              const ReadOptions& readOptions, const Slice& key,
              GetContext* get_context, const SliceTransform* prefix_extractor,
-             bool skip_filters = false) override;
+             bool skip_filters = false);
+  
+  std::map<uint32_t, std::vector<BlockHandle>> GetSegmentBlockHandles() const override;
 #endif
 
   // WaLSM+ Note: call FullFilterKeyMayMatch() method in this file
@@ -261,6 +265,20 @@ class BlockBasedTable : public TableReader {
                                    CachableEntry<Block>& block,
                                    TBlockIter* input_iter, Status s) const;
 
+#ifdef ART_PLUS
+  // Similar to the above, with one crucial difference: it will retrieve the
+  // block from the file even if there are no caches configured (assuming the
+  // read options allow I/O).
+  template <typename TBlocklike>
+  Status RetrieveBlock(FilePrefetchBuffer* prefetch_buffer,
+                       const ReadOptions& ro, const BlockHandle& handle,
+                       const UncompressionDict& uncompression_dict,
+                       CachableEntry<TBlocklike>* block_entry,
+                       BlockType block_type, GetContext* get_context,
+                       BlockCacheLookupContext* lookup_context,
+                       bool for_compaction, bool use_cache) const;
+#endif
+
   class PartitionedIndexIteratorState;
 
   template <typename TBlocklike>
@@ -270,6 +288,8 @@ class BlockBasedTable : public TableReader {
 
   friend class UncompressionDictReader;
 
+  friend class FilterCacheEntry;
+
  protected:
   Rep* rep_;
   explicit BlockBasedTable(Rep* rep, BlockCacheTracer* const block_cache_tracer)
@@ -320,6 +340,7 @@ class BlockBasedTable : public TableReader {
       GetContext* get_context, BlockCacheLookupContext* lookup_context,
       BlockContents* contents) const;
 
+#ifndef ART_PLUS
   // Similar to the above, with one crucial difference: it will retrieve the
   // block from the file even if there are no caches configured (assuming the
   // read options allow I/O).
@@ -331,6 +352,7 @@ class BlockBasedTable : public TableReader {
                        BlockType block_type, GetContext* get_context,
                        BlockCacheLookupContext* lookup_context,
                        bool for_compaction, bool use_cache) const;
+#endif
 
   void RetrieveMultipleBlocks(
       const ReadOptions& options, const MultiGetRange* batch,
@@ -511,6 +533,11 @@ class BlockBasedTable : public TableReader {
   void DumpKeyValue(const Slice& key, const Slice& value,
                     std::ostream& out_stream);
 
+  Status GetFilterIndexBlock(const ReadOptions& read_options, bool use_cache,
+                             GetContext* get_context,
+                             BlockCacheLookupContext* lookup_context,
+                             CachableEntry<Block>* filter_block) const;
+
   // A cumulative data block file read in MultiGet lower than this size will
   // use a stack buffer
   static constexpr size_t kMultiGetReadStackBufSize = 8192;
diff --git a/table/block_based/block_based_table_segment_aware_iterator.cc b/table/block_based/block_based_table_segment_aware_iterator.cc
new file mode 100644
index 000000000..aaaf34c06
--- /dev/null
+++ b/table/block_based/block_based_table_segment_aware_iterator.cc
@@ -0,0 +1,149 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#include "table/block_based/block_based_table_segment_aware_iterator.h"
+#include <cstdint>
+#include <memory>
+#include "db/dbformat.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+#include "util/coding.h"
+
+namespace ROCKSDB_NAMESPACE {
+bool BlockBasedTableSegmentAwareIterator::Valid() const {
+  return data_iter_ && data_iter_->Valid();
+}
+
+void BlockBasedTableSegmentAwareIterator::SeekToFirst() {
+  data_iter_->SeekToFirst();
+  SeekFilterAndUpdateSegmentID();
+}
+
+void BlockBasedTableSegmentAwareIterator::SeekToLast() {
+  data_iter_->SeekToLast();
+  SeekFilterAndUpdateSegmentID();
+}
+
+void BlockBasedTableSegmentAwareIterator::Seek(const Slice& target) {
+  // data_iter receives internal key, while filter_index_iter receives modified_key (maybe internal key or just internal key)
+  data_iter_->Seek(target);
+  SeekFilterAndUpdateSegmentID();
+}
+
+void BlockBasedTableSegmentAwareIterator::SeekForPrev(const Slice& target) {
+  data_iter_->SeekForPrev(target);
+  SeekFilterAndUpdateSegmentID();
+}
+
+void BlockBasedTableSegmentAwareIterator::Next() {
+  data_iter_->Next();
+  UpdateSegmentID();
+}
+
+void BlockBasedTableSegmentAwareIterator::Prev() {
+  data_iter_->Prev();
+  // degraded performance
+  SeekFilterAndUpdateSegmentID();
+}
+
+Slice BlockBasedTableSegmentAwareIterator::key() const {
+  return data_iter_->key();
+}
+
+Slice BlockBasedTableSegmentAwareIterator::user_key() const {
+  return data_iter_->user_key();
+}
+
+Slice BlockBasedTableSegmentAwareIterator::value() const {
+  return data_iter_->value();
+}
+
+Status BlockBasedTableSegmentAwareIterator::status() const {
+  Status data_iter_status = data_iter_->status();
+  if (!data_iter_status.ok()) {
+    return data_iter_status;
+  }
+  return status_;
+}
+
+uint32_t BlockBasedTableSegmentAwareIterator::segment_id() const {
+  return current_segment_id_;
+}
+
+void BlockBasedTableSegmentAwareIterator::SeekFilterAndUpdateSegmentID() {
+  if (!data_iter_->Valid()) {
+    status_ = data_iter_->status();
+    current_segment_id_ = INVALID_SEGMENT_ID;
+    return;
+  }
+
+  // we should return segment_id corresponding to the user_key() when called segment_id(),
+  // so here we use data_iter_->user_key() to get the user_key for filter_index_iter
+  Slice current_user_key = data_iter_->user_key();
+
+  std::unique_ptr<const char[]> current_modified_key_buf;
+  Slice current_modified_key = generate_modified_user_key(
+      current_modified_key_buf, current_user_key, 0, 0);
+
+  filter_index_iter_->Seek(current_modified_key);
+  UpdateSegmentID();
+}
+
+// assumes we will get the entire filter partition key (including user_key, seq_num, segment_id)
+// may iterate over the filter_index_iter to find correct filter_index, then extract segment_id
+void BlockBasedTableSegmentAwareIterator::UpdateSegmentID() {
+  if (!data_iter_->Valid()) {
+    // TODO: (TODO: how to handle this situation?)
+    current_segment_id_ = INVALID_SEGMENT_ID;
+    return;
+  }
+
+  if (!filter_index_iter_ || !filter_index_iter_->Valid()) {
+    // TODO: (TODO: how to handle this situation?)
+    current_segment_id_ = INVALID_SEGMENT_ID;
+    return;
+  }
+
+  Slice current_user_key = data_iter_->user_key();
+
+  std::unique_ptr<const char[]> current_modified_key_buf;
+  Slice current_modified_key = generate_modified_user_key(
+      current_modified_key_buf, current_user_key, 0, 0);
+
+  Slice filter_key = filter_index_iter_->user_key();
+  // forward lookup
+  while (segment_id_removing_comparator_->Compare(current_modified_key, filter_key) > 0) {
+    filter_index_iter_->Next();
+    if (!filter_index_iter_->Valid()) {
+      // TODO: (TODO: how to handle this situation?)
+      current_segment_id_ = INVALID_SEGMENT_ID;
+      return;
+    }
+    filter_key = filter_index_iter_->user_key();
+  }
+  // backward lookup not implemented
+  // do nothing here, since we already seek it
+  // frequently seeking backward is not good for performance
+
+
+  uint32_t filter_index = DecodeFixed32R(filter_key.data());
+  if (filter_index > 0) {
+    // filter_index=0 should always be satisfied (TODO: how to handle this situation?)
+    current_segment_id_ = INVALID_SEGMENT_ID;
+    return;
+  }
+
+  uint32_t segment_id = INVALID_SEGMENT_ID;
+  if (filter_key.size() >= 8) {
+    segment_id = DecodeFixed32R(filter_key.data() + filter_key.size() - 4);
+  }
+
+  current_segment_id_ = segment_id;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/table/block_based/block_based_table_segment_aware_iterator.h b/table/block_based/block_based_table_segment_aware_iterator.h
new file mode 100644
index 000000000..926f96ea0
--- /dev/null
+++ b/table/block_based/block_based_table_segment_aware_iterator.h
@@ -0,0 +1,75 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#pragma once
+#include <cstdint>
+#include <memory>
+#include "db/dbformat.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/db.h"
+#include "table/block_based/block.h"
+#include "table/block_based/block_based_table_reader.h"
+
+#include "table/block_based/block_based_table_reader_impl.h"
+#include "table/block_based/block_prefetcher.h"
+#include "table/block_based/cachable_entry.h"
+#include "table/block_based/reader_common.h"
+#include "table/internal_iterator.h"
+
+namespace ROCKSDB_NAMESPACE {
+// Iterates over the contents of BlockBasedTable, also provides segment_id information by iterating over the filter index.
+class BlockBasedTableSegmentAwareIterator : public InternalIteratorBase<Slice> {
+ public:
+  BlockBasedTableSegmentAwareIterator(
+      std::unique_ptr<InternalIterator> data_iter, CachableEntry<Block> filter_index_block_entry, std::unique_ptr<IndexBlockIter> filter_index_iter,
+      const InternalKeyComparator& icomp,
+      TableReaderCaller caller)
+      : data_iter_(std::move(data_iter)),
+        filter_index_block_entry_(std::move(filter_index_block_entry)),
+        filter_index_iter_(std::move(filter_index_iter)),
+        icmp_(&icomp),
+        segment_id_removing_comparator_(SegmentIdRemovingComparator(icomp.user_comparator())),
+        lookup_context_(caller),
+        user_comparator_(icomp.user_comparator()) {}
+
+  ~BlockBasedTableSegmentAwareIterator() {}
+
+  // Ausuming that `target` is the original user key, not the modified key.
+  void Seek(const Slice& target) override;
+  // Ausuming that `target` is the original user key, not the modified key.
+  void SeekForPrev(const Slice& target) override;
+  void SeekToFirst() override;
+  void SeekToLast() override;
+  void Next() final override;
+  void Prev() override;
+  bool Valid() const override;
+  Slice key() const override;
+  Slice user_key() const override;
+  Slice value() const override;
+  Status status() const override;
+  uint32_t segment_id() const override;
+
+ private:
+  std::unique_ptr<InternalIterator> data_iter_;
+  CachableEntry<Block> filter_index_block_entry_;
+  std::unique_ptr<IndexBlockIter> filter_index_iter_;
+  const InternalKeyComparator* icmp_;
+  std::unique_ptr<Comparator> segment_id_removing_comparator_;
+  const SliceTransform* prefix_extractor_;
+  TableReaderCaller lookup_context_;
+  InternalKeyComparator user_comparator_;
+  HistogramImpl* file_read_hist_;
+  uint32_t current_segment_id_ = 0;
+  InternalKey current_partition_key_;
+  Status status_;
+  int level_;
+
+  void UpdateSegmentID();
+  void SeekFilterAndUpdateSegmentID();
+};
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/table/block_based/cachable_entry.h b/table/block_based/cachable_entry.h
index 8b34ada54..71116f5f6 100644
--- a/table/block_based/cachable_entry.h
+++ b/table/block_based/cachable_entry.h
@@ -186,6 +186,15 @@ class CachableEntry {
     assert(!own_value_);
   }
 
+#ifdef ART_PLUS
+  T* ReleaseValue() {
+    assert(own_value_ && cache_ == nullptr && cache_handle_ == nullptr);
+    T* value = value_;
+    ResetFields();
+    return value;
+  }
+#endif
+
 private:
   // release cache entry in cache or release owned value
   void ReleaseResource() {
diff --git a/table/block_based/filter_block.h b/table/block_based/filter_block.h
index d94c7e606..8a9c4cc63 100644
--- a/table/block_based/filter_block.h
+++ b/table/block_based/filter_block.h
@@ -20,9 +20,13 @@
 
 #include <stddef.h>
 #include <stdint.h>
+#include <sys/types.h>
+#include <cassert>
 #include <memory>
 #include <string>
 #include <vector>
+#include "db/art/clf_model.h"
+#include "db/art/filter_cache_client.h"
 #include "db/dbformat.h"
 #include "rocksdb/options.h"
 #include "rocksdb/slice.h"
@@ -41,6 +45,23 @@ class FilterPolicy;
 class GetContext;
 using MultiGetRange = MultiGetContext::Range;
 
+struct SegmentBuilderResult {
+  struct PerSegmentResult {
+    uint32_t segment_id;
+    std::vector<RangeRatePair> range_rate_pairs;
+    std::unordered_map<uint32_t, double> inherit_recorder;
+
+    std::string smallest_key;
+    std::string largest_key;
+    uint32_t key_count;
+  };
+
+  std::set<uint32_t> new_segment_ids;
+  std::vector<PerSegmentResult> per_segment_results;
+  std::set<uint32_t> merged_segment_ids;
+  int output_level;
+};
+
 // A FilterBlockBuilder is used to construct all of the filters for a
 // particular Table.  It generates a single string which is stored as
 // a special block in the Table.
@@ -60,7 +81,7 @@ class FilterBlockBuilder {
 
   virtual bool IsBlockBased() = 0;                    // If is blockbased filter
   virtual void StartBlock(uint64_t block_offset) = 0;  // Start new block filter
-  virtual void Add(const Slice& key) = 0;      // Add a key to current filter
+  virtual void Add(const Slice& key, uint32_t segment_id = INVALID_SEGMENT_ID) = 0;      // Add a key to current filter
   virtual size_t NumAdded() const = 0;         // Number of keys added
   Slice Finish() {                             // Generate Filter
     const BlockHandle empty_handle;
@@ -70,6 +91,8 @@ class FilterBlockBuilder {
     return ret;
   }
   virtual Slice Finish(const BlockHandle& tmp, Status* status) = 0;
+  // default invalid method to make compiler happy
+  virtual SegmentBuilderResult GetSegmentBuilderResult() { assert(false); return SegmentBuilderResult{}; }
 };
 
 // A FilterBlockReader is used to parse filter from SST table.
@@ -103,6 +126,19 @@ class FilterBlockReader {
                            GetContext* get_context,
                            BlockCacheLookupContext* lookup_context) = 0;
 
+#ifdef ART_PLUS
+  virtual bool KeyMayMatch(FilterCacheClient& filter_cache,
+                           const Slice& key,
+                           const SliceTransform* prefix_extractor,
+                           uint64_t block_offset, const bool no_io,
+                           const Slice* const const_ikey_ptr,
+                           GetContext* get_context,
+                           BlockCacheLookupContext* lookup_context) {
+    assert(false);
+    return false;
+  }
+#endif
+
   virtual void KeysMayMatch(MultiGetRange* range,
                             const SliceTransform* prefix_extractor,
                             uint64_t block_offset, const bool no_io,
@@ -174,4 +210,5 @@ class FilterBlockReader {
   }
 };
 
+
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/table/block_based/filter_policy.cc b/table/block_based/filter_policy.cc
index da03f1d5f..27ed3a192 100644
--- a/table/block_based/filter_policy.cc
+++ b/table/block_based/filter_policy.cc
@@ -16,6 +16,7 @@
 
 #include "rocksdb/filter_policy.h"
 
+#include "db/art/macros.h"
 #include "rocksdb/slice.h"
 #include "table/block_based/block_based_filter_block.h"
 #include "table/block_based/full_filter_block.h"
@@ -571,7 +572,7 @@ class MultiLegacyBloomBitsBuilder : public FilterBitsBuilder {
   virtual Slice Finish(std::unique_ptr<const char[]>* buf) override;
   virtual Slice FinishWithId(std::unique_ptr<const char[]>* buf,
                        const int hash_id) override;
-
+  virtual int CalculateNumEntry(const uint32_t bytes) override;
  private:
   std::vector<LegacyBloomBitsBuilder*> bits_builders_;
 
@@ -616,6 +617,10 @@ Slice MultiLegacyBloomBitsBuilder::FinishWithId(std::unique_ptr<const char[]>* b
                                           int hash_id) {
   return bits_builders_[hash_id]->Finish(buf);
 }
+
+int MultiLegacyBloomBitsBuilder::CalculateNumEntry(const uint32_t bytes) {
+  return bits_builders_[0]->CalculateNumEntry(bytes);
+}
 #endif
 
 class LegacyBloomBitsReader : public FilterBitsReader {
@@ -875,8 +880,8 @@ FilterBitsBuilder* BloomFilterPolicy::GetBuilderWithContext(
         #else
         // TODO: determine filter_count, 
         // and maybe move this property to some kind of options (WaLSM+)
-        const int filter_count = 10;
-        return new MultiLegacyBloomBitsBuilder(filter_count, whole_bits_per_key_, context.info_log);
+        const int filter_count = MAX_UNITS_NUM;
+        return new MultiLegacyBloomBitsBuilder(filter_count, BITS_PER_KEY_PER_UNIT, context.info_log);
         #endif
     }
   }
diff --git a/table/block_based/full_filter_block.cc b/table/block_based/full_filter_block.cc
index f6ecbeb1c..7912461a6 100644
--- a/table/block_based/full_filter_block.cc
+++ b/table/block_based/full_filter_block.cc
@@ -27,7 +27,7 @@ FullFilterBlockBuilder::FullFilterBlockBuilder(
   filter_bits_builder_.reset(filter_bits_builder);
 }
 
-void FullFilterBlockBuilder::Add(const Slice& key) {
+void FullFilterBlockBuilder::Add(const Slice& key, uint32_t segment_id) {
   const bool add_prefix = prefix_extractor_ && prefix_extractor_->InDomain(key);
   if (whole_key_filtering_) {
     if (!add_prefix) {
diff --git a/table/block_based/full_filter_block.h b/table/block_based/full_filter_block.h
index deda30c6f..a91404ba0 100644
--- a/table/block_based/full_filter_block.h
+++ b/table/block_based/full_filter_block.h
@@ -55,7 +55,7 @@ class FullFilterBlockBuilder : public FilterBlockBuilder {
   // not implemented in FullFilterBlock
   virtual void StartBlock(uint64_t /*block_offset*/) override {}
   // if not use prefix bloom, only call AddKey(key)
-  virtual void Add(const Slice& key) override;
+  virtual void Add(const Slice& key, uint32_t segment_id) override;
   // return num_added_, num of keys
   virtual size_t NumAdded() const override { return num_added_; }
   // only return the slice from LegacyBloomBitsBuilder(format version < 5)
diff --git a/table/block_based/partitioned_filter_block.cc b/table/block_based/partitioned_filter_block.cc
index e077603f4..73397b55a 100644
--- a/table/block_based/partitioned_filter_block.cc
+++ b/table/block_based/partitioned_filter_block.cc
@@ -4,12 +4,27 @@
 //  (found in the LICENSE.Apache file in the root directory).
 
 #include "table/block_based/partitioned_filter_block.h"
+#include <sys/types.h>
 
+#include <algorithm>
+#include <array>
 #include <atomic>
+#include <cassert>
+#include <cctype>
+#include <cstddef>
+#include <cstdint>
 #include <cstring>
+#include <iomanip>
+#include <iostream>
 #include <memory>
+#include <mutex>
+#include <numeric>
+#include <ostream>
 #include <utility>
 
+#include "db/art/clf_model.h"
+#include "db/art/logger.h"
+#include "db/art/macros.h"
 #include "db/dbformat.h"
 #include "file/file_util.h"
 #include "monitoring/perf_context_imp.h"
@@ -21,21 +36,20 @@
 #include "rocksdb/status.h"
 #include "table/block_based/block.h"
 #include "table/block_based/block_based_table_reader.h"
+#include "table/block_based/filter_block.h"
+#include "table/block_based/index_builder.h"
+#include "table/format.h"
 #include "util/coding.h"
 
 namespace ROCKSDB_NAMESPACE {
-#ifdef ART_PLUS
-Slice generate_modified_internal_key(std::unique_ptr<const char[]>& buf,
-                                     Slice original_internal_key,
-                                     int filter_index, int segment_id);
-#endif
 
 PartitionedFilterBlockBuilder::PartitionedFilterBlockBuilder(
     const SliceTransform* _prefix_extractor, bool whole_key_filtering,
     FilterBitsBuilder* filter_bits_builder, int index_block_restart_interval,
     const bool use_value_delta_encoding,
     PartitionedIndexBuilder* const p_index_builder,
-    const uint32_t partition_size)
+    const uint32_t partition_size, const std::vector<std::string>& range_separators,
+    const InternalKeyComparator* const internal_comparator)
     : FullFilterBlockBuilder(_prefix_extractor, whole_key_filtering,
                              filter_bits_builder),
       index_on_filter_block_builder_(index_block_restart_interval,
@@ -45,11 +59,14 @@ PartitionedFilterBlockBuilder::PartitionedFilterBlockBuilder(
                                                  true /*use_delta_encoding*/,
                                                  use_value_delta_encoding),
       p_index_builder_(p_index_builder),
-      keys_added_to_partition_(0) {
+      keys_added_to_partition_(0),
+      range_separators_(range_separators),
+      internal_comparator_(internal_comparator),
+      user_comparator_(internal_comparator->user_comparator()) {
   keys_per_partition_ =
       filter_bits_builder_->CalculateNumEntry(partition_size);
   if (keys_per_partition_ < 1) {
-    // partition_size (minus buffer, ~10%) might be smaller than minimum
+    // partition_size (minus bufer, ~10%) might be smaller than minimum
     // filter size, sometimes based on cache line size. Try to find that
     // minimum size without CalculateSpace (not necessarily available).
     uint32_t larger = std::max(partition_size + 4, uint32_t{16});
@@ -68,18 +85,25 @@ PartitionedFilterBlockBuilder::PartitionedFilterBlockBuilder(
     }
   }
 
+  // keys_per_partition_ = std::min(keys_per_partition_, (uint32_t) KEYS_PER_SEGMENT);
+  // keys_per_partition_ = KEYS_PER_SEGMENT;
+
   #ifdef ART_PLUS
   filter_count_ = filter_bits_builder->filter_count_;
   filter_gc.resize(filter_count_);
   filters.resize(filter_count_);
   finishing_filter_index_ = 0;
+
+  keys_in_current_segment_.reserve(keys_per_partition_);
+  segment_ids_in_current_segment_.reserve(keys_per_partition_);
+  current_range_index_ = 0;
   #endif
 }
 
 PartitionedFilterBlockBuilder::~PartitionedFilterBlockBuilder() {}
 
 void PartitionedFilterBlockBuilder::MaybeCutAFilterBlock(
-    const Slice* next_key) {
+    const Slice* next_key, uint32_t next_key_segment_id) {
   // Use == to send the request only once
   if (keys_added_to_partition_ == keys_per_partition_) {
     // Currently only index builder is in charge of cutting a partition. We keep
@@ -103,24 +127,141 @@ void PartitionedFilterBlockBuilder::MaybeCutAFilterBlock(
   }
 
   #ifdef ART_PLUS
+  const uint32_t new_segment_id = segment_id_base_.fetch_add(1, std::memory_order_relaxed);
   for (int i = 0; i < filter_count_; ++i) {
     filter_gc[i].push_back(std::unique_ptr<const char[]>(nullptr));
     Slice filter = filter_bits_builder_->FinishWithId(&filter_gc[i].back(), i);
     std::string& index_key = p_index_builder_->GetPartitionKey();
-    filters[i].push_back({index_key, filter, segment_id_base_.fetch_add(1, std::memory_order_relaxed)});
+    filters[i].push_back({index_key, filter, new_segment_id});
   }
+  ProcessSegmentCut(new_segment_id);
   #else
   Slice filter = filter_bits_builder_->Finish(&filter_gc.back());
   std::string& index_key = p_index_builder_->GetPartitionKey();
   filters.push_back({index_key, filter});
   #endif
+  // std::cerr << "keys_added_to_partition = " << keys_per_partition_ << "\n";
   keys_added_to_partition_ = 0;
   Reset();
 }
 
-void PartitionedFilterBlockBuilder::Add(const Slice& key) {
-  MaybeCutAFilterBlock(&key);
-  FullFilterBlockBuilder::Add(key);
+void PartitionedFilterBlockBuilder::ProcessSegmentCut(uint32_t new_segment_id) {
+  assert(keys_in_current_segment_.size() ==
+         segment_ids_in_current_segment_.size());
+  size_t siz = keys_in_current_segment_.size();
+
+  for (size_t i = 0; i + 1 < siz; ++i) {
+    if (user_comparator_->Compare(keys_in_current_segment_[i],
+                                  keys_in_current_segment_[i + 1]) > 0) {
+      assert(false);
+      std::cout << std::endl;
+      std::cout << "segment_id: " << new_segment_id << ", key count: " << siz
+                << std::endl;
+      for (size_t i = 0; i < siz; ++i) {
+        std::cout << std::setw(5) << i << " " << keys_in_current_segment_[i]
+                  << std::endl;
+      }
+      std::cout << std::endl;
+    }
+  }
+
+  // for range_recorder
+  std::vector<RangeRatePair> range_rate_pairs;
+  uint32_t cnt_in_current_range = 0;
+
+  // for inherit_infos_recorders
+
+  // <parent_segment_id, count>
+  std::unordered_map<uint32_t, double> inherit_counts;
+
+  // for every first key in segments, we perform binary search to find corresponding key range
+  {
+    auto it = std::upper_bound(
+        range_separators_.begin(), range_separators_.end(),
+        keys_in_current_segment_[0], [this](const Slice& a, const Slice& b) {
+          return this->user_comparator_->Compare(a, b) < 0;
+        });
+    if (it != range_separators_.begin()) {
+      it--;
+      current_range_index_ = std::distance(range_separators_.begin(), it);
+    } else {
+      // the first key is smaller than all the range separators,
+      // which should only happen when range_separators_ is empty
+      assert(range_separators_.empty());
+    }
+  }
+
+  // perform linear scan to find the range for each key
+  for (size_t i = 0; i < siz; ++i) {
+    const Slice& key = keys_in_current_segment_[i];
+    const uint32_t source_segment_id = segment_ids_in_current_segment_[i];
+
+    // for merged_segment_ids
+    source_segment_ids_count[source_segment_id]++;
+
+    // for range_recorder
+    while (current_range_index_ + 1 < range_separators_.size() &&
+           user_comparator_->Compare(
+               key, range_separators_[current_range_index_ + 1]) >= 0) {
+      current_range_index_++;
+      if (cnt_in_current_range > 0) {
+        range_rate_pairs.emplace_back(
+            RangeRatePair{uint32_t(current_range_index_),
+                          double(cnt_in_current_range) / siz});
+        cnt_in_current_range = 0;
+      }
+    }
+
+    // for inherit_infos_recorders
+    inherit_counts[source_segment_id]++;
+    cnt_in_current_range++;
+  }
+
+  // process the last range
+  if (current_range_index_ + 1 < range_separators_.size()) {
+    if (cnt_in_current_range > 0) {
+      range_rate_pairs.emplace_back(RangeRatePair{
+          uint32_t(current_range_index_), double(cnt_in_current_range) / siz});
+    }
+    cnt_in_current_range = 0;
+  }
+  
+  // update smallest & largest key
+  std::string smallest_key = keys_in_current_segment_[0];
+  std::string largest_key = keys_in_current_segment_.back();
+
+  // update result
+  // inherit_counts should be updated when GetSegmentBuilderResult() is called
+  segment_builder_result_.new_segment_ids.insert(new_segment_id);
+  segment_builder_result_.per_segment_results.push_back(SegmentBuilderResult::PerSegmentResult{
+      new_segment_id, range_rate_pairs, inherit_counts, smallest_key,
+      largest_key, (uint32_t) keys_in_current_segment_.size()});
+
+  // clear
+  keys_in_current_segment_.clear();
+  segment_ids_in_current_segment_.clear();
+
+}
+
+void PartitionedFilterBlockBuilder::Add(const Slice& key, uint32_t segment_id) {
+  const std::string key_str = key.ToString();
+  for (int i = 0; i < key_str.size(); i++) {
+    char c = key_str[i];
+    if (!std::isprint(c)) {
+      std::cout << "error" << std::endl;
+    }
+  }
+  if (!keys_in_current_segment_.empty() && user_comparator_->Compare(keys_in_current_segment_.back(), key) > 0) {
+    std::cout << "error" << std::endl;
+  }
+  if (key.ends_with(Slice("\1", 1))) {
+    std::cout << "error" << std::endl;
+  }
+  MaybeCutAFilterBlock(&key, segment_id);
+  FullFilterBlockBuilder::Add(key, segment_id);
+
+  keys_in_current_segment_.emplace_back(key.ToString());
+  segment_ids_in_current_segment_.emplace_back(segment_id);
 }
 
 void PartitionedFilterBlockBuilder::AddKey(const Slice& key) {
@@ -170,7 +311,7 @@ Slice PartitionedFilterBlockBuilder::Finish(
     filters.pop_front();
     #endif
   } else {
-    MaybeCutAFilterBlock(nullptr);
+    MaybeCutAFilterBlock(nullptr, INVALID_SEGMENT_ID);
   }
   // If there is no filter partition left, then return the index on filter
   // partitions
@@ -350,6 +491,33 @@ BlockHandle PartitionedFilterBlockReader::GetFilterPartitionHandle(
   return fltr_blk_handle;
 }
 
+#ifdef ART_PLUS
+std::pair<Slice, BlockHandle> PartitionedFilterBlockReader::GetFilterPartitionKeyAndHandle(
+    const CachableEntry<Block>& filter_block, const Slice& entry) const {
+  IndexBlockIter iter;
+  const Comparator* const segment_id_removing_comparator = table()->get_rep()->segment_id_removing_comparator.get();
+  Statistics* kNullStats = nullptr;
+  filter_block.GetValue()->NewIndexIterator(
+      segment_id_removing_comparator,
+      table()->get_rep()->get_global_seqno(BlockType::kFilter), &iter,
+      kNullStats, true /* total_order_seek */, false /* have_first_key */,
+      index_key_includes_seq(), index_value_is_full());
+  iter.Seek(entry);
+  if (UNLIKELY(!iter.Valid())) {
+    // entry is larger than all the keys. However its prefix might still be
+    // present in the last partition. If this is called by PrefixMayMatch this
+    // is necessary for correct behavior. Otherwise it is unnecessary but safe.
+    // Assuming this is an unlikely case for full key search, the performance
+    // overhead should be negligible.
+    iter.SeekToLast();
+  }
+  assert(iter.Valid());
+  Slice fltr_block_key = iter.key();
+  BlockHandle fltr_blk_handle = iter.value().handle;
+  return {fltr_block_key, fltr_blk_handle};
+}
+#endif
+
 // TODO: retrieve filter block from filter cache (WaLSM+)
 Status PartitionedFilterBlockReader::GetFilterPartitionBlock(
     FilePrefetchBuffer* prefetch_buffer, const BlockHandle& fltr_blk_handle,
@@ -464,40 +632,50 @@ bool PartitionedFilterBlockReader::MayMatch(
     return true;
   }
 
-  #ifdef ART_PLUS
   // find key "0 original_internal key". filter_index=segment_id=0. (WaLSM+)
   // segment_id itself is useless in comparison, 
   // but must be appended otherwise the extracted user key will be incorrect.
   std::unique_ptr<const char[]> modified_key_buf;
   Slice modified_key =
       generate_modified_internal_key(modified_key_buf, *const_ikey_ptr, 0, 0);
-  auto filter_handle = GetFilterPartitionHandle(filter_block, modified_key);
-  #else
-  auto filter_handle = GetFilterPartitionHandle(filter_block, *const_ikey_ptr);
-  #endif
+  // auto filter_handle = GetFilterPartitionHandle(filter_block, modified_key);
+  auto key_and_handle = GetFilterPartitionKeyAndHandle(filter_block, modified_key);
+  Slice filter_key = key_and_handle.first;
+  auto filter_handle = key_and_handle.second;
   if (UNLIKELY(filter_handle.size() == 0)) {  // key is out of range
     return false;
   }
 
+  assert(filter_key.size() >= 8);
+  // TODO: validate we have stripped useless internal key suffix (WaLSM+)
+  uint32_t segment_id = DecodeFixed32R(filter_key.data() + filter_key.size() - 4);
+
   // TODO: get some filter blocks from the filter cache and check (WaLSM+)
-  CachableEntry<ParsedFullFilterBlock> filter_partition_block;
-  s = GetFilterPartitionBlock(nullptr /* prefetch_buffer */, filter_handle,
-                              no_io, get_context, lookup_context,
-                              &filter_partition_block);
-  if (UNLIKELY(!s.ok())) {
-    IGNORE_STATUS_IF_ERROR(s);
-    return true;
+  std::vector<CachableEntry<ParsedFullFilterBlock>> filter_partition_blocks =
+      filter_cache.get_filter_blocks(segment_id);
+
+  // static std::array<std::atomic<int>, MAX_UNITS_NUM+1> filter_unit_num_hits;
+  // static std::atomic<int> maymatch_calls{0};
+  // filter_unit_num_hits[filter_partition_blocks.size()]++;
+  // if (maymatch_calls.fetch_add(1) % 500000 == 0) {
+  //   std::cout << "maymatch_calls: " << maymatch_calls.load() << std::endl;
+  //   for (size_t i = 0; i < filter_unit_num_hits.size(); ++i) {
+  //     std::cout << "filter_unit_num_hits[" << i << "]: " << filter_unit_num_hits[i].load() << std::endl;
+  //   }
+  // }
+
+  for (size_t hash_id = 0; hash_id < filter_partition_blocks.size(); ++hash_id) {
+    FullFilterBlockReader filter_partition(
+        table(), std::move(filter_partition_blocks[hash_id]), hash_id);
+    bool may_exist = (filter_partition.*filter_function)(
+        slice, prefix_extractor, block_offset, no_io, const_ikey_ptr,
+        get_context, lookup_context);
+    if (!may_exist) {
+      return false;
+    }
   }
 
-  FullFilterBlockReader filter_partition(table(),
-                                         std::move(filter_partition_block));
-  // initialize the reader with hash_id (WaLSM+)
-  // FullFilterBlockReader filter_partition(table(),
-                                        //  std::move(filter_partition_block),
-                                        //  1);
-  return (filter_partition.*filter_function)(
-      slice, prefix_extractor, block_offset, no_io, const_ikey_ptr, get_context,
-      lookup_context);
+  return true;
 }
 #endif
 
@@ -714,6 +892,33 @@ bool PartitionedFilterBlockReader::index_value_is_full() const {
 }
 
 #ifdef ART_PLUS
+
+// should be called only once
+SegmentBuilderResult PartitionedFilterBlockBuilder::GetSegmentBuilderResult() {
+  // update inherit_recorders
+  for (auto& segment_result : segment_builder_result_.per_segment_results) {
+    auto& inherit_counts = segment_result.inherit_recorder;
+    int segment_size = 0;
+    for (auto& inherit_count : inherit_counts) {
+      segment_size += (int) inherit_count.second;
+    }
+    for (auto& inherit_count : inherit_counts) {
+      inherit_count.second /= segment_size;
+    }
+  }
+
+  // update merged_segment_ids
+  for (const auto& source_segment_id_count : source_segment_ids_count) {
+    const auto segment_id = source_segment_id_count.first;
+    segment_builder_result_.merged_segment_ids.insert(segment_id);
+  }
+
+  assert(!segment_builder_result_.merged_segment_ids.empty());
+  assert(!segment_builder_result_.new_segment_ids.empty());
+
+  return segment_builder_result_;
+}
+
 std::atomic<uint32_t> PartitionedFilterBlockBuilder::segment_id_base_{0};
 #endif
 
@@ -737,6 +942,23 @@ Slice generate_modified_internal_key(std::unique_ptr<const char[]>& buf, Slice o
   buf.reset(modified_key_buf);
   return modified_key;
 }
+
+Slice generate_modified_user_key(std::unique_ptr<const char[]>& buf, Slice original_user_key, int filter_index, int segment_id) {
+  // calculate modified_key (WaLSM+)
+  // +--------------+------------------------------------+------------+
+  // | filter_index | original_user_key                  | segment_id |
+  // |    4 bytes   | (key.size() - kInternalBytes) bytes|   4 bytes  |
+  // +--------------+------------------------------------+------------+
+  size_t modified_key_buf_size = 4 + original_user_key.size() + 4;
+  char *modified_key_buf = new char[modified_key_buf_size];
+  EncodeFixed32R(modified_key_buf, filter_index);
+  std::memcpy(modified_key_buf + 4, original_user_key.data(), original_user_key.size());
+  EncodeFixed32R(modified_key_buf + 4 + original_user_key.size(), segment_id);
+  Slice modified_key = Slice(modified_key_buf, modified_key_buf_size);
+
+  buf.reset(modified_key_buf);
+  return modified_key;
+}
 #endif
 
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/table/block_based/partitioned_filter_block.h b/table/block_based/partitioned_filter_block.h
index 0d970a7a6..4c74f8082 100644
--- a/table/block_based/partitioned_filter_block.h
+++ b/table/block_based/partitioned_filter_block.h
@@ -6,6 +6,7 @@
 #pragma once
 
 #include <atomic>
+#include <cstddef>
 #include <cstdint>
 #include <list>
 #include <string>
@@ -14,10 +15,12 @@
 #include "db/dbformat.h"
 #include "db/art/filter_cache_client.h"
 #include "index_builder.h"
+#include "rocksdb/comparator.h"
 #include "rocksdb/options.h"
 #include "rocksdb/slice.h"
 #include "rocksdb/slice_transform.h"
 #include "table/block_based/block.h"
+#include "table/block_based/filter_block.h"
 #include "table/block_based/filter_block_reader_common.h"
 #include "table/block_based/full_filter_block.h"
 #include "util/autovector.h"
@@ -31,16 +34,20 @@ class PartitionedFilterBlockBuilder : public FullFilterBlockBuilder {
       FilterBitsBuilder* filter_bits_builder, int index_block_restart_interval,
       const bool use_value_delta_encoding,
       PartitionedIndexBuilder* const p_index_builder,
-      const uint32_t partition_size);
+      const uint32_t partition_size, 
+      const std::vector<std::string>& range_separators,
+      const InternalKeyComparator* const internal_comparator);
 
   virtual ~PartitionedFilterBlockBuilder();
 
   void AddKey(const Slice& key) override;
-  void Add(const Slice& key) override;
+  void Add(const Slice& key, uint32_t segment_id) override;
 
   virtual Slice Finish(const BlockHandle& last_partition_block_handle,
                        Status* status) override;
 
+  virtual SegmentBuilderResult GetSegmentBuilderResult() override;
+
  private:
   // Filter data
   BlockBuilder index_on_filter_block_builder_;  // top-level index builder
@@ -65,7 +72,8 @@ class PartitionedFilterBlockBuilder : public FullFilterBlockBuilder {
   bool finishing_filters =
       false;  // true if Finish is called once but not complete yet.
   // The policy of when cut a filter block and Finish it
-  void MaybeCutAFilterBlock(const Slice* next_key);
+  void MaybeCutAFilterBlock(const Slice* next_key, uint32_t next_key_segment_id);
+  void ProcessSegmentCut(uint32_t new_segment_id);
   // Currently we keep the same number of partitions for filters and indexes.
   // This would allow for some potentioal optimizations in future. If such
   // optimizations did not realize we can use different number of partitions and
@@ -83,6 +91,14 @@ class PartitionedFilterBlockBuilder : public FullFilterBlockBuilder {
   // When Finish() is called, return filters[filter_index].front() (WaLSM+)
   int finishing_filter_index_;
   static std::atomic<uint32_t> segment_id_base_;
+  std::vector<std::string> keys_in_current_segment_;
+  std::vector<uint32_t> segment_ids_in_current_segment_;
+  std::map<uint32_t, uint32_t> source_segment_ids_count;
+  std::size_t current_range_index_;
+  const std::vector<std::string>& range_separators_;
+  const InternalKeyComparator* const internal_comparator_;
+  const Comparator* user_comparator_;
+  SegmentBuilderResult segment_builder_result_;
   #endif
 };
 
@@ -106,7 +122,7 @@ class PartitionedFilterBlockReader : public FilterBlockReaderCommon<Block> {
                    const Slice& key, const SliceTransform* prefix_extractor,
                    uint64_t block_offset, const bool no_io,
                    const Slice* const const_ikey_ptr, GetContext* get_context,
-                   BlockCacheLookupContext* lookup_context);
+                   BlockCacheLookupContext* lookup_context) override;
 #endif
   // TODO: not used in WaLSM+ Benchmark, meybe used in MultiGet interface ?
   void KeysMayMatch(MultiGetRange* range,
@@ -129,6 +145,11 @@ class PartitionedFilterBlockReader : public FilterBlockReaderCommon<Block> {
   size_t ApproximateMemoryUsage() const override;
 
  private:
+  #ifdef ART_PLUS
+  std::pair<Slice, BlockHandle> GetFilterPartitionKeyAndHandle(
+      const CachableEntry<Block>& filter_block, const Slice& entry) const;
+  #endif
+
   BlockHandle GetFilterPartitionHandle(const CachableEntry<Block>& filter_block,
                                        const Slice& entry) const;
   Status GetFilterPartitionBlock(
diff --git a/table/block_fetcher.cc b/table/block_fetcher.cc
index b0880d516..edf56d446 100644
--- a/table/block_fetcher.cc
+++ b/table/block_fetcher.cc
@@ -214,6 +214,7 @@ inline void BlockFetcher::GetBlockContents() {
 }
 
 Status BlockFetcher::ReadBlockContents() {
+  if (file_ == nullptr) return IOStatus::NotFound(Status::SubCode::kNone); // handle to special error of enable_units
   if (TryGetUncompressBlockFromPersistentCache()) {
     compression_type_ = kNoCompression;
 #ifndef NDEBUG
diff --git a/table/cuckoo/cuckoo_table_builder.cc b/table/cuckoo/cuckoo_table_builder.cc
index f42e87bdf..1ed444772 100644
--- a/table/cuckoo/cuckoo_table_builder.cc
+++ b/table/cuckoo/cuckoo_table_builder.cc
@@ -3,6 +3,7 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
+#include <cstdint>
 #ifndef ROCKSDB_LITE
 #include "table/cuckoo/cuckoo_table_builder.h"
 
@@ -84,7 +85,7 @@ CuckooTableBuilder::CuckooTableBuilder(
   properties_.db_session_id = db_session_id;
 }
 
-void CuckooTableBuilder::Add(const Slice& key, const Slice& value) {
+void CuckooTableBuilder::Add(const Slice& key, const Slice& value, uint32_t /*segment_id*/) {
   if (num_entries_ >= kMaxVectorIdx - 1) {
     status_ = Status::NotSupported("Number of keys in a file must be < 2^32-1");
     return;
diff --git a/table/cuckoo/cuckoo_table_builder.h b/table/cuckoo/cuckoo_table_builder.h
index 8e8026487..4a13830e9 100644
--- a/table/cuckoo/cuckoo_table_builder.h
+++ b/table/cuckoo/cuckoo_table_builder.h
@@ -40,7 +40,7 @@ class CuckooTableBuilder: public TableBuilder {
   // Add key,value to the table being constructed.
   // REQUIRES: key is after any previously added key according to comparator.
   // REQUIRES: Finish(), Abandon() have not been called
-  void Add(const Slice& key, const Slice& value) override;
+  void Add(const Slice& key, const Slice& value, uint32_t segment_id) override;
 
   // Return non-ok iff some error has been detected.
   Status status() const override { return status_; }
diff --git a/table/internal_iterator.h b/table/internal_iterator.h
index c4382a54e..7ee4dda52 100644
--- a/table/internal_iterator.h
+++ b/table/internal_iterator.h
@@ -6,6 +6,7 @@
 
 #pragma once
 
+#include <cstdint>
 #include <string>
 #include "db/dbformat.h"
 #include "rocksdb/comparator.h"
@@ -172,6 +173,8 @@ class InternalIteratorBase : public Cleanable {
     return Status::NotSupported("");
   }
 
+  virtual uint32_t segment_id() const { return INVALID_SEGMENT_ID; }
+
  protected:
   void SeekForPrevImpl(const Slice& target, const Comparator* cmp) {
     Seek(target);
diff --git a/table/iterator_wrapper.h b/table/iterator_wrapper.h
index ff46f2536..a38923ea5 100644
--- a/table/iterator_wrapper.h
+++ b/table/iterator_wrapper.h
@@ -9,6 +9,7 @@
 
 #pragma once
 
+#include <cstdint>
 #include <set>
 
 #include "table/internal_iterator.h"
@@ -65,6 +66,10 @@ class IteratorWrapperBase {
     assert(Valid());
     return iter_->value();
   }
+  uint32_t segment_id() const {
+    assert(Valid());
+    return iter_->segment_id();
+  }
   // Methods below require iter() != nullptr
   Status status() const {
     assert(iter_);
diff --git a/table/merging_iterator.cc b/table/merging_iterator.cc
index fdd1a4910..2a406d0a9 100644
--- a/table/merging_iterator.cc
+++ b/table/merging_iterator.cc
@@ -242,6 +242,11 @@ class MergingIterator : public InternalIterator {
     return current_->value();
   }
 
+  uint32_t segment_id() const override {
+    assert(Valid());
+    return current_->segment_id();
+  }
+
   bool PrepareValue() override {
     assert(Valid());
     if (current_->PrepareValue()) {
diff --git a/table/plain/plain_table_builder.cc b/table/plain/plain_table_builder.cc
index faebcfe2f..d83506343 100644
--- a/table/plain/plain_table_builder.cc
+++ b/table/plain/plain_table_builder.cc
@@ -122,7 +122,7 @@ PlainTableBuilder::~PlainTableBuilder() {
   io_status_.PermitUncheckedError();
 }
 
-void PlainTableBuilder::Add(const Slice& key, const Slice& value) {
+void PlainTableBuilder::Add(const Slice& key, const Slice& value, uint32_t /*segment_id*/) {
   // temp buffer for metadata bytes between key and value.
   char meta_bytes_buf[6];
   size_t meta_bytes_buf_size = 0;
diff --git a/table/plain/plain_table_builder.h b/table/plain/plain_table_builder.h
index 6ab5d59e3..3d9ed23db 100644
--- a/table/plain/plain_table_builder.h
+++ b/table/plain/plain_table_builder.h
@@ -57,7 +57,7 @@ class PlainTableBuilder: public TableBuilder {
   // Add key,value to the table being constructed.
   // REQUIRES: key is after any previously added key according to comparator.
   // REQUIRES: Finish(), Abandon() have not been called
-  void Add(const Slice& key, const Slice& value) override;
+  void Add(const Slice& key, const Slice& value, uint32_t segment_id) override;
 
   // Return non-ok iff some error has been detected.
   Status status() const override { return status_; }
diff --git a/table/table_builder.h b/table/table_builder.h
index 36475c143..2f5d2779e 100644
--- a/table/table_builder.h
+++ b/table/table_builder.h
@@ -19,6 +19,7 @@
 #include "options/cf_options.h"
 #include "rocksdb/options.h"
 #include "rocksdb/table_properties.h"
+#include "table/block_based/filter_block.h"
 #include "trace_replay/block_cache_tracer.h"
 
 namespace ROCKSDB_NAMESPACE {
@@ -150,7 +151,7 @@ class TableBuilder {
   // Add key,value to the table being constructed.
   // REQUIRES: key is after any previously added key according to comparator.
   // REQUIRES: Finish(), Abandon() have not been called
-  virtual void Add(const Slice& key, const Slice& value) = 0;
+  virtual void Add(const Slice& key, const Slice& value, uint32_t segment_id = INVALID_SEGMENT_ID) = 0;
 
   // Return non-ok iff some error has been detected.
   virtual Status status() const = 0;
@@ -198,6 +199,9 @@ class TableBuilder {
 
   // Return file checksum function name
   virtual const char* GetFileChecksumFuncName() const = 0;
+
+  // default method to make compiler happy, not yet implemented (WaLSM+)
+  virtual SegmentBuilderResult GetSegmentBuilderResult() { assert(false); return SegmentBuilderResult(); }
 };
 
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/table/table_reader.h b/table/table_reader.h
index b011790b9..9c5939840 100644
--- a/table/table_reader.h
+++ b/table/table_reader.h
@@ -8,9 +8,12 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
 #pragma once
+#include <map>
 #include <memory>
+#include <vector>
 #include "db/range_tombstone_fragmenter.h"
 #include "rocksdb/slice_transform.h"
+#include "table/format.h"
 #include "table/get_context.h"
 #include "table/internal_iterator.h"
 #include "table/multiget_context.h"
@@ -146,6 +149,10 @@ class TableReader {
                                 TableReaderCaller /*caller*/) {
     return Status::NotSupported("VerifyChecksum() not supported");
   }
+
+  virtual std::map<uint32_t, std::vector<BlockHandle>> GetSegmentBlockHandles() const {
+    return {};
+  }
 };
 
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/test_sh/test.sh b/test_sh/test.sh
index ab28f37c1..15bf7d76a 100644
--- a/test_sh/test.sh
+++ b/test_sh/test.sh
@@ -3,7 +3,7 @@
 value_array=(32 64 128)
 test_all_size=81920000000   #8G
 
-pmem_path="/mnt/chen/test"
+pmem_path= "/mnt/pmem0.7/guoteng/nodememory"
 
 bench_benchmarks="fillrandom, stats, wait, clean_cache, stats, readrandom, stats, clean_cache"
 bench_readnum="1000000"
diff --git a/util/bloom_test.cc b/util/bloom_test.cc
index 0fea9c662..f64c72a09 100644
--- a/util/bloom_test.cc
+++ b/util/bloom_test.cc
@@ -7,6 +7,7 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
+#include "table/block_based/full_filter_block.h"
 #ifndef GFLAGS
 #include <cstdio>
 int main() {
@@ -28,6 +29,9 @@ int main() {
 #include "test_util/testutil.h"
 #include "util/gflags_compat.h"
 #include "util/hash.h"
+#include "table/block_based/partitioned_filter_block.h"
+#include "include/rocksdb/filter_policy.h"
+#include "rocksdb/slice.h"
 
 using GFLAGS_NAMESPACE::ParseCommandLineFlags;
 
@@ -977,6 +981,111 @@ INSTANTIATE_TEST_CASE_P(Full, FullBloomTest,
                         testing::Values(BloomFilterPolicy::kLegacyBloom,
                                         BloomFilterPolicy::kFastLocalBloom));
 
+#ifdef ART_PLUS
+class MultiUnitBloomEffectTest : public testing::Test {
+ protected:
+  size_t filter_count_ = 8;
+  int bits_per_key_per_unit_ = 2;
+  int num_keys_ = 1000;
+  std::vector<std::string> keys_;
+  std::vector<std::unique_ptr<const char[]>> filter_bufs_;
+  std::vector<Slice> filter_slices_;
+  std::shared_ptr<const FilterPolicy> policy_;
+  std::unique_ptr<FilterBitsBuilder> bits_builder_;
+
+  void SetUp() override {
+    policy_.reset(new BloomFilterPolicy(bits_per_key_per_unit_, BloomFilterPolicy::kLegacyBloom));
+    
+    BlockBasedTableOptions table_options;
+    table_options.filter_policy = policy_;
+    
+    FilterBuildingContext context(table_options);
+    
+    // 直接通过policy获取builder
+    bits_builder_.reset(policy_->GetBuilderWithContext(context));
+    
+    // 生成key
+    for (int i = 0; i < num_keys_; ++i) {
+      keys_.push_back("key" + std::to_string(i));
+    }
+    
+    // 添加所有key到filter
+    for (const auto& k : keys_) {
+      bits_builder_->AddKey(Slice(k));
+    }
+    
+    // 分别构造每个unit的filter
+    filter_bufs_.resize(filter_count_);
+    filter_slices_.resize(filter_count_);
+    for (size_t i = 0; i < filter_count_; ++i) {
+      // 调用FinishWithId
+      filter_slices_[i] = bits_builder_->FinishWithId(&filter_bufs_[i], i);
+      std::cout << "slice " << i << " size: " << filter_slices_[i].size() << std::endl;
+    }
+  }
+
+  // 用前n个unit做AND查询
+  bool MayMatchWithNUnits(const std::string& key, int n) {
+    for (int i = 0; i < n; ++i) {
+      FilterBitsReader* reader = policy_->GetFilterBitsReader(filter_slices_[i]);
+      if (!reader->MayMatchWithId(Slice(key), i)) {
+        delete reader;
+        return false;
+      }
+      delete reader;
+    }
+    return true;
+  }
+};
+
+TEST_F(MultiUnitBloomEffectTest, FalsePositiveRateDecreasesWithMoreUnits) {
+  int test_fp = 0;
+  int test_total = 100000000;
+  // 用1个unit
+  for (int i = num_keys_; i < num_keys_ + test_total; ++i) {
+    if (MayMatchWithNUnits("key" + std::to_string(i), 1)) test_fp++;
+  }
+  double fp_rate_1 = test_fp / double(test_total);
+
+  test_fp = 0;
+  // 用2个unit
+  for (int i = num_keys_; i < num_keys_ + test_total; ++i) {
+    if (MayMatchWithNUnits("key" + std::to_string(i), 2)) test_fp++;
+  }
+  double fp_rate_2 = test_fp / double(test_total);
+
+  test_fp = 0;
+  // 用4个unit
+  for (int i = num_keys_; i < num_keys_ + test_total; ++i) {
+    if (MayMatchWithNUnits("key" + std::to_string(i), 4)) test_fp++;
+  }
+  double fp_rate_4 = test_fp / double(test_total);
+
+  test_fp = 0;
+  // 用8个unit
+  for (int i = num_keys_; i < num_keys_ + test_total; ++i) {
+    if (MayMatchWithNUnits("key" + std::to_string(i), 8)) test_fp++;
+  }
+  double fp_rate_8 = test_fp / double(test_total);
+
+  printf("FP rate with 1 unit: %f\n", fp_rate_1);
+  printf("FP rate with 2 units: %f\n", fp_rate_2);
+  printf("FP rate with 4 units: %f\n", fp_rate_4);
+  printf("FP rate with 8 units: %f\n", fp_rate_8);
+
+  ASSERT_GT(fp_rate_1, fp_rate_2);
+  ASSERT_GT(fp_rate_2, fp_rate_4);
+  ASSERT_GT(fp_rate_4, fp_rate_8);
+}
+
+TEST_F(MultiUnitBloomEffectTest, AllKeysAlwaysMatch) {
+  // 所有插入的key都应该能查到
+  for (const auto& k : keys_) {
+    ASSERT_TRUE(MayMatchWithNUnits(k, 8));
+  }
+}
+#endif  // ART_PLUS
+
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
diff --git a/util/comparator.cc b/util/comparator.cc
index f82a6dd14..b7dfa40f0 100644
--- a/util/comparator.cc
+++ b/util/comparator.cc
@@ -232,11 +232,13 @@ class SegmentIdRemovingComparatorImpl : public Comparator {
 
   void FindShortestSeparator(std::string* start,
                              const Slice& limit) const override {
-    real_comparator->FindShortestSeparator(start, limit);
+    // real_comparator->FindShortestSeparator(start, limit);
+    // do nothing to disable the key shortening feature
   }
 
   void FindShortSuccessor(std::string* key) const override {
-    real_comparator->FindShortSuccessor(key);
+    // real_comparator->FindShortSuccessor(key);
+    // do nothing to disable the key shortening feature
   }
 
   bool IsSameLengthImmediateSuccessor(const Slice& s,