diff --git a/.gitignore b/.gitignore index f2424a220..e2d24c97b 100644 --- a/.gitignore +++ b/.gitignore @@ -103,4 +103,3 @@ __pycache__/ include/csv2/ debug.* -*.log \ No newline at end of file diff --git a/Makefile b/Makefile index 30b431e50..cb5a05e2f 100644 --- a/Makefile +++ b/Makefile @@ -14,6 +14,10 @@ ifeq ($(origin PYTHON), undefined) endif export PYTHON +# EXTRA_CXXFLAGS += -I$(HOME)/local/include -gdwarf-4 -fsanitize=address +EXTRA_CXXFLAGS += -I$(HOME)/local/include -gdwarf-4 -DGFLAGS +LDFLAGS += -L$(HOME)/local/lib -lsocket++ -lgflags + CLEAN_FILES = # deliberately empty, so we can append below. CFLAGS += ${EXTRA_CFLAGS} CXXFLAGS += ${EXTRA_CXXFLAGS} @@ -112,6 +116,8 @@ ifneq ($(findstring rocksdbjava, $(MAKECMDGOALS)),) endif endif +# DEBUG_LEVEL=1 +DEBUG_LEVEL=0 $(info $$DEBUG_LEVEL is ${DEBUG_LEVEL}) # Lite build flag. @@ -1310,6 +1316,11 @@ $(STATIC_LIBRARY): $(LIB_OBJECTS) $(AM_V_AR)rm -f $@ $(SHARED1) $(SHARED2) $(SHARED3) $(SHARED4) $(AM_V_at)$(AR) $(ARFLAGS) $@ $(LIB_OBJECTS) +# Add a target to build bloom_test with ART_PLUS defined +bloom_test_plus: util/bloom_test.cc $(LIBRARY) $(GTEST) + $(AM_V_CCLD)$(CXX) $(CXXFLAGS) -DART_PLUS -c util/bloom_test.cc -o $(OBJ_DIR)/util/bloom_test_plus.o + $(AM_V_CCLD)$(CXX) -o bloom_test_plus $(OBJ_DIR)/util/bloom_test_plus.o $(GTEST) $(LIBRARY) $(EXEC_LDFLAGS) $(LDFLAGS) $(COVERAGEFLAGS) + $(STATIC_TEST_LIBRARY): $(TEST_OBJECTS) $(AM_V_AR)rm -f $@ $(SHARED_TEST_LIBRARY) $(AM_V_at)$(AR) $(ARFLAGS) $@ $^ diff --git a/YCSB/.gitignore b/YCSB/.gitignore index 50f9d154b..056f2b01b 100644 --- a/YCSB/.gitignore +++ b/YCSB/.gitignore @@ -8,3 +8,7 @@ tags compile_commands.json .clangd/ .cache/ + +test_logs/ +compile.log +*.log \ No newline at end of file diff --git a/YCSB/Makefile b/YCSB/Makefile index 10ba2d280..f15bbed47 100644 --- a/YCSB/Makefile +++ b/YCSB/Makefile @@ -9,16 +9,19 @@ #---------------------build config------------------------- +CXXFLAGS += -I$(HOME)/local/include -gdwarf-4 +LDFLAGS += -lstdc++ +LDFLAGS += -L$(HOME)/local/lib -lsocket++ + DEBUG_BUILD ?= 0 -EXTRA_CXXFLAGS ?= -I../include -I../include/rocksdb -EXTRA_LDFLAGS ?= -L../ -lpmem -ldl +# EXTRA_CXXFLAGS += -I../include -I../include/rocksdb -fsanitize=address +EXTRA_CXXFLAGS += -I../include -I../include/rocksdb +EXTRA_LDFLAGS += -L../ -lpmem -ldl BIND_ROCKSDB ?= 1 BIND_LEVELDB ?= 0 BIND_LMDB ?= 0 -EXTRA_LDFLAGS += -lstdc++ -EXTRA_LDFLAGS += -lsocket++ # EXTRA_LDFLAGS += -lpython3.12 # EXTRA_CXXFLAGS += -I$(PYTHON_INCLUDE_PATH) # EXTRA_CXXFLAGS += -L$(PYTHON_LIBRARY_PATH) diff --git a/YCSB/batch_test.sh b/YCSB/batch_test.sh new file mode 100755 index 000000000..8faf77130 --- /dev/null +++ b/YCSB/batch_test.sh @@ -0,0 +1,408 @@ +#!/bin/bash + +# YCSB批量测试脚本 +# 作者: 自动生成 +# 用途: 执行一系列YCSB测试,支持不同workload和properties配置 + +# =========================================== +# 全局配置变量 +# =========================================== + +# 默认配置 +DEFAULT_YCSB_PATH="./ycsb" +DEFAULT_DB_TYPE="rocksdb" +DEFAULT_PROPERTIES_FILE="rocksdb/rocksdb.properties" +DEFAULT_THREAD_COUNT=8 + +# 数据库和NVM路径配置 +DB_PATH="/mnt/nvme0n1/guoteng/walsmtest/tmp/db_nvm_l0" +NVM_PATH="/mnt/pmem0.7/guoteng/nodememory" + +# 日志配置 +LOG_DIR="./test_logs" +TIMESTAMP=$(date +"%Y%m%d_%H%M%S") + +# =========================================== +# 工具函数 +# =========================================== + +# 打印带时间戳的日志信息 +log_info() { + echo "[$(date '+%Y-%m-%d %H:%M:%S')] INFO: $1" +} + +# 打印错误信息 +log_error() { + echo "[$(date '+%Y-%m-%d %H:%M:%S')] ERROR: $1" >&2 +} + +# 打印警告信息 +log_warn() { + echo "[$(date '+%Y-%m-%d %H:%M:%S')] WARN: $1" +} + +# =========================================== +# 资源清理函数 +# =========================================== + +# 清理数据库和NVM存储资源 +cleanup_resources() { + log_info "开始清理资源..." + + local cleanup_success=true + + # 清理数据库目录 + if [ -d "${DB_PATH}" ]; then + log_info "清理数据库目录: ${DB_PATH}" + if [ "$(ls -A "${DB_PATH}" 2>/dev/null)" ]; then + rm -rf "${DB_PATH}"/* + if [ $? -eq 0 ]; then + log_info "数据库目录清理完成" + else + log_error "数据库目录清理失败" + cleanup_success=false + fi + else + log_info "数据库目录为空,无需清理" + fi + else + log_info "数据库目录不存在: ${DB_PATH} (正常情况)" + fi + + # 清理NVM路径 + if [ -f "${NVM_PATH}" ] || [ -d "${NVM_PATH}" ]; then + log_info "清理NVM路径: ${NVM_PATH}" + rm -rf "${NVM_PATH}" + if [ $? -eq 0 ]; then + log_info "NVM路径清理完成" + else + log_error "NVM路径清理失败" + cleanup_success=false + fi + else + log_info "NVM路径不存在: ${NVM_PATH} (正常情况)" + fi + + if [ "$cleanup_success" = true ]; then + log_info "资源清理完成" + return 0 + else + log_error "资源清理过程中出现错误" + return 1 + fi +} + +# =========================================== +# YCSB测试执行函数 +# =========================================== + +# 执行单个YCSB测试 +run_single_ycsb_test() { + local workload="$1" + local properties_file="$2" + local additional_params="$3" + local test_name="$4" + + log_info "开始执行YCSB测试: ${test_name}" + log_info " - Workload: ${workload}" + log_info " - Properties: ${properties_file}" + log_info " - Thread Count: ${DEFAULT_THREAD_COUNT}" + + # 检查workload文件是否存在 + if [ ! -f "${workload}" ]; then + log_error "Workload文件不存在: ${workload}" + return 1 + fi + + # 检查properties文件是否存在 + if [ ! -f "${properties_file}" ]; then + log_error "Properties文件不存在: ${properties_file}" + return 1 + fi + + # 创建日志目录 + mkdir -p "${LOG_DIR}" + + # 生成日志文件名 + local log_file="${LOG_DIR}/${test_name}_${TIMESTAMP}.log" + + # 构建YCSB命令 + local ycsb_cmd="${DEFAULT_YCSB_PATH} -load -run -db ${DEFAULT_DB_TYPE} -P ${workload} -P ${properties_file} -p threadcount=${DEFAULT_THREAD_COUNT}" + + # 添加额外参数 + if [ -n "${additional_params}" ]; then + ycsb_cmd="${ycsb_cmd} ${additional_params}" + fi + + # 添加统计输出 + ycsb_cmd="${ycsb_cmd} -s" + + log_info "执行命令: ${ycsb_cmd}" + + # 执行YCSB测试并记录日志 + echo "开始时间: $(date)" > "${log_file}" + echo "命令: ${ycsb_cmd}" >> "${log_file}" + echo "======================================" >> "${log_file}" + + eval "${ycsb_cmd}" 2>&1 | tee -a "${log_file}" + local exit_code=${PIPESTATUS[0]} + + echo "======================================" >> "${log_file}" + echo "结束时间: $(date)" >> "${log_file}" + echo "退出码: ${exit_code}" >> "${log_file}" + + if [ ${exit_code} -eq 0 ]; then + log_info "测试完成: ${test_name} (日志: ${log_file})" + return 0 + else + log_error "测试失败: ${test_name} (退出码: ${exit_code})" + return 1 + fi +} + +# =========================================== +# 批量测试函数 +# =========================================== + +# 执行批量测试 +run_batch_tests() { + local config_file="$1" + + if [ ! -f "${config_file}" ]; then + log_error "配置文件不存在: ${config_file}" + return 1 + fi + + log_info "开始批量测试,配置文件: ${config_file}" + + local test_count=0 + local success_count=0 + local failed_tests=() + + # 读取配置文件并执行测试 + while IFS='|' read -r test_name workload properties_file additional_params; do + # 跳过注释行和空行 + [[ ${test_name} =~ ^#.*$ ]] && continue + [[ -z ${test_name} ]] && continue + + test_count=$((test_count + 1)) + + log_info "执行测试 ${test_count}: ${test_name}" + + # 清理资源 + cleanup_resources + if [ $? -ne 0 ]; then + log_error "资源清理失败,跳过测试: ${test_name}" + failed_tests+=("${test_name}") + continue + fi + + # 确定properties文件路径 + local actual_properties_file + if [ -n "${properties_file}" ] && [ "${properties_file}" != " " ]; then + actual_properties_file="${properties_file}" + else + actual_properties_file="${DEFAULT_PROPERTIES_FILE}" + fi + + # 执行测试 + run_single_ycsb_test "${workload}" "${actual_properties_file}" "${additional_params}" "${test_name}" + if [ $? -eq 0 ]; then + success_count=$((success_count + 1)) + log_info "测试成功: ${test_name}" + else + failed_tests+=("${test_name}") + log_error "测试失败: ${test_name}" + fi + + log_info "测试 ${test_count} 完成: ${test_name}" + echo "----------------------------------------" + + done < "${config_file}" + + # 输出测试结果汇总 + log_info "批量测试完成" + log_info "总测试数: ${test_count}" + log_info "成功测试数: ${success_count}" + log_info "失败测试数: $((test_count - success_count))" + + if [ ${#failed_tests[@]} -gt 0 ]; then + log_warn "失败的测试:" + for failed_test in "${failed_tests[@]}"; do + log_warn " - ${failed_test}" + done + fi + + return 0 +} + +# =========================================== +# 主函数 +# =========================================== + +# 显示帮助信息 +show_help() { + cat << EOF +YCSB批量测试脚本使用说明: + +用法: + $0 [选项] [参数] + +选项: + -h, --help 显示此帮助信息 + -c, --cleanup 仅执行资源清理 + -s, --single 执行单个测试 + -b, --batch 执行批量测试 + -l, --list 列出可用的workload文件 + +单个测试参数: + -w, --workload 指定workload文件 (必需) + -P, --properties 指定properties文件 (可选,默认: ${DEFAULT_PROPERTIES_FILE}) + -p, --params 额外参数 (可选) + -n, --name 测试名称 (可选) + +批量测试参数: + -f, --config-file 批量测试配置文件 (必需) + +配置文件格式 (使用|分隔): + 测试名称|workload路径|properties文件路径(可选)|额外参数(可选) + +示例: + # 执行单个测试 + $0 -s -w workloads/workloada -n test1 + + # 执行单个测试并指定properties文件 + $0 -s -w workloads/workloada -P custom.properties -n test1 + + # 执行批量测试 + $0 -b -f batch_config.txt + + # 仅清理资源 + $0 -c + +EOF +} + +# 列出可用的workload文件 +list_workloads() { + log_info "可用的workload文件:" + find workloads/ -name "workload*" -type f | sort | while read -r workload; do + echo " - ${workload}" + done +} + +# 主函数 +main() { + local action="" + local workload="" + local properties_file="" + local additional_params="" + local test_name="" + local config_file="" + + # 解析命令行参数 + while [[ $# -gt 0 ]]; do + case $1 in + -h|--help) + show_help + exit 0 + ;; + -c|--cleanup) + action="cleanup" + shift + ;; + -s|--single) + action="single" + shift + ;; + -b|--batch) + action="batch" + shift + ;; + -l|--list) + list_workloads + exit 0 + ;; + -w|--workload) + workload="$2" + shift 2 + ;; + -P|--properties) + properties_file="$2" + shift 2 + ;; + -p|--params) + additional_params="$2" + shift 2 + ;; + -n|--name) + test_name="$2" + shift 2 + ;; + -f|--config-file) + config_file="$2" + shift 2 + ;; + *) + log_error "未知参数: $1" + show_help + exit 1 + ;; + esac + done + + # 检查YCSB可执行文件 + if [ ! -f "${DEFAULT_YCSB_PATH}" ]; then + log_error "YCSB可执行文件不存在: ${DEFAULT_YCSB_PATH}" + exit 1 + fi + + # 根据action执行相应操作 + case "${action}" in + cleanup) + cleanup_resources + ;; + single) + if [ -z "${workload}" ]; then + log_error "执行单个测试需要指定workload文件" + show_help + exit 1 + fi + + if [ -z "${test_name}" ]; then + test_name="single_test_$(basename ${workload})" + fi + + # 确定properties文件路径 + if [ -z "${properties_file}" ]; then + properties_file="${DEFAULT_PROPERTIES_FILE}" + fi + + # 清理资源 + cleanup_resources + + # 执行测试 + run_single_ycsb_test "${workload}" "${properties_file}" "${additional_params}" "${test_name}" + local result=$? + + exit ${result} + ;; + batch) + if [ -z "${config_file}" ]; then + log_error "执行批量测试需要指定配置文件" + show_help + exit 1 + fi + + run_batch_tests "${config_file}" + ;; + *) + log_error "需要指定操作类型 (-c, -s, -b, -l)" + show_help + exit 1 + ;; + esac +} + +# 执行主函数 +main "$@" diff --git a/YCSB/build.sh b/YCSB/build.sh new file mode 100755 index 000000000..f7c979322 --- /dev/null +++ b/YCSB/build.sh @@ -0,0 +1,12 @@ +cd .. +rm -rf ../log/* +make clean +make static_lib -j32 + +# mv librocksdb_debug.a librocksdb.a + +cd YCSB +make clean && make -j4 + +rm -rf /mnt/nvme0n1/guoteng/walsmtest/tmp/db_nvm_l0 +rm -rf /mnt/pmem0.7/guoteng/nodememory diff --git a/YCSB/buildall.sh b/YCSB/buildall.sh new file mode 100755 index 000000000..1b684ca01 --- /dev/null +++ b/YCSB/buildall.sh @@ -0,0 +1,7 @@ +cd .. +# make clean +make static_lib -j32 + +cd YCSB +make clean && make DEBUG_BUILD=1 + diff --git a/YCSB/core/core_workload.cc b/YCSB/core/core_workload.cc index a559d5509..693a9eb6a 100644 --- a/YCSB/core/core_workload.cc +++ b/YCSB/core/core_workload.cc @@ -68,6 +68,7 @@ const string CoreWorkload::REQUEST_DISTRIBUTION_PROPERTY = "requestdistribution" const string CoreWorkload::REQUEST_DISTRIBUTION_DEFAULT = "uniform"; const string CoreWorkload::ZERO_PADDING_PROPERTY = "zeropadding"; +// const string CoreWorkload::ZERO_PADDING_DEFAULT = "96"; const string CoreWorkload::ZERO_PADDING_DEFAULT = "1"; const string CoreWorkload::MIN_SCAN_LENGTH_PROPERTY = "minscanlength"; diff --git a/YCSB/rocksdb/rocksdb.properties b/YCSB/rocksdb/rocksdb.properties index d4b7f9774..a33f6f67a 100644 --- a/YCSB/rocksdb/rocksdb.properties +++ b/YCSB/rocksdb/rocksdb.properties @@ -1,29 +1,33 @@ -rocksdb.dbname=/mnt/walsm/tmp/tmp_data/db_test_art -rocksdb.nvm_path=/mnt/walsm/node_memory +rocksdb.dbname=/mnt/nvme0n1/guoteng/walsmtest/tmp/gt_test +rocksdb.nvm_path=/mnt/pmem0.8/guoteng/nodememory rocksdb.format=single -rocksdb.destroy=false +# rocksdb.destroy=false # Load options from file #rocksdb.optionsfile=rocksdb/options.ini # Below options are ignored if options file is used -rocksdb.compression=no -rocksdb.max_background_jobs=2 -rocksdb.target_file_size_base=67108864 -rocksdb.target_file_size_multiplier=1 -rocksdb.max_bytes_for_level_base=268435456 -rocksdb.write_buffer_size=67108864 +# rocksdb.compression=no +rocksdb.max_background_jobs=4 +# rocksdb.target_file_size_base=67108864 +# rocksdb.target_file_size_multiplier=1 +# rocksdb.max_bytes_for_level_base=268435456 +# rocksdb.write_buffer_size=67108864 rocksdb.max_open_files=-1 -rocksdb.max_write_buffer_number=2 +# rocksdb.max_write_buffer_number=2 rocksdb.use_direct_io_for_flush_compaction=true rocksdb.use_direct_reads=true -rocksdb.allow_mmap_writes=false -rocksdb.allow_mmap_reads=false -rocksdb.cache_size=8388608 +# rocksdb.allow_mmap_writes=false +# rocksdb.allow_mmap_reads=false +rocksdb.cache_size=100663296 rocksdb.compressed_cache_size=0 -rocksdb.bloom_bits=0 +rocksdb.bloom_bits=2 # set total_threads to 32, see rocksdb_db.cc rocksdb.increase_parallelism=true # rocksdb.optimize_level_style_compaction=true rocksdb.optimize_universal_style_compaction=true + +rocksdb.block_size=4096 +rocksdb.metadata_size=4096 +rocksdb.max_subcompactions=1 \ No newline at end of file diff --git a/YCSB/rocksdb/rocksdb_cacheio.properties b/YCSB/rocksdb/rocksdb_cacheio.properties new file mode 100644 index 000000000..fbca92902 --- /dev/null +++ b/YCSB/rocksdb/rocksdb_cacheio.properties @@ -0,0 +1,33 @@ +rocksdb.dbname=/mnt/nvme0n1/guoteng/walsmtest/tmp/db_nvm_l0 +rocksdb.nvm_path=/mnt/pmem0.7/guoteng/nodememory +rocksdb.format=single +# rocksdb.destroy=false + +# Load options from file +#rocksdb.optionsfile=rocksdb/options.ini + +# Below options are ignored if options file is used +# rocksdb.compression=no +rocksdb.max_background_jobs=4 +# rocksdb.target_file_size_base=67108864 +# rocksdb.target_file_size_multiplier=1 +# rocksdb.max_bytes_for_level_base=268435456 +# rocksdb.write_buffer_size=67108864 +rocksdb.max_open_files=-1 +# rocksdb.max_write_buffer_number=2 +# rocksdb.use_direct_io_for_flush_compaction=true +# rocksdb.use_direct_reads=true +# rocksdb.allow_mmap_writes=false +# rocksdb.allow_mmap_reads=false +rocksdb.cache_size=134217728 +rocksdb.compressed_cache_size=0 +rocksdb.bloom_bits=2 + +# set total_threads to 32, see rocksdb_db.cc +rocksdb.increase_parallelism=true +# rocksdb.optimize_level_style_compaction=true +rocksdb.optimize_universal_style_compaction=true + +rocksdb.block_size=4096 +rocksdb.metadata_size=8192 +rocksdb.max_subcompactions=4 \ No newline at end of file diff --git a/YCSB/rocksdb/rocksdb_db.cc b/YCSB/rocksdb/rocksdb_db.cc index 4f8dda0cd..212ad3f79 100644 --- a/YCSB/rocksdb/rocksdb_db.cc +++ b/YCSB/rocksdb/rocksdb_db.cc @@ -18,7 +18,15 @@ #include #include #include +#include #include +#include +#include +#include "rocksdb/env.h" +#include "rocksdb/options.h" +#include "rocksdb/table.h" + +#include namespace { const std::string PROP_NAME = "rocksdb.dbname"; @@ -111,6 +119,15 @@ namespace { const std::string PROP_FS_URI = "rocksdb.fs_uri"; const std::string PROP_FS_URI_DEFAULT = ""; + const std::string PROP_BLOCK_SIZE = "rocksdb.block_size"; + const std::string PROP_BLOCK_SIZE_DEFAULT = "0"; + + const std::string PROP_METADATA_SIZE = "rocksdb.metadata_size"; + const std::string PROP_METADATA_SIZE_DEFAULT = "0"; + + const std::string PROP_MAX_SUBCOMPACTION = "rocksdb.max_subcompactions"; + const std::string PROP_MAX_SUBCOMPACTION_DEFAULT = "0"; + static std::shared_ptr env_guard; static std::shared_ptr block_cache; static std::shared_ptr block_cache_compressed; @@ -121,6 +138,7 @@ namespace ycsbc { rocksdb::DB *RocksdbDB::db_ = nullptr; int RocksdbDB::ref_cnt_ = 0; std::mutex RocksdbDB::mu_; +rocksdb::Options opt; void RocksdbDB::Init() { // merge operator disabled by default due to link error @@ -198,7 +216,6 @@ void RocksdbDB::Init() { throw utils::Exception("RocksDB db path is missing"); } - rocksdb::Options opt; opt.create_if_missing = true; opt.nvm_path = nvm_path; std::vector cf_descs; @@ -230,6 +247,8 @@ void RocksdbDB::Cleanup() { if (--ref_cnt_) { return; } + std::cout << "Statistics: " << opt.statistics->ToString() << std::endl; + sleep(5); // sleep 5 seconds to wait for final reports delete db_; } @@ -309,6 +328,10 @@ void RocksdbDB::GetOptions(const utils::Properties &props, rocksdb::Options *opt if (val != 0) { opt->max_open_files = val; } + val = std::stoi(props.GetProperty(PROP_MAX_SUBCOMPACTION, PROP_MAX_SUBCOMPACTION_DEFAULT)); + if (val != 0) { + opt->max_subcompactions = val; + } val = std::stoi(props.GetProperty(PROP_L0_COMPACTION_TRIGGER, PROP_L0_COMPACTION_TRIGGER_DEFAULT)); if (val != 0) { @@ -337,6 +360,21 @@ void RocksdbDB::GetOptions(const utils::Properties &props, rocksdb::Options *opt } rocksdb::BlockBasedTableOptions table_options; + table_options.pin_top_level_index_and_filter = true; + table_options.pin_l0_filter_and_index_blocks_in_cache = false; + table_options.cache_index_and_filter_blocks_with_high_priority = true; + table_options.index_type = rocksdb::BlockBasedTableOptions::kTwoLevelIndexSearch; + table_options.partition_filters = true; + table_options.cache_index_and_filter_blocks = true; + table_options.index_shortening = rocksdb::BlockBasedTableOptions::IndexShorteningMode::kNoShortening; + size_t block_size = std::stoul(props.GetProperty(PROP_BLOCK_SIZE, PROP_BLOCK_SIZE_DEFAULT)); + if (block_size > 0) { + table_options.block_size = block_size; + } + size_t metadata_block_size = std::stoul(props.GetProperty(PROP_METADATA_SIZE, PROP_METADATA_SIZE_DEFAULT)); + if (metadata_block_size > 0) { + table_options.metadata_block_size = metadata_block_size; + } size_t cache_size = std::stoul(props.GetProperty(PROP_CACHE_SIZE, PROP_CACHE_SIZE_DEFAULT)); if (cache_size > 0) { block_cache = rocksdb::NewLRUCache(cache_size); @@ -348,13 +386,16 @@ void RocksdbDB::GetOptions(const utils::Properties &props, rocksdb::Options *opt block_cache_compressed = rocksdb::NewLRUCache(cache_size); table_options.block_cache_compressed = rocksdb::NewLRUCache(compressed_cache_size); } - int bloom_bits = std::stoul(props.GetProperty(PROP_BLOOM_BITS, PROP_BLOOM_BITS_DEFAULT)); + int bloom_bits = std::stoul(props.GetProperty(PROP_BLOOM_BITS, PROP_BLOOM_BITS_DEFAULT)); if (bloom_bits > 0) { table_options.filter_policy.reset(rocksdb::NewBloomFilterPolicy(bloom_bits)); } opt->table_factory.reset(rocksdb::NewBlockBasedTableFactory(table_options)); if (props.GetProperty(PROP_INCREASE_PARALLELISM, PROP_INCREASE_PARALLELISM_DEFAULT) == "true") { + // unlimit the thread count of compactions and flushes. let it depend on total thread: 32 + opt->max_background_compactions = -1; + opt->max_background_flushes = -1; opt->IncreaseParallelism(32); } if (props.GetProperty(PROP_OPTIMIZE_LEVELCOMP, PROP_OPTIMIZE_LEVELCOMP_DEFAULT) == "true") { @@ -363,6 +404,7 @@ void RocksdbDB::GetOptions(const utils::Properties &props, rocksdb::Options *opt if (props.GetProperty(PROP_OPTIMIZE_UNIVERSALCOMP, PROP_OPTIMIZE_UNIVERSALCOMP_DEFAULT) == "true") { opt->OptimizeUniversalStyleCompaction(); } + opt->statistics = rocksdb::CreateDBStatistics(); } } @@ -431,12 +473,6 @@ DB::Status RocksdbDB::ReadSingle(const std::string &table, const std::string &ke std::vector &result) { std::string data; rocksdb::Status s = db_->Get(rocksdb::ReadOptions(), key, &data); - #ifdef GEN_WORKLOAD - std::fstream f; - f.open("../workload/workload", std::ios::out | std::ios::app); - f << key < &values) { - /* std::string data; rocksdb::Status s = db_->Get(rocksdb::ReadOptions(), key, &data); if (s.IsNotFound()) { @@ -505,9 +540,6 @@ DB::Status RocksdbDB::UpdateSingle(const std::string &table, const std::string & throw utils::Exception(std::string("RocksDB Put: ") + s.ToString()); } return kOK; - */ - // use insert, not read-modify-write - return InsertSingle(table, key, values); } DB::Status RocksdbDB::MergeSingle(const std::string &table, const std::string &key, diff --git a/YCSB/rocksdb/rocksdb_directio.properties b/YCSB/rocksdb/rocksdb_directio.properties new file mode 100644 index 000000000..a38b865c4 --- /dev/null +++ b/YCSB/rocksdb/rocksdb_directio.properties @@ -0,0 +1,33 @@ +rocksdb.dbname=/mnt/nvme0n1/guoteng/walsmtest/tmp/db_nvm_l0 +rocksdb.nvm_path=/mnt/pmem0.7/guoteng/nodememory +rocksdb.format=single +# rocksdb.destroy=false + +# Load options from file +#rocksdb.optionsfile=rocksdb/options.ini + +# Below options are ignored if options file is used +# rocksdb.compression=no +rocksdb.max_background_jobs=4 +# rocksdb.target_file_size_base=67108864 +# rocksdb.target_file_size_multiplier=1 +# rocksdb.max_bytes_for_level_base=268435456 +# rocksdb.write_buffer_size=67108864 +rocksdb.max_open_files=-1 +# rocksdb.max_write_buffer_number=2 +rocksdb.use_direct_io_for_flush_compaction=true +rocksdb.use_direct_reads=true +# rocksdb.allow_mmap_writes=false +# rocksdb.allow_mmap_reads=false +rocksdb.cache_size=134217728 +rocksdb.compressed_cache_size=0 +rocksdb.bloom_bits=2 + +# set total_threads to 32, see rocksdb_db.cc +rocksdb.increase_parallelism=true +# rocksdb.optimize_level_style_compaction=true +rocksdb.optimize_universal_style_compaction=true + +rocksdb.block_size=4096 +rocksdb.metadata_size=8192 +rocksdb.max_subcompactions=4 \ No newline at end of file diff --git a/YCSB/test.sh b/YCSB/test.sh new file mode 100644 index 000000000..2116bcfed --- /dev/null +++ b/YCSB/test.sh @@ -0,0 +1,15 @@ +cd .. +rm -rf log/* +make clean +make static_lib -j32 + +#mv librocksdb_debug.a librocksdb.a + +cd YCSB +make clean && make -j4 + +rm -rf /mnt/nvme0n1/guoteng/walsmtest/tmp/gt_test +rm -rf /mnt/pmem0.8/guoteng/nodememory + +#gdb --args ./ycsb -load -run -db rocksdb -P workloads/workloadt -P rocksdb/rocksdb.properties -p threadcount=8 -s +./ycsb -load -run -db rocksdb -P workloads/workloadt -P rocksdb/rocksdb.properties -p threadcount=8 -p sleepafterload=60 -s diff --git a/YCSB/workloads/search_0p.spec b/YCSB/workloads/search_0p.spec new file mode 100644 index 000000000..624bd1646 --- /dev/null +++ b/YCSB/workloads/search_0p.spec @@ -0,0 +1,17 @@ +fieldcount=10 +fieldlength=100 + +recordcount=80000000 +operationcount=320000000 +workload=com.yahoo.ycsb.workloads.CoreWorkload + +readallfields=true + +readproportion=0 +updateproportion=1.0 +scanproportion=0 +insertproportion=0 + +requestdistribution=zipfian +zipfianvalue=0.98 +maxscanlength=100 \ No newline at end of file diff --git a/YCSB/workloads/search_100p.spec b/YCSB/workloads/search_100p.spec new file mode 100644 index 000000000..3403b9c5e --- /dev/null +++ b/YCSB/workloads/search_100p.spec @@ -0,0 +1,17 @@ +fieldcount=10 +fieldlength=100 + +recordcount=80000000 +operationcount=320000000 +workload=com.yahoo.ycsb.workloads.CoreWorkload + +readallfields=true + +readproportion=1.0 +updateproportion=0 +scanproportion=0 +insertproportion=0 + +requestdistribution=zipfian +zipfianvalue=0.98 +maxscanlength=100 \ No newline at end of file diff --git a/YCSB/workloads/search_25p.spec b/YCSB/workloads/search_25p.spec new file mode 100644 index 000000000..d2a8dd2a6 --- /dev/null +++ b/YCSB/workloads/search_25p.spec @@ -0,0 +1,17 @@ +fieldcount=10 +fieldlength=100 + +recordcount=80000000 +operationcount=320000000 +workload=com.yahoo.ycsb.workloads.CoreWorkload + +readallfields=true + +readproportion=0.25 +updateproportion=0.75 +scanproportion=0 +insertproportion=0 + +requestdistribution=zipfian +zipfianvalue=0.98 +maxscanlength=100 \ No newline at end of file diff --git a/YCSB/workloads/search_50p.spec b/YCSB/workloads/search_50p.spec new file mode 100644 index 000000000..a53b084dd --- /dev/null +++ b/YCSB/workloads/search_50p.spec @@ -0,0 +1,17 @@ +fieldcount=10 +fieldlength=100 + +recordcount=80000000 +operationcount=320000000 +workload=com.yahoo.ycsb.workloads.CoreWorkload + +readallfields=true + +readproportion=0.50 +updateproportion=0.50 +scanproportion=0 +insertproportion=0 + +requestdistribution=zipfian +zipfianvalue=0.98 +maxscanlength=100 \ No newline at end of file diff --git a/YCSB/workloads/search_75p.spec b/YCSB/workloads/search_75p.spec new file mode 100644 index 000000000..f3ff43f48 --- /dev/null +++ b/YCSB/workloads/search_75p.spec @@ -0,0 +1,17 @@ +fieldcount=10 +fieldlength=100 + +recordcount=80000000 +operationcount=320000000 +workload=com.yahoo.ycsb.workloads.CoreWorkload + +readallfields=true + +readproportion=0.75 +updateproportion=0.25 +scanproportion=0 +insertproportion=0 + +requestdistribution=zipfian +zipfianvalue=0.98 +maxscanlength=100 \ No newline at end of file diff --git a/YCSB/workloads/value_16kb.spec b/YCSB/workloads/value_16kb.spec new file mode 100644 index 000000000..158825ec5 --- /dev/null +++ b/YCSB/workloads/value_16kb.spec @@ -0,0 +1,17 @@ +fieldcount=10 +fieldlength=1600 + +recordcount=5000000 +operationcount=20000000 +workload=com.yahoo.ycsb.workloads.CoreWorkload + +readallfields=true + +readproportion=0.5 +updateproportion=0.5 +scanproportion=0 +insertproportion=0 + +requestdistribution=zipfian +zipfianvalue=0.98 +maxscanlength=100 \ No newline at end of file diff --git a/YCSB/workloads/value_1kb.spec b/YCSB/workloads/value_1kb.spec new file mode 100644 index 000000000..8daafe878 --- /dev/null +++ b/YCSB/workloads/value_1kb.spec @@ -0,0 +1,17 @@ +fieldcount=10 +fieldlength=100 + +recordcount=80000000 +operationcount=320000000 +workload=com.yahoo.ycsb.workloads.CoreWorkload + +readallfields=true + +readproportion=0.5 +updateproportion=0.5 +scanproportion=0 +insertproportion=0 + +requestdistribution=zipfian +zipfianvalue=0.98 +maxscanlength=100 \ No newline at end of file diff --git a/YCSB/workloads/value_256b.spec b/YCSB/workloads/value_256b.spec new file mode 100644 index 000000000..98decf924 --- /dev/null +++ b/YCSB/workloads/value_256b.spec @@ -0,0 +1,17 @@ +fieldcount=10 +fieldlength=25 + +recordcount=320000000 +operationcount=1280000000 +workload=com.yahoo.ycsb.workloads.CoreWorkload + +readallfields=true + +readproportion=0.5 +updateproportion=0.5 +scanproportion=0 +insertproportion=0 + +requestdistribution=zipfian +zipfianvalue=0.98 +maxscanlength=100 \ No newline at end of file diff --git a/YCSB/workloads/value_4kb.spec b/YCSB/workloads/value_4kb.spec new file mode 100644 index 000000000..7f916ccdf --- /dev/null +++ b/YCSB/workloads/value_4kb.spec @@ -0,0 +1,17 @@ +fieldcount=10 +fieldlength=400 + +recordcount=20000000 +operationcount=80000000 +workload=com.yahoo.ycsb.workloads.CoreWorkload + +readallfields=true + +readproportion=0.5 +updateproportion=0.5 +scanproportion=0 +insertproportion=0 + +requestdistribution=zipfian +zipfianvalue=0.98 +maxscanlength=100 \ No newline at end of file diff --git a/YCSB/workloads/workload_A.spec b/YCSB/workloads/workload_A.spec new file mode 100644 index 000000000..8daafe878 --- /dev/null +++ b/YCSB/workloads/workload_A.spec @@ -0,0 +1,17 @@ +fieldcount=10 +fieldlength=100 + +recordcount=80000000 +operationcount=320000000 +workload=com.yahoo.ycsb.workloads.CoreWorkload + +readallfields=true + +readproportion=0.5 +updateproportion=0.5 +scanproportion=0 +insertproportion=0 + +requestdistribution=zipfian +zipfianvalue=0.98 +maxscanlength=100 \ No newline at end of file diff --git a/YCSB/workloads/workload_B.spec b/YCSB/workloads/workload_B.spec new file mode 100644 index 000000000..b3df53ae4 --- /dev/null +++ b/YCSB/workloads/workload_B.spec @@ -0,0 +1,17 @@ +fieldcount=10 +fieldlength=100 + +recordcount=80000000 +operationcount=320000000 +workload=com.yahoo.ycsb.workloads.CoreWorkload + +readallfields=true + +readproportion=0.95 +updateproportion=0.05 +scanproportion=0 +insertproportion=0 + +requestdistribution=zipfian +zipfianvalue=0.98 +maxscanlength=100 \ No newline at end of file diff --git a/YCSB/workloads/workload_C.spec b/YCSB/workloads/workload_C.spec new file mode 100644 index 000000000..dd7d41a08 --- /dev/null +++ b/YCSB/workloads/workload_C.spec @@ -0,0 +1,17 @@ +fieldcount=10 +fieldlength=100 + +recordcount=80000000 +operationcount=320000000 +workload=com.yahoo.ycsb.workloads.CoreWorkload + +readallfields=true + +readproportion=1 +updateproportion=0 +scanproportion=0 +insertproportion=0 + +requestdistribution=zipfian +zipfianvalue=0.98 +maxscanlength=100 \ No newline at end of file diff --git a/YCSB/workloads/workload_D.spec b/YCSB/workloads/workload_D.spec new file mode 100644 index 000000000..e7c4e6986 --- /dev/null +++ b/YCSB/workloads/workload_D.spec @@ -0,0 +1,17 @@ +fieldcount=10 +fieldlength=100 + +recordcount=80000000 +operationcount=320000000 +workload=com.yahoo.ycsb.workloads.CoreWorkload + +readallfields=true + +readproportion=0.95 +updateproportion=0 +scanproportion=0 +insertproportion=0.05 + +requestdistribution=latest +zipfianvalue=0.98 +maxscanlength=100 \ No newline at end of file diff --git a/YCSB/workloads/workload_E.spec b/YCSB/workloads/workload_E.spec new file mode 100644 index 000000000..26c9fd0e3 --- /dev/null +++ b/YCSB/workloads/workload_E.spec @@ -0,0 +1,17 @@ +fieldcount=10 +fieldlength=100 + +recordcount=80000000 +operationcount=320000000 +workload=com.yahoo.ycsb.workloads.CoreWorkload + +readallfields=true + +readproportion=0 +updateproportion=0 +scanproportion=0.95 +insertproportion=0.05 + +requestdistribution=zipfian +zipfianvalue=0.98 +maxscanlength=100 \ No newline at end of file diff --git a/YCSB/workloads/workload_F.spec b/YCSB/workloads/workload_F.spec new file mode 100644 index 000000000..81a922969 --- /dev/null +++ b/YCSB/workloads/workload_F.spec @@ -0,0 +1,18 @@ +fieldcount=10 +fieldlength=100 + +recordcount=80000000 +operationcount=320000000 +workload=com.yahoo.ycsb.workloads.CoreWorkload + +readallfields=true + +readproportion=0.5 +updateproportion=0 +scanproportion=0 +insertproportion=0 +readmodifywriteproportion=0.5 + +requestdistribution=zipfian +zipfianvalue=0.98 +maxscanlength=100 \ No newline at end of file diff --git a/YCSB/workloads/workloadt b/YCSB/workloads/workloadt index a69512474..efcc59b5d 100644 --- a/YCSB/workloads/workloadt +++ b/YCSB/workloads/workloadt @@ -2,8 +2,8 @@ # Workload T: For Debug -recordcount=5000000 -operationcount=2200000 +recordcount=80000000 +operationcount=80000000 workload=com.yahoo.ycsb.workloads.CoreWorkload readallfields=true diff --git a/YCSB/workloads/zipfian_0p.spec b/YCSB/workloads/zipfian_0p.spec new file mode 100644 index 000000000..5780ccf0f --- /dev/null +++ b/YCSB/workloads/zipfian_0p.spec @@ -0,0 +1,17 @@ +fieldcount=10 +fieldlength=100 + +recordcount=80000000 +operationcount=320000000 +workload=com.yahoo.ycsb.workloads.CoreWorkload + +readallfields=true + +readproportion=0.50 +updateproportion=0.50 +scanproportion=0 +insertproportion=0 + +requestdistribution=zipfian +zipfianvalue=0 +maxscanlength=100 \ No newline at end of file diff --git a/YCSB/workloads/zipfian_105p.spec b/YCSB/workloads/zipfian_105p.spec new file mode 100644 index 000000000..1bca10d31 --- /dev/null +++ b/YCSB/workloads/zipfian_105p.spec @@ -0,0 +1,17 @@ +fieldcount=10 +fieldlength=100 + +recordcount=80000000 +operationcount=320000000 +workload=com.yahoo.ycsb.workloads.CoreWorkload + +readallfields=true + +readproportion=0.50 +updateproportion=0.50 +scanproportion=0 +insertproportion=0 + +requestdistribution=zipfian +zipfianvalue=1.05 +maxscanlength=100 \ No newline at end of file diff --git a/YCSB/workloads/zipfian_110p.spec b/YCSB/workloads/zipfian_110p.spec new file mode 100644 index 000000000..d544be44c --- /dev/null +++ b/YCSB/workloads/zipfian_110p.spec @@ -0,0 +1,17 @@ +fieldcount=10 +fieldlength=100 + +recordcount=80000000 +operationcount=320000000 +workload=com.yahoo.ycsb.workloads.CoreWorkload + +readallfields=true + +readproportion=0.50 +updateproportion=0.50 +scanproportion=0 +insertproportion=0 + +requestdistribution=zipfian +zipfianvalue=1.1 +maxscanlength=100 \ No newline at end of file diff --git a/YCSB/workloads/zipfian_70p.spec b/YCSB/workloads/zipfian_70p.spec new file mode 100644 index 000000000..3211f81c7 --- /dev/null +++ b/YCSB/workloads/zipfian_70p.spec @@ -0,0 +1,17 @@ +fieldcount=10 +fieldlength=100 + +recordcount=80000000 +operationcount=320000000 +workload=com.yahoo.ycsb.workloads.CoreWorkload + +readallfields=true + +readproportion=0.50 +updateproportion=0.50 +scanproportion=0 +insertproportion=0 + +requestdistribution=zipfian +zipfianvalue=0.70 +maxscanlength=100 \ No newline at end of file diff --git a/YCSB/workloads/zipfian_90p.spec b/YCSB/workloads/zipfian_90p.spec new file mode 100644 index 000000000..f40478267 --- /dev/null +++ b/YCSB/workloads/zipfian_90p.spec @@ -0,0 +1,17 @@ +fieldcount=10 +fieldlength=100 + +recordcount=80000000 +operationcount=320000000 +workload=com.yahoo.ycsb.workloads.CoreWorkload + +readallfields=true + +readproportion=0.50 +updateproportion=0.50 +scanproportion=0 +insertproportion=0 + +requestdistribution=zipfian +zipfianvalue=0.90 +maxscanlength=100 \ No newline at end of file diff --git a/YCSB/workloads/zipfian_95p.spec b/YCSB/workloads/zipfian_95p.spec new file mode 100644 index 000000000..b4292293d --- /dev/null +++ b/YCSB/workloads/zipfian_95p.spec @@ -0,0 +1,17 @@ +fieldcount=10 +fieldlength=100 + +recordcount=80000000 +operationcount=320000000 +workload=com.yahoo.ycsb.workloads.CoreWorkload + +readallfields=true + +readproportion=0.50 +updateproportion=0.50 +scanproportion=0 +insertproportion=0 + +requestdistribution=zipfian +zipfianvalue=0.95 +maxscanlength=100 \ No newline at end of file diff --git a/YCSB/workloads/zipfian_98p.spec b/YCSB/workloads/zipfian_98p.spec new file mode 100644 index 000000000..a53b084dd --- /dev/null +++ b/YCSB/workloads/zipfian_98p.spec @@ -0,0 +1,17 @@ +fieldcount=10 +fieldlength=100 + +recordcount=80000000 +operationcount=320000000 +workload=com.yahoo.ycsb.workloads.CoreWorkload + +readallfields=true + +readproportion=0.50 +updateproportion=0.50 +scanproportion=0 +insertproportion=0 + +requestdistribution=zipfian +zipfianvalue=0.98 +maxscanlength=100 \ No newline at end of file diff --git a/db/art/art_metric.h b/db/art/art_metric.h new file mode 100644 index 000000000..a71d6a029 --- /dev/null +++ b/db/art/art_metric.h @@ -0,0 +1,133 @@ +// +// Created by Guo Teng. +// + +#pragma once +#include +#include +#include +#include + +namespace ROCKSDB_NAMESPACE { + +// record evaluation metric for WaLSM paper +constexpr bool EVALUATE_METRIC = true; + +// 将FlushMetric和SSDWriteMetric加起来才是总的Write Data Size + +// metric of WaLSM flush write evaluation +// NVM MemTable-L0的写入 +// struct NVMWriteMetric { +// uint64_t WriteSsdDataBytes = 0; +// uint64_t LastPrintedBytes = 0; + +// std::string getMetric() { +// double WriteSSdDataGB = WriteSsdDataBytes * 1.0 / (1024 * 1024 * 1024); +// std::string s = "Write Metric analysis:"; +// s = s + " From NVM - " + std::to_string(WriteSSdDataGB) + " GB"; +// return s; +// } + +// void printMetric() { +// if (WriteSsdDataBytes - LastPrintedBytes > 1024 * 1024 * 1024) { +// LastPrintedBytes = WriteSsdDataBytes; +// std::cout << getMetric() << std::endl; +// } +// } + +// void updateMetric(uint64_t add_bytes) { +// if (EVALUATE_METRIC) { +// WriteSsdDataBytes += add_bytes; +// printMetric(); +// } +// } +// }; + +// metric of WaLSM compaction write evaluation +// L0-L1, L1-L2, ... 的Compaction的写入 +struct SSDWriteMetric { + uint64_t WriteSsdDataBytes = 0; + uint64_t LastPrintedBytes = 0; + + std::string getMetric() { + double WriteSSdDataGB = WriteSsdDataBytes * 1.0 / (1024 * 1024 * 1024); + std::string s = "Write Metric analysis:"; + s = s + " From SSD - " + std::to_string(WriteSSdDataGB) + " GB"; + return s; + } + + void printMetric() { + if (WriteSsdDataBytes - LastPrintedBytes > 1024 * 1024 * 1024) { + LastPrintedBytes = WriteSsdDataBytes; + std::cout << getMetric() << std::endl; + } + } + + void updateMetric(uint64_t add_bytes) { + if (EVALUATE_METRIC) { + WriteSsdDataBytes += add_bytes; + printMetric(); + } + } +}; + +// metric of WaLSM flush evaluation +// 从NVM flush到SSD的KV数据大小 +struct FlushMetric { + uint64_t FlushSsdDataBytes = 0; + uint64_t LastPrintedBytes = 0; + + std::string getMetric() { + double FlushSSdDataGB = FlushSsdDataBytes * 1.0 / (1024 * 1024 * 1024); + std::string s = "Flush Metric analysis:"; + s = s + " - " + std::to_string(FlushSSdDataGB) + " GB"; + return s; + } + + void printMetric() { + if (FlushSsdDataBytes - LastPrintedBytes > 512 * 1024 * 1024) { + LastPrintedBytes = FlushSsdDataBytes; + std::cout << getMetric() << std::endl; + } + } + + void updateMetric(uint64_t add_bytes) { + if (EVALUATE_METRIC) { + FlushSsdDataBytes += add_bytes; + printMetric(); + } + } +}; + +// metric of WaLSM read evaluation +// 读取的文件物理块个数,一个块4KB +struct ReadMetric { + uint64_t ReadSsdBlocksCnt = 0; + uint64_t LastPrintedCount = 0; + + std::string getMetric() { + std::string s = "Read Metric analysis:"; + s = s + " - " + std::to_string(ReadSsdBlocksCnt) + " Blocks"; + return s; + } + + void printMetric() { + if (ReadSsdBlocksCnt - LastPrintedCount > 1000 * 1000) { + LastPrintedCount = ReadSsdBlocksCnt; + std::cout << getMetric() << std::endl; + } + } + + void updateMetric(uint64_t offset_start, uint64_t offset_end) { + uint64_t block_start = offset_start / 4096; + uint64_t block_end = offset_end / 4096; + + if (EVALUATE_METRIC) { + ReadSsdBlocksCnt += std::max(block_end - block_end, uint64_t(1)); + printMetric(); + } + } +}; + + +} // namespace rocksdb \ No newline at end of file diff --git a/db/art/clf_model.cc b/db/art/clf_model.cc index 2c8224979..333e00f1c 100644 --- a/db/art/clf_model.cc +++ b/db/art/clf_model.cc @@ -7,15 +7,9 @@ #include #include #include +#include "port/likely.h" namespace ROCKSDB_NAMESPACE { - -uint16_t ClfModel::feature_num_; -std::string ClfModel::dataset_name_; -std::string ClfModel::dataset_path_; -std::string ClfModel::host_, ClfModel::port_; -size_t ClfModel::buffer_size_; - void ClfModel::write_debug_dataset() { assert(feature_num_ > 0); // ready for writer @@ -125,13 +119,17 @@ void ClfModel::write_real_dataset(std::vector>& datas, std void ClfModel::write_dataset(std::vector>& datas, std::vector& tags, std::vector& get_cnts) { assert(feature_num_ > 0); - if (datas.empty()) { - write_debug_dataset(); - // dataset_cnt_ += 1; - return; - } + if (UNLIKELY(datas.empty())) return; + assert(datas.size() > 0); + // if (datas.empty()) { + // assert(false); // we have to write dataset + // write_debug_dataset(); + // // dataset_cnt_ += 1; + // return; + // } assert(feature_num_ % 2 != 0); // features num: 2r + 1 + assert(feature_num_ >= 3); write_real_dataset(datas, tags, get_cnts); // dataset_cnt_ += 1; @@ -139,9 +137,10 @@ void ClfModel::write_dataset(std::vector>& datas, std::vec } void ClfModel::make_train(std::vector>& datas, std::vector& tags, std::vector& get_cnts) { - assert(feature_num_ > 0); + assert(feature_num_ > 0); // model is ready write_dataset(datas, tags, get_cnts); + // // TODO: avoid python model training // already write dataset // send msg to LightGBM server, let server read dataset and train new model libsocket::inet_stream sock(host_, port_, LIBSOCKET_IPv4); @@ -153,6 +152,7 @@ void ClfModel::make_train(std::vector>& datas, std::vector sock << message; sock >> recv_buffer; // wait for training end // will destroy sock when leaving this func scope + std::cout << "[MODEL] model training end, message: " << recv_buffer << std::endl; } void ClfModel::make_predict_samples(std::vector>& datas) { @@ -194,7 +194,7 @@ void ClfModel::make_real_predict(std::vector>& datas, std: libsocket::inet_stream sock(host_, port_, LIBSOCKET_IPv4); std::string message, recv_buffer; for (std::vector& data : datas) { - if (!data.empty()) { + if (LIKELY(!data.empty())) { prepare_data(data); message.clear(); recv_buffer.clear(); @@ -220,10 +220,12 @@ void ClfModel::make_real_predict(std::vector>& datas, std: void ClfModel::make_predict(std::vector>& datas, std::vector& preds) { preds.clear(); + if (UNLIKELY(datas.empty())) return; + assert(datas.size() > 0); // datas empty means we are debuging class ClfModel - if (datas.empty()) { - make_predict_samples(datas); - } + // if (datas.empty()) { + // make_predict_samples(datas); + // } // only write pred result to vector preds, and return nothing make_real_predict(datas, preds); return; diff --git a/db/art/clf_model.h b/db/art/clf_model.h index def2bf820..ab9087e26 100644 --- a/db/art/clf_model.h +++ b/db/art/clf_model.h @@ -7,6 +7,7 @@ #include #include #include "macros.h" +#include "port/likely.h" // dataset data point format: // every data point accounts for one segment @@ -29,17 +30,23 @@ namespace ROCKSDB_NAMESPACE { struct RangeRatePair; +struct RangeHeatPair; class ClfModel; bool RangeRatePairLessorComparor(const RangeRatePair& pair_1, const RangeRatePair& pair_2); bool RangeRatePairGreatorComparor(const RangeRatePair& pair_1, const RangeRatePair& pair_2); +bool RangeHeatPairLessorComparor(const RangeHeatPair& pair_1, const RangeHeatPair& pair_2); +bool RangeHeatPairGreatorComparor(const RangeHeatPair& pair_1, const RangeHeatPair& pair_2); + struct RangeRatePair { uint32_t range_id; double rate_in_segment; - RangeRatePair(const uint32_t& id, const double& rate) { - range_id = id; rate_in_segment = rate; - } +}; + +struct RangeHeatPair { + double rate_in_segment; + double heat_value; }; inline bool RangeRatePairLessorComparor(const RangeRatePair& pair_1, const RangeRatePair& pair_2) { @@ -50,13 +57,21 @@ inline bool RangeRatePairGreatorComparor(const RangeRatePair& pair_1, const Rang return pair_1.rate_in_segment > pair_2.rate_in_segment; } +inline bool RangeHeatPairLessorComparor(const RangeHeatPair& pair_1, const RangeHeatPair& pair_2) { + return pair_1.heat_value < pair_2.heat_value; +} + +inline bool RangeHeatPairGreatorComparor(const RangeHeatPair& pair_1, const RangeHeatPair& pair_2) { + return pair_1.heat_value > pair_2.heat_value; +} + class ClfModel { private: - static uint16_t feature_num_; // model input features num - static std::string dataset_name_; // dataset csv file name - static std::string dataset_path_; // path to save dataset csv file - static std::string host_, port_; // lightgbm server connection - static size_t buffer_size_; // socket receive buffer max size + uint16_t feature_num_; // model input features num + std::string dataset_name_; // dataset csv file name + std::string dataset_path_; // path to save dataset csv file + std::string host_, port_; // lightgbm server connection + size_t buffer_size_; // socket receive buffer max size public: // init member vars ClfModel() { @@ -78,7 +93,7 @@ class ClfModel { // feature num = level feature num (1) + 2 * num of key ranges // we set features_num_ to largest feature num void make_ready(std::vector& features_nums) { - if (features_nums.empty()) { + if (UNLIKELY(features_nums.empty())) { feature_num_ = 41; // debug feature num, see ../lgb_server files } else { // we may limit feature_num_ because of the socket transmit size limit is 1024 bytes @@ -88,7 +103,7 @@ class ClfModel { feature_num_ = MAX_FEATURES_NUM; } } - + assert(feature_num_ == MAX_FEATURES_NUM); // std::cout << "[DEBUG] ClfModel ready, feature_num_: " << feature_num_ << std::endl; } diff --git a/db/art/compactor.h b/db/art/compactor.h index 76d884096..b066f9a77 100644 --- a/db/art/compactor.h +++ b/db/art/compactor.h @@ -11,6 +11,7 @@ #include #include #include +#include "table/block_based/filter_block.h" #include #include #include @@ -45,6 +46,8 @@ struct SingleCompactionJob { std::vector keys_in_node; autovector* compacted_indexes; + SegmentBuilderResult segment_builder_result; + void Reset() { candidates.clear(); candidates_removed.clear(); diff --git a/db/art/filter_cache.cc b/db/art/filter_cache.cc index 731f12ccb..871d24078 100644 --- a/db/art/filter_cache.cc +++ b/db/art/filter_cache.cc @@ -1,97 +1,140 @@ #include "filter_cache.h" #include +#include +#include +#include +#include +#include +#include +#include "table/block_based/parsed_full_filter_block.h" +#include "filter_cache_entry.h" +#include "port/likely.h" namespace ROCKSDB_NAMESPACE { -FilterCache FilterCacheManager::filter_cache_; -HeatBuckets FilterCacheManager::heat_buckets_; -ClfModel FilterCacheManager::clf_model_; -GreedyAlgo FilterCacheManager::greedy_algo_; -FilterCacheHeapManager FilterCacheManager::heap_manager_; -uint32_t FilterCacheManager::get_cnt_; -uint32_t FilterCacheManager::period_cnt_; -uint32_t FilterCacheManager::last_long_period_; -uint32_t FilterCacheManager::last_short_period_; -std::mutex FilterCacheManager::update_mutex_; -bool FilterCacheManager::train_signal_; -std::map FilterCacheManager::last_count_recorder_; -std::map FilterCacheManager::current_count_recorder_; -std::mutex FilterCacheManager::count_mutex_; -bool FilterCacheManager::is_ready_; - -bool FilterCache::check_key(const uint32_t& segment_id, const std::string& key) { +std::vector> FilterCache::get_filter_blocks(const uint32_t segment_id) { auto it = filter_cache_.find(segment_id); - if (it == filter_cache_.end()) { + if (UNLIKELY(it == filter_cache_.end())) { // not in cache, that means we havent insert segment FilterCacheItem info into cache // actually, we start inserting after every segment becomes available - return true; - } else { - return (it->second).check_key(key); + // we return a empty vector here + return {}; + } + + return it->second.get_filter_blocks(); +} + +void FilterCache::init_segment(uint32_t segment_id, const BlockBasedTable* table, const std::vector& block_handles) { + // filter_cache_[segment_id] = FilterCacheEntry(segment_id, table, this, block_handles); + + if (LIKELY(table != nullptr && block_handles.size() == MAX_UNITS_NUM)) { + filter_cache_.emplace(std::piecewise_construct, std::make_tuple(segment_id), std::make_tuple(segment_id, table, this, block_handles)); } } void FilterCache::enable_for_segments(std::unordered_map& segment_units_num_recorder, const bool& is_forced, - std::set& level_0_segment_ids, std::set& failed_segment_ids) { + std::set& new_level_0_segment_ids, std::set& failed_segment_ids) { failed_segment_ids.clear(); filter_cache_mutex_.lock(); + // uint32_t enable_non_l0_count = 0, enable_l0_count = 0, fail_count = 0; + // std::cout << "level 0 filter usage before enable: " << level_0_used_space_size_ << std::endl; + // std::cout << "non level 0 filter usage before enable: " << used_space_size_ << std::endl; for (auto it = segment_units_num_recorder.begin(); it != segment_units_num_recorder.end(); it ++) { const uint32_t segment_id = it->first; const uint16_t units_num = it->second; auto cache_it = filter_cache_.find(segment_id); - bool is_level_0 = level_0_segment_ids.count(segment_id); + bool is_level_0 = new_level_0_segment_ids.count(segment_id); if (cache_it != filter_cache_.end()) { // filter units cached + const uint32_t old_size = (cache_it->second).approximate_size(); + assert(old_size >= 0); // should not cache it before if (is_forced || is_level_0 || !is_full()) { - const uint32_t old_size = (cache_it->second).approximate_size(); (cache_it->second).enable_units(units_num); - used_space_size_ = used_space_size_ - old_size + (cache_it->second).approximate_size(); if (is_level_0) { level_0_used_space_size_ = level_0_used_space_size_ - old_size + (cache_it->second).approximate_size(); + // enable_l0_count++; + // std::cout << "enable " << int((cache_it->second).approximate_size()) - int(old_size) + // << " bits for l0 segment " << segment_id << ", units num: " << units_num << std::endl; + } + else { + used_space_size_ = used_space_size_ - old_size + (cache_it->second).approximate_size(); + // enable_non_l0_count++; + // std::cout << "enable " << int((cache_it->second).approximate_size()) - int(old_size) + // << " bits for non l0 segment " << segment_id << ", units num: " << units_num << std::endl; } } else { failed_segment_ids.insert(segment_id); + // fail_count++; + assert(new_level_0_segment_ids.count(segment_id) == 0); + // std::cout << "failed to enable filters for segment " << segment_id << std::endl; } } else { + // already call FIlterCache::init_segment, + // so new segment already inserted into filter cache, but no filter units cached + // filter units not cached // now cache it - if (is_forced || is_level_0 || !is_full()) { - FilterCacheItem cache_item(units_num); - filter_cache_.insert(std::make_pair(segment_id, cache_item)); - used_space_size_ = used_space_size_ + cache_item.approximate_size(); - if (is_level_0) { - level_0_used_space_size_ = level_0_used_space_size_ + cache_item.approximate_size(); - } - } else { - failed_segment_ids.insert(segment_id); - } + // if (is_forced || is_level_0 || !is_full()) { + // FilterCacheEntry cache_item(units_num); + // filter_cache_.insert(std::make_pair(segment_id, cache_item)); + // used_space_size_ = used_space_size_ + cache_item.approximate_size(); + // if (is_level_0) { + // level_0_used_space_size_ = level_0_used_space_size_ + cache_item.approximate_size(); + // } + // } else { + // failed_segment_ids.insert(segment_id); + // } + + // all segments to be enabled must have been inited + // std::cout << "filter handle not exist, segment id: " << segment_id << std::endl; + // assert(false); } } + // std::cout << "enable l0 count: " << enable_l0_count << ", enable non l0 count: " << enable_non_l0_count << ", fail count: " << fail_count << std::endl; + // std::cout << "level 0 filter usage after enable: " << level_0_used_space_size_ << std::endl; + // std::cout << "non level 0 filter usage after enable: " << used_space_size_ << std::endl; + // assert(enable_l0_count == new_level_0_segment_ids.size()); + // assert(enable_l0_count + enable_non_l0_count + fail_count == segment_units_num_recorder.size()); filter_cache_mutex_.unlock(); } -void FilterCache::update_for_segments(std::unordered_map& segment_units_num_recorder, const bool& is_forced, - std::set& level_0_segment_ids, std::set& failed_segment_ids) { +void FilterCache::update_for_segments(std::unordered_map& segment_units_num_recorder, + std::set& old_level_0_segment_ids, std::set& failed_segment_ids) { + assert(false); // only used in move_segment, but it is disallowed, so this func never used. + exit(0); + // because no new segments is generated, no reason to increase the usage of filter cache + bool is_forced = true; + failed_segment_ids.clear(); filter_cache_mutex_.lock(); for (auto it = segment_units_num_recorder.begin(); it != segment_units_num_recorder.end(); it ++) { const uint32_t segment_id = it->first; const uint16_t units_num = it->second; auto cache_it = filter_cache_.find(segment_id); - bool is_level_0 = level_0_segment_ids.count(segment_id); + bool is_level_0 = old_level_0_segment_ids.count(segment_id); if (cache_it != filter_cache_.end()) { + const uint32_t old_size = (cache_it->second).approximate_size(); // filter units cached if (is_forced || is_level_0 || !is_full()) { - const uint32_t old_size = (cache_it->second).approximate_size(); (cache_it->second).enable_units(units_num); - used_space_size_ = used_space_size_ - old_size + (cache_it->second).approximate_size(); if (is_level_0) { - level_0_used_space_size_ = level_0_used_space_size_ - old_size + (cache_it->second).approximate_size(); + assert(old_size > 0); // should already cache filter for level 0. + level_0_used_space_size_ -= old_size; + used_space_size_ += (cache_it->second).approximate_size(); + } else { + used_space_size_ = used_space_size_ - old_size + (cache_it->second).approximate_size(); } } else { + // never reach this statement, because is_forced is always true + assert(false); failed_segment_ids.insert(segment_id); } } else { // filter units not cached // do nothing!!! + + // all segments to be enabled must have been inited + // std::cout << "error segment_id: " << segment_id << std::endl; + // assert(false); } } filter_cache_mutex_.unlock(); @@ -105,25 +148,38 @@ bool FilterCache::is_ready() { return double(used_space_size_) / double(cache_size_) >= READY_RATE; } -void FilterCache::release_for_segments(std::vector& segment_ids, std::set& level_0_segment_ids) { +void FilterCache::release_for_segments(std::vector& segment_ids, std::set& old_level_0_segment_ids) { std::sort(segment_ids.begin(), segment_ids.end()); // delete key-value pair in filter_cache_ filter_cache_mutex_.lock(); auto it = filter_cache_.begin(); size_t idx = 0; + // uint32_t release_non_l0_count = 0, release_l0_count = 0; + // std::cout << "level 0 filter usage before release: " << level_0_used_space_size_ << std::endl; + // std::cout << "non level 0 filter usage before release: " << used_space_size_ << std::endl; while (it != filter_cache_.end() && idx < segment_ids.size()) { if (it->first < segment_ids[idx]) { it ++; } else if (it->first > segment_ids[idx]) { idx ++; } else { - used_space_size_ = used_space_size_ - (it->second).approximate_size(); - if (level_0_segment_ids.count(it->first)) { + if (old_level_0_segment_ids.count(it->first)) { level_0_used_space_size_ = level_0_used_space_size_ - (it->second).approximate_size(); + // release_l0_count++; + // std::cout << "free " << (it->second).approximate_size() << " bits of level 0 segment " << it->first << std::endl; + } else { + used_space_size_ = used_space_size_ - (it->second).approximate_size(); + // release_non_l0_count++; + // std::cout << "free " << (it->second).approximate_size() << " bits of non level 0 segment " << it->first << std::endl; } - it = filter_cache_.erase(it); + it = filter_cache_.erase(it); idx++; } } + // assert(release_non_l0_count + release_l0_count == segment_ids.size()); + // assert(release_l0_count == old_level_0_segment_ids.size()); + // std::cout << "release l0 count: " << release_l0_count << ", release non l0 count: " << release_non_l0_count << std::endl; + // std::cout << "level 0 filter usage after release: " << level_0_used_space_size_ << std::endl; + // std::cout << "non level 0 filter usage after release: " << used_space_size_ << std::endl; filter_cache_mutex_.unlock(); } @@ -143,39 +199,58 @@ bool FilterCacheManager::make_heat_buckets_ready(const std::string& key, } void FilterCacheManager::hit_heat_buckets(const std::string& key) { - if (heat_buckets_.is_ready()) { + bool signal = false; + if (LIKELY(heat_buckets_.is_ready())) { get_cnt_ += 1; - if (get_cnt_ >= PERIOD_COUNT) { - heat_buckets_.hit(key, true); + heat_buckets_.hit(key, signal); // if one period end, return true signal + if (signal) { + period_mutex_.WriteLock(); get_cnt_ = 0; period_cnt_ += 1; - } else { - heat_buckets_.hit(key, false); + // std::cout << "get cnt updated, current period cnt: " << period_cnt_ << std::endl; + period_mutex_.WriteUnlock(); } } - if (period_cnt_ - last_long_period_ >= TRAIN_PERIODS) { - update_mutex_.lock(); +} - if (period_cnt_ - last_long_period_ >= TRAIN_PERIODS) { - last_long_period_ = period_cnt_; - update_count_recorder(); - train_signal_ = true; - } +void FilterCacheManager::do_periods_work() { + bool need_retrain = false; - update_mutex_.unlock(); + // called by a background thread, never need to lock + // update_mutex_.lock(); + if (period_cnt_ - last_long_period_ >= TRAIN_PERIODS) { + // std::cout << "period_cnt_: " << period_cnt_ << std::endl; + // std::cout << "last_long_period_: " << last_long_period_ << std::endl; + last_long_period_ = period_cnt_; + update_count_recorder(); + // debug_count_recorder(); + std::map recent_count_recorder; + std::vector empty_needed_segment_ids; + estimate_recent_counts(recent_count_recorder, empty_needed_segment_ids); + assert(recent_count_recorder.size() > 0); + // std::cout << "long period end, sync visit cnt." << std::endl; + heap_manager_.sync_visit_cnt(recent_count_recorder); + train_signal_ = true; + need_retrain = true; } if (period_cnt_ - last_short_period_ >= 1) { - update_mutex_.lock(); - - if (period_cnt_ - last_short_period_ >= 1) { - last_short_period_ = period_cnt_; - std::map estimate_count_recorder; - estimate_counts_for_all(estimate_count_recorder); - heap_manager_.sync_visit_cnt(estimate_count_recorder); + last_short_period_ = period_cnt_; + // if already updated, do not update again + if (!need_retrain) { + // std::cout << "period_cnt_: " << period_cnt_ << std::endl; + // std::cout << "last_short_period_: " << last_short_period_ << std::endl; + // debug_count_recorder(); + std::map recent_count_recorder; + std::vector empty_needed_segment_ids; + estimate_recent_counts(recent_count_recorder, empty_needed_segment_ids); + assert(recent_count_recorder.size() > 0); + // std::cout << "short period end, sync visit cnt." << std::endl; + heap_manager_.sync_visit_cnt(recent_count_recorder); } - - update_mutex_.unlock(); } + + std::this_thread::sleep_for(std::chrono::milliseconds(1000)); + // update_mutex_.unlock(); } bool FilterCacheManager::make_clf_model_ready(std::vector& features_nums) { @@ -183,126 +258,281 @@ bool FilterCacheManager::make_clf_model_ready(std::vector& features_nu return clf_model_.is_ready(); } -bool FilterCacheManager::check_key(const uint32_t& segment_id, const std::string& key) { +std::vector> FilterCacheManager::get_filter_blocks(uint32_t segment_id) { // move hit_count_recorder to a background thread // hit_count_recorder(segment_id); // one get opt will cause query to many segments. // so one get opt only call one hit_heat_buckets, but call many hit_count_recorder - return filter_cache_.check_key(segment_id, key); + return filter_cache_.get_filter_blocks(segment_id); +} + +void FilterCacheManager::init_segment(uint32_t segment_id, const BlockBasedTable* table, const std::vector& block_handles) { + filter_cache_.init_segment(segment_id, table, block_handles); } -void FilterCacheManager::hit_count_recorder(const uint32_t& segment_id) { - count_mutex_.lock(); +void FilterCacheManager::hit_count_recorder(uint32_t segment_id) { + count_mutex_.ReadLock(); auto it = current_count_recorder_.find(segment_id); if (it == current_count_recorder_.end()) { // segment havent been visited, need to insert count - current_count_recorder_.insert(std::make_pair(segment_id, 1)); + // current_count_recorder_.insert(std::make_pair(segment_id, 1)); + // do nothing, wait for insertion } else { // segment have been visited, only update count it->second = it->second + 1; } - count_mutex_.unlock(); + count_mutex_.ReadUnlock(); } void FilterCacheManager::update_count_recorder() { - count_mutex_.lock(); + count_mutex_.WriteLock(); last_count_recorder_.clear(); - last_count_recorder_.insert(current_count_recorder_.begin(), current_count_recorder_.end()); + // last_count_recorder_.insert(current_count_recorder_.begin(), current_count_recorder_.end()); + std::copy(current_count_recorder_.begin(), current_count_recorder_.end(), + std::inserter(last_count_recorder_, last_count_recorder_.begin())); + assert(last_count_recorder_.size() == current_count_recorder_.size()); for (auto it = current_count_recorder_.begin(); it != current_count_recorder_.end(); it++) { it->second = 0; } - count_mutex_.unlock(); + count_mutex_.WriteUnlock(); +} + +void FilterCacheManager::debug_count_recorder() { + uint32_t get_cnt = 0; + + count_mutex_.ReadLock(); + + std::cout << "last_count_recorder: " << std::endl; + for (auto it = last_count_recorder_.begin(); it != last_count_recorder_.end(); it++) { + std::cout << it->first << ": " << it->second << std::endl; + } + std::cout << "current_count_recorder: " << std::endl; + for (auto it = current_count_recorder_.begin(); it != current_count_recorder_.end(); it++) { + get_cnt += it->second; + std::cout << it->first << ": " << it->second << std::endl; + } + std::cout << "get_cnt: " << get_cnt << std::endl; + + count_mutex_.ReadUnlock(); } void FilterCacheManager::inherit_count_recorder(std::vector& merged_segment_ids, std::vector& new_segment_ids, const uint32_t& level_0_base_count, std::map>& inherit_infos_recorder) { - count_mutex_.lock(); + count_mutex_.WriteLock(); + // copy last count and current count of merged segments std::map merged_last_count_recorder, merged_current_count_recorder; // cache merged segment count temporarily + // std::cout << std::endl << std::endl; + // std::cout << "merged segments id: "; for (uint32_t& merged_segment_id : merged_segment_ids) { merged_last_count_recorder.insert(std::make_pair(merged_segment_id, last_count_recorder_[merged_segment_id])); last_count_recorder_.erase(merged_segment_id); merged_current_count_recorder.insert(std::make_pair(merged_segment_id, current_count_recorder_[merged_segment_id])); current_count_recorder_.erase(merged_segment_id); + + // std::cout << merged_segment_id << " "; + assert(last_count_recorder_.find(merged_segment_id) == last_count_recorder_.end()); + assert(current_count_recorder_.find(merged_segment_id) == current_count_recorder_.end()); } + // std::cout << std::endl; + // std::cout << "merged segments size: " << merged_segment_ids.size() << std::endl; + // std::cout << "new segments size: " << new_segment_ids.size() << std::endl; + // std::cout << "inherit_infos_recorder size: " << inherit_infos_recorder.size() << std::endl; + + // init last count and current count of new segments based on inherit method (that not on Level 0) std::map new_last_count_recorder, new_current_count_recorder; for (auto infos_it = inherit_infos_recorder.begin(); infos_it != inherit_infos_recorder.end(); infos_it ++) { double last_count = 0, current_count = 0; + double weight_sum = 0; + // std::cout << "child segment: " << infos_it->first << std::endl; std::unordered_map& info = infos_it->second; for (auto info_it = info.begin(); info_it != info.end(); info_it ++) { - last_count = last_count + INHERIT_REMAIN_FACTOR * (merged_last_count_recorder[info_it->first] * info_it->second); - current_count = current_count + INHERIT_REMAIN_FACTOR * (merged_current_count_recorder[info_it->first] * info_it->second); + last_count += INHERIT_REMAIN_FACTOR * (merged_last_count_recorder[info_it->first] * info_it->second); + current_count += INHERIT_REMAIN_FACTOR * (merged_current_count_recorder[info_it->first] * info_it->second); + weight_sum += info_it->second; + // std::cout << "parent segment: " << info_it->first << " weight: " << info_it->second + // << " last count: " << merged_last_count_recorder[info_it->first] + // << " current count: " << merged_current_count_recorder[info_it->first] << std::endl; + assert(merged_last_count_recorder.find(info_it->first) != merged_last_count_recorder.end()); + assert(merged_current_count_recorder.find(info_it->first) != merged_current_count_recorder.end()); } + + // std::cout << "temp last count: " << uint32_t(last_count) << " temp currrent count: " << uint32_t(current_count) << std::endl; + // assert(weight_sum > 0.90); + // weight sum should be 1.0, we multiple the inherited count by (1.0 / weight_sum) + // assert(weight_sum > 0.98 && weight_sum < 1.02); // weight_sum approximately equals to 1.0 + last_count *= (1.0 / weight_sum); current_count *= (1.0 / weight_sum); // actually weight_sum always equals to 1.0 + // std::cout << "weight sum: " << weight_sum << " final last count: " << uint32_t(last_count) << " final currrent count: " << uint32_t(current_count) << std::endl; new_last_count_recorder.insert(std::make_pair(infos_it->first, uint32_t(last_count))); new_current_count_recorder.insert(std::make_pair(infos_it->first, uint32_t(current_count))); } + assert(inherit_infos_recorder.size() == new_last_count_recorder.size()); + assert(inherit_infos_recorder.size() == new_current_count_recorder.size()); + assert(inherit_infos_recorder.size() <= new_segment_ids.size()); + + // uint32_t last_insert_num = 0, current_insert_num = 0; + // uint32_t last_update_num = 0, current_update_num = 0; + // uint32_t last_check_num = 0, current_check_num = 0; + + // insert last count and current count of new segments for (uint32_t& new_segment_id : new_segment_ids) { + // insert last count auto last_it = last_count_recorder_.find(new_segment_id); uint32_t new_last_count = level_0_base_count; // level 0 segments init + // if true, this means new segment not on level 0, also means this segments are inherited from some segments if (new_last_count_recorder.count(new_segment_id) > 0) { new_last_count = new_last_count_recorder[new_segment_id]; + // last_check_num ++; } if (last_it != last_count_recorder_.end()) { last_it->second = last_it->second + new_last_count; + // last_update_num ++; } else { last_count_recorder_.insert(std::make_pair(new_segment_id, new_last_count)); + // last_insert_num ++; } + // insert current count auto current_it = current_count_recorder_.find(new_segment_id); uint32_t new_current_count = level_0_base_count; // level 0 segments init + // if true, this means new segment not on level 0, also means this segments are inherited from some segments if (new_current_count_recorder.count(new_segment_id) > 0) { new_current_count = new_current_count_recorder[new_segment_id]; + // current_check_num ++; } if (current_it != current_count_recorder_.end()) { current_it->second = current_it->second + new_current_count; + // current_update_num ++; } else { current_count_recorder_.insert(std::make_pair(new_segment_id, new_current_count)); + // current_insert_num ++; } + + assert(last_count_recorder_[new_segment_id] >= new_last_count); + assert(current_count_recorder_[new_segment_id] >= new_current_count); + // std::cout << "new segment id: " << new_segment_id << " last count: " << new_last_count << " current count: " << new_current_count << std::endl; } - count_mutex_.unlock(); + // assert(last_insert_num + last_update_num == new_segment_ids.size()); + // assert(current_insert_num + current_update_num == new_segment_ids.size()); + // assert(last_check_num == inherit_infos_recorder.size()); + // assert(current_check_num == inherit_infos_recorder.size()); + // std::cout << "last_insert_num: " << last_insert_num << " last_update_num: " << last_update_num << std::endl; + // std::cout << "current_insert_num: " << current_insert_num << " current_update_num: " << current_update_num << std::endl; + // std::cout << std::endl << std::endl; + + count_mutex_.WriteUnlock(); } -void FilterCacheManager::estimate_counts_for_all(std::map& approximate_counts_recorder) { +void FilterCacheManager::estimate_recent_counts(std::map& approximate_counts_recorder, const std::vector& needed_segment_ids) { const uint32_t long_period_total_count = TRAIN_PERIODS * PERIOD_COUNT; uint32_t current_long_period_count = PERIOD_COUNT * (period_cnt_ % TRAIN_PERIODS) + get_cnt_; double current_long_period_rate = std::min(double(current_long_period_count) / double(long_period_total_count), 1.0); - approximate_counts_recorder.clear(); - approximate_counts_recorder.insert(current_count_recorder_.begin(), current_count_recorder_.end()); - auto approx_it = approximate_counts_recorder.begin(); - auto last_it = last_count_recorder_.begin(); - while (approx_it != approximate_counts_recorder.end() && last_it != last_count_recorder_.end()) { - if (approx_it->first > last_it->first) { - last_it ++; - } else if(approx_it->first < last_it->first) { - approx_it ++; - } else { - approx_it->second = approx_it->second + uint32_t((1 - current_long_period_rate) * last_it->second); + if (needed_segment_ids.empty()) { + count_mutex_.ReadLock(); + approximate_counts_recorder.clear(); + // approximate_counts_recorder.insert(current_count_recorder_.begin(), current_count_recorder_.end()); + std::copy(current_count_recorder_.begin(), current_count_recorder_.end(), + std::inserter(approximate_counts_recorder, approximate_counts_recorder.begin())); + assert(approximate_counts_recorder.size() == current_count_recorder_.size()); + auto approx_it = approximate_counts_recorder.begin(); + auto last_it = last_count_recorder_.begin(); + // std::cout << "estimate all segments' recent frequency." << std::endl; + while (approx_it != approximate_counts_recorder.end() && last_it != last_count_recorder_.end()) { + if (approx_it->first > last_it->first) { + last_it ++; + } else if(approx_it->first < last_it->first) { + approx_it ++; + } else { + uint32_t recent_result = approx_it->second + uint32_t((1 - current_long_period_rate) * last_it->second); + // if (current_long_period_rate > 0) { + // std::cout << "current rate: " << current_long_period_rate + // << ", current count: " << approx_it->second + // << ", last count: " << last_it->second + // << ", final recent count: " << recent_result << std::endl; + // } + approx_it->second = recent_result; + assert(approximate_counts_recorder[approx_it->first] == recent_result); + if (uint32_t((1 - current_long_period_rate) * last_it->second) > 0) + assert(current_count_recorder_[approx_it->first] != recent_result); + approx_it ++; + last_it ++; + } + } + count_mutex_.ReadUnlock(); + } else { + count_mutex_.ReadLock(); + approximate_counts_recorder.clear(); + for (uint32_t segment_id : needed_segment_ids) { + approximate_counts_recorder.insert(std::make_pair(segment_id, current_count_recorder_[segment_id])); + } + assert(approximate_counts_recorder.size() == needed_segment_ids.size()); + auto approx_it = approximate_counts_recorder.begin(); + // std::cout << "estimate some segments' recent frequency." << std::endl; + while (approx_it != approximate_counts_recorder.end()) { + uint32_t recent_result = approx_it->second + + uint32_t((1 - current_long_period_rate) * last_count_recorder_[approx_it->first]); + // if (current_long_period_rate > 0) { + // std::cout << "current rate: " << current_long_period_rate + // << ", current count: " << approx_it->second + // << ", last count: " << last_count_recorder_[approx_it->first] + // << ", final recent count: " << recent_result << std::endl; + // } + approx_it->second = recent_result; + assert(approximate_counts_recorder[approx_it->first] == recent_result); + if (uint32_t((1 - current_long_period_rate) * last_count_recorder_[approx_it->first]) > 0) + assert(current_count_recorder_[approx_it->first] != recent_result); approx_it ++; } + count_mutex_.ReadUnlock(); } - // return nothing, already write result to approximate_counts_recorder } -void FilterCacheManager::try_retrain_model(std::map& level_recorder, +bool FilterCacheManager::try_retrain_model(std::map& level_recorder, std::map>& segment_ranges_recorder, std::map& unit_size_recorder) { // we should guarantee these 3 external recorder share the same keys set // we need to do this job outside FilterCacheManager assert(level_recorder.size() == segment_ranges_recorder.size()); // assert(level_recorder.size() == unit_size_recorder.size()); + // should not train when loading, train_signal_ only true when starting YCSB run. if (train_signal_ == false) { - return; + return false; } + // auto level_it_0 = level_recorder.begin(); + // while (level_it_0 != level_recorder.end()) { + // if (last_count_recorder_.find(level_it_0->first) == last_count_recorder_.end()) continue; + // uint32_t cnt = last_count_recorder_[level_it_0->first]; + // std::cout << level_it_0->first << " : " << cnt << ", level: " << level_it_0->second << std::endl; + // level_it_0++; + // } + + // recheck whether each segments include at least one key ranges. + // auto ranges_it = segment_ranges_recorder.begin(); + // while(ranges_it != segment_ranges_recorder.end()) + // { + // // std::cout << "segment " << ranges_it->first + // // << " ranges num : " << (ranges_it->second).size() << std::endl; + // assert((ranges_it->second).size() > 0); + + // double rate_sum = 0; + // for (RangeRatePair& pair : ranges_it->second) { + // rate_sum += pair.rate_in_segment; + // } + // assert(rate_sum <= 1.02 && rate_sum >= 0.98); + + // ranges_it++; + // } + // solve programming problem std::map label_recorder; std::map algo_infos; @@ -322,24 +552,64 @@ void FilterCacheManager::try_retrain_model(std::map& level_r greedy_algo_.solve(algo_infos, label_recorder, filter_cache_.cache_size_except_level_0()); */ assert(unit_size_recorder.size() == 0); - auto get_cnt_it = last_count_recorder_.begin(); - while (get_cnt_it != last_count_recorder_.end()) { + + std::map last_count_recorder_copy; + count_mutex_.ReadLock(); + last_count_recorder_copy = last_count_recorder_; + count_mutex_.ReadUnlock(); + + auto get_cnt_it = last_count_recorder_copy.begin(); + while (get_cnt_it != last_count_recorder_copy.end()) { // unit_size_recorder always empty, so we only use DEFAULT_UNIT_SIZE - algo_infos.insert(std::make_pair(get_cnt_it->first, SegmentAlgoInfo(get_cnt_it->second, DEFAULT_UNIT_SIZE))); + // exclude level 0 segments + if (level_recorder[get_cnt_it->first] > 0) { + algo_infos.insert(std::make_pair(get_cnt_it->first, SegmentAlgoInfo(get_cnt_it->second, DEFAULT_UNIT_SIZE))); + } get_cnt_it ++; } + assert(algo_infos.size() > 0); + if (UNLIKELY(algo_infos.empty())) return false; + std::cout << "[ALGO] algo_infos size: " << algo_infos.size() << std::endl; greedy_algo_.solve(algo_infos, label_recorder, filter_cache_.cache_size_except_level_0()); + std::cout << "[ALGO] stage 1: recorder size (exclude level 0): " << label_recorder.size() << std::endl; + assert(algo_infos.size() == label_recorder.size()); + // // need to verify solutions + // greedy_algo_.verify(algo_infos, label_recorder, filter_cache_.cache_size_except_level_0() / 256); + + // assert(level_recorder.size() == segment_ranges_recorder.size()); + // should make these two recorders share the same segment ids + auto level_it_1 = level_recorder.begin(); + auto range_it_1 = segment_ranges_recorder.begin(); + while (level_it_1 != level_recorder.end() + && range_it_1 != segment_ranges_recorder.end()) + { + if (level_it_1->first < range_it_1->first) { + level_it_1 = level_recorder.erase(level_it_1); + } else if (level_it_1->first > range_it_1->first) { + range_it_1 = segment_ranges_recorder.erase(range_it_1); + } else { + level_it_1++; range_it_1++; + } + } + while (level_it_1 != level_recorder.end()) { + level_it_1 = level_recorder.erase(level_it_1); + } + while (range_it_1 != segment_ranges_recorder.end()) { + range_it_1 = segment_ranges_recorder.erase(range_it_1); + } + assert(level_recorder.size() == segment_ranges_recorder.size()); - // programming problem may include some merged segments, we need to ignore them + // level_recorder and segment_ranges_recorder may include some merged segments, we need to ignore them auto old_level_it = level_recorder.begin(); auto old_range_it = segment_ranges_recorder.begin(); auto old_label_it = label_recorder.begin(); while (old_level_it != level_recorder.end() && old_range_it != segment_ranges_recorder.end() && old_label_it != label_recorder.end()) { + // std::cout << "debug : " << old_level_it->first << " : " << old_range_it->first << std::endl; assert(old_level_it->first == old_range_it->first); if (old_level_it->first < old_label_it->first) { - old_level_it = level_recorder.erase(old_label_it); + old_level_it = level_recorder.erase(old_level_it); old_range_it = segment_ranges_recorder.erase(old_range_it); } else if (old_level_it->first > old_label_it->first) { old_label_it = label_recorder.erase(old_label_it); @@ -349,27 +619,43 @@ void FilterCacheManager::try_retrain_model(std::map& level_r old_label_it ++; } } + // if some different elements remain in recorder's tail, we need to erase them while (old_level_it != level_recorder.end() && old_range_it != segment_ranges_recorder.end()) { assert(old_level_it->first == old_range_it->first); - old_level_it = level_recorder.erase(old_label_it); + old_level_it = level_recorder.erase(old_level_it); old_range_it = segment_ranges_recorder.erase(old_range_it); } while (old_label_it != label_recorder.end()) { old_label_it = label_recorder.erase(old_label_it); } + // recheck whether these 3 recorder have same size + assert(level_recorder.size() == segment_ranges_recorder.size()); + assert(level_recorder.size() == label_recorder.size()); + // auto check_level_it_2 = level_recorder.begin(); + // auto check_label_it_2 = label_recorder.begin(); + // while (check_level_it_2 != level_recorder.end() + // && check_label_it_2 != label_recorder.end()) + // { + // assert(check_level_it_2->first == check_label_it_2->first); + // check_level_it_2++; check_label_it_2++; + // } + std::cout << "[ALGO] stage 2: recorder size (exclude level 0): " << label_recorder.size() << std::endl; + std::vector buckets = heat_buckets_.buckets(); std::vector> datas; std::vector labels; std::vector get_cnts; - auto level_it = level_recorder.begin(); // key range id start with 0 + std::cout << "[ALGO] stage 3: current count recorder size (include level 0): " << last_count_recorder_copy.size() << std::endl; + // remember key range id starts with 0 + auto level_it = level_recorder.begin(); auto range_it = segment_ranges_recorder.begin(); - auto count_it = last_count_recorder_.begin(); + auto count_it = last_count_recorder_copy.begin(); auto label_it = label_recorder.begin(); while (level_it != level_recorder.end() && range_it != segment_ranges_recorder.end() && - count_it != last_count_recorder_.end() && label_it != label_recorder.end()) { + count_it != last_count_recorder_copy.end() && label_it != label_recorder.end()) { assert(level_it->first == range_it->first); assert(level_it->first == label_it->first); if (count_it->first < level_it->first) { @@ -379,19 +665,42 @@ void FilterCacheManager::try_retrain_model(std::map& level_r range_it ++; label_it ++; } else { - if (level_it->second > 0) { + // only train with non level 0 data + if (LIKELY(level_it->second > 0)) { // add data row std::vector data; - std::sort((range_it->second).begin(), (range_it->second).end(), RangeRatePairGreatorComparor); - data.emplace_back(level_it->second); + std::vector heat_pairs; + // double rate_sum = 0; for (RangeRatePair& pair : range_it->second) { + // rate_sum += pair.rate_in_segment; + + RangeHeatPair heat_pair; assert(pair.range_id >= 0 && pair.range_id < buckets.size()); - data.emplace_back(uint32_t(RATE_SIGNIFICANT_DIGITS_FACTOR * pair.rate_in_segment)); - data.emplace_back(uint32_t(HOTNESS_SIGNIFICANT_DIGITS_FACTOR * buckets[pair.range_id].hotness_)); + heat_pair.rate_in_segment = pair.rate_in_segment; + heat_pair.heat_value = buckets[pair.range_id].hotness_; + heat_pairs.emplace_back(heat_pair); } + // assert(rate_sum >= 0.98 && rate_sum <= 1.02); + assert(heat_pairs.size() == (range_it->second).size()); + + std::sort(heat_pairs.begin(), heat_pairs.end(), RangeHeatPairGreatorComparor); + for (size_t i = 0; i < heat_pairs.size() - 1; i ++) { + assert(heat_pairs[i].heat_value >= heat_pairs[i+1].heat_value); + } + + data.emplace_back(level_it->second); + for (RangeHeatPair& heat_pair : heat_pairs) { + data.emplace_back(uint32_t(RATE_SIGNIFICANT_DIGITS_FACTOR * heat_pair.rate_in_segment)); + data.emplace_back(uint32_t(HOTNESS_SIGNIFICANT_DIGITS_FACTOR * heat_pair.heat_value)); + } + // std::cout << "[DEBUG] segment " << level_it->first << " data features num: " << data.size() << std::endl; + assert(data.size() >= 3 && data.size() % 2 == 1); + assert((range_it->second).size() * 2 + 1 == data.size()); + assert(data[0] > 0); datas.emplace_back(data); // add label row labels.emplace_back(label_it->second); + assert(label_it->second <= MAX_UNITS_NUM); // add get cnt row get_cnts.emplace_back(count_it->second); } @@ -405,15 +714,17 @@ void FilterCacheManager::try_retrain_model(std::map& level_r // check three vectors have same length assert(datas.size() == labels.size()); assert(get_cnts.size() == labels.size()); + std::cout << "[ALGO] stage 3: training labels size (exclude level 0): " << labels.size() << std::endl; clf_model_.make_train(datas, labels, get_cnts); train_signal_ = false; + + return true; } void FilterCacheManager::update_cache_and_heap(std::map& level_recorder, std::map>& segment_ranges_recorder) { - assert(level_recorder.size() == segment_ranges_recorder.size()); std::vector segment_ids; std::vector> datas; std::vector preds; @@ -421,9 +732,28 @@ void FilterCacheManager::update_cache_and_heap(std::map& lev std::map current_units_num_limit_recorder; std::vector buckets = heat_buckets_.buckets(); + // check whether level 0 segments exist? + // assert(level_recorder.size() == segment_ranges_recorder.size()); + // auto level_it_1 = level_recorder.begin(); + // auto range_it_1 = segment_ranges_recorder.begin(); + // while (level_it_1 != level_recorder.end() + // && range_it_1 != segment_ranges_recorder.end()) { + // assert(level_it_1->first == range_it_1->first); + // assert(level_it_1->second > 0); + // level_it_1++; + // range_it_1++; + // } + // check whether level 0 segments exist? + // level_it_1 = level_recorder.begin(); + // while (level_it_1 != level_recorder.end()) { + // assert(level_it_1->second > 0); + // level_it_1++; + // } + // build data rows into datas auto level_it = level_recorder.begin(); auto range_it = segment_ranges_recorder.begin(); + assert(level_recorder.size() == segment_ranges_recorder.size()); while (level_it != level_recorder.end() && range_it != segment_ranges_recorder.end()) { if (level_it->first < range_it->first) { level_it ++; @@ -431,17 +761,39 @@ void FilterCacheManager::update_cache_and_heap(std::map& lev range_it ++; } else { assert(level_it->first == range_it->first); - - if (level_it->second > 0) { + assert(level_it->second > 0); + if (LIKELY(level_it->second > 0)) { + segment_ids.emplace_back(level_it->first); // add data row std::vector data; - std::sort((range_it->second).begin(), (range_it->second).end(), RangeRatePairGreatorComparor); - data.emplace_back(level_it->second); + std::vector heat_pairs; + // double rate_sum = 0; for (RangeRatePair& pair : range_it->second) { + // rate_sum += pair.rate_in_segment; + + RangeHeatPair heat_pair; assert(pair.range_id >= 0 && pair.range_id < buckets.size()); - data.emplace_back(uint32_t(RATE_SIGNIFICANT_DIGITS_FACTOR * pair.rate_in_segment)); - data.emplace_back(uint32_t(HOTNESS_SIGNIFICANT_DIGITS_FACTOR * buckets[pair.range_id].hotness_)); + heat_pair.rate_in_segment = pair.rate_in_segment; + heat_pair.heat_value = buckets[pair.range_id].hotness_; + heat_pairs.emplace_back(heat_pair); + } + // assert(rate_sum >= 0.98 && rate_sum <= 1.02); + assert(heat_pairs.size() == (range_it->second).size()); + + std::sort(heat_pairs.begin(), heat_pairs.end(), RangeHeatPairGreatorComparor); + for (size_t i = 0; i < heat_pairs.size() - 1; i ++) { + assert(heat_pairs[i].heat_value >= heat_pairs[i+1].heat_value); + } + + data.emplace_back(level_it->second); + for (RangeHeatPair& heat_pair : heat_pairs) { + data.emplace_back(uint32_t(RATE_SIGNIFICANT_DIGITS_FACTOR * heat_pair.rate_in_segment)); + data.emplace_back(uint32_t(HOTNESS_SIGNIFICANT_DIGITS_FACTOR * heat_pair.heat_value)); } + // std::cout << "[DEBUG] segment " << level_it->first << " data features num: " << data.size() << std::endl; + assert(data.size() >= 3 && data.size() % 2 == 1); + assert((range_it->second).size() * 2 + 1 == data.size()); + assert(data[0] > 0); datas.emplace_back(data); } @@ -454,11 +806,15 @@ void FilterCacheManager::update_cache_and_heap(std::map& lev clf_model_.make_predict(datas, preds); assert(segment_ids.size() == preds.size()); size_t idx = 0; + // std::cout << std::endl << "sync units num limit" << std::endl; while (idx < segment_ids.size() && idx < preds.size()) { segment_units_num_recorder.insert(std::make_pair(segment_ids[idx], preds[idx])); current_units_num_limit_recorder.insert(std::make_pair(segment_ids[idx], preds[idx])); + // std::cout << "segment id: " << segment_ids[idx] << ", units limit: " << preds[idx] << std::endl; idx = idx + 1; } + assert(segment_units_num_recorder.size() == current_units_num_limit_recorder.size()); + assert(segment_ids.size() == segment_units_num_recorder.size()); // update filter cache helper heaps heap_manager_.sync_units_num_limit(current_units_num_limit_recorder); @@ -466,14 +822,9 @@ void FilterCacheManager::update_cache_and_heap(std::map& lev // update filter cache std::set empty_level_0_segment_ids; // no level 0 segment in heaps and model data, dont worry std::set empty_failed_segment_ids; - filter_cache_.update_for_segments(segment_units_num_recorder, true, empty_level_0_segment_ids, empty_failed_segment_ids); -} + filter_cache_.enable_for_segments(segment_units_num_recorder, true, empty_level_0_segment_ids, empty_failed_segment_ids); + assert(empty_failed_segment_ids.empty()); -void FilterCacheManager::remove_segments(std::vector& segment_ids, std::set& level_0_segment_ids) { - // update filter cache helper heaps - heap_manager_.batch_delete(segment_ids); - // update filter cache map - filter_cache_.release_for_segments(segment_ids, level_0_segment_ids); } bool FilterCacheManager::adjust_cache_and_heap() { @@ -500,14 +851,16 @@ bool FilterCacheManager::adjust_cache_and_heap() { std::set empty_failed_segment_ids; // force to update segments' filter units group, so dont worry for cache space segment_units_num_recorder.insert(std::make_pair(result.enable_segment_id, result.enable_segment_next_units_num)); segment_units_num_recorder.insert(std::make_pair(result.disable_segment_id, result.disable_segment_next_units_num)); - filter_cache_.update_for_segments(segment_units_num_recorder, true, empty_level_0_segment_ids, empty_failed_segment_ids); - } + filter_cache_.enable_for_segments(segment_units_num_recorder, true, empty_level_0_segment_ids, empty_failed_segment_ids); + assert(empty_failed_segment_ids.empty()); + } + // std::this_thread::sleep_for(std::chrono::milliseconds(10)); return can_adjust; } void FilterCacheManager::insert_segments(std::vector& merged_segment_ids, std::vector& new_segment_ids, std::map>& inherit_infos_recorder, - std::map& level_recorder, const uint32_t& level_0_base_count, + std::map& new_level_recorder, const uint32_t& level_0_base_count, std::map>& segment_ranges_recorder) { std::unordered_map segment_units_num_recorder; std::map approximate_counts_recorder; @@ -518,50 +871,111 @@ void FilterCacheManager::insert_segments(std::vector& merged_segment_i std::sort(merged_segment_ids.begin(), merged_segment_ids.end()); std::sort(new_segment_ids.begin(), new_segment_ids.end()); - // pick up merged or new level 0 segments - // assume level_recorder keys set equals to merged_segment_ids + new_segment_ids - assert(new_segment_ids.size() == 0 || merged_segment_ids.size() + new_segment_ids.size() == level_recorder.size()); - auto level_it = level_recorder.begin(); - size_t merged_idx = 0, new_idx = 0; - while (level_it != level_recorder.end()) { - if (merged_idx < merged_segment_ids.size() && level_it->first == merged_segment_ids[merged_idx]) { - if (level_it->second == 0) { - old_level_0_segment_ids.insert(level_it->first); - } - merged_idx ++; - } else if (new_idx < new_segment_ids.size() && level_it->first == new_segment_ids[new_idx]) { - if (level_it->second == 0) { - new_level_0_segment_ids.insert(level_it->first); - segment_units_num_recorder.insert(std::make_pair(level_it->first, MAX_UNITS_NUM)); - } else { - // not a level 0 segment, set default units num - segment_units_num_recorder.insert(std::make_pair(level_it->first, DEFAULT_UNITS_NUM)); - } - new_idx ++; - } - level_it ++; + assert(new_segment_ids.size() == new_level_recorder.size()); + assert(new_segment_ids.size() == segment_ranges_recorder.size()); + assert(new_segment_ids.size() >= inherit_infos_recorder.size()); + // uint32_t new_l0_count = 0, new_non_l0_count = 0; + // size_t cached_l0_count = cached_level_0_segment_ids_.size(); + assert(DEFAULT_UNITS_NUM <= MAX_UNITS_NUM && DEFAULT_UNITS_NUM >= MIN_UNITS_NUM); + for (auto& item : new_level_recorder) { + auto segment_id = item.first; + auto level = item.second; + if (level == 0) { + new_level_0_segment_ids.insert(segment_id); + cached_level_0_segment_ids_.insert(segment_id); // update current cached level 0 segments + segment_units_num_recorder.insert(std::make_pair(segment_id, MAX_UNITS_NUM)); + // new_l0_count++; + } else { + segment_units_num_recorder.insert(std::make_pair(segment_id, DEFAULT_UNITS_NUM)); + // new_non_l0_count++; + } + } + // cached_l0_count += new_l0_count; + // assert(new_l0_count + new_non_l0_count == new_segment_ids.size()); + assert(segment_units_num_recorder.size() == new_segment_ids.size()); + + // // print new segment ids + // std::cout << std::endl; + // std::cout << "new level-0 segment id: "; + // for (uint32_t segment_id : new_level_0_segment_ids) { + // std::cout << segment_id << " "; + // } + // std::cout << std::endl; + // std::cout << "new non level-0 segment id: "; + // for (uint32_t segment_id : new_segment_ids) { + // if (new_level_0_segment_ids.count(segment_id) == 0) { + // std::cout << segment_id << " "; + // } + // } + // std::cout << std::endl; + + // collect old segments id on level 0 + // uint32_t old_l0_count = 0, old_non_l0_count = 0; + // std::cout << "merged segment ids: " << std::endl; + for (uint32_t& merged_segment_id : merged_segment_ids) { + if (cached_level_0_segment_ids_.count(merged_segment_id) > 0) { + old_level_0_segment_ids.insert(merged_segment_id); + cached_level_0_segment_ids_.erase(merged_segment_id); + // old_l0_count++; + } else { + // old_non_l0_count++; + // std::cout << merged_segment_id << " "; + } } + // std::cout << std::endl; + + // cached_l0_count -= old_l0_count; + // assert(cached_l0_count == cached_level_0_segment_ids_.size()); + // assert(old_l0_count + old_non_l0_count == merged_segment_ids.size()); + + // // print merged segment ids + // std::cout << "merged level-0 segment id: "; + // for (uint32_t segment_id : old_level_0_segment_ids) { + // std::cout << segment_id << " "; + // } + // std::cout << std::endl; + // std::cout << "merged non level-0 segment id: "; + // for (uint32_t segment_id : merged_segment_ids) { + // if (old_level_0_segment_ids.count(segment_id) == 0) { + // std::cout << segment_id << " "; + // } + // } + // std::cout << std::endl; if (!is_ready_) { // if is_ready_ is false, no need to enable two-heaps adjustment, remember to update is_ready_ in the end // remove merged segments' units in filter cache and nodes in filter heaps - heap_manager_.batch_delete(merged_segment_ids); + std::vector merged_segment_ids_except_l0; + for (uint32_t &segment_id : merged_segment_ids) { + if (old_level_0_segment_ids.count(segment_id) == 0) { + merged_segment_ids_except_l0.emplace_back(segment_id); + } + } + heap_manager_.batch_delete(merged_segment_ids_except_l0); + // std::cout << merged_segment_ids_except_l0.size() << " " << merged_segment_ids.size() << std::endl; filter_cache_.release_for_segments(merged_segment_ids, old_level_0_segment_ids); // inherit merged segments' counts to new segments' counts // ensure that new segments that are not in inherit_infos_recorder keys set are only level 0 segments + // this function will remove moved segments from last_count_recorder_ and current_count_recorder_ inherit_count_recorder(merged_segment_ids, new_segment_ids, level_0_base_count, inherit_infos_recorder); - estimate_counts_for_all(approximate_counts_recorder); + + std::vector needed_segment_ids; + for (uint32_t segment_id : new_segment_ids) { + needed_segment_ids.emplace_back(segment_id); + } + estimate_recent_counts(approximate_counts_recorder, needed_segment_ids); + assert(approximate_counts_recorder.size() > 0); // insert units into filter cache filter_cache_.enable_for_segments(segment_units_num_recorder, false, new_level_0_segment_ids, failed_segment_ids); // insert nodes into filter heaps for (uint32_t& new_segment_id : new_segment_ids) { - if (new_level_0_segment_ids.count(new_segment_id)) { + if (new_level_0_segment_ids.count(new_segment_id) > 0) { // no need to insert level 0 segment nodes into heap continue; - } else if (failed_segment_ids.count(new_segment_id)) { + } else if (failed_segment_ids.count(new_segment_id) > 0) { // failed to insert filter units uint16_t units_num = segment_units_num_recorder[new_segment_id]; new_segment_items.emplace_back(FilterCacheHeapItem(new_segment_id, approximate_counts_recorder[new_segment_id], @@ -573,6 +987,7 @@ void FilterCacheManager::insert_segments(std::vector& merged_segment_i units_num, 0, units_num)); } } + assert(new_segment_items.size() + new_level_0_segment_ids.size() == new_segment_ids.size()); heap_manager_.batch_upsert(new_segment_items); // remember to update is_ready_ @@ -582,32 +997,70 @@ void FilterCacheManager::insert_segments(std::vector& merged_segment_i } else { // is_ready_ is true, then we will not update is_ready_, that means is_ready_ will be always true // remove merged segments' units in filter cache and nodes in filter heaps - heap_manager_.batch_delete(merged_segment_ids); + std::vector merged_segment_ids_except_l0; + for (uint32_t &segment_id : merged_segment_ids) { + if (old_level_0_segment_ids.count(segment_id) == 0) { + merged_segment_ids_except_l0.emplace_back(segment_id); + } + } + heap_manager_.batch_delete(merged_segment_ids_except_l0); + // std::cout << merged_segment_ids_except_l0.size() << " " << merged_segment_ids.size() << std::endl; filter_cache_.release_for_segments(merged_segment_ids, old_level_0_segment_ids); // inherit merged segments' counts to new segments' counts // ensure that new segments that are not in inherit_infos_recorder keys set are only level 0 segments + // this function will remove moved segments from last_count_recorder_ and current_count_recorder_ inherit_count_recorder(merged_segment_ids, new_segment_ids, level_0_base_count, inherit_infos_recorder); - estimate_counts_for_all(approximate_counts_recorder); + + std::vector needed_segment_ids; + for (uint32_t segment_id : new_segment_ids) { + needed_segment_ids.emplace_back(segment_id); + } + estimate_recent_counts(approximate_counts_recorder, needed_segment_ids); + assert(approximate_counts_recorder.size() > 0); // predict units num for new non level 0 segments and update segment_units_num_recorder std::vector> pred_datas; std::vector pred_segment_ids; std::vector pred_results; for (uint32_t& new_segment_id : new_segment_ids) { - if (new_level_0_segment_ids.count(new_segment_id)) { + if (new_level_0_segment_ids.count(new_segment_id) > 0) { // no need to predict for level 0 segments continue; } else { pred_segment_ids.emplace_back(new_segment_id); + auto range_it = segment_ranges_recorder.find(new_segment_id); + assert(range_it != segment_ranges_recorder.end()); std::vector pred_data; - pred_data.emplace_back(level_recorder[new_segment_id]); - for (RangeRatePair& pair : segment_ranges_recorder[new_segment_id]) { + std::vector heat_pairs; + // double rate_sum = 0; + for (RangeRatePair& pair : range_it->second) { + // rate_sum += pair.rate_in_segment; + + RangeHeatPair heat_pair; assert(pair.range_id >= 0 && pair.range_id < buckets.size()); - pred_data.emplace_back(uint32_t(RATE_SIGNIFICANT_DIGITS_FACTOR * pair.rate_in_segment)); - pred_data.emplace_back(uint32_t(HOTNESS_SIGNIFICANT_DIGITS_FACTOR * buckets[pair.range_id].hotness_)); + heat_pair.rate_in_segment = pair.rate_in_segment; + heat_pair.heat_value = buckets[pair.range_id].hotness_; + heat_pairs.emplace_back(heat_pair); + } + // assert(rate_sum >= 0.98 && rate_sum <= 1.02); + assert(heat_pairs.size() == (range_it->second).size()); + + std::sort(heat_pairs.begin(), heat_pairs.end(), RangeHeatPairGreatorComparor); + for (size_t i = 0; i < heat_pairs.size() - 1; i ++) { + assert(heat_pairs[i].heat_value >= heat_pairs[i+1].heat_value); + } + + pred_data.emplace_back(new_level_recorder[new_segment_id]); + for (RangeHeatPair& heat_pair : heat_pairs) { + pred_data.emplace_back(uint32_t(RATE_SIGNIFICANT_DIGITS_FACTOR * heat_pair.rate_in_segment)); + pred_data.emplace_back(uint32_t(HOTNESS_SIGNIFICANT_DIGITS_FACTOR * heat_pair.heat_value)); } + // std::cout << "[DEBUG] segment " << level_it->first << " data features num: " << data.size() << std::endl; + assert(pred_data.size() >= 3 && pred_data.size() % 2 == 1); + assert((range_it->second).size() * 2 + 1 == pred_data.size()); + assert(pred_data[0] > 0); pred_datas.emplace_back(pred_data); } } @@ -617,18 +1070,22 @@ void FilterCacheManager::insert_segments(std::vector& merged_segment_i size_t pred_idx = 0; while (pred_idx < pred_segment_ids.size() && pred_idx < pred_results.size()) { segment_units_num_recorder[pred_segment_ids[pred_idx]] = pred_results[pred_idx]; + assert(new_level_0_segment_ids.count(pred_segment_ids[pred_idx]) == 0); + assert(pred_results[pred_idx] >= MIN_UNITS_NUM && pred_results[pred_idx] <= MAX_UNITS_NUM); pred_idx = pred_idx + 1; } + // std::cout << "insert predict " << pred_results.size() << " segments" << std::endl; + assert(pred_results.size() + new_level_0_segment_ids.size() == new_segment_ids.size()); // insert units into filter cache filter_cache_.enable_for_segments(segment_units_num_recorder, false, new_level_0_segment_ids, failed_segment_ids); // insert nodes into filter heaps for (uint32_t& new_segment_id : new_segment_ids) { - if (new_level_0_segment_ids.count(new_segment_id)) { + if (new_level_0_segment_ids.count(new_segment_id) > 0) { // no need to insert level 0 segment nodes into heap continue; - } else if (failed_segment_ids.count(new_segment_id)) { + } else if (failed_segment_ids.count(new_segment_id) > 0) { // failed to insert filter units uint16_t units_num = segment_units_num_recorder[new_segment_id]; new_segment_items.emplace_back(FilterCacheHeapItem(new_segment_id, approximate_counts_recorder[new_segment_id], @@ -640,27 +1097,25 @@ void FilterCacheManager::insert_segments(std::vector& merged_segment_i units_num, 0, units_num)); } } + assert(new_segment_items.size() + new_level_0_segment_ids.size() == new_segment_ids.size()); heap_manager_.batch_upsert(new_segment_items); } } -void FilterCacheManager::delete_segments(std::vector& merged_segment_ids, std::map& level_recorder) { +void FilterCacheManager::delete_segments(std::vector& merged_segment_ids) { + assert(false); + exit(1); std::set old_level_0_segment_ids; - std::sort(merged_segment_ids.begin(), merged_segment_ids.end()); - // level_recorder is a copy of global level_recorder - assert(merged_segment_ids.size() == level_recorder.size()); - auto level_it = level_recorder.begin(); - size_t merged_idx = 0; - while (level_it != level_recorder.end()) { - assert(merged_idx < merged_segment_ids.size() && level_it->first == merged_segment_ids[merged_idx]); - if (merged_idx < merged_segment_ids.size() && level_it->first == merged_segment_ids[merged_idx]) { - if (level_it->second == 0) { - old_level_0_segment_ids.insert(level_it->first); - } - merged_idx ++; + // collect old segments id on level 0 + for (uint32_t& merged_segment_id : merged_segment_ids) { + if (cached_level_0_segment_ids_.count(merged_segment_id)) { + old_level_0_segment_ids.insert(merged_segment_id); + cached_level_0_segment_ids_.erase(merged_segment_id); } - level_it ++; + // remove merged segments' count + last_count_recorder_.erase(merged_segment_id); + current_count_recorder_.erase(merged_segment_id); } if (!is_ready_) { @@ -685,6 +1140,8 @@ void FilterCacheManager::move_segments(std::vector& moved_segment_ids, std::map& old_level_recorder, std::map& move_level_recorder, std::map>& move_segment_ranges_recorder) { + assert(false); + exit(1); std::unordered_map segment_units_num_recorder; std::map approximate_counts_recorder; std::vector new_segment_items; @@ -698,13 +1155,10 @@ void FilterCacheManager::move_segments(std::vector& moved_segment_ids, assert(moved_segment_ids.size() == move_level_recorder.size()); assert(moved_segment_ids.size() == move_segment_ranges_recorder.size()); auto level_it = old_level_recorder.begin(); - size_t moved_idx = 0, new_idx = 0; + size_t moved_idx = 0; while (level_it != old_level_recorder.end()) { assert(moved_idx < moved_segment_ids.size() && level_it->first == moved_segment_ids[moved_idx]); if (moved_idx < moved_segment_ids.size() && level_it->first == moved_segment_ids[moved_idx]) { - if (level_it->second == 0) { - old_level_0_segment_ids.insert(level_it->first); - } segment_units_num_recorder.insert(std::make_pair(level_it->first, DEFAULT_UNITS_NUM)); // actually, we cannot move segments to level 0 in trivial move compaction (only flushing do this). moved_idx ++; @@ -712,27 +1166,39 @@ void FilterCacheManager::move_segments(std::vector& moved_segment_ids, level_it ++; } + // collect old segments id on level 0 + for (uint32_t moved_segment_id : moved_segment_ids) { + if (cached_level_0_segment_ids_.count(moved_segment_id)) { + old_level_0_segment_ids.insert(moved_segment_id); + cached_level_0_segment_ids_.erase(moved_segment_id); + } + } + if (!is_ready_) { // firstly, delete moved segments heap_manager_.batch_delete(moved_segment_ids); - filter_cache_.release_for_segments(moved_segment_ids, old_level_0_segment_ids); // inherit these segments' count - for (uint32_t& segment_id : moved_segment_ids) { - auto last_it = last_count_recorder_.find(segment_id); - auto current_it = current_count_recorder_.find(segment_id); - if (last_it != last_count_recorder_.end()) { - last_it->second = INHERIT_REMAIN_FACTOR * (last_it->second); - } - if (current_it != current_count_recorder_.end()) { - current_it->second = INHERIT_REMAIN_FACTOR * (current_it->second); - } + // for (uint32_t& segment_id : moved_segment_ids) { + // auto last_it = last_count_recorder_.find(segment_id); + // auto current_it = current_count_recorder_.find(segment_id); + // if (last_it != last_count_recorder_.end()) { + // last_it->second = INHERIT_REMAIN_FACTOR * (last_it->second); + // } + // if (current_it != current_count_recorder_.end()) { + // current_it->second = INHERIT_REMAIN_FACTOR * (current_it->second); + // } + // } + std::vector needed_segment_ids; + for (uint32_t segment_id : moved_segment_ids) { + needed_segment_ids.emplace_back(segment_id); } - estimate_counts_for_all(approximate_counts_recorder); + estimate_recent_counts(approximate_counts_recorder, needed_segment_ids); + assert(approximate_counts_recorder.size() > 0); - // insert units into filter cache - std::set empty_new_level_0_segment_ids, empty_failed_segment_ids; - filter_cache_.enable_for_segments(segment_units_num_recorder, true, empty_new_level_0_segment_ids, empty_failed_segment_ids); + // modify units into filter cache + std::set empty_failed_segment_ids; + filter_cache_.update_for_segments(segment_units_num_recorder, old_level_0_segment_ids, empty_failed_segment_ids); // insert nodes into filter heaps for (uint32_t& segment_id : moved_segment_ids) { @@ -750,36 +1216,65 @@ void FilterCacheManager::move_segments(std::vector& moved_segment_ids, } else { // firstly, delete moved segments heap_manager_.batch_delete(moved_segment_ids); - filter_cache_.release_for_segments(moved_segment_ids, old_level_0_segment_ids); // inherit these segments' count - for (uint32_t& segment_id : moved_segment_ids) { - auto last_it = last_count_recorder_.find(segment_id); - auto current_it = current_count_recorder_.find(segment_id); - if (last_it != last_count_recorder_.end()) { - last_it->second = INHERIT_REMAIN_FACTOR * (last_it->second); - } - if (current_it != current_count_recorder_.end()) { - current_it->second = INHERIT_REMAIN_FACTOR * (current_it->second); - } + // for (uint32_t& segment_id : moved_segment_ids) { + // auto last_it = last_count_recorder_.find(segment_id); + // auto current_it = current_count_recorder_.find(segment_id); + // if (last_it != last_count_recorder_.end()) { + // last_it->second = INHERIT_REMAIN_FACTOR * (last_it->second); + // } + // if (current_it != current_count_recorder_.end()) { + // current_it->second = INHERIT_REMAIN_FACTOR * (current_it->second); + // } + // } + std::vector needed_segment_ids; + for (uint32_t segment_id : moved_segment_ids) { + needed_segment_ids.emplace_back(segment_id); } - estimate_counts_for_all(approximate_counts_recorder); + estimate_recent_counts(approximate_counts_recorder, needed_segment_ids); + assert(approximate_counts_recorder.size() > 0); // predict units num for new non level 0 segments and update segment_units_num_recorder std::vector> pred_datas; std::vector pred_segment_ids; std::vector pred_results; - for (uint32_t& segment_id : moved_segment_ids) { - assert(move_level_recorder[segment_id] > 0); - pred_segment_ids.emplace_back(segment_id); + for (uint32_t moved_segment_id : moved_segment_ids) { + assert(move_level_recorder[moved_segment_id] > 0); + pred_segment_ids.emplace_back(moved_segment_id); + auto range_it = move_segment_ranges_recorder.find(moved_segment_id); + assert(range_it != move_segment_ranges_recorder.end()); std::vector pred_data; - pred_data.emplace_back(move_level_recorder[segment_id]); - for (RangeRatePair& pair : move_segment_ranges_recorder[segment_id]) { + std::vector heat_pairs; + double rate_sum = 0; + for (RangeRatePair& pair : range_it->second) { + rate_sum += pair.rate_in_segment; + + RangeHeatPair heat_pair; assert(pair.range_id >= 0 && pair.range_id < buckets.size()); - pred_data.emplace_back(uint32_t(RATE_SIGNIFICANT_DIGITS_FACTOR * pair.rate_in_segment)); - pred_data.emplace_back(uint32_t(HOTNESS_SIGNIFICANT_DIGITS_FACTOR * buckets[pair.range_id].hotness_)); + heat_pair.rate_in_segment = pair.rate_in_segment; + heat_pair.heat_value = buckets[pair.range_id].hotness_; + heat_pairs.emplace_back(heat_pair); + } + assert(rate_sum >= 0.98 && rate_sum <= 1.02); + assert(heat_pairs.size() == (range_it->second).size()); + + std::sort(heat_pairs.begin(), heat_pairs.end(), RangeHeatPairGreatorComparor); + for (size_t i = 0; i < heat_pairs.size() - 1; i ++) { + assert(heat_pairs[i].heat_value >= heat_pairs[i+1].heat_value); } + + pred_data.emplace_back(move_level_recorder[moved_segment_id]); + for (RangeHeatPair& heat_pair : heat_pairs) { + pred_data.emplace_back(uint32_t(RATE_SIGNIFICANT_DIGITS_FACTOR * heat_pair.rate_in_segment)); + pred_data.emplace_back(uint32_t(HOTNESS_SIGNIFICANT_DIGITS_FACTOR * heat_pair.heat_value)); + } + // std::cout << "[DEBUG] segment " << level_it->first << " data features num: " << data.size() << std::endl; + assert(pred_data.size() >= 3 && pred_data.size() % 2 == 1); + assert((range_it->second).size() * 2 + 1 == pred_data.size()); + assert(pred_data[0] > 0); + pred_datas.emplace_back(pred_data); } assert(pred_datas.size() == pred_segment_ids.size()); clf_model_.make_predict(pred_datas, pred_results); @@ -789,13 +1284,14 @@ void FilterCacheManager::move_segments(std::vector& moved_segment_ids, segment_units_num_recorder[pred_segment_ids[pred_idx]] = pred_results[pred_idx]; pred_idx = pred_idx + 1; } + assert(pred_results.size() == moved_segment_ids.size()); - // insert units into filter cache - std::set empty_new_level_0_segment_ids, empty_failed_segment_ids; - filter_cache_.enable_for_segments(segment_units_num_recorder, true, empty_new_level_0_segment_ids, empty_failed_segment_ids); + // modify units into filter cache + std::set empty_failed_segment_ids; + filter_cache_.update_for_segments(segment_units_num_recorder, old_level_0_segment_ids, empty_failed_segment_ids); // insert nodes into filter heaps - for (uint32_t& segment_id : moved_segment_ids) { + for (uint32_t segment_id : moved_segment_ids) { assert(move_level_recorder[segment_id] > 0); uint16_t units_num = segment_units_num_recorder[segment_id]; new_segment_items.emplace_back(FilterCacheHeapItem(segment_id, approximate_counts_recorder[segment_id], @@ -805,4 +1301,65 @@ void FilterCacheManager::move_segments(std::vector& moved_segment_ids, } } + const char* FilterCache::Name() const { return "FilterCache"; } + + // overrides rocksdb::Cache but no nothing + Status FilterCache::Insert(const Slice& key, void* value, size_t charge, + void (*deleter)(const Slice& key, void* value), + Handle** handle, + Priority priority) { + assert(false); + return Status::OK(); + } + + // overrides rocksdb::Cache but no nothing + Cache::Handle* FilterCache::Lookup(const Slice& key, Statistics* stats) { + assert(false); + return nullptr; + } + + // overrides rocksdb::Cache but no nothing + bool FilterCache::Ref(Handle* handle) { return false; } + + // used by CachableEntry + bool FilterCache::Release(Cache::Handle* handle, bool force_erase) { return false; } + + // overrides rocksdb::Cache but no nothing + void* FilterCache::Value(Cache::Handle* handle) { assert(false); return nullptr; } + + // overrides rocksdb::Cache but no nothing + void FilterCache::Erase(const Slice& key) { assert(false); } + // overrides rocksdb::Cache but no nothing + uint64_t FilterCache::NewId() { assert(false); return 0; } + + // overrides rocksdb::Cache but no nothing + void FilterCache::SetCapacity(size_t capacity) { assert(false); } + + // overrides rocksdb::Cache but no nothing + void FilterCache::SetStrictCapacityLimit(bool strict_capacity_limit) { assert(false);} + + // overrides rocksdb::Cache but no nothing + bool FilterCache::HasStrictCapacityLimit() const { assert(false); return false; } + + // overrides rocksdb::Cache but no nothing + size_t FilterCache::GetCapacity() const { assert(false); return 0; } + + // overrides rocksdb::Cache but no nothing + size_t FilterCache::GetUsage() const { assert(false); return 0; } + + // overrides rocksdb::Cache but no nothing + size_t FilterCache::GetUsage(Handle* handle) const { assert(false); return 0; } + + // overrides rocksdb::Cache but no nothing + size_t FilterCache::GetPinnedUsage() const { assert(false); return 0; } + + // overrides rocksdb::Cache but no nothing + size_t FilterCache::GetCharge(Handle* handle) const { assert(false); return 0; } + + // overrides rocksdb::Cache but no nothing + void FilterCache::ApplyToAllCacheEntries(void (*callback)(void*, size_t), + bool thread_safe) { assert(false); } + + // overrides rocksdb::Cache but no nothing + void FilterCache::EraseUnRefEntries() { assert(false); } } \ No newline at end of file diff --git a/db/art/filter_cache.h b/db/art/filter_cache.h index 5578e58f7..45ab8d5a2 100644 --- a/db/art/filter_cache.h +++ b/db/art/filter_cache.h @@ -1,5 +1,7 @@ #pragma once +#include +#include #include #include #include @@ -8,17 +10,21 @@ #include #include #include +#include "db/art/filter_cache_entry.h" +#include "db/version_edit.h" #include "macros.h" #include "greedy_algo.h" #include "clf_model.h" #include "heat_buckets.h" #include "filter_cache_heap.h" -#include "filter_cache_item.h" +#include "rocksdb/cache.h" +#include "table/block_based/cachable_entry.h" +#include "table/block_based/parsed_full_filter_block.h" namespace ROCKSDB_NAMESPACE { -class FilterCache; class FilterCacheManager; +class BlockBasedTable; // FilterCache main component is a STL Map, key -- segment id, value -- Structure of Filter Units ( called FilterCacheItem) // its main job is auto enable/disable filter units for one segment, and check whether one key exists in enabled units @@ -28,9 +34,9 @@ class FilterCacheManager; // 3. check whether filter cache is approximately full // 4. check whether ready to train first model // 5. release FilterCacheItem of these merged (outdated) segments -class FilterCache { +class FilterCache : public Cache { private: - std::map filter_cache_; + std::map filter_cache_; uint32_t used_space_size_; uint32_t level_0_used_space_size_; uint32_t cache_size_; // max size of cache @@ -39,24 +45,29 @@ class FilterCache { public: FilterCache() { filter_cache_.clear(); cache_size_ = CACHE_SPACE_SIZE; used_space_size_ = 0; level_0_used_space_size_ = 0; } - ~FilterCache() { /* do nothing */ } + ~FilterCache() override { /* do nothing */ } // other levels total cache size - uint32_t cache_size_except_level_0() { return cache_size_ * FULL_RATE - level_0_used_space_size_; } + // assume level 0 segments' filter never use filter cache space + uint32_t cache_size_except_level_0() { return cache_size_ * FULL_RATE; } - // check whether one given key exist in one segment - bool check_key(const uint32_t& segment_id, const std::string& key); + // get all cached filter blocks of one segment + std::vector> get_filter_blocks(const uint32_t segment_id); // enable / disable units for a batch of segments (one segment may not exist in FilterCache) // if enabled units num exceed given units num, it will disable units void enable_for_segments(std::unordered_map& segment_units_num_recorder, const bool& is_forced, - std::set& level_0_segment_ids, std::set& failed_segment_ids); + std::set& new_level_0_segment_ids, std::set& failed_segment_ids); // the only difference from enable_for_segments is: - // this func dont insert any filter units for segments that dont exist in cache, but enable_for_segments unc does - void update_for_segments(std::unordered_map& segment_units_num_recorder, const bool& is_forced, - std::set& level_0_segment_ids, std::set& failed_segment_ids); - + // this is designed for moved compaction + // if one segment moved from L0 to L1, we update L0 cached filter usage and filter cache usage + // we do not remove filter handle of these segments, because these segments' ids are still valid + void update_for_segments(std::unordered_map& segment_units_num_recorder, + std::set& old_level_0_segment_ids, std::set& failed_segment_ids); + + // should be called in/after compaction, before any filter adjustment operation that may affect given segment + void init_segment(uint32_t segment_id, const BlockBasedTable* table, const std::vector& block_handles); // check whether filter cache is approximately full // actually, we will leave (1-FULL_RATE) * cache_size_ space for emergency usage bool is_full(); @@ -65,7 +76,75 @@ class FilterCache { bool is_ready(); // release filter units of merged segments - void release_for_segments(std::vector& segment_ids, std::set& level_0_segment_ids); + void release_for_segments(std::vector& segment_ids, std::set& old_level_0_segment_ids); + + // The type of the Cache + virtual const char* Name() const override; + + // overrides rocksdb::Cache but no nothing + Status Insert(const Slice& key, void* value, size_t charge, + void (*deleter)(const Slice& key, void* value), + Handle** handle = nullptr, + Priority priority = Priority::LOW) override; + + // overrides rocksdb::Cache but no nothing + Handle* Lookup(const Slice& key, Statistics* stats = nullptr) override; + + // overrides rocksdb::Cache but no nothing + bool Ref(Handle* handle) override; + + /** + * Release a mapping returned by a previous Lookup(). A released entry might + * still remain in cache in case it is later looked up by others. If + * force_erase is set then it also erase it from the cache if there is no + * other reference to it. Erasing it should call the deleter function that + * was provided when the + * entry was inserted. + * + * Returns true if the entry was also erased. + */ + // REQUIRES: handle must not have been released yet. + // REQUIRES: handle must have been returned by a method on *this. + bool Release(Handle* handle, bool force_erase = false) override; + + // overrides rocksdb::Cache but no nothing + void* Value(Handle* handle) override; + + // overrides rocksdb::Cache but no nothing + void Erase(const Slice& key) override; + // overrides rocksdb::Cache but no nothing + uint64_t NewId() override; + + // overrides rocksdb::Cache but no nothing + void SetCapacity(size_t capacity) override; + + // overrides rocksdb::Cache but no nothing + void SetStrictCapacityLimit(bool strict_capacity_limit) override; + + // overrides rocksdb::Cache but no nothing + bool HasStrictCapacityLimit() const override; + + // overrides rocksdb::Cache but no nothing + size_t GetCapacity() const override; + + // overrides rocksdb::Cache but no nothing + size_t GetUsage() const override; + + // overrides rocksdb::Cache but no nothing + size_t GetUsage(Handle* handle) const override; + + // overrides rocksdb::Cache but no nothing + size_t GetPinnedUsage() const override; + + // overrides rocksdb::Cache but no nothing + size_t GetCharge(Handle* handle) const override; + + // overrides rocksdb::Cache but no nothing + void ApplyToAllCacheEntries(void (*callback)(void*, size_t), + bool thread_safe) override; + + // overrides rocksdb::Cache but no nothing + void EraseUnRefEntries() override; }; // FilterCacheManager is combined of these components: @@ -87,25 +166,29 @@ class FilterCache { class FilterCacheManager { private: // TODO: mutex can be optimized or use a message queue or a thread pool to reduce time costed by mutex - static FilterCache filter_cache_; - static HeatBuckets heat_buckets_; - static ClfModel clf_model_; - static GreedyAlgo greedy_algo_; - static FilterCacheHeapManager heap_manager_; - static uint32_t get_cnt_; // record get cnt in current period, when exceeding PERIOD_COUNT, start next period - static uint32_t period_cnt_; // record period cnt, if period_cnt_ - last_train_period_ >= TRAIN_PERIODS, start to evaluate or retrain ClfModel - static uint32_t last_long_period_; // record last short period cnt of last long period - static uint32_t last_short_period_; // helper var for update job when one short period ends - static std::mutex update_mutex_; // guarantee counts records only updated once - static bool train_signal_; // if true, try to retrain model. we call one background thread to monitor this flag and retrain - static std::map last_count_recorder_; // get cnt recorder of segments in last long period - static std::map current_count_recorder_; // get cnt recorder of segments in current long period - static std::mutex count_mutex_; // guarentee last_count_recorder and current_count_recorder treated orderedly - static bool is_ready_; // check whether ready to use adaptive filter assignment + std::set cached_level_0_segment_ids_; + FilterCache filter_cache_; + HeatBuckets heat_buckets_; + ClfModel clf_model_; + GreedyAlgo greedy_algo_; + FilterCacheHeapManager heap_manager_; + uint32_t get_cnt_; // record get cnt in current period, when exceeding PERIOD_COUNT, start next period + uint32_t period_cnt_; // record period cnt, if period_cnt_ - last_train_period_ >= TRAIN_PERIODS, start to evaluate or retrain ClfModel + uint32_t last_long_period_; // record last short period cnt of last long period + uint32_t last_short_period_; // helper var for update job when one short period ends + mutable port::RWMutex period_mutex_; // guarantee heat buckets, get_cnt_ and period_cnt_ are updated orderly + // std::mutex update_mutex_; // guarantee counts records only updated once + bool train_signal_; // if true, try to retrain model. we call one background thread to monitor this flag and retrain + std::map last_count_recorder_; // get cnt recorder of segments in last long period + std::map current_count_recorder_; // get cnt recorder of segments in current long period + mutable port::RWMutex count_mutex_; // guarentee last_count_recorder and current_count_recorder treated orderedly + bool is_ready_; // check whether ready to use adaptive filter assignment + std::map segment_in_file; // map segment_id to SST file + std::atomic cfd_; // In WaLSM+, we only support one column family public: FilterCacheManager() { get_cnt_ = 0; last_long_period_ = 0; last_short_period_ = 0; train_signal_ = false; } - ~FilterCacheManager(); + ~FilterCacheManager() {} // one background thread monitor this func, if return true, call try_retrain_model at once, wait for training end, and call update_cache_and_heap bool need_retrain() { return train_signal_; } @@ -122,24 +205,27 @@ class FilterCacheManager { // normal bloom filter units query, can we put hit_count_recorder outside this func? this will make get opt faster // will be called by a get operation, this will block get operation // remember to call hit_count_recorder in a background thread - bool check_key(const uint32_t& segment_id, const std::string& key); + std::vector> get_filter_blocks(const uint32_t segment_id); // add 1 to get cnt of specified segment in current long period // will be called when calling check_key // remember to move this func to a single background thread aside check_key // because this func shouldn't block get operations - void hit_count_recorder(const uint32_t& segment_id); + void hit_count_recorder(uint32_t segment_id); // copy counts to last_count_recorder and reset counts of current_count_recorder void update_count_recorder(); + // when debugging, we need to print out counters of each segment. + void debug_count_recorder(); + // inherit counts of merged segments to counts of new segments and remove counts of merged segments // inherit_infos_recorder: { {new segment 1: [{old segment 1: inherit rate 1}, {old segment 2: inherit rate 2}, ...]}, ...} void inherit_count_recorder(std::vector& merged_segment_ids, std::vector& new_segment_ids, const uint32_t& level_0_base_count, std::map>& inherit_infos_recorder); // estimate approximate get cnts for every alive segment - void estimate_counts_for_all(std::map& approximate_counts_recorder); + void estimate_recent_counts(std::map& approximate_counts_recorder, const std::vector& needed_segment_ids); // noticed that at the beginning, heat buckets need to sample put keys to init itself before heat buckets start to work // segment_info_recorder is external variable that records every alive segments' min key and max key @@ -168,6 +254,11 @@ class FilterCacheManager { // we should use one background thread to call this func in every get operation void hit_heat_buckets(const std::string& key); + // when one short period ends, we estimate recent access counter of each segment, then update heaps + // when one long period ends, we reset counters and send a training signal. then classifier will be evaluated and retrained. + // we leave this function to one single thread. Exec this func and never end. + void do_periods_work(); + // if one long period end, we need to check effectiveness of model. // if model doesnt work well in current workload, we retrain this model // 1. use greedy algorithm to solve filter units allocation problem (receive ideal enabled units num for every current segments) @@ -183,7 +274,7 @@ class FilterCacheManager { // we ignore all level 0 segments !!! 3 recorders keys set should be the same ------ all alive segments' ids (except level 0) // because of the time cost of writing csv file, we need to do this func with a background thread // need real benchmark data to debug this func - void try_retrain_model(std::map& level_recorder, + bool try_retrain_model(std::map& level_recorder, std::map>& segment_ranges_recorder, std::map& unit_size_recorder); @@ -200,15 +291,6 @@ class FilterCacheManager { void update_cache_and_heap(std::map& level_recorder, std::map>& segment_ranges_recorder); - // remove merged segments' filter units in the filter cache - // also remove related items in FilterCacheHeap - // segment_ids: [level_1_segment_1, level_0_segment_1, ...] - // level_0_segment_ids: [level_0_segment_1, ...] - // should be called by one background thread - // this func will be called by insert_segments - // you can also call this func alone after segments are merged (not suggested) - void remove_segments(std::vector& segment_ids, std::set& level_0_segment_ids); - // insert new segments into cache // all level 0 segments must enable all filter units // if is_ready_ is not true, set default filter units num (except level 0), insert into filter_cache_ and heaps @@ -232,7 +314,7 @@ class FilterCacheManager { // when old segments are merged into some new segments, call this func in one background thread void insert_segments(std::vector& merged_segment_ids, std::vector& new_segment_ids, std::map>& inherit_infos_recorder, - std::map& level_recorder, const uint32_t& level_0_base_count, + std::map& new_level_recorder, const uint32_t& level_0_base_count, std::map>& segment_ranges_recorder); // in func insert_segments above, we will also remove merged segments, this work well for normal compaction and flush @@ -241,7 +323,7 @@ class FilterCacheManager { // this func only delete merged segments // we only need argument merged_segment_ids (all merged segments' ids) // and level_recorder which only include merged segments' level - void delete_segments(std::vector& merged_segment_ids, std::map& level_recorder); + void delete_segments(std::vector& merged_segment_ids); // move segments to another level, used for trivial move compaction void move_segments(std::vector& moved_segment_ids, @@ -261,6 +343,18 @@ class FilterCacheManager { std::vector& range_seperators() { return heat_buckets_.seperators(); } + + inline void update_cfd(ColumnFamilyData* cfd) { + ColumnFamilyData* expected = nullptr; + cfd_.compare_exchange_strong(expected, cfd, std::memory_order_release); + } + + inline ColumnFamilyData* get_cfd() { + return cfd_.load(std::memory_order_acquire); + } + + // should be called in/after compaction, before any filter adjustment operation that may affect given segment + void init_segment(uint32_t segment_id, const BlockBasedTable* table, const std::vector& block_handles); }; } \ No newline at end of file diff --git a/db/art/filter_cache_client.cc b/db/art/filter_cache_client.cc index 72a483235..f6a23e5b1 100644 --- a/db/art/filter_cache_client.cc +++ b/db/art/filter_cache_client.cc @@ -1,155 +1,244 @@ #include "filter_cache_client.h" +#include +#include +#include +#include "db/art/macros.h" +#include "table/block_based/parsed_full_filter_block.h" +#include "db/art/global_filter_cache_context.h" namespace ROCKSDB_NAMESPACE { -task_thread_pool::task_thread_pool FilterCacheClient::pool_{FILTER_CACHE_THREADS_NUM}; -FilterCacheManager FilterCacheClient::filter_cache_manager_; -bool FilterCacheClient::heat_buckets_ready_; - -void FilterCacheClient::do_prepare_heat_buckets(const std::string& key, std::unordered_map>* const segment_info_recorder) { +void FilterCacheClient::do_prepare_heat_buckets(const std::string& key, std::unordered_map>* segment_info_recorder) { filter_cache_manager_.make_heat_buckets_ready(key, *segment_info_recorder); } -bool FilterCacheClient::prepare_heat_buckets(const std::string& key, std::unordered_map>* const segment_info_recorder) { +bool FilterCacheClient::prepare_heat_buckets(const std::string& key, std::unordered_map>* segment_info_recorder) { heat_buckets_ready_ = filter_cache_manager_.heat_buckets_ready(); if (!heat_buckets_ready_) { // if heat_buckets_ready_ false assert(segment_info_recorder->size() == 0); // should always empty heat_buckets_ready_ = filter_cache_manager_.heat_buckets_ready(); if (!heat_buckets_ready_) { - pool_.submit_detach(do_prepare_heat_buckets, key, segment_info_recorder); + // will leaks memory if pass ref + // pool_.submit_detach([this, &key, segment_info_recorder]() { + do_prepare_heat_buckets(key, segment_info_recorder); + // }); heat_buckets_ready_ = filter_cache_manager_.heat_buckets_ready(); } } return heat_buckets_ready_; } -void FilterCacheClient::do_retrain_or_keep_model(std::vector* const features_nums_except_level_0, - std::map* const level_recorder, - std::map>* const segment_ranges_recorder, - std::map* const unit_size_recorder) { +void FilterCacheClient::do_retrain_or_keep_model(std::vector* features_nums_except_level_0, + const std::map* level_recorder, + const std::map>* segment_ranges_recorder, + const std::map* unit_size_recorder) { std::map level_copy; std::map> segment_ranges_copy; std::map unit_size_copy; + bool clf_ready = false; + bool clf_train = false; // if true, then clf model evaluated or retrained + assert(READY_RATE <= FULL_RATE && READY_RATE >= 0); // if this func background monitor signal, how can it receive latest argument? input pointer! - while (!filter_cache_manager_.heat_buckets_ready()); - while (!filter_cache_manager_.ready_work()); // wait for manager ready + while (!filter_cache_manager_.heat_buckets_ready()) + std::this_thread::sleep_for(std::chrono::milliseconds(100)); assert(filter_cache_manager_.heat_buckets_ready()); // must guarantee that heat buckets ready before we make filter cache manager ready + std::cout << "[MODEL] heat buckets are ready." << std::endl; + while (!filter_cache_manager_.ready_work()) + std::this_thread::sleep_for(std::chrono::milliseconds(100)); // wait for manager ready + assert(filter_cache_manager_.ready_work()); + std::cout << "[MODEL] filter cache is ready." << std::endl; // actually we will load data before we test, so we can ensure that heat buckets ready first - filter_cache_manager_.make_clf_model_ready(*features_nums_except_level_0); + std::cout << "[MODEL] model feature number: " << (*features_nums_except_level_0)[0] << std::endl; + assert((*features_nums_except_level_0)[0] == MAX_FEATURES_NUM); + clf_ready = filter_cache_manager_.make_clf_model_ready(*features_nums_except_level_0); + assert(clf_ready); // lock and copy recorders - global_recorder_mutex_.lock(); + global_filter_cache_recorders_mutex.lock(); level_copy = *level_recorder; segment_ranges_copy = *segment_ranges_recorder; unit_size_copy = *unit_size_recorder; - global_recorder_mutex_.unlock(); - // train first time, before that, there is no model left - filter_cache_manager_.try_retrain_model(level_copy, segment_ranges_copy, unit_size_copy); - filter_cache_manager_.update_cache_and_heap(level_copy, segment_ranges_copy); + global_filter_cache_recorders_mutex.unlock(); + assert(level_copy.size() == segment_ranges_copy.size()); + std::cout << "[MODEL] level recorder size (include level 0): " << level_copy.size() << std::endl; + std::cout << "[MODEL] range recorder size (include level 0): " << segment_ranges_copy.size() << std::endl; + // train first time, before that, there is no model left. + // Note: if it reach here when YCSB loading, we dont train model. we will train first model when one long period ends. + clf_train = filter_cache_manager_.try_retrain_model(level_copy, segment_ranges_copy, unit_size_copy); + if (UNLIKELY(clf_train)) { + assert(false); // only train first model when YCSB load ends. + std::cout << "[MODEL] we retrain a new model, thus we update filter cache and heaps" << std::endl; + std::cout << "[MODEL] level recorder size (exclude level 0): " << level_copy.size() << std::endl; + std::cout << "[MODEL] range recorder size (exclude level 0): " << segment_ranges_copy.size() << std::endl; + filter_cache_manager_.update_cache_and_heap(level_copy, segment_ranges_copy); + } // retrain in long periods while (true) { + bool adjusted = false; // in one long period - while (!filter_cache_manager_.need_retrain()); // wait for long period end + while (!filter_cache_manager_.need_retrain()) { + // std::this_thread::sleep_for(std::chrono::milliseconds(100)); // wait for long period end + adjusted = filter_cache_manager_.adjust_cache_and_heap(); + // if (adjusted) std::cout << "[ADJUST] filter cache adjustment!" << std::endl; + } + assert(filter_cache_manager_.need_retrain()); // lock and copy recorders - global_recorder_mutex_.lock(); + global_filter_cache_recorders_mutex.lock(); level_copy = *level_recorder; segment_ranges_copy = *segment_ranges_recorder; unit_size_copy = *unit_size_recorder; - global_recorder_mutex_.unlock(); + global_filter_cache_recorders_mutex.unlock(); + assert(level_copy.size() == segment_ranges_copy.size()); + std::cout << "[MODEL] level recorder size (include level 0): " << level_copy.size() << std::endl; + std::cout << "[MODEL] range recorder size (include level 0): " << segment_ranges_copy.size() << std::endl; // train first time, before that, there is no model left - filter_cache_manager_.try_retrain_model(level_copy, segment_ranges_copy, unit_size_copy); - filter_cache_manager_.update_cache_and_heap(level_copy, segment_ranges_copy); + clf_train = filter_cache_manager_.try_retrain_model(level_copy, segment_ranges_copy, unit_size_copy); + if (LIKELY(clf_train)) { + std::cout << "[MODEL] we retrain a new model, thus we update filter cache and heaps" << std::endl; + std::cout << "[MODEL] level recorder size (exclude level 0): " << level_copy.size() << std::endl; + std::cout << "[MODEL] range recorder size (exclude level 0): " << segment_ranges_copy.size() << std::endl; + filter_cache_manager_.update_cache_and_heap(level_copy, segment_ranges_copy); + } } // this loop never end } -void FilterCacheClient::retrain_or_keep_model(std::vector* const features_nums_except_level_0, - std::map* const level_recorder, - std::map>* const segment_ranges_recorder, - std::map* const unit_size_recorder) { - pool_.submit_detach(do_retrain_or_keep_model, features_nums_except_level_0, level_recorder, segment_ranges_recorder, unit_size_recorder); +void FilterCacheClient::retrain_or_keep_model(std::vector* features_nums_except_level_0, + const std::map* level_recorder, + const std::map>* segment_ranges_recorder, + const std::map* unit_size_recorder) { + pool_.submit_detach([this, features_nums_except_level_0, level_recorder, segment_ranges_recorder, unit_size_recorder]() { + do_retrain_or_keep_model(features_nums_except_level_0, level_recorder, segment_ranges_recorder, unit_size_recorder); + }); // if first model training not end, python lgb_model server still return default units num // then retrain model when every long period end. if model still work well, keep this model instead // no need to return any value } -void FilterCacheClient::do_hit_count_recorder(const uint32_t& segment_id) { +// // TODO: make it a atomic operation rather than a mutex + threading +void FilterCacheClient::do_hit_count_recorder(uint32_t segment_id) { filter_cache_manager_.hit_count_recorder(segment_id); } -bool FilterCacheClient::check_key(const uint32_t& segment_id, const std::string& key) { - bool result = filter_cache_manager_.check_key(segment_id, key); - pool_.submit_detach(do_hit_count_recorder, segment_id); - return result; +std::vector> FilterCacheClient::get_filter_blocks(uint32_t segment_id) { + // pool_.submit_detach([this, segment_id]() { + // do_hit_count_recorder(segment_id); + // }); + do_hit_count_recorder(segment_id); + return filter_cache_manager_.get_filter_blocks(segment_id); } void FilterCacheClient::do_hit_heat_buckets(const std::string& key) { filter_cache_manager_.hit_heat_buckets(key); } -void FilterCacheClient::get_updating_work(const std::string& key) { - pool_.submit_detach(do_hit_heat_buckets, key); +void FilterCacheClient::hit_heat_buckets(const std::string& key) { + // pool_.submit_detach([this, key]() { + // do_hit_heat_buckets(key); + // }); + do_hit_heat_buckets(key); } -void FilterCacheClient::do_make_adjustment() { +void FilterCacheClient::do_periods_work() { while (true) { - // never stop making heap adjustment - filter_cache_manager_.adjust_cache_and_heap(); + filter_cache_manager_.do_periods_work(); } } -void FilterCacheClient::make_adjustment() { - pool_.submit_detach(do_make_adjustment); +void FilterCacheClient::periods_work() { + pool_.submit_detach([this]() { + do_periods_work(); + }); } +// void FilterCacheClient::do_make_adjustment() { +// assert(false); +// while (true) { +// // never stop making heap adjustment +// filter_cache_manager_.adjust_cache_and_heap(); +// } +// } + +// void FilterCacheClient::make_adjustment() { +// assert(false); +// pool_.submit_detach([this]() { +// do_make_adjustment(); +// }); +// } + void FilterCacheClient::do_batch_insert_segments(std::vector& merged_segment_ids, std::vector& new_segment_ids, std::map>& inherit_infos_recorder, - std::map& level_recorder, const uint32_t& level_0_base_count, + std::map& new_level_recorder, uint32_t level_0_base_count, std::map>& segment_ranges_recorder) { filter_cache_manager_.insert_segments(merged_segment_ids, new_segment_ids, inherit_infos_recorder, - level_recorder, level_0_base_count, segment_ranges_recorder); + new_level_recorder, level_0_base_count, segment_ranges_recorder); } void FilterCacheClient::batch_insert_segments(std::vector merged_segment_ids, std::vector new_segment_ids, std::map> inherit_infos_recorder, - std::map level_recorder, const uint32_t& level_0_base_count, + std::map new_level_recorder, uint32_t level_0_base_count, std::map> segment_ranges_recorder) { - assert(merged_segment_ids.size() > 0 && new_segment_ids.size() > 0); - assert(new_segment_ids.size() == inherit_infos_recorder.size()); - assert(merged_segment_ids.size() + new_segment_ids.size() == level_recorder.size()); + assert(new_segment_ids.size() == new_level_recorder.size()); assert(new_segment_ids.size() == segment_ranges_recorder.size()); + assert(new_segment_ids.size() > 0); if (level_0_base_count == 0) { - pool_.submit_detach(do_batch_insert_segments, merged_segment_ids, new_segment_ids, inherit_infos_recorder, level_recorder, INIT_LEVEL_0_COUNT, segment_ranges_recorder); + pool_.submit_detach([this, merged_segment_ids, new_segment_ids, inherit_infos_recorder, new_level_recorder, segment_ranges_recorder]() mutable { + do_batch_insert_segments(merged_segment_ids, new_segment_ids, inherit_infos_recorder, new_level_recorder, INIT_LEVEL_0_COUNT, segment_ranges_recorder); + }); } else { - pool_.submit_detach(do_batch_insert_segments, merged_segment_ids, new_segment_ids, inherit_infos_recorder, level_recorder, level_0_base_count, segment_ranges_recorder); + pool_.submit_detach([this, merged_segment_ids, new_segment_ids, inherit_infos_recorder, new_level_recorder, level_0_base_count, segment_ranges_recorder]() mutable { + do_batch_insert_segments(merged_segment_ids, new_segment_ids, inherit_infos_recorder, new_level_recorder, level_0_base_count, segment_ranges_recorder); + }); } } -void FilterCacheClient::do_batch_delete_segments(std::vector& merged_segment_ids, std::map& level_recorder) { - filter_cache_manager_.delete_segments(merged_segment_ids, level_recorder); +void FilterCacheClient::update_cfd_ptr_if_needed(ColumnFamilyData* cfd) { + filter_cache_manager_.update_cfd(cfd); +} + +void FilterCacheClient::do_batch_delete_segments(std::vector& merged_segment_ids) { + assert(false); + exit(1); + filter_cache_manager_.delete_segments(merged_segment_ids); } -void FilterCacheClient::batch_delete_segments(std::vector merged_segment_ids, std::map level_recorder) { - assert(merged_segment_ids.size() == level_recorder.size()); - pool_.submit_detach(do_batch_delete_segments, merged_segment_ids, level_recorder); +// disallowed in WaLSM+ +void FilterCacheClient::batch_delete_segments(std::vector merged_segment_ids) { + assert(false); + exit(1); + pool_.submit_detach([this, merged_segment_ids]() mutable { + do_batch_delete_segments(merged_segment_ids); + }); } void FilterCacheClient::do_batch_move_segments(std::vector& moved_segment_ids, std::map& old_level_recorder, std::map& move_level_recorder, std::map>& move_segment_ranges_recorder) { + assert(false); + exit(1); filter_cache_manager_.move_segments(moved_segment_ids, old_level_recorder, move_level_recorder, move_segment_ranges_recorder); } +// disallowed in WaLSM+ void FilterCacheClient::batch_move_segments(std::vector moved_segment_ids, std::map old_level_recorder, std::map move_level_recorder, std::map> move_segment_ranges_recorder) { + assert(false); + exit(1); assert(moved_segment_ids.size() == move_level_recorder.size()); assert(moved_segment_ids.size() == move_segment_ranges_recorder.size()); - pool_.submit_detach(do_batch_move_segments, moved_segment_ids, old_level_recorder, move_level_recorder, move_segment_ranges_recorder); + pool_.submit_detach([this, &moved_segment_ids, &old_level_recorder, &move_level_recorder, &move_segment_ranges_recorder]() { + do_batch_move_segments(moved_segment_ids, old_level_recorder, move_level_recorder, move_segment_ranges_recorder); + }); +} + +void FilterCacheClient::init_segment(uint32_t segment_id, const BlockBasedTable* table, const std::vector& block_handles) { + assert(block_handles.size() > 0); + filter_cache_manager_.init_segment(segment_id, table, block_handles); } } \ No newline at end of file diff --git a/db/art/filter_cache_client.h b/db/art/filter_cache_client.h index ff9414071..dbe3717f3 100644 --- a/db/art/filter_cache_client.h +++ b/db/art/filter_cache_client.h @@ -5,55 +5,60 @@ #include #include "macros.h" #include "filter_cache.h" +#include "table/block_based/parsed_full_filter_block.h" +#include "table/format.h" namespace ROCKSDB_NAMESPACE { -// global mutex to control global level recorder, ... -static std::mutex global_recorder_mutex_; - class FilterCacheClient; +class FilterCacheManager; +class ParsedFullFilterBlock; class FilterCacheClient { private: - static task_thread_pool::task_thread_pool pool_; - static FilterCacheManager filter_cache_manager_; + task_thread_pool::task_thread_pool pool_{FILTER_CACHE_THREADS_NUM}; + FilterCacheManager filter_cache_manager_; // we need heat_buckets_ready_ to become true before filter_cache_ready_ // In YCSB benchmark, we first load data (insert key-value pairs) then may try get operation // so we can guarantee that heat_buckets_ready_ become true before filter_cache_ready_ - static bool heat_buckets_ready_; // the same as FilterCacheManager.heat_buckets_.is_ready() + bool heat_buckets_ready_; // the same as FilterCacheManager.heat_buckets_.is_ready() // background thread part of prepare_heat_buckets - static void do_prepare_heat_buckets(const std::string& key, std::unordered_map>* const segment_info_recorder); + void do_prepare_heat_buckets(const std::string& key, std::unordered_map>* segment_info_recorder); // background thread part of retrain_or_keep_model - static void do_retrain_or_keep_model(std::vector* const features_nums_except_level_0, - std::map* const level_recorder, - std::map>* const segment_ranges_recorder, - std::map* const unit_size_recorder); + void do_retrain_or_keep_model(std::vector* features_nums_except_level_0, + const std::map* level_recorder, + const std::map>* segment_ranges_recorder, + const std::map* unit_size_recorder); // background thread part of check_key - static void do_hit_count_recorder(const uint32_t& segment_id); + void do_hit_count_recorder(uint32_t segment_id); - // background thread part of get_updating_work - static void do_hit_heat_buckets(const std::string& key); + // background thread part of hit_heat_buckets + void do_hit_heat_buckets(const std::string& key); - // background thread part of make_adjustment - static void do_make_adjustment(); + // // background thread part of make_adjustment + // void do_make_adjustment(); // background thread part of batch_insert_segments - static void do_batch_insert_segments(std::vector& merged_segment_ids, std::vector& new_segment_ids, + void do_batch_insert_segments(std::vector& merged_segment_ids, std::vector& new_segment_ids, std::map>& inherit_infos_recorder, - std::map& level_recorder, const uint32_t& level_0_base_count, + std::map& new_level_recorder, uint32_t level_0_base_count, std::map>& segment_ranges_recorder); // background thread part of batch_delete_segments - void do_batch_delete_segments(std::vector& merged_segment_ids, std::map& level_recorder); + void do_batch_delete_segments(std::vector& merged_segment_ids); // background thread part of batch_move_segments void do_batch_move_segments(std::vector& moved_segment_ids, std::map& old_level_recorder, std::map& move_level_recorder, std::map>& move_segment_ranges_recorder); + + // background thread part of periods_work; + void do_periods_work(); + public: FilterCacheClient() { heat_buckets_ready_ = false; @@ -79,35 +84,44 @@ class FilterCacheClient { // please ensure that 3 recorders need to keep the same segments set, or error will occur in train func // you can use mutex in compaction and flushing to guarantee this // then when every long period end, try to retrain a new model or keep last model - void retrain_or_keep_model(std::vector* const features_nums_except_level_0, - std::map* const level_recorder, - std::map>* const segment_ranges_recorder, - std::map* const unit_size_recorder); + void retrain_or_keep_model(std::vector* features_nums_except_level_0, + const std::map* level_recorder, + const std::map>* segment_ranges_recorder, + const std::map* unit_size_recorder); // correespinding to FilterCacheManager work: check_key and hit_count_recorder // return FilterCacheManager.check_key() and leave hit_count_recorder to background - bool check_key(const uint32_t& segment_id, const std::string& key); + std::vector> get_filter_blocks(uint32_t segment_id); // every db get operation need one hit_heat_buckets - void get_updating_work(const std::string& key); + void hit_heat_buckets(const std::string& key); + + // keep track of period count, update access counters and retrain classifier model + void periods_work(); - // heap based adjustment - void make_adjustment(); + // // heap based adjustment + // void make_adjustment(); // batch insert segments into filter cache manager, will also delete merged segments void batch_insert_segments(std::vector merged_segment_ids, std::vector new_segment_ids, std::map> inherit_infos_recorder, - std::map level_recorder, const uint32_t& level_0_base_count, + std::map new_level_recorder, uint32_t level_0_base_count, std::map> segment_ranges_recorder); + + // In WaLSM+, we only support one column family, we just save cfd ptr here + void update_cfd_ptr_if_needed(ColumnFamilyData* cfd); // batch delete segments from filter cache manager - void batch_delete_segments(std::vector merged_segment_ids, std::map level_recorder); + void batch_delete_segments(std::vector merged_segment_ids); // batch of moving segments to one level void batch_move_segments(std::vector moved_segment_ids, std::map old_level_recorder, std::map move_level_recorder, std::map> move_segment_ranges_recorder); + + + void init_segment(uint32_t segment_id, const BlockBasedTable* table, const std::vector& block_handles); }; } diff --git a/db/art/filter_cache_entry.cc b/db/art/filter_cache_entry.cc new file mode 100644 index 000000000..bd5c8e85b --- /dev/null +++ b/db/art/filter_cache_entry.cc @@ -0,0 +1,157 @@ +#include "filter_cache_entry.h" + +#include +#include +#include +#include +#include +#include + +#include "db/table_cache.h" +#include "rocksdb/options.h" +#include "table/block_based/block_based_table_reader.h" +#include "table/block_based/cachable_entry.h" +#include "table/block_based/filter_block.h" +#include "table/block_based/parsed_full_filter_block.h" +#include "table/format.h" +#include "table/table_reader.h" + +namespace ROCKSDB_NAMESPACE { + +// 构造函数,可以初始化成员变量 +// TODO pass right parameters +FilterCacheEntry::FilterCacheEntry( + const uint32_t segment_id, const BlockBasedTable* table, + FilterCache* filter_cache, const std::vector& block_handles) { + segment_id_ = segment_id; + table_ = table; + filter_cache_ = filter_cache; + loaded_units_num_ = 0; + + // fill block_handles from the input vector, or fill with null handles + assert(block_handles.size() == MAX_UNITS_NUM); + block_handles_.fill(BlockHandle::NullBlockHandle()); + cache_handles_.fill(nullptr); + for (size_t i = 0; i < block_handles.size(); i++) { + block_handles_[i] = block_handles[i]; + } + + // load units into memory, then only modify loaded_units_num_ + prefetch_units(); +} + +// 清理成员变量,避免内存泄漏,如果new了空间,就可能需要在这里清理 +FilterCacheEntry::~FilterCacheEntry() {} + +size_t FilterCacheEntry::approximate_size() { + uint32_t sum = 0; + // for (size_t i = 0; i < loaded_units_num_; i++) { + // if (cache_handles_[i] == nullptr) { + // continue; + // } + // sum += cache_handles_[i]->value_->ApproximateMemoryUsage(); + // } + // sum *= 8; // convert to bits + sum += DEFAULT_UNIT_SIZE * loaded_units_num_; + return sum; +} + +std::vector> +FilterCacheEntry::get_filter_blocks() { + uint32_t units_num = loaded_units_num_; + + rwlock.ReadLock(); + std::vector> result; + + result.reserve(units_num); + for (size_t i = 0; i < units_num; i++) { + if (UNLIKELY(cache_handles_[i] == nullptr)) { + result.emplace_back(nullptr, nullptr, nullptr, false); + result[i].Reset(); + continue; + } + result.emplace_back(cache_handles_[i]->value_.get(), filter_cache_, + cache_handles_[i].get(), false); + } + rwlock.ReadUnlock(); + return result; +} + +void FilterCacheEntry::enable_units(uint32_t target_unit_num) { + if (target_unit_num > MAX_UNITS_NUM) { + target_unit_num = MAX_UNITS_NUM; + } + + // std::cout << "segment id: " << segment_id_ << ", enable units num from " << loaded_units_num_ << " to " << target_unit_num << std::endl; + + rwlock.WriteLock(); + loaded_units_num_ = target_unit_num; + rwlock.WriteUnlock(); + + // static std::atomic target_unit_num_5_counter{0}; + // static std::atomic target_unit_num_12_counter{0}; + // if (target_unit_num == 5) { + // target_unit_num_5_counter.fetch_add(1); + // } else if (target_unit_num == 12) { + // target_unit_num_12_counter.fetch_add(1); + // } + // std::cout << "target_unit_num_5_counter: " << target_unit_num_5_counter.load() << ", target_unit_num_12_counter: " << target_unit_num_12_counter.load() << std::endl; +} + +void FilterCacheEntry::prefetch_units() { + uint32_t target_unit_num = MAX_UNITS_NUM; + uint32_t prefetch_success_num = 0; + + rwlock.WriteLock(); + const ReadOptions read_options; + for (uint32_t i = 0; i < target_unit_num; i++) { + // do nothing for null block handle + if (block_handles_[i] == BlockHandle::NullBlockHandle()) { + continue; + } + CachableEntry block_entry; + Status s = table_->RetrieveBlock( + nullptr, read_options, block_handles_[i], + UncompressionDict::GetEmptyDict(), &block_entry, BlockType::kFilter, + nullptr, nullptr, + /* for_compaction */ false, /* use_cache */ false); + + + if (s.ok()) { + prefetch_success_num++; + } + + // do nothing if no data retrieved + if (!s.ok()) { + std::cout << "failed to retrive filter data, segment id: " << segment_id_ + << std::endl; + cache_handles_[i].reset(); + units_[i].reset(); + break; + } + + units_[i] = + std::shared_ptr(block_entry.ReleaseValue()); + cache_handles_[i] = + // std::make_shared(units_[i], filter_cache_); + std::shared_ptr( + new FilterCacheDataHandle(units_[i], filter_cache_)); + } + rwlock.WriteUnlock(); + + // std::cout << "segment id: " << segment_id_ << ", prefetch success num: " << prefetch_success_num << std::endl; + // std::cout << "used bytes:" ; + // for (uint32_t i = 0; i < target_unit_num; i++) { + // if (cache_handles_[i] == nullptr) { + // std::cout << "null "; + // continue; + // } + // std::cout << cache_handles_[i]->value_->ApproximateMemoryUsage() << " "; + // } + // std::cout << std::endl; +} + +FilterCacheEntry::FilterCacheDataHandle::FilterCacheDataHandle( + DataPtr value, FilterCache* cache) + : value_(value), cache_(cache) {} +} // namespace ROCKSDB_NAMESPACE \ No newline at end of file diff --git a/db/art/filter_cache_entry.h b/db/art/filter_cache_entry.h new file mode 100644 index 000000000..0778c9883 --- /dev/null +++ b/db/art/filter_cache_entry.h @@ -0,0 +1,83 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "macros.h" +#include "port/port_posix.h" +#include "table/block_based/cachable_entry.h" +#include "table/block_based/parsed_full_filter_block.h" +#include "table/format.h" + +namespace ROCKSDB_NAMESPACE { +class TableCache; // forward declaration +class BlockBasedTable; +class FilterCache; + +// 先在filter +// cache里为每个segment默认启用总bits-per-key=8,随着写入的segment的增加, +// 一旦已经占用了filter cache最大容量的一定阈值(如80%), +// 就利用GreedyAlgo计算规划问题,并进行模型训练 一旦filter +// cache已满,就进入filter cache的double +// heap调整,我们只需将新的segment用模型进行预测 +// 将新segment的node插入到两个heap里,在后台启动一个线程,自行调整两个堆,并不断返回调整的结果 +// 得到结果后,我们可以立即对filter +// units的启用情况进行调节,也可以先保存后面批量调整 具体见文档 + +// 注意加上一些必要的英文注释 +// filter cache主要为一个map, key是segment id(uint32_t), +// value就为FilterCacheItem类 成员函数需要在filter_cache_item.cc里定义 + +// TODO: how to get block_handles? +class FilterCacheEntry { + using DataPtr = std::shared_ptr; + + private: + const BlockBasedTable* table_; + FilterCache* filter_cache_; + + struct FilterCacheDataHandle : public Cache::Handle { + DataPtr value_; + FilterCache* cache_; + + FilterCacheDataHandle(DataPtr value, FilterCache* cache); + }; + using HandlePtr = std::shared_ptr; + + uint32_t segment_id_; + uint32_t loaded_units_num_; + std::array units_{}; + std::array block_handles_{}; + std::array cache_handles_{}; + mutable port::RWMutex rwlock; + + public: + // 构造函数,可以初始化成员变量 + // TODO pass right parameters + FilterCacheEntry(const uint32_t segment_id, const BlockBasedTable* table, + FilterCache* filter_cache, const std::vector& block_handles); + + // 清理成员变量,避免内存泄漏,如果new了空间,就可能需要在这里清理 + ~FilterCacheEntry(); + + // 占用的内存空间,这里估计总共使用的filter units占用的空间就行了 + // 注意,返回的空间大小为占用的bits数量,不是bytes数量 + size_t approximate_size(); + + // 根据目前已经启用的units数,启用或禁用filter units + // 输入需要启用的units数,决定启用、禁用还是不处理 + // units_num : [MIN_UNITS_NUM, MAX_UNITS_NUM] + void enable_units(uint32_t units_num); + + void prefetch_units(); + + // 获取缓存的 filter Block + std::vector> get_filter_blocks(); +}; +} // namespace ROCKSDB_NAMESPACE \ No newline at end of file diff --git a/db/art/filter_cache_heap.cc b/db/art/filter_cache_heap.cc index fd2afb48c..35cefb59f 100644 --- a/db/art/filter_cache_heap.cc +++ b/db/art/filter_cache_heap.cc @@ -1,20 +1,16 @@ #include "filter_cache_heap.h" #include #include +#include "port/likely.h" namespace ROCKSDB_NAMESPACE { -FilterCacheHeap FilterCacheHeapManager::benefit_heap_; -FilterCacheHeap FilterCacheHeapManager::cost_heap_; -std::map FilterCacheHeapManager::heap_visit_cnt_recorder_; -std::map FilterCacheHeapManager::units_num_limit_recorder_; -std::mutex FilterCacheHeapManager::manager_mutex_; FilterCacheHeapNode FilterCacheHeap::heap_top() { // need lock heap, or we may retrive outdated node // heap_mutex_.lock(); - if (!heap_.empty()) { + if (LIKELY(!heap_.empty())) { return heap_[0]; } else { return nullptr; @@ -23,6 +19,46 @@ FilterCacheHeapNode FilterCacheHeap::heap_top() { // heap_mutex_.unlock(); } +void FilterCacheHeap::heap_check(bool max_heap) { + // need lock heap, or we may retrive outdated node + // heap_mutex_.lock(); + + if (LIKELY(!heap_.empty())) { + if (max_heap) { + for (auto &node : heap_) { + assert(node->benefit_or_cost <= heap_[0]->benefit_or_cost); + } + } else { + for (auto &node : heap_) { + assert(node->benefit_or_cost >= heap_[0]->benefit_or_cost); + } + } + } + + // heap_mutex_.unlock(); +} + +void FilterCacheHeap::heap_print(std::vector& needed_segment_ids, const bool should_exist) { + // heap_mutex_.lock(); + + for (uint32_t &segment_id : needed_segment_ids) { + auto it = heap_index_.find(segment_id); + if (should_exist) + assert(it != heap_index_.end()); + if (UNLIKELY(it == heap_index_.end())) continue; + std::cout << "segment id: " << it->second->segment_id + << ", visit cnt: " << it->second->approx_visit_cnt + << ", benefit/cost: " << it->second->benefit_or_cost + << ", current units: " << it->second->current_units_num + << ", units limit: " << it->second->units_num_limit + << std::endl; + assert(it->second->is_alive); + assert(it->second->current_units_num <= it->second->units_num_limit); + } + + // heap_mutex_.unlock(); +} + /* void FilterCacheHeap::pop() { // heap_mutex_.lock(); @@ -94,6 +130,7 @@ void FilterCacheHeap::push(FilterCacheHeapNode& node) { void FilterCacheHeap::batch_query(std::vector& segment_ids, std::vector& return_nodes) { // heap_mutex_.lock(); + // uint32_t return_count = 0; return_nodes.clear(); for (uint32_t& segment_id : segment_ids) { auto it = heap_index_.find(segment_id); @@ -102,9 +139,11 @@ void FilterCacheHeap::batch_query(std::vector& segment_ids, std::vecto // so we should return null when query a merged segment id if (it != heap_index_.end() && (it->second)->is_alive == true) { return_node = it->second; // node exists in heap_index_ and segment alive + // return_count++; } return_nodes.emplace_back(return_node); } + // assert(segment_ids.size() == return_count); // heap_mutex_.unlock(); } @@ -120,14 +159,27 @@ void FilterCacheHeap::batch_upsert(std::vector& nodes) { // exist in heap_index_ and heap_ // we may query nodes from this heap, and update var in nodes, then upsert original nodes // check it->second != node to make sure that we won't free a refered sapce + assert(it->second != node); + assert(it->second->segment_id == segment_id); if (it->second != node) { *(it->second) = *(node); // only copy content, this will update content of node in heap_index_ and heap_ delete node; // remember to free unnecessary space! } + // bool found = false; + // for (auto &node : heap_) { + // found = found || (heap_index_[segment_id]->segment_id == node->segment_id); + // if (found) { + // assert(node == heap_index_[segment_id]); + // break; + // } + // } + // assert(found); // should found } else { // not exist in heap_index_ and heap_ heap_index_.insert(std::make_pair(segment_id, node)); // insert into heap_index_ heap_.emplace_back(node); // push into heap_ + assert(heap_index_[segment_id] == heap_[heap_.size()-1]); + assert(heap_index_[segment_id]->segment_id == segment_id); } } @@ -140,16 +192,34 @@ void FilterCacheHeap::batch_upsert(std::vector& nodes) { void FilterCacheHeap::batch_delete(std::vector& segment_ids) { // heap_mutex_.lock(); + // uint32_t delete_count = 0; + // uint32_t size_before = heap_.size(); + // we guarantee that if one node not exist in heap_index_, it must not exist in heap for (uint32_t& segment_id : segment_ids) { auto it = heap_index_.find(segment_id); - if (it == heap_index_.end()) { + if (UNLIKELY(it == heap_index_.end())) { // not exist in heap_index_ and heap_ // do nothing + // for (auto &node : heap_) { + // assert(node->segment_id != segment_id); + // } } else { // exist in heap_index_ and heap_ // set is_alive to false and delete after that it->second->is_alive = false; + assert(heap_index_[segment_id]->is_alive == false); + // bool found = false; + // for (auto &node : heap_) { + // found = found || (heap_index_[segment_id]->segment_id == node->segment_id); + // if (found) { + // assert(node == heap_index_[segment_id]); + // assert(node->is_alive == false); + // break; + // } + // } + // assert(found); // should found + // delete_count++; } } @@ -172,6 +242,11 @@ void FilterCacheHeap::batch_delete(std::vector& segment_ids) { } } + // check already deleted? + // for (uint32_t &segment_id : segment_ids) { + // assert(heap_index_.find(segment_id) == heap_index_.end()); + // } + // assert(heap_.size() + delete_count == size_before); // delete done, need to rebuild heap_ rebuild_heap(); @@ -181,20 +256,47 @@ void FilterCacheHeap::batch_delete(std::vector& segment_ids) { void FilterCacheHeapManager::batch_delete(std::vector& segment_ids) { manager_mutex_.lock(); + // std::set segment_ids_set; for (uint32_t& segment_id : segment_ids) { auto cnt_it = heap_visit_cnt_recorder_.find(segment_id); auto limit_it = units_num_limit_recorder_.find(segment_id); - if (cnt_it != heap_visit_cnt_recorder_.end()) { + // assert((cnt_it != heap_visit_cnt_recorder_.end() && limit_it != units_num_limit_recorder_.end()) + // || (cnt_it == heap_visit_cnt_recorder_.end() && limit_it == units_num_limit_recorder_.end())); + if (LIKELY(cnt_it != heap_visit_cnt_recorder_.end())) { heap_visit_cnt_recorder_.erase(segment_id); } - if (limit_it != units_num_limit_recorder_.end()) { + if (LIKELY(limit_it != units_num_limit_recorder_.end())) { units_num_limit_recorder_.erase(segment_id); } + // segment_ids_set.insert(segment_id); } + // assert(segment_ids_set.size() == segment_ids.size()); // all segment ids should be unique + + // // print before deletion + // if (segment_ids_set.size() > 0) { + // std::cout << std::endl; + // std::cout << "before deletion, print benefit heap items: " << std::endl; + // benefit_heap_.heap_print(segment_ids, false); + // std::cout << "before deletion, print cost heap items: " << std::endl; + // cost_heap_.heap_print(segment_ids, false); + // std::cout << std::endl; + // } benefit_heap_.batch_delete(segment_ids); cost_heap_.batch_delete(segment_ids); + // check whether it is heap? + // benefit_heap_.heap_check(true); + // cost_heap_.heap_check(false); + // for (uint32_t &segment_id : segment_ids) { + // assert(heap_visit_cnt_recorder_.find(segment_id) == heap_visit_cnt_recorder_.end()); + // assert(units_num_limit_recorder_.find(segment_id) == units_num_limit_recorder_.end()); + // } + + assert(benefit_heap_.heap_size() == heap_visit_cnt_recorder_.size()); + assert(benefit_heap_.heap_size() == units_num_limit_recorder_.size()); + assert(benefit_heap_.heap_size() == cost_heap_.heap_size()); + manager_mutex_.unlock(); } @@ -202,11 +304,15 @@ void FilterCacheHeapManager::batch_upsert(std::vector& item manager_mutex_.lock(); std::vector benefit_nodes, cost_nodes; + // std::set segment_ids_set; for (FilterCacheHeapItem& item : items) { assert(item.current_units_num >= MIN_UNITS_NUM); assert(item.current_units_num <= item.units_num_limit); + assert(item.units_num_limit <= MAX_UNITS_NUM); double benefit = StandardBenefitWithMaxBound(item.approx_visit_cnt, item.current_units_num, item.units_num_limit); double cost = StandardCostWithMinBound(item.approx_visit_cnt, item.current_units_num, MIN_UNITS_NUM); + // if (item.units_num_limit == item.current_units_num) assert(benefit == 0); + // if (item.current_units_num == MIN_UNITS_NUM) assert(cost == __DBL_MAX__); // item meets at least one conditions // so that item always upsert into heap // if item.approx_visit_cnt = 0, still push into heap @@ -234,20 +340,18 @@ void FilterCacheHeapManager::batch_upsert(std::vector& item } */ - if (item.current_units_num <= item.units_num_limit) { - cost_nodes.emplace_back(new FilterCacheHeapItem(item.segment_id, - item.approx_visit_cnt, - item.current_units_num, - cost, - item.units_num_limit) - ); - benefit_nodes.emplace_back(new FilterCacheHeapItem(item.segment_id, - item.approx_visit_cnt, - item.current_units_num, - benefit, - item.units_num_limit) - ); - } + cost_nodes.emplace_back(new FilterCacheHeapItem(item.segment_id, + item.approx_visit_cnt, + item.current_units_num, + cost, + item.units_num_limit) + ); + benefit_nodes.emplace_back(new FilterCacheHeapItem(item.segment_id, + item.approx_visit_cnt, + item.current_units_num, + benefit, + item.units_num_limit) + ); // update visit cnt, we need to keep recorder visit cnt and heap visit cnt the same const uint32_t segment_id = item.segment_id; @@ -265,12 +369,42 @@ void FilterCacheHeapManager::batch_upsert(std::vector& item } else { units_num_limit_recorder_.insert(std::make_pair(segment_id, units_limit)); } + // segment_ids_set.insert(segment_id); + assert(heap_visit_cnt_recorder_[segment_id] == visit_cnt); + assert(units_num_limit_recorder_[segment_id] == units_limit); } + // assert(segment_ids_set.size() == items.size()); // upsert nodes into heaps benefit_heap_.batch_upsert(benefit_nodes); cost_heap_.batch_upsert(cost_nodes); + // std::vector segment_ids; + // std::copy(segment_ids_set.begin(), segment_ids_set.end(), std::back_inserter(segment_ids)); + // assert(segment_ids.size() == segment_ids_set.size()); + + // // print after upsertion + // if (segment_ids_set.size() > 0) { + // std::cout << std::endl; + // std::cout << "after upsertion, print benefit heap items: " << std::endl; + // benefit_heap_.heap_print(segment_ids, true); + // std::cout << "after upsertion, print cost heap items: " << std::endl; + // cost_heap_.heap_print(segment_ids, true); + // std::cout << std::endl; + // } + + // check whether is heap? + // benefit_heap_.heap_check(true); + // cost_heap_.heap_check(false); + // for (uint32_t &segment_id : segment_ids) { + // assert(heap_visit_cnt_recorder_.find(segment_id) != heap_visit_cnt_recorder_.end()); + // assert(units_num_limit_recorder_.find(segment_id) != units_num_limit_recorder_.end()); + // } + + assert(benefit_heap_.heap_size() == heap_visit_cnt_recorder_.size()); + assert(benefit_heap_.heap_size() == units_num_limit_recorder_.size()); + assert(benefit_heap_.heap_size() == cost_heap_.heap_size()); + manager_mutex_.unlock(); } @@ -280,12 +414,13 @@ bool FilterCacheHeapManager::try_modify(FilterCacheModifyResult& result) { FilterCacheHeapNode benefit_node = benefit_heap_.heap_top(); FilterCacheHeapNode cost_node = cost_heap_.heap_top(); // if benefit heap or cost heap empty, no need to modify - if (benefit_node == nullptr || cost_node == nullptr) { + if (UNLIKELY(benefit_node == nullptr || cost_node == nullptr)) { manager_mutex_.unlock(); // remember to unlock, or we will cause deadlock return false; } - if (benefit_node->is_alive == false || cost_node->is_alive == false) { + if (UNLIKELY(benefit_node->is_alive == false || cost_node->is_alive == false)) { + // std::cout << "one node is not alive, stop modification." << std::endl; manager_mutex_.unlock(); // remember to unlock, or we will cause deadlock return false; } @@ -293,7 +428,10 @@ bool FilterCacheHeapManager::try_modify(FilterCacheModifyResult& result) { const double benefit = benefit_node->benefit_or_cost; const double cost = cost_node->benefit_or_cost; // if benefit of enable one unit <= cost of disable one unit, no need to modify - if (benefit <= cost) { + if (benefit - cost < double(PURE_BENEFIT_BOUND)) { + // std::cout << std::endl; + // std::cout << "failed to modify, because benefit of modification do not hit threshold." << std::endl; + // std::cout << "benefit: " << benefit << ", cost: " << cost << std::endl; manager_mutex_.unlock(); // remember to unlock, or we will cause deadlock return false; } @@ -301,15 +439,29 @@ bool FilterCacheHeapManager::try_modify(FilterCacheModifyResult& result) { const uint32_t benefit_segment_id = benefit_node->segment_id; const uint32_t cost_segment_id = cost_node->segment_id; // if we will enable and disable one unit of the same segment, ignore it - if (benefit_segment_id == cost_segment_id) { + if (UNLIKELY(benefit_segment_id == cost_segment_id)) { + // std::cout << "cannot modify the same segment!" << std::endl; manager_mutex_.unlock(); // remember to unlock, or we will cause deadlock return false; } + if (UNLIKELY(heap_visit_cnt_recorder_.find(benefit_segment_id) == heap_visit_cnt_recorder_.end() + || heap_visit_cnt_recorder_.find(cost_segment_id) == heap_visit_cnt_recorder_.end())) { + // std::cout << "target segment merged, stop modification." << std::endl; + manager_mutex_.unlock(); + return false; + } + if (UNLIKELY(units_num_limit_recorder_.find(benefit_segment_id) == units_num_limit_recorder_.end() + || units_num_limit_recorder_.find(cost_segment_id) == units_num_limit_recorder_.end())) { + // std::cout << "target segment merged, stop modification." << std::endl; + manager_mutex_.unlock(); + return false; + } // FilterCacheHeapItem(const uint32_t& id, const uint32_t& cnt, const uint16_t& units, const double& heap_value) // we can try filter unit modification, reminded that this modification will modify units num of two segments // so we need to upsert new nodes of these two segments into benefit heap and cost heap std::vector new_benefit_nodes, new_cost_nodes; + // std::vector segment_ids; /* if (benefit_node->current_units_num + 1 < benefit_node->units_num_limit) { @@ -399,11 +551,21 @@ bool FilterCacheHeapManager::try_modify(FilterCacheModifyResult& result) { cost_node->units_num_limit ) ); - // already make ready for upsert - benefit_heap_.batch_upsert(new_benefit_nodes); - cost_heap_.batch_upsert(new_cost_nodes); - // write result + // segment_ids.emplace_back(benefit_node->segment_id); + // segment_ids.emplace_back(cost_node->segment_id); + + // // print nodes + // std::cout << std::endl; + // std::cout << "before modification, print nodes." << std::endl; + // std::cout << "benefit nodes: " << std::endl; + // benefit_heap_.heap_print(segment_ids, true); + // std::cout << "cost nodes: " << std::endl; + // cost_heap_.heap_print(segment_ids, true); + + // write result before real upsert, + // noticed that batch_upsert will also modify content of these nodes, + // so we need to save contents right now result.enable_segment_id = benefit_node->segment_id; result.disable_segment_id = cost_node->segment_id; result.enable_segment_units_num = benefit_node->current_units_num; @@ -413,6 +575,37 @@ bool FilterCacheHeapManager::try_modify(FilterCacheModifyResult& result) { result.enable_benefit = benefit; result.disable_cost = cost; + // std::cout << std::endl; + // std::cout << "enable one unit for segment " << result.enable_segment_id + // << ", disable one unit for segment " << result.disable_segment_id + // << ", enable from " << result.enable_segment_units_num << " to " << result.enable_segment_next_units_num + // << ", disable from " << result.disable_segment_units_num << " to " << result.disable_segment_next_units_num + // << ", benefit: " << result.enable_benefit << ", cost: " << result.disable_cost << std::endl; + + // already make ready for upsert + benefit_heap_.batch_upsert(new_benefit_nodes); + cost_heap_.batch_upsert(new_cost_nodes); + + // // print nodes + // std::cout << std::endl; + // std::cout << "after modification, print nodes." << std::endl; + // std::cout << "benefit nodes: " << std::endl; + // benefit_heap_.heap_print(segment_ids, true); + // std::cout << "cost nodes: " << std::endl; + // cost_heap_.heap_print(segment_ids, true); + + // check whether is heap? + // benefit_heap_.heap_check(true); + // cost_heap_.heap_check(false); + // for (uint32_t &segment_id : segment_ids) { + // assert(heap_visit_cnt_recorder_.find(segment_id) != heap_visit_cnt_recorder_.end()); + // assert(units_num_limit_recorder_.find(segment_id) != units_num_limit_recorder_.end()); + // } + + assert(benefit_heap_.heap_size() == heap_visit_cnt_recorder_.size()); + assert(benefit_heap_.heap_size() == units_num_limit_recorder_.size()); + assert(benefit_heap_.heap_size() == cost_heap_.heap_size()); + // return nothing, result already written into var result manager_mutex_.unlock(); @@ -420,34 +613,49 @@ bool FilterCacheHeapManager::try_modify(FilterCacheModifyResult& result) { return true; } -void FilterCacheHeapManager::sync_visit_cnt(std::map& current_visit_cnt_recorder) { +void FilterCacheHeapManager::sync_visit_cnt(std::map& recent_visit_cnt_recorder) { manager_mutex_.lock(); std::vector sync_nodes; std::vector sync_segment_ids; auto heap_it = heap_visit_cnt_recorder_.begin(); - auto current_it = current_visit_cnt_recorder.begin(); + auto recent_it = recent_visit_cnt_recorder.begin(); while (heap_it != heap_visit_cnt_recorder_.end() && - current_it != current_visit_cnt_recorder.end()) { - if (heap_it->first < current_it->first) { + recent_it != recent_visit_cnt_recorder.end()) { + if (heap_it->first < recent_it->first) { heap_it ++; - } else if (heap_it->first > current_it->first) { - current_it ++; + } else if (heap_it->first > recent_it->first) { + recent_it ++; } else { - // heap_it->first == current_it->first - assert(heap_it->first == current_it->first); + // heap_it->first == recent_it->first + assert(heap_it->first == recent_it->first); int64_t old_visit_cnt = heap_it->second; - int64_t cur_visit_cnt = current_it->second; - if (std::abs(cur_visit_cnt-old_visit_cnt) > VISIT_CNT_UPDATE_BOUND) { - heap_it->second = current_it->second; // remember to update heap visit cnt recorder - sync_segment_ids.emplace_back(current_it->first); + int64_t rec_visit_cnt = recent_it->second; + if (std::abs(rec_visit_cnt-old_visit_cnt) > VISIT_CNT_UPDATE_BOUND) { + heap_it->second = recent_it->second; // remember to update heap visit cnt recorder + sync_segment_ids.emplace_back(recent_it->first); + // std::cout << "segment " << heap_it->first << " cnt diff is " << std::abs(rec_visit_cnt-old_visit_cnt) + // << ", bigger than " << VISIT_CNT_UPDATE_BOUND << ", start to sync." << std::endl; + } + else { + // std::cout << "segment " << heap_it->first << " cnt diff is " << std::abs(rec_visit_cnt-old_visit_cnt) + // << ", smaller than / euqal to " << VISIT_CNT_UPDATE_BOUND << ", do not sync." << std::endl; } - // heap_it ++; - current_it ++; + heap_it ++; + recent_it ++; } } + // // print sync nodes before + // std::cout << std::endl; + // std::cout << "before sync visit count, print sync nodes: " << std::endl; + // std::cout << "benefit heap:" << std::endl; + // benefit_heap_.heap_print(sync_segment_ids, true); + // std::cout << "cost heap:" << std::endl; + // cost_heap_.heap_print(sync_segment_ids, true); + // std::cout << std::endl; + // query nodes in heap std::vector sync_benefit_nodes, sync_cost_nodes; benefit_heap_.batch_query(sync_segment_ids, sync_benefit_nodes); @@ -455,16 +663,18 @@ void FilterCacheHeapManager::sync_visit_cnt(std::map& curren // update visit cnt and benefit/cost in these nodes for (FilterCacheHeapNode& sync_benefit_node : sync_benefit_nodes) { - if (sync_benefit_node != nullptr) { - sync_benefit_node->approx_visit_cnt = current_visit_cnt_recorder[sync_benefit_node->segment_id]; + assert(sync_benefit_node != nullptr); + if (LIKELY(sync_benefit_node != nullptr)) { + sync_benefit_node->approx_visit_cnt = recent_visit_cnt_recorder[sync_benefit_node->segment_id]; sync_benefit_node->benefit_or_cost = StandardBenefitWithMaxBound(sync_benefit_node->approx_visit_cnt, sync_benefit_node->current_units_num, sync_benefit_node->units_num_limit); } } for (FilterCacheHeapNode& sync_cost_node : sync_cost_nodes) { - if (sync_cost_node != nullptr) { - sync_cost_node->approx_visit_cnt = current_visit_cnt_recorder[sync_cost_node->segment_id]; + assert(sync_cost_node != nullptr); + if (LIKELY(sync_cost_node != nullptr)) { + sync_cost_node->approx_visit_cnt = recent_visit_cnt_recorder[sync_cost_node->segment_id]; sync_cost_node->benefit_or_cost = StandardCostWithMinBound(sync_cost_node->approx_visit_cnt, sync_cost_node->current_units_num, MIN_UNITS_NUM); @@ -474,7 +684,6 @@ void FilterCacheHeapManager::sync_visit_cnt(std::map& curren // upsert nodes into benefit heap and cost heap // benefit_heap_.batch_upsert(sync_benefit_nodes); // cost_heap_.batch_upsert(sync_cost_nodes); - // notice that we already updated these nodes in heap, we only need to rebuild heap // but heap.upsert include the step of checking whether these segments already in heap @@ -482,6 +691,28 @@ void FilterCacheHeapManager::sync_visit_cnt(std::map& curren benefit_heap_.rebuild_heap(); cost_heap_.rebuild_heap(); + // check whether is heap? + // benefit_heap_.heap_check(true); + // cost_heap_.heap_check(false); + // for (uint32_t &segment_id : sync_segment_ids) { + // assert(heap_visit_cnt_recorder_.find(segment_id) != heap_visit_cnt_recorder_.end()); + // assert(heap_visit_cnt_recorder_[segment_id] == recent_visit_cnt_recorder[segment_id]); + // assert(units_num_limit_recorder_.find(segment_id) != units_num_limit_recorder_.end()); + // } + + assert(benefit_heap_.heap_size() == heap_visit_cnt_recorder_.size()); + assert(benefit_heap_.heap_size() == units_num_limit_recorder_.size()); + assert(benefit_heap_.heap_size() == cost_heap_.heap_size()); + + // // print sync nodes after + // std::cout << std::endl; + // std::cout << "after sync visit count, print sync nodes: " << std::endl; + // std::cout << "benefit heap:" << std::endl; + // benefit_heap_.heap_print(sync_segment_ids, true); + // std::cout << "cost heap:" << std::endl; + // cost_heap_.heap_print(sync_segment_ids, true); + // std::cout << std::endl; + manager_mutex_.unlock(); } @@ -502,15 +733,27 @@ void FilterCacheHeapManager::sync_units_num_limit(std::map& } else { // origin_it->first == current_it->first assert(origin_it->first == current_it->first); - assert(current_it->second <= MAX_UNITS_NUM); + assert(current_it->second <= MAX_UNITS_NUM && current_it->second >= MIN_UNITS_NUM); if (origin_it->second != current_it->second) { origin_it->second = current_it->second; sync_segment_ids.emplace_back(current_it->first); } + uint32_t segment_id = origin_it->first; + assert(units_num_limit_recorder_[segment_id] == current_it->second); + origin_it ++; current_it ++; } } + // // print sync nodes before + // std::cout << std::endl; + // std::cout << "before sync units limit, print sync nodes: " << std::endl; + // std::cout << "benefit heap:" << std::endl; + // benefit_heap_.heap_print(sync_segment_ids, true); + // std::cout << "cost heap:" << std::endl; + // cost_heap_.heap_print(sync_segment_ids, true); + // std::cout << std::endl; + // query nodes in heap std::vector sync_benefit_nodes, sync_cost_nodes; benefit_heap_.batch_query(sync_segment_ids, sync_benefit_nodes); @@ -518,20 +761,24 @@ void FilterCacheHeapManager::sync_units_num_limit(std::map& // update units num limit, units num and benefit/cost in these nodes for (FilterCacheHeapNode& sync_benefit_node : sync_benefit_nodes) { - if (sync_benefit_node != nullptr) { + assert(sync_benefit_node != nullptr); + if (LIKELY(sync_benefit_node != nullptr)) { sync_benefit_node->units_num_limit = current_units_num_limit_recorder[sync_benefit_node->segment_id]; sync_benefit_node->current_units_num = std::min(sync_benefit_node->units_num_limit, sync_benefit_node->current_units_num); + assert(sync_benefit_node->units_num_limit >= sync_benefit_node->current_units_num); sync_benefit_node->benefit_or_cost = StandardBenefitWithMaxBound(sync_benefit_node->approx_visit_cnt, sync_benefit_node->current_units_num, sync_benefit_node->units_num_limit); } } for (FilterCacheHeapNode& sync_cost_node : sync_cost_nodes) { - if (sync_cost_node != nullptr) { + assert(sync_cost_node != nullptr); + if (LIKELY(sync_cost_node != nullptr)) { sync_cost_node->units_num_limit = current_units_num_limit_recorder[sync_cost_node->segment_id]; sync_cost_node->current_units_num = std::min(sync_cost_node->units_num_limit, sync_cost_node->current_units_num); + assert(sync_cost_node->units_num_limit >= sync_cost_node->current_units_num); sync_cost_node->benefit_or_cost = StandardCostWithMinBound(sync_cost_node->approx_visit_cnt, sync_cost_node->current_units_num, MIN_UNITS_NUM); @@ -549,488 +796,509 @@ void FilterCacheHeapManager::sync_units_num_limit(std::map& benefit_heap_.rebuild_heap(); cost_heap_.rebuild_heap(); + // check whether is heap? + // benefit_heap_.heap_check(true); + // cost_heap_.heap_check(false); + // for (uint32_t &segment_id : sync_segment_ids) { + // assert(heap_visit_cnt_recorder_.find(segment_id) != heap_visit_cnt_recorder_.end()); + // assert(units_num_limit_recorder_.find(segment_id) != units_num_limit_recorder_.end()); + // } + + assert(benefit_heap_.heap_size() == heap_visit_cnt_recorder_.size()); + assert(benefit_heap_.heap_size() == units_num_limit_recorder_.size()); + assert(benefit_heap_.heap_size() == cost_heap_.heap_size()); + + // // print sync nodes after + // std::cout << std::endl; + // std::cout << "after sync units limit, print sync nodes: " << std::endl; + // std::cout << "benefit heap:" << std::endl; + // benefit_heap_.heap_print(sync_segment_ids, true); + // std::cout << "cost heap:" << std::endl; + // cost_heap_.heap_print(sync_segment_ids, true); + // std::cout << std::endl; + manager_mutex_.unlock(); } -void FilterCacheHeapManager::debug() { - std::vector items; - std::vector segment_ids; - std::map current_visit_cnt_recorder; - std::map current_units_num_limit_recorder; - std::map b_heap_index; - std::vector b_heap; - std::map c_heap_index; - std::vector c_heap; - std::fstream f_heap; - f_heap.open("/pg_wal/ycc/heap.log", std::ios::out | std::ios::app); - // FilterCacheHeapItem(const uint32_t& id, const uint32_t& cnt, const uint16_t& units, - // const double& heap_value, const uint16_t& limit) - // 1. try to insert some new data - f_heap << "[DEBUG] debug step 1 : batch insert" << std::endl << std::endl; - for (uint32_t id = 0; id < 70; id++) { - items.emplace_back(id % 70, (id % 70) * 10, (id % 70) / 10, 0, MAX_UNITS_NUM); - } - batch_upsert(items); - benefit_heap_.heap_index(b_heap_index); - benefit_heap_.heap(b_heap); - cost_heap_.heap_index(c_heap_index); - cost_heap_.heap(c_heap); - f_heap << "[DEBUG] step1 b_heap_index : " << std::endl; - for (auto it = b_heap_index.begin(); it != b_heap_index.end(); it++) { - FilterCacheHeapNode node = it->second; - f_heap << it->first << " -> "; - f_heap << " id : " << node->segment_id; - f_heap << " , cnt : " << node->approx_visit_cnt; - f_heap << " , units : " << node->current_units_num; - f_heap << " , value : " << node->benefit_or_cost; - f_heap << " , limit : " << node->units_num_limit; - f_heap << " , alive : " << node->is_alive << std::endl; - } - f_heap << "[DEBUG] step1 b_heap : " << std::endl; - for (FilterCacheHeapNode& node : b_heap) { - f_heap << " id : " << node->segment_id; - f_heap << " , cnt : " << node->approx_visit_cnt; - f_heap << " , units : " << node->current_units_num; - f_heap << " , value : " << node->benefit_or_cost; - f_heap << " , limit : " << node->units_num_limit; - f_heap << " , alive : " << node->is_alive << std::endl; - } - f_heap << "[DEBUG] step1 c_heap_index : " << std::endl; - for (auto it = c_heap_index.begin(); it != c_heap_index.end(); it++) { - FilterCacheHeapNode node = it->second; - f_heap << it->first << " -> "; - f_heap << " id : " << node->segment_id; - f_heap << " , cnt : " << node->approx_visit_cnt; - f_heap << " , units : " << node->current_units_num; - f_heap << " , value : " << node->benefit_or_cost; - f_heap << " , limit : " << node->units_num_limit; - f_heap << " , alive : " << node->is_alive << std::endl; - } - f_heap << "[DEBUG] step1 c_heap : " << std::endl; - for (FilterCacheHeapNode& node : c_heap) { - f_heap << " id : " << node->segment_id; - f_heap << " , cnt : " << node->approx_visit_cnt; - f_heap << " , units : " << node->current_units_num; - f_heap << " , value : " << node->benefit_or_cost; - f_heap << " , limit : " << node->units_num_limit; - f_heap << " , alive : " << node->is_alive << std::endl; - } - f_heap << "[DEBUG] step1 visit_cnt_recorder : " << std::endl; - for (auto it = heap_visit_cnt_recorder_.begin(); - it != heap_visit_cnt_recorder_.end(); it++) { - f_heap << it->first << " -> " << it->second << std::endl; - } - f_heap << "[DEBUG] step1 units_limit_recorder : " << std::endl; - for (auto it = units_num_limit_recorder_.begin(); - it != units_num_limit_recorder_.end(); it++) { - f_heap << it->first << " -> " << it->second << std::endl; - } - - // 2. try to update old data - f_heap << std::endl << std::endl<< "[DEBUG] debug step 2 : batch update (using upsert)" << std::endl << std::endl; - items.clear(); - for (uint32_t id = 0; id < 70; id++) { - items.emplace_back(id % 70, (id % 70) * std::pow(10, (id % 70) / 10), (id % 70) / 10, 0, MAX_UNITS_NUM); - } - batch_upsert(items); - benefit_heap_.heap_index(b_heap_index); - benefit_heap_.heap(b_heap); - cost_heap_.heap_index(c_heap_index); - cost_heap_.heap(c_heap); - f_heap << "[DEBUG] step2 b_heap_index : " << std::endl; - for (auto it = b_heap_index.begin(); it != b_heap_index.end(); it++) { - FilterCacheHeapNode node = it->second; - f_heap << it->first << " -> "; - f_heap << " id : " << node->segment_id; - f_heap << " , cnt : " << node->approx_visit_cnt; - f_heap << " , units : " << node->current_units_num; - f_heap << " , value : " << node->benefit_or_cost; - f_heap << " , limit : " << node->units_num_limit; - f_heap << " , alive : " << node->is_alive << std::endl; - } - f_heap << "[DEBUG] step2 b_heap : " << std::endl; - for (FilterCacheHeapNode& node : b_heap) { - f_heap << " id : " << node->segment_id; - f_heap << " , cnt : " << node->approx_visit_cnt; - f_heap << " , units : " << node->current_units_num; - f_heap << " , value : " << node->benefit_or_cost; - f_heap << " , limit : " << node->units_num_limit; - f_heap << " , alive : " << node->is_alive << std::endl; - } - f_heap << "[DEBUG] step2 c_heap_index : " << std::endl; - for (auto it = c_heap_index.begin(); it != c_heap_index.end(); it++) { - FilterCacheHeapNode node = it->second; - f_heap << it->first << " -> "; - f_heap << " id : " << node->segment_id; - f_heap << " , cnt : " << node->approx_visit_cnt; - f_heap << " , units : " << node->current_units_num; - f_heap << " , value : " << node->benefit_or_cost; - f_heap << " , limit : " << node->units_num_limit; - f_heap << " , alive : " << node->is_alive << std::endl; - } - f_heap << "[DEBUG] step2 c_heap : " << std::endl; - for (FilterCacheHeapNode& node : c_heap) { - f_heap << " id : " << node->segment_id; - f_heap << " , cnt : " << node->approx_visit_cnt; - f_heap << " , units : " << node->current_units_num; - f_heap << " , value : " << node->benefit_or_cost; - f_heap << " , limit : " << node->units_num_limit; - f_heap << " , alive : " << node->is_alive << std::endl; - } - f_heap << "[DEBUG] step2 visit_cnt_recorder : " << std::endl; - for (auto it = heap_visit_cnt_recorder_.begin(); - it != heap_visit_cnt_recorder_.end(); it++) { - f_heap << it->first << " -> " << it->second << std::endl; - } - f_heap << "[DEBUG] step2 units_limit_recorder : " << std::endl; - for (auto it = units_num_limit_recorder_.begin(); - it != units_num_limit_recorder_.end(); it++) { - f_heap << it->first << " -> " << it->second << std::endl; - } - - // 3. try to delete some data - f_heap << std::endl << std::endl<< "[DEBUG] debug step 3 : batch delete" << std::endl << std::endl; - items.clear(); - segment_ids.clear(); - for (uint32_t i = 0; i < 10; i++) { - segment_ids.emplace_back(i); - } - for (uint32_t i = 60; i < 100; i++) { - segment_ids.emplace_back(i); - } - batch_delete(segment_ids); - benefit_heap_.heap_index(b_heap_index); - benefit_heap_.heap(b_heap); - cost_heap_.heap_index(c_heap_index); - cost_heap_.heap(c_heap); - f_heap << "[DEBUG] step3 b_heap_index : " << std::endl; - for (auto it = b_heap_index.begin(); it != b_heap_index.end(); it++) { - FilterCacheHeapNode node = it->second; - f_heap << it->first << " -> "; - f_heap << " id : " << node->segment_id; - f_heap << " , cnt : " << node->approx_visit_cnt; - f_heap << " , units : " << node->current_units_num; - f_heap << " , value : " << node->benefit_or_cost; - f_heap << " , limit : " << node->units_num_limit; - f_heap << " , alive : " << node->is_alive << std::endl; - } - f_heap << "[DEBUG] step3 b_heap : " << std::endl; - for (FilterCacheHeapNode& node : b_heap) { - f_heap << " id : " << node->segment_id; - f_heap << " , cnt : " << node->approx_visit_cnt; - f_heap << " , units : " << node->current_units_num; - f_heap << " , value : " << node->benefit_or_cost; - f_heap << " , limit : " << node->units_num_limit; - f_heap << " , alive : " << node->is_alive << std::endl; - } - f_heap << "[DEBUG] step3 c_heap_index : " << std::endl; - for (auto it = c_heap_index.begin(); it != c_heap_index.end(); it++) { - FilterCacheHeapNode node = it->second; - f_heap << it->first << " -> "; - f_heap << " id : " << node->segment_id; - f_heap << " , cnt : " << node->approx_visit_cnt; - f_heap << " , units : " << node->current_units_num; - f_heap << " , value : " << node->benefit_or_cost; - f_heap << " , limit : " << node->units_num_limit; - f_heap << " , alive : " << node->is_alive << std::endl; - } - f_heap << "[DEBUG] step3 c_heap : " << std::endl; - for (FilterCacheHeapNode& node : c_heap) { - f_heap << " id : " << node->segment_id; - f_heap << " , cnt : " << node->approx_visit_cnt; - f_heap << " , units : " << node->current_units_num; - f_heap << " , value : " << node->benefit_or_cost; - f_heap << " , limit : " << node->units_num_limit; - f_heap << " , alive : " << node->is_alive << std::endl; - } - f_heap << "[DEBUG] step3 visit_cnt_recorder : " << std::endl; - for (auto it = heap_visit_cnt_recorder_.begin(); - it != heap_visit_cnt_recorder_.end(); it++) { - f_heap << it->first << " -> " << it->second << std::endl; - } - f_heap << "[DEBUG] step3 units_limit_recorder : " << std::endl; - for (auto it = units_num_limit_recorder_.begin(); - it != units_num_limit_recorder_.end(); it++) { - f_heap << it->first << " -> " << it->second << std::endl; - } - - // 4. try to sync visit cnt - f_heap << std::endl << std::endl<< "[DEBUG] debug step 4 : sync visit cnt " << std::endl << std::endl; - for (uint32_t id = 0; id < 40; id++) { - if (id % 2 == 0) { - current_visit_cnt_recorder.insert(std::make_pair(id, (id % 70) * std::pow(10, (id % 70) / 10) + 101010)); - } - } - for (uint32_t id = 40; id < 60; id++) { - current_visit_cnt_recorder.insert(std::make_pair(id, (id % 70) * std::pow(10, (id % 70) / 10) + 101010)); - } - sync_visit_cnt(current_visit_cnt_recorder); - benefit_heap_.heap_index(b_heap_index); - benefit_heap_.heap(b_heap); - cost_heap_.heap_index(c_heap_index); - cost_heap_.heap(c_heap); - f_heap << "[DEBUG] step4 b_heap_index : " << std::endl; - for (auto it = b_heap_index.begin(); it != b_heap_index.end(); it++) { - FilterCacheHeapNode node = it->second; - f_heap << it->first << " -> "; - f_heap << " id : " << node->segment_id; - f_heap << " , cnt : " << node->approx_visit_cnt; - f_heap << " , units : " << node->current_units_num; - f_heap << " , value : " << node->benefit_or_cost; - f_heap << " , limit : " << node->units_num_limit; - f_heap << " , alive : " << node->is_alive << std::endl; - } - f_heap << "[DEBUG] step4 b_heap : " << std::endl; - for (FilterCacheHeapNode& node : b_heap) { - f_heap << " id : " << node->segment_id; - f_heap << " , cnt : " << node->approx_visit_cnt; - f_heap << " , units : " << node->current_units_num; - f_heap << " , value : " << node->benefit_or_cost; - f_heap << " , limit : " << node->units_num_limit; - f_heap << " , alive : " << node->is_alive << std::endl; - } - f_heap << "[DEBUG] step4 c_heap_index : " << std::endl; - for (auto it = c_heap_index.begin(); it != c_heap_index.end(); it++) { - FilterCacheHeapNode node = it->second; - f_heap << it->first << " -> "; - f_heap << " id : " << node->segment_id; - f_heap << " , cnt : " << node->approx_visit_cnt; - f_heap << " , units : " << node->current_units_num; - f_heap << " , value : " << node->benefit_or_cost; - f_heap << " , limit : " << node->units_num_limit; - f_heap << " , alive : " << node->is_alive << std::endl; - } - f_heap << "[DEBUG] step4 c_heap : " << std::endl; - for (FilterCacheHeapNode& node : c_heap) { - f_heap << " id : " << node->segment_id; - f_heap << " , cnt : " << node->approx_visit_cnt; - f_heap << " , units : " << node->current_units_num; - f_heap << " , value : " << node->benefit_or_cost; - f_heap << " , limit : " << node->units_num_limit; - f_heap << " , alive : " << node->is_alive << std::endl; - } - f_heap << "[DEBUG] step4 visit_cnt_recorder : " << std::endl; - for (auto it = heap_visit_cnt_recorder_.begin(); - it != heap_visit_cnt_recorder_.end(); it++) { - f_heap << it->first << " -> " << it->second << std::endl; - } - f_heap << "[DEBUG] step4 units_limit_recorder : " << std::endl; - for (auto it = units_num_limit_recorder_.begin(); - it != units_num_limit_recorder_.end(); it++) { - f_heap << it->first << " -> " << it->second << std::endl; - } - - // 5. try to decrease units limit - f_heap << std::endl << std::endl<< "[DEBUG] debug step 5 : decrease units limit " << std::endl << std::endl; - for (uint32_t id = 0; id < 40; id++) { - if (id % 2 == 0) { - current_units_num_limit_recorder.insert(std::make_pair(id, 0)); - } else { - current_units_num_limit_recorder.insert(std::make_pair(id, 1)); - } - } - for (uint32_t id = 40; id < 50; id++) { - current_units_num_limit_recorder.insert(std::make_pair(id, 3)); - } - for (uint32_t id = 50; id < 70; id++) { - current_units_num_limit_recorder.insert(std::make_pair(id, 5)); - } - sync_units_num_limit(current_units_num_limit_recorder); - benefit_heap_.heap_index(b_heap_index); - benefit_heap_.heap(b_heap); - cost_heap_.heap_index(c_heap_index); - cost_heap_.heap(c_heap); - f_heap << "[DEBUG] step5 b_heap_index : " << std::endl; - for (auto it = b_heap_index.begin(); it != b_heap_index.end(); it++) { - FilterCacheHeapNode node = it->second; - f_heap << it->first << " -> "; - f_heap << " id : " << node->segment_id; - f_heap << " , cnt : " << node->approx_visit_cnt; - f_heap << " , units : " << node->current_units_num; - f_heap << " , value : " << node->benefit_or_cost; - f_heap << " , limit : " << node->units_num_limit; - f_heap << " , alive : " << node->is_alive << std::endl; - } - f_heap << "[DEBUG] step5 b_heap : " << std::endl; - for (FilterCacheHeapNode& node : b_heap) { - f_heap << " id : " << node->segment_id; - f_heap << " , cnt : " << node->approx_visit_cnt; - f_heap << " , units : " << node->current_units_num; - f_heap << " , value : " << node->benefit_or_cost; - f_heap << " , limit : " << node->units_num_limit; - f_heap << " , alive : " << node->is_alive << std::endl; - } - f_heap << "[DEBUG] step5 c_heap_index : " << std::endl; - for (auto it = c_heap_index.begin(); it != c_heap_index.end(); it++) { - FilterCacheHeapNode node = it->second; - f_heap << it->first << " -> "; - f_heap << " id : " << node->segment_id; - f_heap << " , cnt : " << node->approx_visit_cnt; - f_heap << " , units : " << node->current_units_num; - f_heap << " , value : " << node->benefit_or_cost; - f_heap << " , limit : " << node->units_num_limit; - f_heap << " , alive : " << node->is_alive << std::endl; - } - f_heap << "[DEBUG] step5 c_heap : " << std::endl; - for (FilterCacheHeapNode& node : c_heap) { - f_heap << " id : " << node->segment_id; - f_heap << " , cnt : " << node->approx_visit_cnt; - f_heap << " , units : " << node->current_units_num; - f_heap << " , value : " << node->benefit_or_cost; - f_heap << " , limit : " << node->units_num_limit; - f_heap << " , alive : " << node->is_alive << std::endl; - } - f_heap << "[DEBUG] step5 visit_cnt_recorder : " << std::endl; - for (auto it = heap_visit_cnt_recorder_.begin(); - it != heap_visit_cnt_recorder_.end(); it++) { - f_heap << it->first << " -> " << it->second << std::endl; - } - f_heap << "[DEBUG] step5 units_limit_recorder : " << std::endl; - for (auto it = units_num_limit_recorder_.begin(); - it != units_num_limit_recorder_.end(); it++) { - f_heap << it->first << " -> " << it->second << std::endl; - } - - // 6. try to increase units limit - f_heap << std::endl << std::endl<< "[DEBUG] debug step 6 : increase units limit " << std::endl << std::endl; - for (uint32_t id = 0; id < 40; id++) { - if (id % 2 == 0) { - current_units_num_limit_recorder[id] = 3; - } else { - current_units_num_limit_recorder[id] = 4; - } - } - for (uint32_t id = 40; id < 50; id++) { - current_units_num_limit_recorder[id] = 5; - } - for (uint32_t id = 50; id < 70; id++) { - current_units_num_limit_recorder[id] = 6; - } - sync_units_num_limit(current_units_num_limit_recorder); - benefit_heap_.heap_index(b_heap_index); - benefit_heap_.heap(b_heap); - cost_heap_.heap_index(c_heap_index); - cost_heap_.heap(c_heap); - f_heap << "[DEBUG] step6 b_heap_index : " << std::endl; - for (auto it = b_heap_index.begin(); it != b_heap_index.end(); it++) { - FilterCacheHeapNode node = it->second; - f_heap << it->first << " -> "; - f_heap << " id : " << node->segment_id; - f_heap << " , cnt : " << node->approx_visit_cnt; - f_heap << " , units : " << node->current_units_num; - f_heap << " , value : " << node->benefit_or_cost; - f_heap << " , limit : " << node->units_num_limit; - f_heap << " , alive : " << node->is_alive << std::endl; - } - f_heap << "[DEBUG] step6 b_heap : " << std::endl; - for (FilterCacheHeapNode& node : b_heap) { - f_heap << " id : " << node->segment_id; - f_heap << " , cnt : " << node->approx_visit_cnt; - f_heap << " , units : " << node->current_units_num; - f_heap << " , value : " << node->benefit_or_cost; - f_heap << " , limit : " << node->units_num_limit; - f_heap << " , alive : " << node->is_alive << std::endl; - } - f_heap << "[DEBUG] step6 c_heap_index : " << std::endl; - for (auto it = c_heap_index.begin(); it != c_heap_index.end(); it++) { - FilterCacheHeapNode node = it->second; - f_heap << it->first << " -> "; - f_heap << " id : " << node->segment_id; - f_heap << " , cnt : " << node->approx_visit_cnt; - f_heap << " , units : " << node->current_units_num; - f_heap << " , value : " << node->benefit_or_cost; - f_heap << " , limit : " << node->units_num_limit; - f_heap << " , alive : " << node->is_alive << std::endl; - } - f_heap << "[DEBUG] step6 c_heap : " << std::endl; - for (FilterCacheHeapNode& node : c_heap) { - f_heap << " id : " << node->segment_id; - f_heap << " , cnt : " << node->approx_visit_cnt; - f_heap << " , units : " << node->current_units_num; - f_heap << " , value : " << node->benefit_or_cost; - f_heap << " , limit : " << node->units_num_limit; - f_heap << " , alive : " << node->is_alive << std::endl; - } - f_heap << "[DEBUG] step6 visit_cnt_recorder : " << std::endl; - for (auto it = heap_visit_cnt_recorder_.begin(); - it != heap_visit_cnt_recorder_.end(); it++) { - f_heap << it->first << " -> " << it->second << std::endl; - } - f_heap << "[DEBUG] step6 units_limit_recorder : " << std::endl; - for (auto it = units_num_limit_recorder_.begin(); - it != units_num_limit_recorder_.end(); it++) { - f_heap << it->first << " -> " << it->second << std::endl; - } - - // 7. try to loop modification - f_heap << std::endl << std::endl<< "[DEBUG] debug step 7 : loop try_modify " << std::endl << std::endl; - f_heap << "[DEBUG] step7 loop start : " << std::endl; - FilterCacheModifyResult result; - while (try_modify(result)) { - f_heap << "enable segment -> " << "id : " << result.enable_segment_id; - f_heap << " , prev units num : " << result.enable_segment_units_num; - f_heap << " , benefit : " << result.enable_benefit << std::endl; - f_heap << "disable segment -> " << "id : " << result.disable_segment_id; - f_heap << " , prev units num : " << result.disable_segment_units_num; - f_heap << " , cost : " << result.disable_cost << std::endl; - } - // write final indexs and heaps - benefit_heap_.heap_index(b_heap_index); - benefit_heap_.heap(b_heap); - cost_heap_.heap_index(c_heap_index); - cost_heap_.heap(c_heap); - f_heap << "[DEBUG] step7 b_heap_index : " << std::endl; - for (auto it = b_heap_index.begin(); it != b_heap_index.end(); it++) { - FilterCacheHeapNode node = it->second; - f_heap << it->first << " -> "; - f_heap << " id : " << node->segment_id; - f_heap << " , cnt : " << node->approx_visit_cnt; - f_heap << " , units : " << node->current_units_num; - f_heap << " , value : " << node->benefit_or_cost; - f_heap << " , limit : " << node->units_num_limit; - f_heap << " , alive : " << node->is_alive << std::endl; - } - f_heap << "[DEBUG] step7 b_heap : " << std::endl; - for (FilterCacheHeapNode& node : b_heap) { - f_heap << " id : " << node->segment_id; - f_heap << " , cnt : " << node->approx_visit_cnt; - f_heap << " , units : " << node->current_units_num; - f_heap << " , value : " << node->benefit_or_cost; - f_heap << " , limit : " << node->units_num_limit; - f_heap << " , alive : " << node->is_alive << std::endl; - } - f_heap << "[DEBUG] step7 c_heap_index : " << std::endl; - for (auto it = c_heap_index.begin(); it != c_heap_index.end(); it++) { - FilterCacheHeapNode node = it->second; - f_heap << it->first << " -> "; - f_heap << " id : " << node->segment_id; - f_heap << " , cnt : " << node->approx_visit_cnt; - f_heap << " , units : " << node->current_units_num; - f_heap << " , value : " << node->benefit_or_cost; - f_heap << " , limit : " << node->units_num_limit; - f_heap << " , alive : " << node->is_alive << std::endl; - } - f_heap << "[DEBUG] step7 c_heap : " << std::endl; - for (FilterCacheHeapNode& node : c_heap) { - f_heap << " id : " << node->segment_id; - f_heap << " , cnt : " << node->approx_visit_cnt; - f_heap << " , units : " << node->current_units_num; - f_heap << " , value : " << node->benefit_or_cost; - f_heap << " , limit : " << node->units_num_limit; - f_heap << " , alive : " << node->is_alive << std::endl; - } - f_heap << "[DEBUG] step7 visit_cnt_recorder : " << std::endl; - for (auto it = heap_visit_cnt_recorder_.begin(); - it != heap_visit_cnt_recorder_.end(); it++) { - f_heap << it->first << " -> " << it->second << std::endl; - } - f_heap << "[DEBUG] step7 units_limit_recorder : " << std::endl; - for (auto it = units_num_limit_recorder_.begin(); - it != units_num_limit_recorder_.end(); it++) { - f_heap << it->first << " -> " << it->second << std::endl; - } - - f_heap.close(); -} +// void FilterCacheHeapManager::debug() { +// std::vector items; +// std::vector segment_ids; +// std::map current_visit_cnt_recorder; +// std::map current_units_num_limit_recorder; +// std::map b_heap_index; +// std::vector b_heap; +// std::map c_heap_index; +// std::vector c_heap; +// std::fstream f_heap; +// f_heap.open("/home/guoteng_20241228_135/WaLSM+/log/heap.log", std::ios::out | std::ios::app); +// // FilterCacheHeapItem(const uint32_t& id, const uint32_t& cnt, const uint16_t& units, +// // const double& heap_value, const uint16_t& limit) +// // 1. try to insert some new data +// f_heap << "[DEBUG] debug step 1 : batch insert" << std::endl << std::endl; +// for (uint32_t id = 0; id < 70; id++) { +// items.emplace_back(id % 70, (id % 70) * 10, (id % 70) / 10, 0, MAX_UNITS_NUM); +// } +// batch_upsert(items); +// benefit_heap_.heap_index(b_heap_index); +// benefit_heap_.heap(b_heap); +// cost_heap_.heap_index(c_heap_index); +// cost_heap_.heap(c_heap); +// f_heap << "[DEBUG] step1 b_heap_index : " << std::endl; +// for (auto it = b_heap_index.begin(); it != b_heap_index.end(); it++) { +// FilterCacheHeapNode node = it->second; +// f_heap << it->first << " -> "; +// f_heap << " id : " << node->segment_id; +// f_heap << " , cnt : " << node->approx_visit_cnt; +// f_heap << " , units : " << node->current_units_num; +// f_heap << " , value : " << node->benefit_or_cost; +// f_heap << " , limit : " << node->units_num_limit; +// f_heap << " , alive : " << node->is_alive << std::endl; +// } +// f_heap << "[DEBUG] step1 b_heap : " << std::endl; +// for (FilterCacheHeapNode& node : b_heap) { +// f_heap << " id : " << node->segment_id; +// f_heap << " , cnt : " << node->approx_visit_cnt; +// f_heap << " , units : " << node->current_units_num; +// f_heap << " , value : " << node->benefit_or_cost; +// f_heap << " , limit : " << node->units_num_limit; +// f_heap << " , alive : " << node->is_alive << std::endl; +// } +// f_heap << "[DEBUG] step1 c_heap_index : " << std::endl; +// for (auto it = c_heap_index.begin(); it != c_heap_index.end(); it++) { +// FilterCacheHeapNode node = it->second; +// f_heap << it->first << " -> "; +// f_heap << " id : " << node->segment_id; +// f_heap << " , cnt : " << node->approx_visit_cnt; +// f_heap << " , units : " << node->current_units_num; +// f_heap << " , value : " << node->benefit_or_cost; +// f_heap << " , limit : " << node->units_num_limit; +// f_heap << " , alive : " << node->is_alive << std::endl; +// } +// f_heap << "[DEBUG] step1 c_heap : " << std::endl; +// for (FilterCacheHeapNode& node : c_heap) { +// f_heap << " id : " << node->segment_id; +// f_heap << " , cnt : " << node->approx_visit_cnt; +// f_heap << " , units : " << node->current_units_num; +// f_heap << " , value : " << node->benefit_or_cost; +// f_heap << " , limit : " << node->units_num_limit; +// f_heap << " , alive : " << node->is_alive << std::endl; +// } +// f_heap << "[DEBUG] step1 visit_cnt_recorder : " << std::endl; +// for (auto it = heap_visit_cnt_recorder_.begin(); +// it != heap_visit_cnt_recorder_.end(); it++) { +// f_heap << it->first << " -> " << it->second << std::endl; +// } +// f_heap << "[DEBUG] step1 units_limit_recorder : " << std::endl; +// for (auto it = units_num_limit_recorder_.begin(); +// it != units_num_limit_recorder_.end(); it++) { +// f_heap << it->first << " -> " << it->second << std::endl; +// } + +// // 2. try to update old data +// f_heap << std::endl << std::endl<< "[DEBUG] debug step 2 : batch update (using upsert)" << std::endl << std::endl; +// items.clear(); +// for (uint32_t id = 0; id < 70; id++) { +// items.emplace_back(id % 70, (id % 70) * std::pow(10, (id % 70) / 10), (id % 70) / 10, 0, MAX_UNITS_NUM); +// } +// batch_upsert(items); +// benefit_heap_.heap_index(b_heap_index); +// benefit_heap_.heap(b_heap); +// cost_heap_.heap_index(c_heap_index); +// cost_heap_.heap(c_heap); +// f_heap << "[DEBUG] step2 b_heap_index : " << std::endl; +// for (auto it = b_heap_index.begin(); it != b_heap_index.end(); it++) { +// FilterCacheHeapNode node = it->second; +// f_heap << it->first << " -> "; +// f_heap << " id : " << node->segment_id; +// f_heap << " , cnt : " << node->approx_visit_cnt; +// f_heap << " , units : " << node->current_units_num; +// f_heap << " , value : " << node->benefit_or_cost; +// f_heap << " , limit : " << node->units_num_limit; +// f_heap << " , alive : " << node->is_alive << std::endl; +// } +// f_heap << "[DEBUG] step2 b_heap : " << std::endl; +// for (FilterCacheHeapNode& node : b_heap) { +// f_heap << " id : " << node->segment_id; +// f_heap << " , cnt : " << node->approx_visit_cnt; +// f_heap << " , units : " << node->current_units_num; +// f_heap << " , value : " << node->benefit_or_cost; +// f_heap << " , limit : " << node->units_num_limit; +// f_heap << " , alive : " << node->is_alive << std::endl; +// } +// f_heap << "[DEBUG] step2 c_heap_index : " << std::endl; +// for (auto it = c_heap_index.begin(); it != c_heap_index.end(); it++) { +// FilterCacheHeapNode node = it->second; +// f_heap << it->first << " -> "; +// f_heap << " id : " << node->segment_id; +// f_heap << " , cnt : " << node->approx_visit_cnt; +// f_heap << " , units : " << node->current_units_num; +// f_heap << " , value : " << node->benefit_or_cost; +// f_heap << " , limit : " << node->units_num_limit; +// f_heap << " , alive : " << node->is_alive << std::endl; +// } +// f_heap << "[DEBUG] step2 c_heap : " << std::endl; +// for (FilterCacheHeapNode& node : c_heap) { +// f_heap << " id : " << node->segment_id; +// f_heap << " , cnt : " << node->approx_visit_cnt; +// f_heap << " , units : " << node->current_units_num; +// f_heap << " , value : " << node->benefit_or_cost; +// f_heap << " , limit : " << node->units_num_limit; +// f_heap << " , alive : " << node->is_alive << std::endl; +// } +// f_heap << "[DEBUG] step2 visit_cnt_recorder : " << std::endl; +// for (auto it = heap_visit_cnt_recorder_.begin(); +// it != heap_visit_cnt_recorder_.end(); it++) { +// f_heap << it->first << " -> " << it->second << std::endl; +// } +// f_heap << "[DEBUG] step2 units_limit_recorder : " << std::endl; +// for (auto it = units_num_limit_recorder_.begin(); +// it != units_num_limit_recorder_.end(); it++) { +// f_heap << it->first << " -> " << it->second << std::endl; +// } + +// // 3. try to delete some data +// f_heap << std::endl << std::endl<< "[DEBUG] debug step 3 : batch delete" << std::endl << std::endl; +// items.clear(); +// segment_ids.clear(); +// for (uint32_t i = 0; i < 10; i++) { +// segment_ids.emplace_back(i); +// } +// for (uint32_t i = 60; i < 100; i++) { +// segment_ids.emplace_back(i); +// } +// batch_delete(segment_ids); +// benefit_heap_.heap_index(b_heap_index); +// benefit_heap_.heap(b_heap); +// cost_heap_.heap_index(c_heap_index); +// cost_heap_.heap(c_heap); +// f_heap << "[DEBUG] step3 b_heap_index : " << std::endl; +// for (auto it = b_heap_index.begin(); it != b_heap_index.end(); it++) { +// FilterCacheHeapNode node = it->second; +// f_heap << it->first << " -> "; +// f_heap << " id : " << node->segment_id; +// f_heap << " , cnt : " << node->approx_visit_cnt; +// f_heap << " , units : " << node->current_units_num; +// f_heap << " , value : " << node->benefit_or_cost; +// f_heap << " , limit : " << node->units_num_limit; +// f_heap << " , alive : " << node->is_alive << std::endl; +// } +// f_heap << "[DEBUG] step3 b_heap : " << std::endl; +// for (FilterCacheHeapNode& node : b_heap) { +// f_heap << " id : " << node->segment_id; +// f_heap << " , cnt : " << node->approx_visit_cnt; +// f_heap << " , units : " << node->current_units_num; +// f_heap << " , value : " << node->benefit_or_cost; +// f_heap << " , limit : " << node->units_num_limit; +// f_heap << " , alive : " << node->is_alive << std::endl; +// } +// f_heap << "[DEBUG] step3 c_heap_index : " << std::endl; +// for (auto it = c_heap_index.begin(); it != c_heap_index.end(); it++) { +// FilterCacheHeapNode node = it->second; +// f_heap << it->first << " -> "; +// f_heap << " id : " << node->segment_id; +// f_heap << " , cnt : " << node->approx_visit_cnt; +// f_heap << " , units : " << node->current_units_num; +// f_heap << " , value : " << node->benefit_or_cost; +// f_heap << " , limit : " << node->units_num_limit; +// f_heap << " , alive : " << node->is_alive << std::endl; +// } +// f_heap << "[DEBUG] step3 c_heap : " << std::endl; +// for (FilterCacheHeapNode& node : c_heap) { +// f_heap << " id : " << node->segment_id; +// f_heap << " , cnt : " << node->approx_visit_cnt; +// f_heap << " , units : " << node->current_units_num; +// f_heap << " , value : " << node->benefit_or_cost; +// f_heap << " , limit : " << node->units_num_limit; +// f_heap << " , alive : " << node->is_alive << std::endl; +// } +// f_heap << "[DEBUG] step3 visit_cnt_recorder : " << std::endl; +// for (auto it = heap_visit_cnt_recorder_.begin(); +// it != heap_visit_cnt_recorder_.end(); it++) { +// f_heap << it->first << " -> " << it->second << std::endl; +// } +// f_heap << "[DEBUG] step3 units_limit_recorder : " << std::endl; +// for (auto it = units_num_limit_recorder_.begin(); +// it != units_num_limit_recorder_.end(); it++) { +// f_heap << it->first << " -> " << it->second << std::endl; +// } + +// // 4. try to sync visit cnt +// f_heap << std::endl << std::endl<< "[DEBUG] debug step 4 : sync visit cnt " << std::endl << std::endl; +// for (uint32_t id = 0; id < 40; id++) { +// if (id % 2 == 0) { +// current_visit_cnt_recorder.insert(std::make_pair(id, (id % 70) * std::pow(10, (id % 70) / 10) + 101010)); +// } +// } +// for (uint32_t id = 40; id < 60; id++) { +// current_visit_cnt_recorder.insert(std::make_pair(id, (id % 70) * std::pow(10, (id % 70) / 10) + 101010)); +// } +// sync_visit_cnt(current_visit_cnt_recorder); +// benefit_heap_.heap_index(b_heap_index); +// benefit_heap_.heap(b_heap); +// cost_heap_.heap_index(c_heap_index); +// cost_heap_.heap(c_heap); +// f_heap << "[DEBUG] step4 b_heap_index : " << std::endl; +// for (auto it = b_heap_index.begin(); it != b_heap_index.end(); it++) { +// FilterCacheHeapNode node = it->second; +// f_heap << it->first << " -> "; +// f_heap << " id : " << node->segment_id; +// f_heap << " , cnt : " << node->approx_visit_cnt; +// f_heap << " , units : " << node->current_units_num; +// f_heap << " , value : " << node->benefit_or_cost; +// f_heap << " , limit : " << node->units_num_limit; +// f_heap << " , alive : " << node->is_alive << std::endl; +// } +// f_heap << "[DEBUG] step4 b_heap : " << std::endl; +// for (FilterCacheHeapNode& node : b_heap) { +// f_heap << " id : " << node->segment_id; +// f_heap << " , cnt : " << node->approx_visit_cnt; +// f_heap << " , units : " << node->current_units_num; +// f_heap << " , value : " << node->benefit_or_cost; +// f_heap << " , limit : " << node->units_num_limit; +// f_heap << " , alive : " << node->is_alive << std::endl; +// } +// f_heap << "[DEBUG] step4 c_heap_index : " << std::endl; +// for (auto it = c_heap_index.begin(); it != c_heap_index.end(); it++) { +// FilterCacheHeapNode node = it->second; +// f_heap << it->first << " -> "; +// f_heap << " id : " << node->segment_id; +// f_heap << " , cnt : " << node->approx_visit_cnt; +// f_heap << " , units : " << node->current_units_num; +// f_heap << " , value : " << node->benefit_or_cost; +// f_heap << " , limit : " << node->units_num_limit; +// f_heap << " , alive : " << node->is_alive << std::endl; +// } +// f_heap << "[DEBUG] step4 c_heap : " << std::endl; +// for (FilterCacheHeapNode& node : c_heap) { +// f_heap << " id : " << node->segment_id; +// f_heap << " , cnt : " << node->approx_visit_cnt; +// f_heap << " , units : " << node->current_units_num; +// f_heap << " , value : " << node->benefit_or_cost; +// f_heap << " , limit : " << node->units_num_limit; +// f_heap << " , alive : " << node->is_alive << std::endl; +// } +// f_heap << "[DEBUG] step4 visit_cnt_recorder : " << std::endl; +// for (auto it = heap_visit_cnt_recorder_.begin(); +// it != heap_visit_cnt_recorder_.end(); it++) { +// f_heap << it->first << " -> " << it->second << std::endl; +// } +// f_heap << "[DEBUG] step4 units_limit_recorder : " << std::endl; +// for (auto it = units_num_limit_recorder_.begin(); +// it != units_num_limit_recorder_.end(); it++) { +// f_heap << it->first << " -> " << it->second << std::endl; +// } + +// // 5. try to decrease units limit +// f_heap << std::endl << std::endl<< "[DEBUG] debug step 5 : decrease units limit " << std::endl << std::endl; +// for (uint32_t id = 0; id < 40; id++) { +// if (id % 2 == 0) { +// current_units_num_limit_recorder.insert(std::make_pair(id, 0)); +// } else { +// current_units_num_limit_recorder.insert(std::make_pair(id, 1)); +// } +// } +// for (uint32_t id = 40; id < 50; id++) { +// current_units_num_limit_recorder.insert(std::make_pair(id, 3)); +// } +// for (uint32_t id = 50; id < 70; id++) { +// current_units_num_limit_recorder.insert(std::make_pair(id, 5)); +// } +// sync_units_num_limit(current_units_num_limit_recorder); +// benefit_heap_.heap_index(b_heap_index); +// benefit_heap_.heap(b_heap); +// cost_heap_.heap_index(c_heap_index); +// cost_heap_.heap(c_heap); +// f_heap << "[DEBUG] step5 b_heap_index : " << std::endl; +// for (auto it = b_heap_index.begin(); it != b_heap_index.end(); it++) { +// FilterCacheHeapNode node = it->second; +// f_heap << it->first << " -> "; +// f_heap << " id : " << node->segment_id; +// f_heap << " , cnt : " << node->approx_visit_cnt; +// f_heap << " , units : " << node->current_units_num; +// f_heap << " , value : " << node->benefit_or_cost; +// f_heap << " , limit : " << node->units_num_limit; +// f_heap << " , alive : " << node->is_alive << std::endl; +// } +// f_heap << "[DEBUG] step5 b_heap : " << std::endl; +// for (FilterCacheHeapNode& node : b_heap) { +// f_heap << " id : " << node->segment_id; +// f_heap << " , cnt : " << node->approx_visit_cnt; +// f_heap << " , units : " << node->current_units_num; +// f_heap << " , value : " << node->benefit_or_cost; +// f_heap << " , limit : " << node->units_num_limit; +// f_heap << " , alive : " << node->is_alive << std::endl; +// } +// f_heap << "[DEBUG] step5 c_heap_index : " << std::endl; +// for (auto it = c_heap_index.begin(); it != c_heap_index.end(); it++) { +// FilterCacheHeapNode node = it->second; +// f_heap << it->first << " -> "; +// f_heap << " id : " << node->segment_id; +// f_heap << " , cnt : " << node->approx_visit_cnt; +// f_heap << " , units : " << node->current_units_num; +// f_heap << " , value : " << node->benefit_or_cost; +// f_heap << " , limit : " << node->units_num_limit; +// f_heap << " , alive : " << node->is_alive << std::endl; +// } +// f_heap << "[DEBUG] step5 c_heap : " << std::endl; +// for (FilterCacheHeapNode& node : c_heap) { +// f_heap << " id : " << node->segment_id; +// f_heap << " , cnt : " << node->approx_visit_cnt; +// f_heap << " , units : " << node->current_units_num; +// f_heap << " , value : " << node->benefit_or_cost; +// f_heap << " , limit : " << node->units_num_limit; +// f_heap << " , alive : " << node->is_alive << std::endl; +// } +// f_heap << "[DEBUG] step5 visit_cnt_recorder : " << std::endl; +// for (auto it = heap_visit_cnt_recorder_.begin(); +// it != heap_visit_cnt_recorder_.end(); it++) { +// f_heap << it->first << " -> " << it->second << std::endl; +// } +// f_heap << "[DEBUG] step5 units_limit_recorder : " << std::endl; +// for (auto it = units_num_limit_recorder_.begin(); +// it != units_num_limit_recorder_.end(); it++) { +// f_heap << it->first << " -> " << it->second << std::endl; +// } + +// // 6. try to increase units limit +// f_heap << std::endl << std::endl<< "[DEBUG] debug step 6 : increase units limit " << std::endl << std::endl; +// for (uint32_t id = 0; id < 40; id++) { +// if (id % 2 == 0) { +// current_units_num_limit_recorder[id] = 3; +// } else { +// current_units_num_limit_recorder[id] = 4; +// } +// } +// for (uint32_t id = 40; id < 50; id++) { +// current_units_num_limit_recorder[id] = 5; +// } +// for (uint32_t id = 50; id < 70; id++) { +// current_units_num_limit_recorder[id] = 6; +// } +// sync_units_num_limit(current_units_num_limit_recorder); +// benefit_heap_.heap_index(b_heap_index); +// benefit_heap_.heap(b_heap); +// cost_heap_.heap_index(c_heap_index); +// cost_heap_.heap(c_heap); +// f_heap << "[DEBUG] step6 b_heap_index : " << std::endl; +// for (auto it = b_heap_index.begin(); it != b_heap_index.end(); it++) { +// FilterCacheHeapNode node = it->second; +// f_heap << it->first << " -> "; +// f_heap << " id : " << node->segment_id; +// f_heap << " , cnt : " << node->approx_visit_cnt; +// f_heap << " , units : " << node->current_units_num; +// f_heap << " , value : " << node->benefit_or_cost; +// f_heap << " , limit : " << node->units_num_limit; +// f_heap << " , alive : " << node->is_alive << std::endl; +// } +// f_heap << "[DEBUG] step6 b_heap : " << std::endl; +// for (FilterCacheHeapNode& node : b_heap) { +// f_heap << " id : " << node->segment_id; +// f_heap << " , cnt : " << node->approx_visit_cnt; +// f_heap << " , units : " << node->current_units_num; +// f_heap << " , value : " << node->benefit_or_cost; +// f_heap << " , limit : " << node->units_num_limit; +// f_heap << " , alive : " << node->is_alive << std::endl; +// } +// f_heap << "[DEBUG] step6 c_heap_index : " << std::endl; +// for (auto it = c_heap_index.begin(); it != c_heap_index.end(); it++) { +// FilterCacheHeapNode node = it->second; +// f_heap << it->first << " -> "; +// f_heap << " id : " << node->segment_id; +// f_heap << " , cnt : " << node->approx_visit_cnt; +// f_heap << " , units : " << node->current_units_num; +// f_heap << " , value : " << node->benefit_or_cost; +// f_heap << " , limit : " << node->units_num_limit; +// f_heap << " , alive : " << node->is_alive << std::endl; +// } +// f_heap << "[DEBUG] step6 c_heap : " << std::endl; +// for (FilterCacheHeapNode& node : c_heap) { +// f_heap << " id : " << node->segment_id; +// f_heap << " , cnt : " << node->approx_visit_cnt; +// f_heap << " , units : " << node->current_units_num; +// f_heap << " , value : " << node->benefit_or_cost; +// f_heap << " , limit : " << node->units_num_limit; +// f_heap << " , alive : " << node->is_alive << std::endl; +// } +// f_heap << "[DEBUG] step6 visit_cnt_recorder : " << std::endl; +// for (auto it = heap_visit_cnt_recorder_.begin(); +// it != heap_visit_cnt_recorder_.end(); it++) { +// f_heap << it->first << " -> " << it->second << std::endl; +// } +// f_heap << "[DEBUG] step6 units_limit_recorder : " << std::endl; +// for (auto it = units_num_limit_recorder_.begin(); +// it != units_num_limit_recorder_.end(); it++) { +// f_heap << it->first << " -> " << it->second << std::endl; +// } + +// // 7. try to loop modification +// f_heap << std::endl << std::endl<< "[DEBUG] debug step 7 : loop try_modify " << std::endl << std::endl; +// f_heap << "[DEBUG] step7 loop start : " << std::endl; +// FilterCacheModifyResult result; +// while (try_modify(result)) { +// f_heap << "enable segment -> " << "id : " << result.enable_segment_id; +// f_heap << " , prev units num : " << result.enable_segment_units_num; +// f_heap << " , benefit : " << result.enable_benefit << std::endl; +// f_heap << "disable segment -> " << "id : " << result.disable_segment_id; +// f_heap << " , prev units num : " << result.disable_segment_units_num; +// f_heap << " , cost : " << result.disable_cost << std::endl; +// } +// // write final indexs and heaps +// benefit_heap_.heap_index(b_heap_index); +// benefit_heap_.heap(b_heap); +// cost_heap_.heap_index(c_heap_index); +// cost_heap_.heap(c_heap); +// f_heap << "[DEBUG] step7 b_heap_index : " << std::endl; +// for (auto it = b_heap_index.begin(); it != b_heap_index.end(); it++) { +// FilterCacheHeapNode node = it->second; +// f_heap << it->first << " -> "; +// f_heap << " id : " << node->segment_id; +// f_heap << " , cnt : " << node->approx_visit_cnt; +// f_heap << " , units : " << node->current_units_num; +// f_heap << " , value : " << node->benefit_or_cost; +// f_heap << " , limit : " << node->units_num_limit; +// f_heap << " , alive : " << node->is_alive << std::endl; +// } +// f_heap << "[DEBUG] step7 b_heap : " << std::endl; +// for (FilterCacheHeapNode& node : b_heap) { +// f_heap << " id : " << node->segment_id; +// f_heap << " , cnt : " << node->approx_visit_cnt; +// f_heap << " , units : " << node->current_units_num; +// f_heap << " , value : " << node->benefit_or_cost; +// f_heap << " , limit : " << node->units_num_limit; +// f_heap << " , alive : " << node->is_alive << std::endl; +// } +// f_heap << "[DEBUG] step7 c_heap_index : " << std::endl; +// for (auto it = c_heap_index.begin(); it != c_heap_index.end(); it++) { +// FilterCacheHeapNode node = it->second; +// f_heap << it->first << " -> "; +// f_heap << " id : " << node->segment_id; +// f_heap << " , cnt : " << node->approx_visit_cnt; +// f_heap << " , units : " << node->current_units_num; +// f_heap << " , value : " << node->benefit_or_cost; +// f_heap << " , limit : " << node->units_num_limit; +// f_heap << " , alive : " << node->is_alive << std::endl; +// } +// f_heap << "[DEBUG] step7 c_heap : " << std::endl; +// for (FilterCacheHeapNode& node : c_heap) { +// f_heap << " id : " << node->segment_id; +// f_heap << " , cnt : " << node->approx_visit_cnt; +// f_heap << " , units : " << node->current_units_num; +// f_heap << " , value : " << node->benefit_or_cost; +// f_heap << " , limit : " << node->units_num_limit; +// f_heap << " , alive : " << node->is_alive << std::endl; +// } +// f_heap << "[DEBUG] step7 visit_cnt_recorder : " << std::endl; +// for (auto it = heap_visit_cnt_recorder_.begin(); +// it != heap_visit_cnt_recorder_.end(); it++) { +// f_heap << it->first << " -> " << it->second << std::endl; +// } +// f_heap << "[DEBUG] step7 units_limit_recorder : " << std::endl; +// for (auto it = units_num_limit_recorder_.begin(); +// it != units_num_limit_recorder_.end(); it++) { +// f_heap << it->first << " -> " << it->second << std::endl; +// } + +// f_heap.close(); +// } } \ No newline at end of file diff --git a/db/art/filter_cache_heap.h b/db/art/filter_cache_heap.h index ca9aea1aa..ab299ca9e 100644 --- a/db/art/filter_cache_heap.h +++ b/db/art/filter_cache_heap.h @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include @@ -182,6 +183,14 @@ class FilterCacheHeap { // return heap top FilterCacheHeapNode heap_top(); + // check heap_top's benefit/cost is smaller/greater than other items + void heap_check(bool max_heap); + + // print heap items of selected segments' ids + void heap_print(std::vector& needed_segment_ids, const bool should_exist); + + size_t heap_size() { assert(heap_.size() == heap_index_.size()); return heap_.size(); } + // pop one node with deleting node from heap_index_ // void pop(); @@ -206,12 +215,14 @@ class FilterCacheHeap { // only used in debug !!! void heap_index(std::map& heap_index) { + assert(false); heap_index.clear(); heap_index.insert(heap_index_.begin(), heap_index_.end()); } // only used in debug !!! void heap(std::vector& heap) { + assert(false); heap.clear(); heap.assign(heap_.begin(), heap_.end()); } @@ -219,18 +230,18 @@ class FilterCacheHeap { class FilterCacheHeapManager { private: - static FilterCacheHeap benefit_heap_; - static FilterCacheHeap cost_heap_; + FilterCacheHeap benefit_heap_; + FilterCacheHeap cost_heap_; // set heap node visit cnt = c_1, real estimated visit cnt = c_2 // we only update c_1 when | c_1 - c_2 | >= VISIT_CNT_UPDATE_BOUND // update c_1 means we need to update this recorder and heap // heap_visit_cnt_recorder: map // when filter cache call delete, this recorder will automately delete these merged segment ids // when filter cache call upsert, this recorder will automately upsert these segment ids - static std::map heap_visit_cnt_recorder_; - static std::map units_num_limit_recorder_; + std::map heap_visit_cnt_recorder_; + std::map units_num_limit_recorder_; // TODO: mutex can be optimized - static std::mutex manager_mutex_; + std::mutex manager_mutex_; public: FilterCacheHeapManager() { @@ -252,7 +263,7 @@ class FilterCacheHeapManager { // sync visit cnt in heap and real estimated visit cnt // reminded that we will not insert or delete nodes in this method // we only update these nodes that already exist in two heaps - void sync_visit_cnt(std::map& current_visit_cnt_recorder); + void sync_visit_cnt(std::map& recent_visit_cnt_recorder); // try to read benefit_heap top and cost_heap top, then judge whether we need to modify units num in filter cache // return true when we can modify units num of several segments, return false when we cannot @@ -268,9 +279,9 @@ class FilterCacheHeapManager { // because we need to keep heap visit cnt and recorder visit cnt the same void batch_upsert(std::vector& items); - // 1. try debug batch insert - // 2. try debug batch update(use batch_upsert) - void debug(); + // // 1. try debug batch insert + // // 2. try debug batch update(use batch_upsert) + // void debug(); }; } diff --git a/db/art/filter_cache_item.cc b/db/art/filter_cache_item.cc deleted file mode 100644 index 6f5cb1163..000000000 --- a/db/art/filter_cache_item.cc +++ /dev/null @@ -1,5 +0,0 @@ -#include "filter_cache_item.h" - -namespace ROCKSDB_NAMESPACE { - -} \ No newline at end of file diff --git a/db/art/filter_cache_item.h b/db/art/filter_cache_item.h deleted file mode 100644 index 8a591b88c..000000000 --- a/db/art/filter_cache_item.h +++ /dev/null @@ -1,53 +0,0 @@ -#pragma once - -#include -#include -#include -#include -#include -#include -#include "macros.h" - -namespace ROCKSDB_NAMESPACE { - -// 先在filter cache里为每个segment默认启用总bits-per-key=8,随着写入的segment的增加, -// 一旦已经占用了filter cache最大容量的一定阈值(如80%), 就利用GreedyAlgo计算规划问题,并进行模型训练 -// 一旦filter cache已满,就进入filter cache的double heap调整,我们只需将新的segment用模型进行预测 -// 将新segment的node插入到两个heap里,在后台启动一个线程,自行调整两个堆,并不断返回调整的结果 -// 得到结果后,我们可以立即对filter units的启用情况进行调节,也可以先保存后面批量调整 -// 具体见文档 - - -// 注意加上一些必要的英文注释 -// filter cache主要为一个map, key是segment id(uint32_t), value就为FilterCacheItem类 -// 成员函数需要在filter_cache_item.cc里定义 -class FilterCacheItem { -private: - // 这里定义一些必要的成员变量,尽量设置为private - // 可以存handle、segment id等信息 - // 允许使用STL类,如vector、map等 - // 是否需要使用mutex来保证filter units的启用/禁用管理与用units检查key二者不冲突? -public: - // 构造函数,可以初始化成员变量 - FilterCacheItem(const uint32_t& segment_id); - - // 清理成员变量,避免内存泄漏,如果new了空间,就可能需要在这里清理 - ~FilterCacheItem(); - - // 占用的内存空间,这里估计总共使用的filter units占用的空间就行了 - // 注意,返回的空间大小为占用的bits数量,不是bytes数量 - uint32_t approximate_size(); - - // 根据目前已经启用的units数,启用或禁用filter units - // 输入需要启用的units数,决定启用、禁用还是不处理 - // units_num : [MIN_UNITS_NUM, MAX_UNITS_NUM] - void enable_units(const uint32_t& units_num); - - // 输入一个key,判断是否存在 - // 具体就是从第一个unit开始,依次判断,如果有一个unit判断不存在。就停止。 - // 如果每个unit都判断存在,就返回true,否则返回false - // 如果启用的unit数为0,默认返回true - bool check_key(const std::string& key); -}; - -} \ No newline at end of file diff --git a/db/art/global_filter_cache_context.cc b/db/art/global_filter_cache_context.cc new file mode 100644 index 000000000..207a81757 --- /dev/null +++ b/db/art/global_filter_cache_context.cc @@ -0,0 +1,66 @@ +#include +#include +#include +#include + +#include "db/art/filter_cache_client.h" + +namespace ROCKSDB_NAMESPACE { +// TODO: add necessary filter cache info structures +rocksdb::FilterCacheClient + global_filter_cache; // already contain FilterCacheManager + +// TODO: mutex for updating these recorders below +// will be locked when updating these recorders below, and unlock after +// updating ends. +// Should not be used when you just want to call filtercache's function +std::mutex global_filter_cache_recorders_mutex; + +// these global recorders need to be latest after every flush or compaction: +// std::map* level_recorder_ +// std::map>* segment_ranges_recorder_ +// std::map* unit_size_recorder_ +// you may need filter_cache_.range_seperators() to receive key range seperators +// exactly, if key k < seperators[i+1] and key k >= seperators[i], then key k +// hit key range i HeatBuckets::locate(const std::string& key) will tell you how +// to binary search corresponding key range for one key + +// segment_info_recorder save every segments' min key and max key +// but we only need to pass empty segment_info_recorder now +// TODO: it should contain all levels segments' min key and max key, then pass +// to filter cache client, but not used now this recorder will help decide the +// key ranges' num, but it dont work in current work you can try to modify macro +// APPROXIMATE_BUCKETS_NUM to decide the key ranges' num +std::unordered_map> + global_segment_info_recorder; + +// record every alive segments' level +// TODO: need to be latest all the time +std::map global_level_recorder; + +// record features num of every segments +// we choose max features num to define model feature num +// if you want to use a default features num, set MAX_FEATURES_NUM to non-zero +// value then do not insert any entry into this vector later +// TODO: we dont use this vector, so we set MAX_FEATURES_NUM to non-zero value +std::vector global_features_nums_except_level_0; + +// should be based level 0 visit cnt in a total long period +// simply we set level_0_base_count to 0, and use macro INIT_LEVEL_0_COUNT +// we can set this macro to ( PERIOD_COUNT * TRAIN_PERIODS ) * ( level 0 sorted +// runs num ) / ( max level 0 segments num ) +// TODO: modify INIT_LEVEL_0_COUNT to proper value +uint32_t global_level_0_base_count; + +// record interacting ranges and their rates of alive segments +// TODO: should be latest all the time +std::map> global_segment_ranges_recorder; + +// every segment's filter unit size is the same +// this recorder should hold all alive segment +// simply, you can also use default macro DEFAULT_UNIT_SIZE for all segments, +// just leave this recorder empty +// TODO: modify DEFAULT_UNIT_SIZE +std::map global_unit_size_recorder; + +} // namespace ROCKSDB_NAMESPACE \ No newline at end of file diff --git a/db/art/global_filter_cache_context.h b/db/art/global_filter_cache_context.h new file mode 100644 index 000000000..762bbfef9 --- /dev/null +++ b/db/art/global_filter_cache_context.h @@ -0,0 +1,68 @@ + +#include +#include +#include +#include +#include + +#include "db/art/filter_cache_client.h" + +namespace ROCKSDB_NAMESPACE { +// TODO: add necessary filter cache info structures +extern rocksdb::FilterCacheClient + global_filter_cache; // already contain FilterCacheManager + +// TODO: mutex for updating these recorders below +// will be locked when updating these recorders below, and unlock after +// updating ends +extern std::mutex global_filter_cache_recorders_mutex; + +// these global recorders need to be latest after every flush or compaction: +// std::map* level_recorder_ +// std::map>* segment_ranges_recorder_ +// std::map* unit_size_recorder_ +// you may need filter_cache_.range_seperators() to receive key range seperators +// exactly, if key k < seperators[i+1] and key k >= seperators[i], then key k +// hit key range i HeatBuckets::locate(const std::string& key) will tell you how +// to binary search corresponding key range for one key + +// segment_info_recorder save every segments' min key and max key +// but we only need to pass empty segment_info_recorder now +// TODO: it should contain all levels segments' min key and max key, then pass +// to filter cache client, but not used now this recorder will help decide the +// key ranges' num, but it dont work in current work you can try to modify macro +// APPROXIMATE_BUCKETS_NUM to decide the key ranges' num +extern std::unordered_map> + global_segment_info_recorder; + +// record every alive segments' level +// TODO: need to be latest all the time +extern std::map global_level_recorder; + +// record features num of every segments +// we choose max features num to define model feature num +// if you want to use a default features num, set MAX_FEATURES_NUM to non-zero +// value then do not insert any entry into this vector later +// TODO: we dont use this vector, so we set MAX_FEATURES_NUM to non-zero value +extern std::vector global_features_nums_except_level_0; + +// should be based level 0 visit cnt in a total long period +// simply we set level_0_base_count to 0, and use macro INIT_LEVEL_0_COUNT +// we can set this macro to ( PERIOD_COUNT * TRAIN_PERIODS ) * ( level 0 sorted +// runs num ) / ( max level 0 segments num ) +// TODO: modify INIT_LEVEL_0_COUNT to proper value +extern uint32_t global_level_0_base_count; + +// record interacting ranges and their rates of alive segments +// TODO: should be latest all the time +extern std::map> + global_segment_ranges_recorder; + +// every segment's filter unit size is the same +// this recorder should hold all alive segment +// simply, you can also use default macro DEFAULT_UNIT_SIZE for all segments, +// just leave this recorder empty +// TODO: modify DEFAULT_UNIT_SIZE +extern std::map global_unit_size_recorder; + +} // namespace ROCKSDB_NAMESPACE \ No newline at end of file diff --git a/db/art/greedy_algo.cc b/db/art/greedy_algo.cc index 9aff8beed..bf2157316 100644 --- a/db/art/greedy_algo.cc +++ b/db/art/greedy_algo.cc @@ -27,9 +27,9 @@ void GreedyAlgo::solve(std::map& segment_algo_infos, segment_algo_helper_heap.end(), CompareSegmentAlgoHelper); - std::fstream f_algo; - f_algo.open("/pg_wal/ycc/algo.log", std::ios::out | std::ios::app); - f_algo << "[DEBUG] start to record algo : " << std::endl; + // std::fstream f_algo; + // f_algo.open("/home/guoteng_20241228_135/WaLSM+/log/algo.log", std::ios::out | std::ios::app); + // f_algo << "[DEBUG] start to record algo : " << std::endl; // current used space size (bits) of filter cache uint32_t current_cache_size = 0; @@ -44,11 +44,9 @@ void GreedyAlgo::solve(std::map& segment_algo_infos, SegmentAlgoHelper segment_algo_helper_top = segment_algo_helper_heap[size-1]; // check whether free space (in filter cache) is enough uint32_t size_needed = segment_algo_helper_top.size_per_unit; - // if not enough, remove this segment helper from heap - // that means we will not consider this segment any longer + // if not enough, exit. we allocate the same size to all units. if (current_cache_size + size_needed > cache_size) { - segment_algo_helper_heap.pop_back(); - continue; + break; } // SegmentAlgoHelper(const uint32_t& id, const uint32_t& cnt, const uint32_t& size, const uint16_t& units) SegmentAlgoHelper segment_algo_helper_needed(segment_algo_helper_top.segment_id, @@ -56,20 +54,21 @@ void GreedyAlgo::solve(std::map& segment_algo_infos, segment_algo_helper_top.size_per_unit, segment_algo_helper_top.units_num + 1); // update enabled units - // noticed that if one segment visit cnt == 0, it still enable one unit + // noticed that if one segment visit cnt == 0, it enable zero units // so check visit num before update algo_solution if (segment_algo_helper_needed.visit_cnt > 0) { algo_solution[segment_algo_helper_needed.segment_id] = segment_algo_helper_needed.units_num; current_cache_size += size_needed; - f_algo << "[DEBUG] segment " << segment_algo_helper_needed.segment_id - << " : " << segment_algo_helper_needed.units_num - 1 << " -> " - << segment_algo_helper_needed.units_num << " , cache space left : " - << cache_size - current_cache_size << " , recv benefit : " - << segment_algo_helper_top.enable_benifit << " , next benefit : " - << segment_algo_helper_needed.enable_benifit << std::endl; + // f_algo << "[DEBUG] segment " << segment_algo_helper_needed.segment_id + // << " : " << segment_algo_helper_needed.units_num - 1 << " -> " + // << segment_algo_helper_needed.units_num << " , cache space left : " + // << cache_size - current_cache_size << " , recv benefit : " + // << segment_algo_helper_top.enable_benifit << " , next benefit : " + // << segment_algo_helper_needed.enable_benifit << " , visit count: " + // << segment_algo_helper_needed.visit_cnt << std::endl; } assert(algo_solution[segment_algo_helper_needed.segment_id] <= MAX_UNITS_NUM); - // enable benefit == 0 means units_num == MAX_UNITS_NUM + // enable benefit == 0 means units_num == MAX_UNITS_NUM or its visit cnt == 0 // that means we cannot enable one unit for this segment, already enable all units if (segment_algo_helper_needed.enable_benifit == 0) { // assert(segment_algo_helper_needed.units_num >= MAX_UNITS_NUM); @@ -83,9 +82,72 @@ void GreedyAlgo::solve(std::map& segment_algo_infos, CompareSegmentAlgoHelper); } - f_algo << std::endl; - f_algo.close(); + // f_algo << std::endl; + // f_algo.close(); // return nothing, all results should be written into algo_solution } -} \ No newline at end of file +void GreedyAlgo::verify(std::map& segment_algo_infos, + std::map& algo_solution, const uint32_t& cache_size) { + assert(!segment_algo_infos.empty()); + assert(algo_solution.size() == segment_algo_infos.size()); + + std::fstream f_algo; + f_algo.open("/home/guoteng_20241228_135/WaLSM+/log/algo.log", std::ios::out | std::ios::app); + f_algo << "[DEBUG] start to verify algo : " << std::endl; + + f_algo << "[DEBUG] segment_algo_infos size : " << segment_algo_infos.size() << std::endl; + f_algo << "[DEBUG] algo_solution size : " << algo_solution.size() << std::endl; + f_algo << "[DEBUG] cache size : " << cache_size << std::endl; + assert(segment_algo_infos.size() == algo_solution.size()); + + std::map min_cnt_recorder, max_cnt_recorder; + for (uint16_t i = 0; i <= MAX_UNITS_NUM; i++) { + min_cnt_recorder[i] = 0xFFFFFFFFU; max_cnt_recorder[i] = 0; + } + + // recheck that we already compute for all segments in segment_algo_infos + auto infos_it = segment_algo_infos.begin(); + auto solution_it = algo_solution.begin(); + std::vector segment_ids; + uint32_t current_cache_size = 0; + double ideal_cost = 0; + while (infos_it != segment_algo_infos.end() && solution_it != algo_solution.end()) { + assert(infos_it->first == solution_it->first); + segment_ids.emplace_back(infos_it->first); + current_cache_size += (infos_it->second.size_per_unit * solution_it->second); + f_algo << "[DEBUG] segment " << infos_it->first << " , visit cnt : " + << infos_it->second.visit_cnt << " , size of each unit : " + << infos_it->second.size_per_unit << " , units num : " + << solution_it->second << std::endl; + min_cnt_recorder[solution_it->second] = std::min(min_cnt_recorder[solution_it->second], infos_it->second.visit_cnt); + max_cnt_recorder[solution_it->second] = std::max(max_cnt_recorder[solution_it->second], infos_it->second.visit_cnt); + ideal_cost += StandardCostForDebug(infos_it->second.visit_cnt, solution_it->second); + infos_it++; solution_it++; + } + assert(current_cache_size <= cache_size); + f_algo << "[DEBUG] current cache size : " << current_cache_size << std::endl; + for (uint16_t i = 0; i <= MAX_UNITS_NUM; i++) { + f_algo << "[DEBUG] " << i << " units, min cnt: " << min_cnt_recorder[i] << ", max cnt: " << max_cnt_recorder[i] << std::endl; + } + f_algo << "[DEBUG] ideal I/O cost : " << ideal_cost << std::endl; + + // if visit cnt of segment i > visit cnt of segment j, then segment i should enable more units than segment j + const size_t segment_size = segment_ids.size(); + for (size_t i=0; isecond).visit_cnt >= (segment_algo_infos.find(segment_id_j)->second).visit_cnt) { + assert(algo_solution[segment_id_i] >= algo_solution[segment_id_j]); + } else { + assert(algo_solution[segment_id_i] <= algo_solution[segment_id_j]); + } + } + } + + f_algo << std::endl; + f_algo.close(); +}; + +} // namespace ROCKSDB_NAMESPACE \ No newline at end of file diff --git a/db/art/greedy_algo.h b/db/art/greedy_algo.h index a1d03acc6..81a9488fb 100644 --- a/db/art/greedy_algo.h +++ b/db/art/greedy_algo.h @@ -15,7 +15,7 @@ struct SegmentAlgoHelper; class GreedyAlgo; inline double StandardBenefit(const uint32_t& visit_cnt, const uint16_t& units_num); -inline double StandardCost(const uint32_t& visit_cnt, const uint16_t& units_num); +inline double StandardCostForDebug(const uint32_t& visit_cnt, const uint16_t& units_num); inline bool CompareSegmentAlgoHelper(const SegmentAlgoHelper& helper_1, const SegmentAlgoHelper& helper_2); // contain visit counter of every segment in last long period @@ -81,7 +81,7 @@ inline double StandardBenefit(const uint32_t& visit_cnt, const uint16_t& units_n return benefit; } -inline double StandardCost(const uint32_t& visit_cnt, const uint16_t& units_num) { +inline double StandardCostForDebug(const uint32_t& visit_cnt, const uint16_t& units_num) { int bits_per_key = BITS_PER_KEY_PER_UNIT; // We intentionally round down to reduce probing cost a little bit int num_probes = static_cast(bits_per_key * 0.69); // 0.69 =~ ln(2) @@ -91,22 +91,9 @@ inline double StandardCost(const uint32_t& visit_cnt, const uint16_t& units_num) // compute false positive rate of one filter unit double rate_per_unit = std::pow(1.0 - std::exp(-double(num_probes) / double(bits_per_key)), num_probes); - if (units_num <= MIN_UNITS_NUM) { - return __DBL_MAX__; - } - - uint16_t next_units_num = units_num - 1; double rate = std::pow(rate_per_unit, units_num); - double next_rate = std::pow(rate_per_unit, next_units_num); - double cost = double(visit_cnt) * (next_rate - rate); - /* - std::cout << "visit_cnt : " << visit_cnt - << " , rate : " << rate - << " , next_rate : " << next_rate - << " . rate_per_unit : " << rate_per_unit - << std::endl; - */ + double cost = double(visit_cnt) * rate; assert(cost >= 0); return cost; } @@ -125,6 +112,14 @@ class GreedyAlgo { // so make sure that only called by one thread void solve(std::map& segment_algo_infos, std::map& algo_solution, const uint32_t& cache_size); + + // simple check results + // noticed that if segment a visit_cnt >= segment b visit_cnt + // then segment a units_num >= segment b units_num + // and check whether usage exceeds cache size + void verify(std::map& segment_algo_infos, + std::map& algo_solution, const uint32_t& cache_size); + // full debug process of GreedyAlgo, not thread-secured // so make sure that only called by one thread void debug(std::map& algo_solution, const uint32_t& cache_size) { diff --git a/db/art/heat_buckets.cc b/db/art/heat_buckets.cc index 5b83dff2a..b99f5bcfb 100644 --- a/db/art/heat_buckets.cc +++ b/db/art/heat_buckets.cc @@ -3,16 +3,6 @@ #include namespace ROCKSDB_NAMESPACE { -std::vector HeatBuckets::seperators_; -std::vector HeatBuckets::buckets_; -uint32_t HeatBuckets::current_cnt_; // current get count in this period -std::vector> HeatBuckets::mutex_ptrs_; -std::mutex HeatBuckets::cnt_mutex_; -std::mutex HeatBuckets::sample_mutex_; -bool HeatBuckets::is_ready_; // identify whether HeatBuckets ready for hit -SamplesPool HeatBuckets::samples_; -bool HeatBuckets::updated_; // prevent from updating hotness more than once in a short time - Bucket::Bucket() { hit_cnt_ = 0; @@ -44,10 +34,8 @@ HeatBuckets::HeatBuckets() { seperators_.resize(0); buckets_.resize(0); current_cnt_ = 0; - mutex_ptrs_.resize(0); is_ready_ = false; samples_.clear(); - updated_ = false; } HeatBuckets::~HeatBuckets() { @@ -66,23 +54,24 @@ void HeatBuckets::debug() { } void HeatBuckets::update() { - // mark already updated, after current_cnt_ more than PERIOD_COUNT / MAGIC_FACTOR, updated_ will be reset to false; - // we need guarantee that in one period (one constant time span), db gets are much larger than PERIOD_COUNT / MAGIC_FACTOR; - // usually in server, exec get requests PERIOD_COUNT / MAGIC_FACTOR times only account for a very very short time. - updated_ = true; - - assert(mutex_ptrs_.size() == buckets_.size()); - for (size_t i=0; ilock(); + uint32_t current_cnt = 0; + + // remember to reset current_cnt_ counter + if (current_cnt_ < PERIOD_COUNT) return; + hit_mutex_.WriteLock(); + if (current_cnt_ >= PERIOD_COUNT) { + current_cnt = current_cnt_; + current_cnt_ = 0; } + hit_mutex_.WriteUnlock(); + + if (current_cnt == 0) return; + // debug(); // TODO: use multiple threads to update hotness of all buckets for (size_t i=0; iunlock(); + buckets_[i].update(BUCKETS_ALPHA, current_cnt); } - // remember to reset current_cnt_ counter - current_cnt_ = 0; } uint32_t HeatBuckets::locate(const std::string& key) { @@ -102,7 +91,7 @@ uint32_t HeatBuckets::locate(const std::string& key) { return left; } -void HeatBuckets::hit(const std::string& key, const bool& signal) { +void HeatBuckets::hit(const std::string& key, bool& signal) { assert(is_ready_); // use binary search to find index i, making seperators_[i] <= key and seperators_[i+1] > i // reminding we have set border guard, so dont worry about out of bounds error @@ -128,39 +117,26 @@ void HeatBuckets::hit(const std::string& key, const bool& signal) { // std::cout << "debug mutex_ptrs_ size : " << mutex_ptrs_.size() << std::endl; // std::cout << "debug period_cnt_ : " << period_cnt_ << std::endl; // std::cout << "debug alpha_ : " << alpha_ << std::endl; - assert(buckets_.size() == mutex_ptrs_.size()); assert(idx >= 0 && idx < buckets_.size()); assert(seperators_[idx] <= key && key < seperators_[idx+1]); - mutex_ptrs_[idx]->lock(); + hit_mutex_.ReadLock(); buckets_[idx].hit(); // mutex only permits one write opr to one bucket - mutex_ptrs_[idx]->unlock(); - - cnt_mutex_.lock(); current_cnt_ += 1; + if (current_cnt_ >= PERIOD_COUNT) { + signal = true; + } + hit_mutex_.ReadUnlock(); - // use updated_ to prevent from updating hotness in a very short time span (due to multi-threads operation) - if (signal && !updated_) { - // debug(); + if (signal) { update(); } - cnt_mutex_.unlock(); - - // remember to reset updated_ to false - if (updated_ && current_cnt_ >= PERIOD_COUNT / MAGIC_FACTOR) { - updated_ = false; - } } SamplesPool::SamplesPool() { samples_cnt_ = 0; pool_.resize(0); filter_.clear(); - - // because put opt will input duplicated keys, we need to guarantee SAMPLES_MAXCNT much larger than SAMPLES_LIMIT - // however std::set only remain deduplicated keys - // to collect good samples for previous put keys, we need a larger SAMPLES_MAXCNT - assert(SAMPLES_MAXCNT >= MAGIC_FACTOR * SAMPLES_LIMIT); } void SamplesPool::clear() { @@ -207,13 +183,14 @@ void SamplesPool::sample(const std::string& key) { } void SamplesPool::prepare() { - std::string key_min = "user"; // defined min key for YCSB - std::string key_max = pool_[pool_.size()-1] + pool_[pool_.size()-1]; if (!is_ready()) { return; } sort(pool_.begin(), pool_.end()); // add border guard + std::string key_min = "user"; // defined min key for YCSB + // std::string key_max = pool_[pool_.size()-1] + pool_[pool_.size()-1]; + std::string key_max = "user" + std::string(512, '9'); // this key must exteed each key of requests pool_.emplace(pool_.begin(), key_min); pool_.emplace_back(key_max); } @@ -259,9 +236,9 @@ uint32_t SamplesPool::determine_k(std::vector>& segment uint32_t k = pool_.size() - 2; // if segments is empty, use default k to debug if (segments.empty()) { - k = (pool_.size() - 2) / DEFAULT_BUCKETS_NUM; + k = (pool_.size() - 2) / APPROXIMATE_BUCKETS_NUM; } - assert(k > 1); + assert(k >= 1); for (auto& segment : segments) { assert(segment.size() == 2); assert(segment[0] < segment[1]); @@ -295,22 +272,23 @@ void HeatBuckets::init(std::vector>& segments) { samples_.divide(k, seperators_); // std::cout << "[DEBUG] show key ranges below: " << std::endl; + // for (size_t i=0; i(new std::mutex())); - } - assert(mutex_ptrs_.size() == buckets_.size()); assert(seperators_.size() == buckets_.size()+1); is_ready_ = true; @@ -318,5 +296,7 @@ void HeatBuckets::init(std::vector>& segments) { // debug // std::cout << "[DEBUG] heat buckets size: " << buckets_.size() << std::endl; // std::cout << "[DEBUG] key ranges init" << std::endl; + std::cout << "[RANGE] seperators_ size : " << seperators_.size() << std::endl; + std::cout << "[RANGE] buckets_ size : " << buckets_.size() << std::endl; } } \ No newline at end of file diff --git a/db/art/heat_buckets.h b/db/art/heat_buckets.h index 68a8277fe..41adfd683 100644 --- a/db/art/heat_buckets.h +++ b/db/art/heat_buckets.h @@ -8,6 +8,7 @@ #include #include #include +#include "port/port_posix.h" namespace ROCKSDB_NAMESPACE { @@ -28,6 +29,36 @@ class Bucket { void hit(); }; +class SamplesPool { +private: + std::vector pool_; // using set to guarantee only store deduplicated samples + std::set filter_; // used to check whether new key already exist in pool + uint32_t samples_cnt_; // current sample tries num, need to update after every try +public: + SamplesPool(); + + ~SamplesPool() { return; } + + void clear(); + + // we can modify SAMPLES_MAXCNT to control the moment that starts init heat buckets + bool is_ready() { return samples_cnt_ >= SAMPLES_MAXCNT; } + bool is_full() { return pool_.size() >= SAMPLES_LIMIT; } + bool is_sampled(const std::string& key) { return filter_.count(key) > 0; } + + void sample(const std::string& key); + + void prepare(); + + // need call prepare() before + // generate seperators + void divide(const uint32_t& k, std::vector& dst); + + // determine k based on low-level segments' key range + uint32_t determine_k(std::vector>& segments); + uint32_t locate(const std::string& key); // helper func when determine k +}; + /* first sample put keys using reservoir sampling. @@ -42,15 +73,13 @@ class Bucket { class HeatBuckets { private: // TODO: mutex can be optimized - static std::vector seperators_; - static std::vector buckets_; - static uint32_t current_cnt_; // current get count in this period - static std::vector> mutex_ptrs_; - static std::mutex cnt_mutex_; - static std::mutex sample_mutex_; - static bool is_ready_; // identify whether HeatBuckets ready for hit - static SamplesPool samples_; - static bool updated_; + std::vector seperators_; + std::vector buckets_; + uint32_t current_cnt_; // current get count in this period + mutable port::RWMutex hit_mutex_; + std::mutex sample_mutex_; + bool is_ready_; // identify whether HeatBuckets ready for hit + SamplesPool samples_; public: HeatBuckets(); @@ -67,39 +96,9 @@ class HeatBuckets { void init(std::vector>& segments); // if sample enough keys, ready to init heatbuckets void update(); // update hotness value of all buckets - void hit(const std::string& key, const bool& signal); // one key only hit one bucket (also mean only hit one key range) + void hit(const std::string& key, bool& signal); // one key only hit one bucket (also mean only hit one key range) // if signal is true, update hotness void debug(); // output debug message in standard output }; -class SamplesPool { -private: - std::vector pool_; // using set to guarantee only store deduplicated samples - std::set filter_; // used to check whether new key already exist in pool - uint32_t samples_cnt_; // current sample tries num, need to update after every try -public: - SamplesPool(); - - ~SamplesPool() { return; } - - void clear(); - - // we can modify SAMPLES_MAXCNT to control the moment that starts init heat buckets - bool is_ready() { return samples_cnt_ >= SAMPLES_MAXCNT; } - bool is_full() { return pool_.size() >= SAMPLES_LIMIT; } - bool is_sampled(const std::string& key) { return filter_.count(key) > 0; } - - void sample(const std::string& key); - - void prepare(); - - // need call prepare() before - // generate seperators - void divide(const uint32_t& k, std::vector& dst); - - // determine k based on low-level segments' key range - uint32_t determine_k(std::vector>& segments); - uint32_t locate(const std::string& key); // helper func when determine k -}; - } \ No newline at end of file diff --git a/db/art/macros.h b/db/art/macros.h index 9d8a3e8e3..57fcef147 100644 --- a/db/art/macros.h +++ b/db/art/macros.h @@ -140,39 +140,53 @@ namespace ROCKSDB_NAMESPACE { // micros for HeatBuckets // hotness update formula -#define BUCKETS_ALPHA 0.2 +#define BUCKETS_ALPHA 0.4 // samples pool max size, using reservoir sampling -#define SAMPLES_LIMIT 10000 -// if recv samples exceed SAMPLES_MAXCNT, end reservoir sampling and init Heat Buckets -#define SAMPLES_MAXCNT 5000000 -// short period get count, if get count equal to or exceed PERIOD_COUNT, -// end this short period and start next short period -#define PERIOD_COUNT 50000 -// number of heat buckets (number of key ranges, see hotness estimating in the paper) -#define DEFAULT_BUCKETS_NUM 500 +#define SAMPLES_LIMIT 1000000 +#define SAMPLES_MAXCNT 10000000 +// short period get count +#define PERIOD_COUNT 2000000 +// key sample file +// in order to init key ranges before first flush, +// we need to read keys in a file, then init key ranges first. +#define SAMPLES_FILE "/home/guoteng_20241228_135/WaLSM+/key_sample.txt" +// determine number of heat buckets, its value approximately equals to (this number - 1) +// TODO: how to control number of key ranges correctly? For example, if we set to 999, this number is still 1001. +#define APPROXIMATE_BUCKETS_NUM 100000 // magic number in class HeatBuckets -#define MAGIC_FACTOR 500 +#define MAGIC_FACTOR 10 -// micros for Model Train +// key number of each segment +#define KV_NUM_OF_SEGMENT 420 +// default size of one filter unit (bits) +// bits-per-key for every filter unit of every segment, +// found default bits-per-key = DEFAULT_UNITS_NUM * BITS_PER_KEY_PER_UNIT = 10 +// equal to primary value of paper benchmark config value +#define BITS_PER_KEY_PER_UNIT 2 +// TODO: needed to be set based on size of KV pairs +#define DEFAULT_UNIT_SIZE (KV_NUM_OF_SEGMENT * BITS_PER_KEY_PER_UNIT) + +// macros for Model Train // long period = TRAIN_PERIODS * short period. if one long period end, evaluate model and retrain model if necessary -#define TRAIN_PERIODS 10 +#define TRAIN_PERIODS 15 // dataset csv file name #define DATASET_NAME "dataset.csv" // the path to save model txt file and train dataset csv file -#define MODEL_PATH "/pg_wal/ycc/" +#define MODEL_PATH "/home/guoteng_20241228_135/WaLSM+/log/" // we cannot send hotness value (double) to model side, // so we try multiple hotness value by HOTNESS_SIGNIFICANT_DIGITS_FACTOR, then send its integer part to model // also we need to multiple key range rate by RATE_SIGNIFICANT_DIGITS_FACTOR #define HOTNESS_SIGNIFICANT_DIGITS_FACTOR 1e6 -#define RATE_SIGNIFICANT_DIGITS_FACTOR 1e3 +#define RATE_SIGNIFICANT_DIGITS_FACTOR 1e6 // model feature num max limit : 2 * 45 + 1 -#define MAX_FEATURES_NUM 91 +#define MAX_FEATURES_NUM 21 // config micro connecting to LightGBM server // we use Inet socket to connect server #define HOST "127.0.0.1" +// #define PORT "10090" #define PORT "9090" // max size of socket receive buffer size #define BUFFER_SIZE 1024 @@ -183,34 +197,35 @@ namespace ROCKSDB_NAMESPACE { // micros for filter cache // before model work, we enable DEFAULT_UNITS_NUM units for every segments -#define DEFAULT_UNITS_NUM 4 -// bits-per-key for every filter unit of every segment, -// found default bits-per-key = DEFAULT_UNITS_NUM * BITS_PER_KEY_PER_UNIT = 10 -// equal to primary value of paper benchmark config value -#define BITS_PER_KEY_PER_UNIT 4 +#define DEFAULT_UNITS_NUM 2 // max unit nums for every segment, we only generate MAX_UNITS_NUM units for every segment -#define MAX_UNITS_NUM 8 +#define MAX_UNITS_NUM 6 // we enable 0 unit for coldest segments #define MIN_UNITS_NUM 0 // default max size of cache space : 8 * 1024 * 1024 * 128 = 1073741824 bit = 128 MB -#define CACHE_SPACE_SIZE 1073741824 +#define CACHE_SPACE_SIZE (32 * 1024 * 1024 * 8) // fitler cache helper heap type #define BENEFIT_HEAP 0 #define COST_HEAP 1 #define UNKNOWN_HEAP 2 // visit cnt update bound -#define VISIT_CNT_UPDATE_BOUND 10 +#define VISIT_CNT_UPDATE_BOUND 500 +// adjustment benefit bound +#define PURE_BENEFIT_BOUND 500 // filter cache map threshold -#define FULL_RATE 0.95 -#define READY_RATE 0.60 +#define FULL_RATE 1.00 +#define READY_RATE 0.80 // default init L0 counts #define INIT_LEVEL_0_COUNT 0 -// default size of one filter unit (bits) -#define DEFAULT_UNIT_SIZE 0 // inherit remain factor -#define INHERIT_REMAIN_FACTOR 0.5 +#define INHERIT_REMAIN_FACTOR 1 // filter cache client background threads num -#define FILTER_CACHE_THREADS_NUM 10 +#define FILTER_CACHE_THREADS_NUM 6 + +// #define KV_SIZE = 1024 +// data block size for a segment +// #define SEGMENT_DATA_BLOCK_SIZE 32 * 1024 +// #define KEYS_PER_SEGMENT 4096 } // namespace ROCKSDB_NAMESPACE \ No newline at end of file diff --git a/db/art/nvm_manager.cc b/db/art/nvm_manager.cc index 5de2dff71..36492814a 100644 --- a/db/art/nvm_manager.cc +++ b/db/art/nvm_manager.cc @@ -46,7 +46,9 @@ bool InitializeMemory(std::unordered_map& memory_usages, size_t mapped_len; base_memptr = (char*)pmem_map_file( nvm_path.c_str(), TotalSize, PMEM_FILE_CREATE, 0666, &mapped_len, &is_pmem); - //assert(is_pmem && mapped_len == (size_t)TotalSize); + // base_memptr = (char*)mmap( + // nvm_path.c_str(), TotalSize, PMEM_FILE_CREATE, 0666, &mapped_len, &is_pmem); + assert(is_pmem && mapped_len == (size_t)TotalSize); aligned_ptr = reinterpret_cast(ALIGN_UP(reinterpret_cast(base_memptr), 256)); close(fd); diff --git a/db/builder.cc b/db/builder.cc index a5deebff9..2140c3144 100644 --- a/db/builder.cc +++ b/db/builder.cc @@ -455,6 +455,8 @@ Status BuildTableFromArt( if (table_properties) { *table_properties = tp; } + // store SegmentBuilderResult + job->segment_builder_result = builder->GetSegmentBuilderResult(); } delete builder; @@ -516,6 +518,12 @@ Status BuildTableFromArt( s = Status::Corruption("Paranoid checksums do not match"); } } + + // // init table_reader for later use + if (s.ok()) { + s = table_cache->InitFileTableReader( + read_options, internal_comparator, *meta); + } } if (!s.ok() || meta->fd.GetFileSize() == 0) { diff --git a/db/column_family.cc b/db/column_family.cc index d9344f4bb..241b7fc87 100644 --- a/db/column_family.cc +++ b/db/column_family.cc @@ -26,6 +26,7 @@ #include "db/job_context.h" #include "db/range_del_aggregator.h" #include "db/table_properties_collector.h" +#include "db/version_edit.h" #include "db/version_set.h" #include "db/write_controller.h" #include "file/sst_file_manager_impl.h" diff --git a/db/compaction/compaction.cc b/db/compaction/compaction.cc index 4f2d70ba7..7c893b972 100644 --- a/db/compaction/compaction.cc +++ b/db/compaction/compaction.cc @@ -284,65 +284,68 @@ bool Compaction::InputCompressionMatchesOutput() const { } bool Compaction::IsTrivialMove() const { - // Avoid a move if there is lots of overlapping grandparent data. - // Otherwise, the move could create a parent file that will require - // a very expensive merge later on. - // If start_level_== output_level_, the purpose is to force compaction - // filter to be applied to that level, and thus cannot be a trivial move. - - // Check if start level have files with overlapping ranges - if (start_level_ == 0 && input_vstorage_->level0_non_overlapping() == false) { - // We cannot move files from L0 to L1 if the files are overlapping - return false; - } - - if (is_manual_compaction_ && - (immutable_cf_options_.compaction_filter != nullptr || - immutable_cf_options_.compaction_filter_factory != nullptr)) { - // This is a manual compaction and we have a compaction filter that should - // be executed, we cannot do a trivial move - return false; - } - - // Used in universal compaction, where trivial move can be done if the - // input files are non overlapping - if ((mutable_cf_options_.compaction_options_universal.allow_trivial_move) && - (output_level_ != 0)) { - return is_trivial_move_; - } - - if (!(start_level_ != output_level_ && num_input_levels() == 1 && - input(0, 0)->fd.GetPathId() == output_path_id() && - InputCompressionMatchesOutput())) { - return false; - } - - // assert inputs_.size() == 1 - - std::unique_ptr partitioner = CreateSstPartitioner(); - - for (const auto& file : inputs_.front().files) { - std::vector file_grand_parents; - if (output_level_ + 1 >= number_levels_) { - continue; - } - input_vstorage_->GetOverlappingInputs(output_level_ + 1, &file->smallest, - &file->largest, &file_grand_parents); - const auto compaction_size = - file->fd.GetFileSize() + TotalFileSize(file_grand_parents); - if (compaction_size > max_compaction_bytes_) { - return false; - } - - if (partitioner.get() != nullptr) { - if (!partitioner->CanDoTrivialMove(file->smallest.user_key(), - file->largest.user_key())) { - return false; - } - } - } - - return true; + // // Avoid a move if there is lots of overlapping grandparent data. + // // Otherwise, the move could create a parent file that will require + // // a very expensive merge later on. + // // If start_level_== output_level_, the purpose is to force compaction + // // filter to be applied to that level, and thus cannot be a trivial move. + + // // Check if start level have files with overlapping ranges + // if (start_level_ == 0 && input_vstorage_->level0_non_overlapping() == false) { + // // We cannot move files from L0 to L1 if the files are overlapping + // return false; + // } + + // if (is_manual_compaction_ && + // (immutable_cf_options_.compaction_filter != nullptr || + // immutable_cf_options_.compaction_filter_factory != nullptr)) { + // // This is a manual compaction and we have a compaction filter that should + // // be executed, we cannot do a trivial move + // return false; + // } + + // // Used in universal compaction, where trivial move can be done if the + // // input files are non overlapping + // if ((mutable_cf_options_.compaction_options_universal.allow_trivial_move) && + // (output_level_ != 0)) { + // return is_trivial_move_; + // } + + // if (!(start_level_ != output_level_ && num_input_levels() == 1 && + // input(0, 0)->fd.GetPathId() == output_path_id() && + // InputCompressionMatchesOutput())) { + // return false; + // } + + // // assert inputs_.size() == 1 + + // std::unique_ptr partitioner = CreateSstPartitioner(); + + // for (const auto& file : inputs_.front().files) { + // std::vector file_grand_parents; + // if (output_level_ + 1 >= number_levels_) { + // continue; + // } + // input_vstorage_->GetOverlappingInputs(output_level_ + 1, &file->smallest, + // &file->largest, &file_grand_parents); + // const auto compaction_size = + // file->fd.GetFileSize() + TotalFileSize(file_grand_parents); + // if (compaction_size > max_compaction_bytes_) { + // return false; + // } + + // if (partitioner.get() != nullptr) { + // if (!partitioner->CanDoTrivialMove(file->smallest.user_key(), + // file->largest.user_key())) { + // return false; + // } + // } + // } + + // return true; + + // disallow trivial move compaction in WaLSM+ + return false; } void Compaction::AddInputDeletions(VersionEdit* out_edit) { @@ -389,8 +392,8 @@ bool Compaction::KeyNotExistsBeyondOutputLevel( void Compaction::MarkFilesBeingCompacted(bool mark_as_compacted) { for (size_t i = 0; i < num_input_levels(); i++) { for (size_t j = 0; j < inputs_[i].size(); j++) { - assert(mark_as_compacted ? !inputs_[i][j]->being_compacted - : inputs_[i][j]->being_compacted); + // assert(mark_as_compacted ? !inputs_[i][j]->being_compacted + // : inputs_[i][j]->being_compacted); inputs_[i][j]->being_compacted = mark_as_compacted; } } diff --git a/db/compaction/compaction_iterator.cc b/db/compaction/compaction_iterator.cc index 4555ec568..a1a2829db 100644 --- a/db/compaction/compaction_iterator.cc +++ b/db/compaction/compaction_iterator.cc @@ -151,6 +151,7 @@ void CompactionIterator::Next() { if (merge_out_iter_.Valid()) { key_ = merge_out_iter_.key(); value_ = merge_out_iter_.value(); + segment_id_ = merge_out_iter_.segment_id(); Status s = ParseInternalKey(key_, &ikey_); // MergeUntil stops when it encounters a corrupt key and does not // include them in the result, so we expect the keys here to be valid. @@ -268,6 +269,7 @@ void CompactionIterator::NextFromInput() { !IsShuttingDown()) { key_ = input_->key(); value_ = input_->value(); + segment_id_ = input_->segment_id(); iter_stats_.num_input_records++; Status pikStatus = ParseInternalKey(key_, &ikey_); @@ -625,6 +627,7 @@ void CompactionIterator::NextFromInput() { // These will be correctly set below. key_ = merge_out_iter_.key(); value_ = merge_out_iter_.value(); + segment_id_ = merge_out_iter_.segment_id(); pikStatus = ParseInternalKey(key_, &ikey_); // MergeUntil stops when it encounters a corrupt key and does not // include them in the result, so we expect the keys here to valid. diff --git a/db/compaction/compaction_iterator.h b/db/compaction/compaction_iterator.h index 29dedd3c7..c2e385588 100644 --- a/db/compaction/compaction_iterator.h +++ b/db/compaction/compaction_iterator.h @@ -12,6 +12,7 @@ #include "db/compaction/compaction.h" #include "db/compaction/compaction_iteration_stats.h" +#include "db/dbformat.h" #include "db/merge_helper.h" #include "db/pinned_iterators_manager.h" #include "db/range_del_aggregator.h" @@ -116,6 +117,7 @@ class CompactionIterator { bool Valid() const { return valid_; } const Slice& user_key() const { return current_user_key_; } const CompactionIterationStats& iter_stats() const { return iter_stats_; } + uint32_t segment_id() { return segment_id_; } private: // Processes the input stream to find the next output @@ -206,6 +208,7 @@ class CompactionIterator { Slice current_user_key_; SequenceNumber current_user_key_sequence_; SequenceNumber current_user_key_snapshot_; + uint32_t segment_id_ = INVALID_SEGMENT_ID; // True if the iterator has already returned a record for the current key. bool has_outputted_key_ = false; diff --git a/db/compaction/compaction_job.cc b/db/compaction/compaction_job.cc index 3b1ee2ae0..a30ccaa99 100644 --- a/db/compaction/compaction_job.cc +++ b/db/compaction/compaction_job.cc @@ -10,18 +10,24 @@ #include "db/compaction/compaction_job.h" #include +#include +#include #include #include +#include #include #include +#include #include #include +#include #include #include #include #include "db/art/compactor.h" #include "db/art/logger.h" +#include "db/art/art_metric.h" #include "db/builder.h" #include "db/db_impl/db_impl.h" #include "db/db_iter.h" @@ -49,12 +55,15 @@ #include "port/port.h" #include "rocksdb/db.h" #include "rocksdb/env.h" +#include "rocksdb/options.h" +#include "rocksdb/slice.h" #include "rocksdb/sst_partitioner.h" #include "rocksdb/statistics.h" #include "rocksdb/status.h" #include "rocksdb/table.h" #include "table/block_based/block.h" #include "table/block_based/block_based_table_factory.h" +#include "table/block_based/filter_block.h" #include "table/merging_iterator.h" #include "table/table_builder.h" #include "test_util/sync_point.h" @@ -67,6 +76,8 @@ namespace ROCKSDB_NAMESPACE { +static SSDWriteMetric writeMetric_; + const char* GetCompactionReasonString(CompactionReason compaction_reason) { switch (compaction_reason) { case CompactionReason::kUnknown: @@ -171,6 +182,9 @@ struct CompactionJob::SubcompactionState { // A flag determine whether the key has been seen in ShouldStopBefore() bool seen_key = false; + // stores segment_builder_result for each subcompaction + SegmentBuilderResult segment_builder_result; + SubcompactionState(Compaction* c, Slice* _start, Slice* _end, uint64_t size) : compaction(c), start(_start), end(_end), approx_size(size) { assert(compaction != nullptr); @@ -178,7 +192,7 @@ struct CompactionJob::SubcompactionState { // Adds the key and value to the builder // If paranoid is true, adds the key-value to the paranoid hash - Status AddToBuilder(const Slice& key, const Slice& value) { + Status AddToBuilder(const Slice& key, const Slice& value, uint32_t segment_id) { auto curr = current_output(); assert(builder != nullptr); assert(curr != nullptr); @@ -186,7 +200,7 @@ struct CompactionJob::SubcompactionState { if (!s.ok()) { return s; } - builder->Add(key, value); + builder->Add(key, value, segment_id); return Status::OK(); } @@ -560,6 +574,71 @@ void CompactionJob::GenSubcompactionBoundaries() { } } +void CompactionJob::CollectDataAndPrefetch() { + // aggregate SegmentBuilderResult from subcompactions + for (auto& state : compact_->sub_compact_states) { + auto& sub_result = state.segment_builder_result; + assert(!sub_result.merged_segment_ids.empty()); + assert(!sub_result.new_segment_ids.empty()); + segment_builder_result_.new_segment_ids.insert( + sub_result.new_segment_ids.begin(), + sub_result.new_segment_ids.end()); + + segment_builder_result_.merged_segment_ids.insert( + sub_result.merged_segment_ids.begin(), + sub_result.merged_segment_ids.end()); + + for (auto& per_segment_result : sub_result.per_segment_results) { + segment_builder_result_.per_segment_results.push_back( + std::move(per_segment_result)); + } + } + segment_builder_result_.output_level = compact_->compaction->output_level(); + + // static std::mutex debug_mutex; + // { + // std::lock_guard lock_guard(debug_mutex); + // for (auto& segment_result : segment_builder_result_.per_segment_results) { + // double rate_sum = 0; + // std::cout << "segment_id=" << segment_result.segment_id << ": " << segment_result.range_rate_pairs.size() << " ranges, level=" << compact_->compaction->output_level() << ", count=" << segment_result.key_count; + // // std::cout << std::endl; + // for (const auto& range_pair : segment_result.range_rate_pairs) { + // rate_sum += range_pair.rate_in_segment; + // // std::cout << range_pair.range_id << "-" << range_pair.rate_in_segment << " "; + // } + // assert(rate_sum >= 0.98 && rate_sum <= 1.02); + // std::cout << std::endl; + // } + // } + + + // WaLSM+ debug + // for (auto& state : compact_->sub_compact_states) { + // for (auto& output : state.outputs) { + // std::cout << "c, filename=" << output.meta.fd.GetNumber() + // << ", smallest=" << output.meta.smallest.user_key().ToString() + // << ", largest=" << output.meta.largest.user_key().ToString() + // << std::endl; + // } + // } + + // insert all filter block handles to FilterCache + for (auto& state : compact_->sub_compact_states) { + for (auto& output : state.outputs) { + assert(output.meta.fd.table_reader != nullptr); + const auto* table = output.meta.fd.table_reader; + auto block_handles_map = table->GetSegmentBlockHandles(); + assert(block_handles_map.size() > 0); + for (const auto& segment_id_and_block_handles : block_handles_map) { + auto segment_id = segment_id_and_block_handles.first; + const auto& block_handles = segment_id_and_block_handles.second; + // dangerous cast, but we know that the table is BlockBasedTablde + filter_cache_client_->init_segment(segment_id, (BlockBasedTable*) table, block_handles); + } + } + } +} + // TODO(WaLSM+): pass temp recorders ptr and update Status CompactionJob::Run() { AutoThreadOperationStageUpdater stage_updater( @@ -627,9 +706,10 @@ Status CompactionJob::Run() { } if (status.ok()) { thread_pool.clear(); - std::vector files_output; - for (const auto& state : compact_->sub_compact_states) { - for (const auto& output : state.outputs) { + // WaLSM+: remove const qulifier to init file table reader + std::vector files_output; + for (auto& state : compact_->sub_compact_states) { + for (auto& output : state.outputs) { files_output.emplace_back(&output); } } @@ -685,6 +765,12 @@ Status CompactionJob::Run() { } } + // // init table_reader for later use + if (s.ok()) { + s = cfd->table_cache()->InitFileTableReader( + read_options, cfd->internal_comparator(), files_output[file_idx]->meta); + } + delete iter; if (!s.ok()) { @@ -781,6 +867,9 @@ Status CompactionJob::Install(const MutableCFOptions& mutable_cf_options) { (GetStartTime() - stats.micros) * 1e-6, compact_->compaction->output_level()); + // update WaLSM write metric + writeMetric_.updateMetric(stats.bytes_written); + ROCKS_LOG_BUFFER( log_buffer_, "[%s] compacted to: %s, MB/sec: %.1f rd, %.1f wr, level %d, " @@ -843,7 +932,6 @@ Status CompactionJob::Install(const MutableCFOptions& mutable_cf_options) { stream.EndArray(); } - CleanupCompaction(); return status; } @@ -965,6 +1053,7 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) { // returns true. const Slice& key = c_iter->key(); const Slice& value = c_iter->value(); + const uint32_t segment_id = c_iter->segment_id(); // If an end key (exclusive) is specified, check if the current key is // >= than it and exit if it is because the iterator is out of its range @@ -987,7 +1076,7 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) { } } // TODO(WaLSM+): pass temp recorders ptr and update - status = sub_compact->AddToBuilder(key, value); + status = sub_compact->AddToBuilder(key, value, segment_id); if (!status.ok()) { break; } @@ -1487,6 +1576,21 @@ Status CompactionJob::FinishCompactionOutputFile( } #endif + // WaLSM+: collect data before resetting builder pointer + auto current_segment_builder_result = sub_compact->builder->GetSegmentBuilderResult(); + // merge segment_builder_result into sub_compact->segment_builder_result + for (const auto id : current_segment_builder_result.merged_segment_ids) { + sub_compact->segment_builder_result.merged_segment_ids.insert(id); + } + for (const auto id : current_segment_builder_result.new_segment_ids) { + sub_compact->segment_builder_result.new_segment_ids.insert(id); + } + for (auto& per_segment_result : current_segment_builder_result.per_segment_results) { + sub_compact->segment_builder_result.per_segment_results.emplace_back(std::move(per_segment_result)); + } + assert(!sub_compact->segment_builder_result.merged_segment_ids.empty()); + assert(!sub_compact->segment_builder_result.new_segment_ids.empty()); + sub_compact->builder.reset(); sub_compact->current_output_file_size = 0; return s; diff --git a/db/compaction/compaction_job.h b/db/compaction/compaction_job.h index aafad8d3a..dbd9c0520 100644 --- a/db/compaction/compaction_job.h +++ b/db/compaction/compaction_job.h @@ -39,6 +39,7 @@ #include "rocksdb/env.h" #include "rocksdb/memtablerep.h" #include "rocksdb/transaction_log.h" +#include "table/block_based/filter_block.h" #include "table/scoped_arena_iterator.h" #include "util/autovector.h" #include "util/stop_watch.h" @@ -103,6 +104,21 @@ class CompactionJob { // Return the IO status IOStatus io_status() const { return io_status_; } + SegmentBuilderResult GetSegmentBuilderResult() const { + return segment_builder_result_; + } + + // should be called before Run() + void SetFilterCacheClient(FilterCacheClient* filter_cache_client) { + filter_cache_client_ = filter_cache_client; + } + + // collect data for WaLSM+ + void CollectDataAndPrefetch(); + + // call cleanup after CollectDataAndPrefetch() + void CleanupCompaction(); + private: struct SubcompactionState; @@ -130,7 +146,6 @@ class CompactionJob { Status InstallCompactionResults(const MutableCFOptions& mutable_cf_options); void RecordCompactionIOStats(); Status OpenCompactionOutputFile(SubcompactionState* sub_compact); - void CleanupCompaction(); void UpdateCompactionJobStats( const InternalStats::CompactionStats& stats) const; void RecordDroppedKeys(const CompactionIterationStats& c_iter_stats, @@ -200,6 +215,9 @@ class CompactionJob { Env::WriteLifeTimeHint write_hint_; Env::Priority thread_pri_; IOStatus io_status_; + + SegmentBuilderResult segment_builder_result_; + FilterCacheClient* filter_cache_client_; }; } // namespace ROCKSDB_NAMESPACE diff --git a/db/compaction/compaction_picker_universal.cc b/db/compaction/compaction_picker_universal.cc index 0768f1958..e78f0a71b 100644 --- a/db/compaction/compaction_picker_universal.cc +++ b/db/compaction/compaction_picker_universal.cc @@ -8,6 +8,10 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #include "db/compaction/compaction_picker_universal.h" +#include +#include +#include +#include "db/version_edit.h" #ifndef ROCKSDB_LITE #include @@ -659,12 +663,17 @@ Compaction* UniversalCompactionBuilder::PickCompactionForQLearning() { if (!partition->is_tier[i] && !partition->is_compaction_work[i] && partition->files_[i].size() > 1) { bool ok = true; + std::set file_metadata_pointers; for (FileMetaData* f : partition->files_[i]) { if (f->being_compacted) { ok = false; break; } + if (file_metadata_pointers.count(f)) { + std::cout << "wtf" << std::endl; + } inputs[i].files.push_back(f); + file_metadata_pointers.insert(f); } if (!ok) { inputs[i].files.clear(); diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc index 18ed841ae..b892d970e 100644 --- a/db/db_impl/db_impl.cc +++ b/db/db_impl/db_impl.cc @@ -9,6 +9,7 @@ #include "db/db_impl/db_impl.h" #include +#include #ifdef OS_SOLARIS #include #endif @@ -109,6 +110,7 @@ #include "util/mutexlock.h" #include "util/stop_watch.h" #include "util/string_util.h" +#include "db/art/global_filter_cache_context.h" namespace ROCKSDB_NAMESPACE { @@ -250,25 +252,34 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname, period_cnt_ = 0; last_train_period_ = 0; */ - segment_info_recorder_ = new std::unordered_map>; - level_recorder_ = new std::map; - level_0_base_count_ = 0; - - features_nums_except_level_0_ = new std::vector; - uint16_t features_num = MAX_FEATURES_NUM; - if (features_num > 0) { - features_nums_except_level_0_->emplace_back(features_num); + { + std::lock_guard global_filter_cache_lock_guard(global_filter_cache_recorders_mutex); + global_level_0_base_count = 0; + uint16_t features_num = MAX_FEATURES_NUM; + if (features_num > 0) { + global_features_nums_except_level_0.emplace_back(features_num); + } + global_filter_cache.periods_work(); + global_filter_cache.retrain_or_keep_model( + &global_features_nums_except_level_0, &global_level_recorder, + &global_segment_ranges_recorder, &global_unit_size_recorder); + // global_filter_cache.make_adjustment(); + #ifdef SAMPLES_FILE + std::ifstream input(SAMPLES_FILE); + assert(input.is_open()); + + std::string art_key; + uint32_t key_count = 0; + while (std::getline(input, art_key)) { + global_filter_cache.prepare_heat_buckets(art_key, &global_segment_info_recorder); + } + assert(global_filter_cache.range_seperators().size() > 0); // heat buckets must be ready + // std::cout << "seperators size: " << global_filter_cache.range_seperators().size() << std::endl; + // for (std::string &seperator : global_filter_cache.range_seperators()) { + // std::cout << seperator << std::endl; + // } + #endif } - - segment_ranges_recorder_ = new std::map>; - - unit_size_recorder_ = new std::map; - - filter_cache_.retrain_or_keep_model(features_nums_except_level_0_, - level_recorder_, - segment_ranges_recorder_, - unit_size_recorder_); - filter_cache_.make_adjustment(); #endif // !batch_per_trx_ implies seq_per_batch_ because it is only unset for // WriteUnprepared, which should use seq_per_batch_. @@ -1691,6 +1702,9 @@ Status DBImpl::GetImpl(const ReadOptions& read_options, const Slice& key, get_impl_options.column_family); auto cfd = cfh->cfd(); + // WaLSM+: update cfd pointer for future use + global_filter_cache.update_cfd_ptr_if_needed(cfd); + if (tracer_) { // TODO: This mutex should be removed later, to improve performance when // tracing is enabled. @@ -1777,7 +1791,7 @@ Status DBImpl::GetImpl(const ReadOptions& read_options, const Slice& key, #ifdef ART #ifdef ART_PLUS std::string art_key(key.data(), key.size()); - filter_cache_.get_updating_work(art_key); + global_filter_cache.hit_heat_buckets(art_key); // ready to estimate hotness, update heat buckets /* if (heat_buckets_.is_ready()) { @@ -1803,7 +1817,7 @@ Status DBImpl::GetImpl(const ReadOptions& read_options, const Slice& key, // only one thread can train model. if (need_train) { std::fstream f_model; - f_model.open("/pg_wal/ycc/model.log", std::ios::out | std::ios::app); + f_model.open("/home/guoteng_20241228_135/WaLSM+/log/model.log", std::ios::out | std::ios::app); f_model << "[DEBUG] try to train models" << std::endl; f_model << "[DEBUG] period_cnt_ : " << period_cnt_ << std::endl; f_model << "[DEBUG] PERIOD_COUNT : " << PERIOD_COUNT << std::endl; @@ -1908,7 +1922,7 @@ Status DBImpl::GetImpl(const ReadOptions& read_options, const Slice& key, get_impl_options.get_value); #else sv->current->Get( - filter_cache_, + global_filter_cache, read_options, lkey, get_impl_options.value, timestamp, &s, &merge_context, &max_covering_tombstone_seq, get_impl_options.get_value ? get_impl_options.value_found : nullptr, diff --git a/db/db_impl/db_impl.h b/db/db_impl/db_impl.h index 32209ea75..7d411ba62 100644 --- a/db/db_impl/db_impl.h +++ b/db/db_impl/db_impl.h @@ -26,7 +26,6 @@ #include "db/art/vlog_manager.h" #include "db/art/heat_buckets.h" #include "db/art/clf_model.h" -#include "db/art/filter_cache_item.h" #include "db/art/filter_cache_heap.h" #include "db/art/filter_cache.h" #include "db/art/filter_cache_client.h" @@ -1907,54 +1906,7 @@ class DBImpl : public DB { HeatGroupManager* group_manager_; #ifdef ART_PLUS - // TODO: add necessary filter cache info structures - FilterCacheClient filter_cache_; // already contain FilterCacheManager - - // TODO: mutex for updating these recorders below - // will be locked when updating these recorders below, and unlock after updating ends - std::mutex filter_cache_mutex_; - - // these global recorders need to be latest after every flush or compaction: - // std::map* level_recorder_ - // std::map>* segment_ranges_recorder_ - // std::map* unit_size_recorder_ - // you may need filter_cache_.range_seperators() to receive key range seperators - // exactly, if key k < seperators[i+1] and key k >= seperators[i], then key k hit key range i - // HeatBuckets::locate(const std::string& key) will tell you how to binary search corresponding key range for one key - - // segment_info_recorder save every segments' min key and max key - // but we only need to pass empty segment_info_recorder now - // TODO: it should contain all levels segments' min key and max key, then pass to filter cache client, but not used now - // this recorder will help decide the key ranges' num, but it dont work in current work - // you can try to modify macro DEFAULT_BUCKETS_NUM to decide the key ranges' num - std::unordered_map>* segment_info_recorder_; - - // record every alive segments' level - // TODO: need to be latest all the time - std::map* level_recorder_; - - // record features num of every segments - // we choose max features num to define model feature num - // if you want to use a default features num, set MAX_FEATURES_NUM to non-zero value - // then do not insert any entry into this vector later - // TODO: we dont use this vector, so we set MAX_FEATURES_NUM to non-zero value - std::vector* features_nums_except_level_0_; - - // should be based level 0 visit cnt in a total long period - // simply we set level_0_base_count to 0, and use macro INIT_LEVEL_0_COUNT - // we can set this macro to ( PERIOD_COUNT * TRAIN_PERIODS ) * ( level 0 sorted runs num ) / ( max level 0 segments num ) - // TODO: modify INIT_LEVEL_0_COUNT to proper value - uint32_t level_0_base_count_; - - // record interacting ranges and their rates of alive segments - // TODO: should be latest all the time - std::map>* segment_ranges_recorder_; - - // every segment's filter unit size is the same - // this recorder should hold all alive segment - // simply, you can also use default macro DEFAULT_UNIT_SIZE for all segments, just leave this recorder empty - // TODO: modify DEFAULT_UNIT_SIZE - std::map* unit_size_recorder_; + // filter_cache context (moved to filter_cache_client.h/cc) /* HeatBuckets heat_buckets_; diff --git a/db/db_impl/db_impl_compaction_flush.cc b/db/db_impl/db_impl_compaction_flush.cc index 846c262e6..df537005c 100644 --- a/db/db_impl/db_impl_compaction_flush.cc +++ b/db/db_impl/db_impl_compaction_flush.cc @@ -7,6 +7,8 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. #include +#include +#include #include "db/art/logger.h" #include "db/builder.h" @@ -14,14 +16,18 @@ #include "db/error_handler.h" #include "db/event_helpers.h" #include "db/nvm_flush_job.h" +#include "db/table_cache.h" #include "file/sst_file_manager_impl.h" #include "monitoring/iostats_context_imp.h" #include "monitoring/perf_context_imp.h" #include "monitoring/thread_status_updater.h" #include "monitoring/thread_status_util.h" +#include "rocksdb/options.h" +#include "table/block_based/filter_block.h" #include "test_util/sync_point.h" #include "util/cast_util.h" #include "util/concurrent_task_limiter_impl.h" +#include "db/art/global_filter_cache_context.h" namespace ROCKSDB_NAMESPACE { @@ -2546,13 +2552,13 @@ void DBImpl::SyncCallFlush(std::vector& jobs) { // you may need filter_cache_.range_seperators() to receive key range seperators // exactly, if key k < seperators[i+1] and key k >= seperators[i], then key k hit key range i // HeatBuckets::locate(const std::string& key) will tell you how to binary search corresponding key range for one key - std::set* merged_segment_ids = new std::set; // the merged segments' id, we need to delete them from these 3 global recorders - std::map* new_level_recorder = new std::map; - std::map>* new_segment_ranges_recorder = new std::map>; - std::map* new_unit_size_recorder = new std::map; - std::vector& key_range_seperators = filter_cache_.range_seperators(); - std::set* new_segment_ids = new std::set; - std::map>* inherit_infos_recorder = new std::map>; + std::unique_ptr> merged_segment_ids (new std::set); // the merged segments' id, we need to delete them from these 3 global recorders + std::unique_ptr> new_level_recorder (new std::map); + std::unique_ptr>> new_segment_ranges_recorder (new std::map>); + std::unique_ptr> new_unit_size_recorder (new std::map); + std::vector& key_range_seperators = global_filter_cache.range_seperators(); + std::unique_ptr> new_segment_ids(new std::set); + std::unique_ptr>> inherit_infos_recorder (new std::map>); // TODO(WaLSM+): you can pass these var into NVMFlushJob and update them when flushing #endif @@ -2632,6 +2638,15 @@ void DBImpl::SyncCallFlush(std::vector& jobs) { } } + // // WaLSM+ debug + // for (auto& db_job : db_jobs) { + // auto& meta = db_job.nvm_flush_job->meta_; + // std::cout << "f, filename=" << meta.fd.GetNumber() + // << ", smallest=" << meta.smallest.user_key().ToString() + // << ", largest=" << meta.largest.user_key().ToString() + // << std::endl; + // } + TEST_SYNC_POINT("DBImpl::SyncCallFlush:FlushFinish:0"); ReleaseFileNumberFromPendingOutputs(pending_outputs_inserted_elem); @@ -2657,80 +2672,114 @@ void DBImpl::SyncCallFlush(std::vector& jobs) { } TEST_SYNC_POINT("DBImpl::SyncCallFlush:ContextCleanedUp"); + atomic_flush_install_cv_.SignalAll(); + bg_cv_.SignalAll(); + + // sync first, we leave other data collection work at last. + // std::vector segment_builder_results; + SegmentBuilderResult agg_segment_builder_result; + for (auto& db_job : db_jobs) { + // segment_builder_results.emplace_back(std::move( + // db_job.nvm_flush_job->segment_builder_result_)); + auto& sub_result = db_job.nvm_flush_job->segment_builder_result_; + agg_segment_builder_result.new_segment_ids.insert( + sub_result.new_segment_ids.begin(), + sub_result.new_segment_ids.end()); + + agg_segment_builder_result.merged_segment_ids.insert( + sub_result.merged_segment_ids.begin(), + sub_result.merged_segment_ids.end()); + + for (auto& per_segment_result : sub_result.per_segment_results) { + agg_segment_builder_result.per_segment_results.push_back( + std::move(per_segment_result)); + } + } + agg_segment_builder_result.output_level = 0; // flushed + assert(agg_segment_builder_result.new_segment_ids.size() > 0); + // insert all filter block handles to filtercache + // TableCache* table_cache = (TableCache*) table_cache_.get(); + for (auto& db_job : db_jobs) { + auto& meta = db_job.nvm_flush_job->meta_; + assert(meta.fd.table_reader != nullptr); + const auto* table = meta.fd.table_reader; + auto block_handles_map = table->GetSegmentBlockHandles(); + assert(block_handles_map.size() > 0); + for (const auto& segment_id_and_block_handles : block_handles_map) { + auto segment_id = segment_id_and_block_handles.first; + const auto& block_handles = segment_id_and_block_handles.second; + // dangerous cast, but we know that the table is BlockBasedTable + global_filter_cache.init_segment(segment_id, (BlockBasedTable*) table, block_handles); + } + } + for (auto& db_job : db_jobs) { num_running_flushes_--; delete db_job.nvm_flush_job; } - atomic_flush_install_cv_.SignalAll(); - bg_cv_.SignalAll(); - #ifdef ART_PLUS + // transfer agg_segment_builder_result to temp recorders + + // update merged_segment_ids and new_segment_ids + // agg_merrged_segment_ids should only have one element - INVALID_SEGMENT_ID + assert(agg_segment_builder_result.merged_segment_ids.size() <= 1); + assert(merged_segment_ids->empty()); + + for (const auto& id : agg_segment_builder_result.new_segment_ids) { + new_segment_ids->insert(id); + } + + // update new_level_recorder + for (const auto id : agg_segment_builder_result.new_segment_ids) { + new_level_recorder->insert(std::make_pair(id, agg_segment_builder_result.output_level)); + } + + // update new_segment_ranges_recorder and inherit_infos_recorder + for (const auto& per_segment_result : agg_segment_builder_result.per_segment_results) { + const auto segment_id = per_segment_result.segment_id; + (*new_segment_ranges_recorder)[segment_id] = per_segment_result.range_rate_pairs; + // no inherit_info when flushing + } + // do new SSTs already exist in latest version? // TODO(WaLSM+): if all ok, merge temp recorders into global DBImpl recorders. // we need a mutex to guarantee these recorders modified by only one background thread at one time - filter_cache_mutex_.lock(); // std::map merged_level_recorder; // actually when flushing, there is no merged segment // remove merged segments - assert(merged_segment_ids->empty()); - /* - auto level_it = level_recorder_->begin(); - auto range_it = segment_ranges_recorder_->begin(); - auto units_it = unit_size_recorder_->begin(); - while (level_it != level_recorder_->end()) { - if (merged_segment_ids->count(level_it->first) > 0) { - merged_level_recorder.insert(std::make_pair(level_it->first, level_it->second)) - level_it = level_recorder_->erase(level_it); - } else { - level_it ++; + // lock and update global recorders + { + std::lock_guard lock_guard(global_filter_cache_recorders_mutex); + assert(new_level_recorder->size() == new_segment_ranges_recorder->size()); + auto new_level_it = new_level_recorder->begin(); + auto new_range_it = new_segment_ranges_recorder->begin(); + auto new_units_it = new_unit_size_recorder->begin(); + while (new_level_it != new_level_recorder->end()) { + global_level_recorder.insert( + std::make_pair(new_level_it->first, new_level_it->second)); + new_level_it++; } - } - while (range_it != segment_ranges_recorder_->end()) { - if (merged_segment_ids->count(range_it->first) > 0) { - range_it = segment_ranges_recorder_->erase(range_it); - } else { - range_it ++; + while (new_range_it != new_segment_ranges_recorder->end()) { + global_segment_ranges_recorder.insert( + std::make_pair(new_range_it->first, new_range_it->second)); + new_range_it++; } - } - while (units_it != unit_size_recorder_->end()) { - if (merged_segment_ids->count(units_it->first) > 0) { - units_it = unit_size_recorder_->erase(units_it); - } else { - units_it ++; + while (new_units_it != new_unit_size_recorder->end()) { + // unit_size_recorder_.insert(std::make_pair(new_units_it->first, + // new_units_it->second)); we only use DEFAULT_UNIT_SIZE + new_units_it++; } } - */ + // recorder's lock released - // lock and update global recorders - global_recorder_mutex_.lock(); - // merge merge temp recorders into global DBImpl recorders. - assert(new_level_recorder->size() == new_segment_ranges_recorder->size()); - auto new_level_it = new_level_recorder->begin(); - auto new_range_it = new_segment_ranges_recorder->begin(); - auto new_units_it = new_unit_size_recorder->begin(); - while (new_level_it != new_level_recorder->end()) { - level_recorder_.insert(std::make_pair(new_level_it->first, new_level_it->second)); - new_level_it ++; - } - while (new_range_it != new_segment_ranges_recorder->end()) { - segment_ranges_recorder_.insert(std::make_pair(new_range_it->first, new_range_it->second)); - new_range_it ++; - } - while (new_units_it != new_unit_size_recorder->end()) { - // unit_size_recorder_.insert(std::make_pair(new_units_it->first, new_units_it->second)); - // we only use DEFAULT_UNIT_SIZE - new_units_it ++; - } - global_recorder_mutex_.unlock(); - // call filter cache client DBImpl::filter_cache_ update work assert(merged_segment_ids->empty()); assert(inherit_infos_recorder->empty()); std::vector merged_segment_ids_vec, new_segment_ids_vec; - merged_segment_ids_vec.assign(merged_segment_ids.begin(), merged_segment_ids.end()); - new_segment_ids_vec.assign(new_segment_ids.begin(), new_segment_ids.end()); - filter_cache_.batch_insert_segments(merged_segment_ids_vec, new_segment_ids_vec, *inherit_infos_recorder, + merged_segment_ids_vec.assign(merged_segment_ids->begin(), merged_segment_ids->end()); + new_segment_ids_vec.assign(new_segment_ids->begin(), new_segment_ids->end()); + global_filter_cache.batch_insert_segments(merged_segment_ids_vec, new_segment_ids_vec, *inherit_infos_recorder.get(), *new_level_recorder, 0, *new_segment_ranges_recorder); // temp recorders below: @@ -2745,16 +2794,6 @@ void DBImpl::SyncCallFlush(std::vector& jobs) { // std::map* level_recorder_ // std::map>* segment_ranges_recorder_ // std::map* unit_size_recorder_ - - // release temp recorders? - delete merged_segment_ids; - delete new_level_recorder; - delete new_segment_ranges_recorder; - delete new_unit_size_recorder; - delete new_segment_ids; - delete inherit_infos_recorder; - - filter_cache_mutex_.unlock(); #endif } } @@ -3080,24 +3119,38 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, // you may need filter_cache_.range_seperators() to receive key range seperators // exactly, if key k < seperators[i+1] and key k >= seperators[i], then key k hit key range i // HeatBuckets::locate(const std::string& key) will tell you how to binary search corresponding key range for one key - std::set* merged_segment_ids = new std::set; // the merged segments' id, we need to delete them from these 3 global recorders - std::map* new_level_recorder = new std::map; - std::map>* new_segment_ranges_recorder = new std::map>; - std::map* new_unit_size_recorder = new std::map; - std::vector& key_range_seperators = filter_cache_.range_seperators(); - std::set* new_segment_ids = new std::set; - std::map>* inherit_infos_recorder = new std::map>; + std::unique_ptr> merged_segment_ids( + new std::set); // the merged segments' id, we need to delete + // them from these 3 global recorders + std::unique_ptr> new_level_recorder( + new std::map); + std::unique_ptr>> + new_segment_ranges_recorder( + new std::map>); + std::unique_ptr> new_unit_size_recorder( + new std::map); + const std::vector& key_range_seperators = + global_filter_cache.range_seperators(); + std::unique_ptr> new_segment_ids(new std::set); + std::unique_ptr>> + inherit_infos_recorder( + new std::map>); // TODO(WaLSM+): you can pass these var into NVMFlushJob and update them when compacting int compaction_flag = 0; // 0 = not defined, 1 = delete compaction, 2 = trivial compaction, 3 = other #endif + // WaLSM+: result from compaction + SegmentBuilderResult segment_builder_result; + IOStatus io_s; if (!c) { // Nothing to do ROCKS_LOG_BUFFER(log_buffer, "Compaction nothing to do"); - } else if (c->deletion_compaction()) { + } else if (UNLIKELY(c->deletion_compaction())) { // TODO(icanadi) Do we want to honor snapshots here? i.e. not delete old // file if there is alive snapshot pointing to it + assert(false); // cannot get here + exit(1); TEST_SYNC_POINT_CALLBACK("DBImpl::BackgroundCompaction:BeforeCompaction", c->column_family_data()); assert(c->num_input_files(1) == 0); @@ -3115,10 +3168,17 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, #ifdef ART_PLUS compaction_flag = 1; #endif - /* - std::set* merged_segment_ids = new std::set; - // the merged segments' id, we need to delete them from these 3 global recorders - */ + + // the merged segments' id, we need to delete them from these 3 global recorders + std::unique_ptr> merged_segment_ids_f1(new std::set); + for (const auto& f : *c->inputs(0)) { + auto segment_handles_map = f->fd.table_reader->GetSegmentBlockHandles(); + for (const auto& segment_id_and_block_handles : segment_handles_map) { + auto segment_id = segment_id_and_block_handles.first; + merged_segment_ids_f1->insert(segment_id); + } + } + for (const auto& f : *c->inputs(0)) { c->edit()->DeleteFile(c->level(), f->fd.GetNumber()); } @@ -3135,7 +3195,9 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, *made_progress = true; TEST_SYNC_POINT_CALLBACK("DBImpl::BackgroundCompaction:AfterCompaction", c->column_family_data()); - } else if (!trivial_move_disallowed && c->IsTrivialMove()) { + } else if (UNLIKELY(!trivial_move_disallowed && c->IsTrivialMove())) { + assert(false); // cannot get here + exit(1); TEST_SYNC_POINT("DBImpl::BackgroundCompaction:TrivialMove"); TEST_SYNC_POINT_CALLBACK("DBImpl::BackgroundCompaction:BeforeCompaction", c->column_family_data()); @@ -3156,21 +3218,39 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, int64_t moved_bytes = 0; #ifdef ART_PLUS compaction_flag = 2; // sign for TrivialMove + + // TODO(WaLSM+): no new SST generated and no SST merged, just move segments(from different levels) to target levels + // we can copy moved segment ids into merged_segment_ids. + // then record these moved segments' new level to new_level_recorder + // maybe we need to record segment ids for every SST for convience? + + std::unique_ptr> merged_segment_ids_f2(new std::set); + // the merged segments' id, we need to delete them from these 3 global recorders + std::unique_ptr> new_level_recorder_f2(new std::map); + auto output_level_f2 = c->output_level(); + + for (unsigned int l = 0; l < c->num_input_levels(); l++) { + if (c->level(l) == c->output_level()) { + continue; + } + for (size_t i = 0; i < c->num_input_files(l); i++) { + FileMetaData* f = c->input(l, i); + auto segment_handles_map = f->fd.table_reader->GetSegmentBlockHandles(); + for (const auto& segment_id_and_block_handles : segment_handles_map) { + auto segment_id = segment_id_and_block_handles.first; + merged_segment_ids_f2->insert(segment_id); + (*new_level_recorder_f2)[segment_id] = output_level_f2; + } + } + } + #endif + for (unsigned int l = 0; l < c->num_input_levels(); l++) { if (c->level(l) == c->output_level()) { continue; } for (size_t i = 0; i < c->num_input_files(l); i++) { - // TODO(WaLSM+): no new SST generated and no SST merged, just move segments(from different levels) to target levels - // we can copy moved segment ids into merged_segment_ids. - // then record these moved segments' new level to new_level_recorder - // maybe we need to record segment ids for every SST for convience? - /* - std::set* merged_segment_ids; - // the merged segments' id, we need to delete them from these 3 global recorders - std::map* new_level_recorder = new std::map; - */ FileMetaData* f = c->input(l, i); c->edit()->DeleteFile(c->level(l), f->fd.GetNumber()); c->edit()->AddFile(c->output_level(), f->fd.GetNumber(), @@ -3293,6 +3373,7 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, TEST_SYNC_POINT_CALLBACK( "DBImpl::BackgroundCompaction:NonTrivial:BeforeRun", nullptr); // Should handle erorr? + compaction_job.SetFilterCacheClient(&global_filter_cache); compaction_job.Run().PermitUncheckedError(); TEST_SYNC_POINT("DBImpl::BackgroundCompaction:NonTrivial:AfterRun"); mutex_.Lock(); @@ -3308,9 +3389,40 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, TEST_SYNC_POINT_CALLBACK("DBImpl::BackgroundCompaction:AfterCompaction", c->column_family_data()); - + #ifdef ART_PLUS + if (status.ok() && !io_s.ok()) { + status = io_s; + } else { + io_s.PermitUncheckedError(); + } + + if (c != nullptr) { + c->ReleaseCompactionFiles(status); + *made_progress = true; + + #ifndef ROCKSDB_LITE + // Need to make sure SstFileManager does its bookkeeping + auto sfm = static_cast( + immutable_db_options_.sst_file_manager.get()); + if (sfm && sfm_reserved_compact_space) { + sfm->OnCompactionCompletion(c.get()); + } + #endif // ROCKSDB_LITE + + NotifyOnCompactionCompleted(c->column_family_data(), c.get(), status, + compaction_job_stats, job_context->job_id); + } + #endif + + compaction_job.CollectDataAndPrefetch(); + segment_builder_result = compaction_job.GetSegmentBuilderResult(); + compaction_job.CleanupCompaction(); + assert(segment_builder_result.new_segment_ids.size() > 0); } +#ifdef ART_PLUS +if (compaction_flag != 3) { +#endif if (status.ok() && !io_s.ok()) { status = io_s; } else { @@ -3333,7 +3445,9 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, NotifyOnCompactionCompleted(c->column_family_data(), c.get(), status, compaction_job_stats, job_context->job_id); } - +#ifdef ART_PLUS +} +#endif if (status.ok() || status.IsCompactionTooLarge() || status.IsManualCompactionPaused()) { // Done @@ -3424,39 +3538,40 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, // do new SSTs already exist in latest version? // TODO(WaLSM+): if all ok, merge temp recorders into global DBImpl recorders. // we need a mutex to guarantee these recorders modified by only one background thread at one time - filter_cache_mutex_.lock(); assert(compaction_flag >= 0 && compaction_flag <= 3); - if (compaction_flag == 1) { + if (UNLIKELY(compaction_flag == 1)) { + assert(false); // cannot get here + exit(1); // lock and update global recorders - global_recorder_mutex_.lock(); + global_filter_cache_recorders_mutex.lock(); // remove merged segments - auto level_it = level_recorder_->begin(); - auto range_it = segment_ranges_recorder_->begin(); - auto units_it = unit_size_recorder_->begin(); + auto level_it = global_level_recorder.begin(); + auto range_it = global_segment_ranges_recorder.begin(); + auto units_it = global_unit_size_recorder.begin(); std::map merged_level_recorder; - while (level_it != level_recorder_->end()) { + while (level_it != global_level_recorder.end()) { if (merged_segment_ids->count(level_it->first) > 0) { merged_level_recorder.insert(std::make_pair(level_it->first, level_it->second)); - level_it = level_recorder_->erase(level_it); + level_it = global_level_recorder.erase(level_it); } else { level_it ++; } } - while (range_it != segment_ranges_recorder_->end()) { + while (range_it != global_segment_ranges_recorder.end()) { if (merged_segment_ids->count(range_it->first) > 0) { - range_it = segment_ranges_recorder_->erase(range_it); + range_it = global_segment_ranges_recorder.erase(range_it); } else { range_it ++; } } - while (units_it != unit_size_recorder_->end()) { + while (units_it != global_unit_size_recorder.end()) { if (merged_segment_ids->count(units_it->first) > 0) { - units_it = unit_size_recorder_->erase(units_it); + units_it = global_unit_size_recorder.erase(units_it); } else { units_it ++; } } - global_recorder_mutex_.unlock(); + global_filter_cache_recorders_mutex.unlock(); // merge merge temp recorders into global DBImpl recorders. assert(new_level_recorder->empty()); @@ -3488,8 +3603,8 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, // new segments id empty, that will not fit in batch_insert_segments // we need a new method batch_delete_segments to only delete merge segments std::vector merged_segment_ids_vec; - merged_segment_ids_vec.assign(merged_segment_ids.begin(), merged_segment_ids.end()); - filter_cache_.batch_delete_segments(merged_segment_ids_vec, merged_level_recorder); + merged_segment_ids_vec.assign(merged_segment_ids->begin(), merged_segment_ids->end()); + global_filter_cache.batch_delete_segments(merged_segment_ids_vec); // temp recorders below: // std::set* merged_segment_ids = new std::set; // the merged segments' id, we need to delete them from these 3 global recorders @@ -3503,42 +3618,36 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, // std::map* level_recorder_ // std::map>* segment_ranges_recorder_ // std::map* unit_size_recorder_ - - // release temp recorders? - delete merged_segment_ids; - delete new_level_recorder; - delete new_segment_ranges_recorder; - delete new_unit_size_recorder; - delete new_segment_ids; - delete inherit_infos_recorder; - - } else if (compaction_flag == 2) { + } else if (UNLIKELY(compaction_flag == 2)) { + assert(false); // cannot get here + exit(1); // lock and update global recorders - global_recorder_mutex_.lock(); + global_filter_cache_recorders_mutex.lock(); // modify segments' level - auto level_it = level_recorder_->begin(); - auto range_it = segment_ranges_recorder_->begin(); + auto level_it = global_level_recorder.begin(); + auto range_it = global_segment_ranges_recorder.begin(); assert(new_level_recorder->size() > 0); assert(merged_segment_ids->size() == new_level_recorder->size()); std::map old_level_recorder; - while (level_it != level_recorder_->end()) { + while (level_it != global_level_recorder.end()) { if (merged_segment_ids->count(level_it->first) > 0) { old_level_recorder.insert(std::make_pair(level_it->first, level_it->second)); - level_it = level_recorder_->erase(level_it); + level_it = global_level_recorder.erase(level_it); } else { level_it ++; } } - while (range_it != segment_ranges_recorder_->end()) { + while (range_it != global_segment_ranges_recorder.end()) { if (merged_segment_ids->count(range_it->first) > 0) { - new_segment_ranges_recorder->insert(std::make_pair(range_it->first, range->second)); - range_it = segment_ranges_recorder_->erase(range_it); + new_segment_ranges_recorder->insert(std::make_pair(range_it->first, range_it->second)); + // no need to erase + // range_it = global_segment_ranges_recorder.erase(range_it); } else { range_it ++; } } - assert(unit_size_recorder_->empty()); + assert(new_unit_size_recorder->empty()); /* while (units_it != unit_size_recorder_->end()) { if (merged_segment_ids->count(units_it->first) > 0) { @@ -3551,27 +3660,27 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, assert(new_level_recorder->size() == new_segment_ranges_recorder->size()); auto new_level_it = new_level_recorder->begin(); - auto new_range_it = new_segment_ranges_recorder->begin(); + // auto new_range_it = new_segment_ranges_recorder->begin(); auto new_units_it = new_unit_size_recorder->begin(); while (new_level_it != new_level_recorder->end()) { - level_recorder_->insert(std::make_pair(new_level_it->first, new_level_it->second)); + global_level_recorder.insert(std::make_pair(new_level_it->first, new_level_it->second)); new_level_it ++; } - while (new_range_it != new_segment_ranges_recorder->end()) { - segment_ranges_recorder_->insert(std::make_pair(new_range_it->first, new_range_it->second)); - new_range_it ++; - } + // while (new_range_it != new_segment_ranges_recorder->end()) { + // global_segment_ranges_recorder.insert(std::make_pair(new_range_it->first, new_range_it->second)); + // new_range_it ++; + // } while (new_units_it != new_unit_size_recorder->end()) { // unit_size_recorder_.insert(std::make_pair(new_units_it->first, new_units_it->second)); new_units_it ++; } - global_recorder_mutex_.unlock(); + global_filter_cache_recorders_mutex.unlock(); // call filter cache client DBImpl::filter_cache_ update work // we need a new filter cache operation to support moving segments to a new level std::vector merged_segment_ids_vec; - merged_segment_ids_vec.assign(merged_segment_ids.begin(), merged_segment_ids.end()); - filter_cache_.batch_move_segments(merged_segment_ids_vec, old_level_recorder, *new_level_recorder, *new_segment_ranges_recorder); + merged_segment_ids_vec.assign(merged_segment_ids->begin(), merged_segment_ids->end()); + global_filter_cache.batch_move_segments(merged_segment_ids_vec, old_level_recorder, *new_level_recorder, *new_segment_ranges_recorder); // temp recorders below: // std::set* merged_segment_ids = new std::set; // the merged segments' id, we need to delete them from these 3 global recorders @@ -3585,44 +3694,58 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, // std::map* level_recorder_ // std::map>* segment_ranges_recorder_ // std::map* unit_size_recorder_ + } else if (LIKELY(compaction_flag == 3)) { + // get SegmentBuilderResult from compaction job + + // update merged_segment_ids and new_segment_ids + for (const auto& id : segment_builder_result.merged_segment_ids) { + merged_segment_ids->insert(id); + } + for (const auto& id : segment_builder_result.new_segment_ids) { + new_segment_ids->insert(id); + } + + // update new_level_recorder + for (const auto id : segment_builder_result.new_segment_ids) { + new_level_recorder->insert(std::make_pair(id, segment_builder_result.output_level)); + } + + // update new_segment_ranges_recorder and inherit_infos_recorder + for (const auto& per_segment_result : segment_builder_result.per_segment_results) { + const auto segment_id = per_segment_result.segment_id; + (*new_segment_ranges_recorder)[segment_id] = per_segment_result.range_rate_pairs; + (*inherit_infos_recorder)[segment_id] = per_segment_result.inherit_recorder; + } - // release temp recorders? - delete merged_segment_ids; - delete new_level_recorder; - delete new_segment_ranges_recorder; - delete new_unit_size_recorder; - delete new_segment_ids; - delete inherit_infos_recorder; - } else if (compaction_flag == 3) { // it is normal compaction (merge->split) std::map merged_level_recorder; // lock and update global recorders - global_recorder_mutex_.lock(); + global_filter_cache_recorders_mutex.lock(); // remove merged segments assert(!(merged_segment_ids->empty())); - auto level_it = level_recorder_->begin(); - auto range_it = segment_ranges_recorder_->begin(); - auto units_it = unit_size_recorder_->begin(); - while (level_it != level_recorder_->end()) { + auto level_it = global_level_recorder.begin(); + auto range_it = global_segment_ranges_recorder.begin(); + auto units_it = global_unit_size_recorder.begin(); + while (level_it != global_level_recorder.end()) { if (merged_segment_ids->count(level_it->first) > 0) { - merged_level_recorder.insert(std::make_pair(level_it->first, level_it->second)) - level_it = level_recorder_->erase(level_it); + merged_level_recorder.insert(std::make_pair(level_it->first, level_it->second)); + level_it = global_level_recorder.erase(level_it); } else { level_it ++; } } - while (range_it != segment_ranges_recorder_->end()) { + while (range_it != global_segment_ranges_recorder.end()) { if (merged_segment_ids->count(range_it->first) > 0) { - range_it = segment_ranges_recorder_->erase(range_it); + range_it = global_segment_ranges_recorder.erase(range_it); } else { range_it ++; } } - while (units_it != unit_size_recorder_->end()) { + while (units_it != global_unit_size_recorder.end()) { if (merged_segment_ids->count(units_it->first) > 0) { - units_it = unit_size_recorder_->erase(units_it); + units_it = global_unit_size_recorder.erase(units_it); } else { units_it ++; } @@ -3635,11 +3758,11 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, auto new_range_it = new_segment_ranges_recorder->begin(); auto new_units_it = new_unit_size_recorder->begin(); while (new_level_it != new_level_recorder->end()) { - level_recorder_.insert(std::make_pair(new_level_it->first, new_level_it->second)); + global_level_recorder.insert(std::make_pair(new_level_it->first, new_level_it->second)); new_level_it ++; } while (new_range_it != new_segment_ranges_recorder->end()) { - segment_ranges_recorder_.insert(std::make_pair(new_range_it->first, new_range_it->second)); + global_segment_ranges_recorder.insert(std::make_pair(new_range_it->first, new_range_it->second)); new_range_it ++; } while (new_units_it != new_unit_size_recorder->end()) { @@ -3647,24 +3770,16 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, // we only use DEFAULT_UNIT_SIZE new_units_it ++; } - global_recorder_mutex_.unlock(); - - // make sure that we also input merged segments' level - // batch_insert_segments argument need both merged and new segments' level - auto merged_it = merged_level_recorder.begin(); - while (merged_it != merged_level_recorder.end()) { - assert(new_level_recorder->find(merged_it->first) == new_level_recorder.end()); - new_level_recorder->insert(std::make_pair(merged_it->first, merged_it->second)); - merged_it ++; - } - assert(new_level_recorder->size() == new_segment_ids->size() + merged_segment_ids->size()); + global_filter_cache_recorders_mutex.unlock(); + // there are no merged segments' id in new_level_recorder + assert(new_level_recorder->size() == new_segment_ids->size()); // call filter cache client DBImpl::filter_cache_ update work assert(inherit_infos_recorder->size() == new_segment_ids->size()); std::vector merged_segment_ids_vec, new_segment_ids_vec; - merged_segment_ids_vec.assign(merged_segment_ids.begin(), merged_segment_ids.end()); - new_segment_ids_vec.assign(new_segment_ids.begin(), new_segment_ids.end()); - filter_cache_.batch_insert_segments(merged_segment_ids_vec, new_segment_ids_vec, *inherit_infos_recorder, + merged_segment_ids_vec.assign(merged_segment_ids->begin(), merged_segment_ids->end()); + new_segment_ids_vec.assign(new_segment_ids->begin(), new_segment_ids->end()); + global_filter_cache.batch_insert_segments(merged_segment_ids_vec, new_segment_ids_vec, *inherit_infos_recorder, *new_level_recorder, 0, *new_segment_ranges_recorder); // temp recorders below: @@ -3679,19 +3794,9 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, // std::map* level_recorder_ // std::map>* segment_ranges_recorder_ // std::map* unit_size_recorder_ - - // release temp recorders? - delete merged_segment_ids; - delete new_level_recorder; - delete new_segment_ranges_recorder; - delete new_unit_size_recorder; - delete new_segment_ids; - delete inherit_infos_recorder; - } else { assert(compaction_flag == 0); } - filter_cache_mutex_.unlock(); #endif return status; } diff --git a/db/db_impl/db_impl_write.cc b/db/db_impl/db_impl_write.cc index 7838c62b2..938f36418 100644 --- a/db/db_impl/db_impl_write.cc +++ b/db/db_impl/db_impl_write.cc @@ -15,6 +15,7 @@ #include "options/options_helper.h" #include "test_util/sync_point.h" #include "util/cast_util.h" +#include "db/art/global_filter_cache_context.h" namespace ROCKSDB_NAMESPACE { // Convenience methods @@ -24,8 +25,13 @@ Status DBImpl::Put(const WriteOptions& o, ColumnFamilyHandle* column_family, #ifdef ART_PLUS // heat_buckets not ready, still sample into pool // if ready, prepare func auto return and do nothing + #ifndef SAMPLES_FILE + // if use key sample file, do not use this code. + assert(false); std::string art_key(key.data(), key.size()); - filter_cache_.prepare_heat_buckets(art_key, segment_info_recorder_); + global_filter_cache.prepare_heat_buckets(art_key, &global_segment_info_recorder); + + #endif #endif return DB::Put(o, column_family, key, val); } diff --git a/db/db_test3.cc b/db/db_test3.cc index c616a1b0f..7e11659fa 100644 --- a/db/db_test3.cc +++ b/db/db_test3.cc @@ -373,7 +373,7 @@ void DoTest(std::string test_name) { options.use_direct_io_for_flush_and_compaction = true; options.use_direct_reads = true; options.enable_pipelined_write = true; - options.nvm_path = "/pg_wal/ycc/memory_art"; + options.nvm_path = "/mnt/pmem0.7/guoteng/nodememory"; options.compression = rocksdb::kNoCompression; options.IncreaseParallelism(16); @@ -382,7 +382,7 @@ void DoTest(std::string test_name) { zipf->Prepare(); DB* db; - DB::Open(options, "/tmp/db_old_custom", &db); + DB::Open(options, "/mnt/nvme0n1/guoteng/walsmtest/tmp/db_old_custom", &db); std::thread read_threads[thread_num]; std::thread write_threads[thread_num]; diff --git a/db/dbformat.h b/db/dbformat.h index 38fea61ed..7aa425836 100644 --- a/db/dbformat.h +++ b/db/dbformat.h @@ -9,6 +9,7 @@ #pragma once #include +#include #include #include #include @@ -16,6 +17,7 @@ #include "db/merge_context.h" #include "logging/logging.h" #include "monitoring/perf_context_imp.h" +#include "port/port_posix.h" #include "rocksdb/comparator.h" #include "rocksdb/db.h" #include "rocksdb/filter_policy.h" @@ -98,6 +100,8 @@ static const SequenceNumber kDisableGlobalSequenceNumber = port::kMaxUint64; constexpr uint64_t kNumInternalBytes = 8; +constexpr uint32_t INVALID_SEGMENT_ID = port::kMaxUint32; + // The data structure that represents an internal key in the way that user_key, // sequence number and type are stored in separated forms. struct ParsedInternalKey { @@ -174,11 +178,21 @@ inline Slice ExtractUserKey(const Slice& internal_key) { return Slice(internal_key.data(), internal_key.size() - kNumInternalBytes); } +#ifdef ART_PLUS +Slice generate_modified_internal_key(std::unique_ptr& buf, + Slice original_internal_key, + int filter_index, int segment_id); + +Slice generate_modified_user_key(std::unique_ptr& buf, + Slice original_user_key, int filter_index, + int segment_id); +#endif + #ifdef ART_PLUS // Returns the internal bytes portion of an internal key. (WaLSM+) inline Slice ExtractInternalBytes(const Slice& internal_key) { assert(internal_key.size() >= kNumInternalBytes); - return Slice(internal_key.data() + internal_key.size(), kNumInternalBytes); + return Slice(internal_key.data() + internal_key.size() - kNumInternalBytes, kNumInternalBytes); } #endif diff --git a/db/merge_helper.cc b/db/merge_helper.cc index 61ff22506..fff01e981 100644 --- a/db/merge_helper.cc +++ b/db/merge_helper.cc @@ -5,6 +5,7 @@ #include "db/merge_helper.h" +#include #include #include "db/dbformat.h" @@ -122,6 +123,7 @@ Status MergeHelper::MergeUntil(InternalIterator* iter, assert(HasOperator()); keys_.clear(); merge_context_.Clear(); + segment_ids_.clear(); has_compaction_filter_skip_until_ = false; assert(user_merge_operator_); bool first_key = true; @@ -134,6 +136,7 @@ Status MergeHelper::MergeUntil(InternalIterator* iter, // original_key_is_iter == (iter->key().ToString() == original_key) bool original_key_is_iter = true; std::string original_key = iter->key().ToString(); + uint32_t original_segment_id = iter->segment_id(); // Important: // orig_ikey is backed by original_key if keys_.empty() // orig_ikey is backed by keys_.back() if !keys_.empty() @@ -220,12 +223,15 @@ Status MergeHelper::MergeUntil(InternalIterator* iter, if (s.ok()) { // The original key encountered original_key = std::move(keys_.back()); + original_segment_id = segment_ids_.back(); orig_ikey.type = kTypeValue; UpdateInternalKey(&original_key, orig_ikey.sequence, orig_ikey.type); keys_.clear(); merge_context_.Clear(); + segment_ids_.clear(); keys_.emplace_front(std::move(original_key)); merge_context_.PushOperand(merge_result); + segment_ids_.push_front(original_segment_id); } // move iter to the next entry @@ -262,8 +268,10 @@ Status MergeHelper::MergeUntil(InternalIterator* iter, if (original_key_is_iter) { // this is just an optimization that saves us one memcpy keys_.push_front(std::move(original_key)); + segment_ids_.push_front(original_segment_id); } else { keys_.push_front(iter->key().ToString()); + segment_ids_.push_front(iter->segment_id()); } if (keys_.size() == 1) { // we need to re-anchor the orig_ikey because it was anchored by @@ -285,6 +293,7 @@ Status MergeHelper::MergeUntil(InternalIterator* iter, // (not just this operand), along with some keys following it. keys_.clear(); merge_context_.Clear(); + segment_ids_.clear(); has_compaction_filter_skip_until_ = true; return s; } @@ -329,11 +338,14 @@ Status MergeHelper::MergeUntil(InternalIterator* iter, // We are certain that keys_ is not empty here (see assertions couple of // lines before). original_key = std::move(keys_.back()); + original_segment_id = segment_ids_.back(); orig_ikey.type = kTypeValue; UpdateInternalKey(&original_key, orig_ikey.sequence, orig_ikey.type); keys_.clear(); merge_context_.Clear(); + segment_ids_.clear(); keys_.emplace_front(std::move(original_key)); + segment_ids_.push_front(original_segment_id); merge_context_.PushOperand(merge_result); } } else { @@ -362,6 +374,7 @@ Status MergeHelper::MergeUntil(InternalIterator* iter, merge_context_.Clear(); merge_context_.PushOperand(merge_result); keys_.erase(keys_.begin(), keys_.end() - 1); + segment_ids_.erase(segment_ids_.begin(), segment_ids_.end() - 1); } } } @@ -373,19 +386,24 @@ MergeOutputIterator::MergeOutputIterator(const MergeHelper* merge_helper) : merge_helper_(merge_helper) { it_keys_ = merge_helper_->keys().rend(); it_values_ = merge_helper_->values().rend(); + it_segment_ids_ = merge_helper_->segment_ids().rend(); } void MergeOutputIterator::SeekToFirst() { const auto& keys = merge_helper_->keys(); const auto& values = merge_helper_->values(); + const auto& segment_ids = merge_helper_->segment_ids(); assert(keys.size() == values.size()); + assert(keys.size() == segment_ids.size()); it_keys_ = keys.rbegin(); it_values_ = values.rbegin(); + it_segment_ids_ = segment_ids.rbegin(); } void MergeOutputIterator::Next() { ++it_keys_; ++it_values_; + ++it_segment_ids_; } CompactionFilter::Decision MergeHelper::FilterMerge(const Slice& user_key, diff --git a/db/merge_helper.h b/db/merge_helper.h index c0534f08b..89b002cc6 100644 --- a/db/merge_helper.h +++ b/db/merge_helper.h @@ -120,6 +120,7 @@ class MergeHelper { } uint64_t TotalFilterTime() const { return total_filter_time_; } bool HasOperator() const { return user_merge_operator_ != nullptr; } + const std::deque& segment_ids() const { return segment_ids_; } // If compaction filter returned REMOVE_AND_SKIP_UNTIL, this method will // return true and fill *until with the key to which we should skip. @@ -155,6 +156,7 @@ class MergeHelper { std::deque keys_; // Parallel with keys_; stores the operands mutable MergeContext merge_context_; + std::deque segment_ids_; StopWatchNano filter_timer_; uint64_t total_filter_time_; @@ -183,12 +185,14 @@ class MergeOutputIterator { Slice key() { return Slice(*it_keys_); } Slice value() { return Slice(*it_values_); } + uint32_t segment_id() { return *it_segment_ids_; } bool Valid() { return it_keys_ != merge_helper_->keys().rend(); } private: const MergeHelper* merge_helper_; std::deque::const_reverse_iterator it_keys_; std::vector::const_reverse_iterator it_values_; + std::deque::const_reverse_iterator it_segment_ids_; }; } // namespace ROCKSDB_NAMESPACE diff --git a/db/nvm_flush_job.cc b/db/nvm_flush_job.cc index 2059f2afb..ce592f008 100644 --- a/db/nvm_flush_job.cc +++ b/db/nvm_flush_job.cc @@ -15,6 +15,7 @@ #include #include "db/art/logger.h" +#include "db/art/art_metric.h" #include "db/builder.h" #include "db/db_iter.h" #include "db/dbformat.h" @@ -50,6 +51,9 @@ namespace ROCKSDB_NAMESPACE { +static FlushMetric flushMetric_; +// static NVMWriteMetric writeMetric_; + // WaLSM+ Note: copy of FlushJob, add nvm reading NVMFlushJob::NVMFlushJob(SingleCompactionJob* job, const std::string& dbname, ColumnFamilyData* cfd, @@ -200,7 +204,12 @@ void NVMFlushJob::Build() { io_status_ = io_s; } LogFlush(db_options_.info_log); + segment_builder_result_ = std::move(job_->segment_builder_result); } + + // upadte WaLSM Flush Metric + flushMetric_.updateMetric(job_->out_file_size); + ROCKS_LOG_INFO(db_options_.info_log, "[%s] [JOB %d] Level-0 flush table #%" PRIu64 ": %" PRIu64 " bytes %s" @@ -298,6 +307,9 @@ void NVMFlushJob::WriteResult(InternalStats::CompactionStats& stats) { stream << "file_cpu_read_nanos" << (IOSTATS(cpu_read_nanos) - prev_cpu_read_nanos); } + + // // update WaLSM write metric + // writeMetric_.updateMetric(stats.bytes_written); } void NVMFlushJob::Cancel() { diff --git a/db/nvm_flush_job.h b/db/nvm_flush_job.h index eadbc07ea..7795b8dbb 100644 --- a/db/nvm_flush_job.h +++ b/db/nvm_flush_job.h @@ -39,6 +39,7 @@ #include "rocksdb/listener.h" #include "rocksdb/memtablerep.h" #include "rocksdb/transaction_log.h" +#include "table/block_based/filter_block.h" #include "table/scoped_arena_iterator.h" #include "util/autovector.h" #include "util/stop_watch.h" @@ -93,6 +94,7 @@ class NVMFlushJob { LogsWithPrepTracker* logs_with_prep_tracker_; FileMetaData meta_; + SegmentBuilderResult segment_builder_result_; private: void ReportStartedFlush(); @@ -162,7 +164,6 @@ class NVMFlushJob { const std::shared_ptr io_tracer_; - }; } // namespace ROCKSDB_NAMESPACE diff --git a/db/table_cache.cc b/db/table_cache.cc index 663ce8a94..6c1865a90 100644 --- a/db/table_cache.cc +++ b/db/table_cache.cc @@ -8,6 +8,7 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #include "db/table_cache.h" +#include #include "db/dbformat.h" #include "db/range_tombstone_fragmenter.h" @@ -514,6 +515,11 @@ Status TableCache::Get(FilterCacheClient& filter_cache, t = GetTableReaderFromHandle(handle); } } + + BlockBasedTable* block_based_table = nullptr; + block_based_table = static_cast(t); + assert(block_based_table != nullptr); + SequenceNumber* max_covering_tombstone_seq = get_context->max_covering_tombstone_seq(); if (s.ok() && max_covering_tombstone_seq != nullptr && @@ -529,7 +535,7 @@ Status TableCache::Get(FilterCacheClient& filter_cache, if (s.ok()) { get_context->SetReplayLog(row_cache_entry); // nullptr if no cache. // only add filter_cache argument - s = t->Get(filter_cache, options, k, get_context, prefix_extractor, skip_filters); + s = block_based_table->Get(filter_cache, options, k, get_context, prefix_extractor, skip_filters); get_context->SetReplayLog(nullptr); } else if (options.read_tier == kBlockCacheTier && s.IsIncomplete()) { // Couldn't find Table in cache but treat as kFound if no_io set diff --git a/db/table_cache.h b/db/table_cache.h index 6a1ed0e90..ec7b25ae5 100644 --- a/db/table_cache.h +++ b/db/table_cache.h @@ -14,7 +14,6 @@ #include #include -#include "db/art/filter_cache_client.h" #include "db/dbformat.h" #include "db/range_del_aggregator.h" #include "options/cf_options.h" @@ -33,6 +32,7 @@ class Arena; struct FileDescriptor; class GetContext; class HistogramImpl; +class FilterCacheClient; // Manages caching for TableReader objects for a column family. The actual // cache is allocated separately and passed to the constructor. TableCache diff --git a/db/version_set.cc b/db/version_set.cc index 799e1ca2d..47587a800 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -5447,8 +5447,8 @@ uint64_t VersionSet::ApproximateSize(const SizeApproximationOptions& options, for (int i = idx_start + 1; i < idx_end; ++i) { uint64_t file_size = files_brief.files[i].fd.GetFileSize(); // The entire file falls into the range, so we can just take its size. - assert(file_size == - ApproximateSize(v, files_brief.files[i], start, end, caller)); + // assert(file_size == + // ApproximateSize(v, files_brief.files[i], start, end, caller)); total_full_size += file_size; } diff --git a/env/io_posix.cc b/env/io_posix.cc index 689d89812..a144e8c86 100644 --- a/env/io_posix.cc +++ b/env/io_posix.cc @@ -37,6 +37,7 @@ #include "util/autovector.h" #include "util/coding.h" #include "util/string_util.h" +#include "db/art/art_metric.h" #if defined(OS_LINUX) && !defined(F_SET_RW_HINT) #define F_LINUX_SPECIFIC_BASE 1024 @@ -45,6 +46,8 @@ namespace ROCKSDB_NAMESPACE { +static ReadMetric readMetric_; + std::string IOErrorMsg(const std::string& context, const std::string& file_name) { if (file_name.empty()) { @@ -601,6 +604,8 @@ IOStatus PosixRandomAccessFile::Read(uint64_t offset, size_t n, filename_, errno); } *result = Slice(scratch, (r < 0) ? 0 : n - left); + // update WaLSM Read Metric + readMetric_.updateMetric(offset, offset + ((r < 0) ? 0 : n - left)); return s; } @@ -705,6 +710,10 @@ IOStatus PosixRandomAccessFile::MultiRead(FSReadRequest* reqs, "PosixRandomAccessFile::MultiRead:io_uring_result", &bytes_read); if (bytes_read == req_wrap->iov.iov_len) { req->result = Slice(req->scratch, req->len); + + // update WaLSM Read Metric + readMetric_.updateMetric(req->offset, req->offset + req->len); + req->status = IOStatus::OK(); } else if (bytes_read == 0) { // cqe->res == 0 can means EOF, or can mean partial results. See @@ -717,6 +726,10 @@ IOStatus PosixRandomAccessFile::MultiRead(FSReadRequest* reqs, // Bytes reads don't fill sectors. Should only happen at the end // of the file. req->result = Slice(req->scratch, req_wrap->finished_len); + + // update WaLSM Read Metric + readMetric_.updateMetric(req->offset, req->offset + req_wrap->finished_len); + req->status = IOStatus::OK(); } else { Slice tmp_slice; @@ -726,6 +739,10 @@ IOStatus PosixRandomAccessFile::MultiRead(FSReadRequest* reqs, req->scratch + req_wrap->finished_len, dbg); req->result = Slice(req->scratch, req_wrap->finished_len + tmp_slice.size()); + + // update WaLSM Read Metric + readMetric_.updateMetric(req->offset, + req->offset + req_wrap->finished_len + tmp_slice.size()); } } else if (bytes_read < req_wrap->iov.iov_len) { assert(bytes_read > 0); @@ -865,6 +882,8 @@ IOStatus PosixMmapReadableFile::Read(uint64_t offset, size_t n, n = static_cast(length_ - offset); } *result = Slice(reinterpret_cast(mmapped_region_) + offset, n); + // update WaLSM Read Metric + readMetric_.updateMetric(offset, offset + n); return s; } @@ -1497,4 +1516,4 @@ IOStatus PosixDirectory::Fsync(const IOOptions& /*opts*/, return IOStatus::OK(); } } // namespace ROCKSDB_NAMESPACE -#endif +#endif \ No newline at end of file diff --git a/examples/Makefile b/examples/Makefile index 90f8cf9a8..9720b11e2 100644 --- a/examples/Makefile +++ b/examples/Makefile @@ -19,7 +19,7 @@ all: write_example simple_example column_families_example compact_files_example write_example: librocksdb write_example.cc $(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++11 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS) -simple_example: librocksdb simple_example.cc +simple_example: simple_example.cc $(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++11 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS) custom: librocksdb custom.cc diff --git a/examples/custom.cc b/examples/custom.cc index 4c509686f..770968a21 100644 --- a/examples/custom.cc +++ b/examples/custom.cc @@ -548,10 +548,10 @@ void DoTest(double zipf) { options.use_direct_reads = true; options.enable_pipelined_write = true; options.compression = rocksdb::kNoCompression; - options.nvm_path = "/pg_wal/ycc/memory_art"; + options.nvm_path = "/mnt/pmem0.7/guoteng/nodememory"; options.IncreaseParallelism(16); - std::string db_path = "/tmp/db_old_custom"; + std::string db_path = "/mnt/nvme0n1/guoteng/walsmtest/tmp/db_old_custom"; DB* db; DB::Open(options, db_path, &db); diff --git a/examples/mini_benchmark.cc b/examples/mini_benchmark.cc index c841fd6dd..8579f87de 100644 --- a/examples/mini_benchmark.cc +++ b/examples/mini_benchmark.cc @@ -593,12 +593,12 @@ int main(int argc, char* argv[]) { options.use_direct_reads = true; options.enable_pipelined_write = true; options.compression = rocksdb::kNoCompression; - options.nvm_path = "/pg_wal/ycc/memory_art"; + options.nvm_path = "/mnt/pmem0.7/guoteng/nodememory"; options.IncreaseParallelism(16); std::remove(options.nvm_path.c_str()); - std::string db_path = "/tmp/tmp_data/db_art"; + std::string db_path = "/mnt/nvme0n1/guoteng/walsmtest/tmp/db_old_custom"; DB* db; DB::Open(options, db_path, &db); diff --git a/examples/run.sh b/examples/run.sh index 5e736eb33..96e431ff5 100755 --- a/examples/run.sh +++ b/examples/run.sh @@ -1,8 +1,8 @@ #!/bin/bash SingleTest() { - sudo rm -rf /mnt/chen/* - sudo rm -rf /tmp/db_old_custom + sudo rm -rf /mnt/pmem0.7/guoteng/* + sudo rm -rf /mnt/nvme0n1/guoteng/walsmtest/tmp/* numactl -N 1 ./ycsb $1 $2 #mv /tmp/db_old_custom/compaction_art.txt /home/chen/result/art_reset_$1_$2.txt } diff --git a/examples/rw_example.cc b/examples/rw_example.cc index 97ac580ee..5119cfe35 100644 --- a/examples/rw_example.cc +++ b/examples/rw_example.cc @@ -258,11 +258,11 @@ int main() { options.use_direct_io_for_flush_and_compaction = true; options.use_direct_reads = true; options.enable_pipelined_write = true; - options.nvm_path = "/pg_wal/ycc/memory_art"; + options.nvm_path = "/mnt/pmem0.7/guoteng/nodememory"; options.compression = rocksdb::kNoCompression; DB* db; - DB::Open(options, "/tmp/tmp_data/db_test_art", &db); + DB::Open(options, "/mnt/nvme0n1/guoteng/walsmtest/tmp/db_test_art", &db); std::thread read_threads[thread_num]; std::thread write_threads[thread_num]; diff --git a/examples/simple_example.cc b/examples/simple_example.cc index 6bca594e5..bf90d7c78 100644 --- a/examples/simple_example.cc +++ b/examples/simple_example.cc @@ -10,8 +10,10 @@ #include #include "rocksdb/db.h" +#include "rocksdb/filter_policy.h" #include "rocksdb/slice.h" #include "rocksdb/options.h" +#include "rocksdb/table.h" #include #include @@ -416,7 +418,7 @@ void ParseOptions(Options& options) { } void DoTest(std::string test_name) { - int thread_num = 8; + int thread_num = 1; int total_count = 320000000; int sample_range = 1000000000; @@ -426,20 +428,43 @@ void DoTest(std::string test_name) { options.use_direct_reads = true; options.enable_pipelined_write = true; options.compression = rocksdb::kNoCompression; - options.nvm_path = "/mnt/walsm/node_memory"; - options.IncreaseParallelism(16); + options.nvm_path = "/mnt/pmem0.7/guoteng/nodememory"; + // options.IncreaseParallelism(16); - std::string db_path = "/tmp/tmp_data/db_test_" + test_name; + options.create_if_missing = true; + options.use_direct_io_for_flush_and_compaction = true; + options.use_direct_reads = true; + options.compression = rocksdb::kNoCompression; + options.compaction_style = rocksdb::kCompactionStyleUniversal; + options.IncreaseParallelism(1); + options.statistics = rocksdb::CreateDBStatistics(); + + rocksdb::BlockBasedTableOptions block_based_options; + block_based_options.pin_top_level_index_and_filter = false; + block_based_options.pin_l0_filter_and_index_blocks_in_cache = false; + block_based_options.cache_index_and_filter_blocks_with_high_priority = false; + block_based_options.index_type = rocksdb::BlockBasedTableOptions::kTwoLevelIndexSearch; + block_based_options.partition_filters = true; + block_based_options.cache_index_and_filter_blocks = true; + block_based_options.filter_policy.reset(rocksdb::NewBloomFilterPolicy(10, false)); + block_based_options.block_cache = + rocksdb::NewLRUCache(static_cast(128 * 1024 * 1024)); + options.table_factory.reset(rocksdb::NewBlockBasedTableFactory(block_based_options)); + options.memtable_prefix_bloom_size_ratio = 0.02; + std::string db_path = "/mnt/nvme0n1/guoteng/walsmtest/tmp/db_test_" + test_name; DB* db; DB::Open(options, db_path, &db); + std::cout << " Open OK" << std::endl; Inserter inserter(thread_num, db); inserter.SetGenerator( new YCSBZipfianGenerator(total_count, sample_range, 0.98, 26.49)); inserter.DoInsert(); + std::cout << "Insert OK" << std::endl; db->Close(); + std::cout << "Close OK" << std::endl; delete db; } diff --git a/examples/walsm_benchmark.cc b/examples/walsm_benchmark.cc index 447cca829..c113ed604 100644 --- a/examples/walsm_benchmark.cc +++ b/examples/walsm_benchmark.cc @@ -545,11 +545,11 @@ int main(int argc, char* argv[]) { options.IncreaseParallelism(16); options.OptimizeForPointLookup(512); options.statistics = CreateDBStatistics(); - // options.nvm_path = "/mnt/pmem1/crh/nodememory"; + // options.nvm_path = "/mnt/pmem0.7/guoteng/nodememory"; // std::remove(options.nvm_path.c_str()); - std::string db_path = "/home/crh/db_test_nvm_l0"; + std::string db_path = "/mnt/nvme0n1/guoteng/walsmtest/tmp/db_test_nvm_l0"; DB* db; DB::Open(options, db_path, &db); diff --git a/examples/write_example.cc b/examples/write_example.cc index 71a9a0183..72b6ef3e1 100644 --- a/examples/write_example.cc +++ b/examples/write_example.cc @@ -85,7 +85,7 @@ int main() { DB* db; - assert(DB::Open(options, "/tmp/db_test", &db).ok()); + assert(DB::Open(options, "/mnt/nvme0n1/guoteng/walsmtest/tmp/db_nvm_l0", &db).ok()); int n = 0; diff --git a/examples/ycsb.cc b/examples/ycsb.cc index 5bca080dd..e54bc9333 100644 --- a/examples/ycsb.cc +++ b/examples/ycsb.cc @@ -419,14 +419,14 @@ void DoTest(double zipf, double read_ratio) { options.use_direct_reads = true; options.enable_pipelined_write = true; options.compression = rocksdb::kNoCompression; - options.nvm_path = "/pg_wal/ycc/memory_art"; + options.nvm_path = "/mnt/pmem0.7/guoteng/nodememory"; options.IncreaseParallelism(16); BlockBasedTableOptions table_options; table_options.filter_policy.reset(NewBloomFilterPolicy(10)); options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - std::string db_path = "/tmp/db_old_custom"; + std::string db_path = "/mnt/nvme0n1/guoteng/walsmtest/tmp/db_old_custom"; DB* db; DB::Open(options, db_path, &db); diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h index 9a11c0220..595959628 100644 --- a/include/rocksdb/options.h +++ b/include/rocksdb/options.h @@ -1224,7 +1224,7 @@ struct DBOptions { bool enable_rewrite = true; // Path for nvm file, don't pass directory. - std::string nvm_path = "/pg_wal/ycc/memory_art"; + std::string nvm_path = "/mnt/pmem0.7/guoteng/nodememory"; }; // Options to control the behavior of a database (passed to DB::Open) diff --git a/include/task_thread_pool.hpp b/include/task_thread_pool.hpp new file mode 100644 index 000000000..6029e8572 --- /dev/null +++ b/include/task_thread_pool.hpp @@ -0,0 +1,486 @@ +// SPDX-License-Identifier: BSD-2-Clause OR MIT OR BSL-1.0 +/** + * @brief A fast and lightweight thread pool for C++11 and newer. + * @see https://github.com/alugowski/task-thread-pool + * @author Adam Lugowski + * @copyright Copyright (C) 2023 Adam Lugowski. + * Licensed under any of the following open-source licenses: + * BSD-2-Clause license, MIT license, Boost Software License 1.0 + * + * + * BSD-2-Clause license: + * + * Copyright (C) 2023 Adam Lugowski + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + * + * + * + * MIT License: + * + * Copyright (c) 2023 Adam Lugowski + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * + * + * Boost Software License 1.0: + * + * Permission is hereby granted, free of charge, to any person or organization + * obtaining a copy of the software and accompanying documentation covered by + * this license (the "Software") to use, reproduce, display, distribute, execute, + * and transmit the Software, and to prepare derivative works of the Software, + * and to permit third-parties to whom the Software is furnished to do so, + * all subject to the following: + * + * The copyright notices in the Software and this entire statement, including + * the above license grant, this restriction and the following disclaimer, must + * be included in all copies of the Software, in whole or in part, and all + * derivative works of the Software, unless such copies or derivative works + * are solely in the form of machine-executable object code generated by a + * source language processor. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT + * SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE + * FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#ifndef AL_TASK_THREAD_POOL_HPP +#define AL_TASK_THREAD_POOL_HPP + +// Version macros. +#define TASK_THREAD_POOL_VERSION_MAJOR 1 +#define TASK_THREAD_POOL_VERSION_MINOR 0 +#define TASK_THREAD_POOL_VERSION_PATCH 10 + +#include +#include +#include +#include +#include +#include +#include + +// MSVC does not correctly set the __cplusplus macro by default, so we must read it from _MSVC_LANG +// See https://devblogs.microsoft.com/cppblog/msvc-now-correctly-reports-__cplusplus/ +#if __cplusplus >= 201703L || (defined(_MSVC_LANG) && _MSVC_LANG >= 201703L) +#define TTP_CXX17 1 +#else +#define TTP_CXX17 0 +#endif + +#if TTP_CXX17 +#define TTP_NODISCARD [[nodiscard]] +#else +#define TTP_NODISCARD +#endif + +namespace task_thread_pool { + +#if !TTP_CXX17 + /** + * A reimplementation of std::decay_t, which is only available since C++14. + */ + template + using decay_t = typename std::decay::type; +#endif + + /** + * A fast and lightweight thread pool that uses C++11 threads. + */ + class task_thread_pool { + public: + /** + * Create a task_thread_pool and start worker threads. + * + * @param num_threads Number of worker threads. If 0 then number of threads is equal to the + * number of physical cores on the machine, as given by std::thread::hardware_concurrency(). + */ + explicit task_thread_pool(unsigned int num_threads = 0) { + if (num_threads < 1) { + num_threads = std::thread::hardware_concurrency(); + if (num_threads < 1) { num_threads = 1; } + } + start_threads(num_threads); + } + + /** + * Finish all tasks left in the queue then shut down worker threads. + * If the pool is currently paused then it is resumed. + */ + ~task_thread_pool() { + unpause(); + wait_for_queued_tasks(); + stop_all_threads(); + } + + /** + * Drop all tasks that have been submitted but not yet started by a worker. + * + * Tasks already in progress continue executing. + */ + void clear_task_queue() { + const std::lock_guard tasks_lock(task_mutex); + tasks = {}; + } + + /** + * Get number of enqueued tasks. + * + * @return Number of tasks that have been enqueued but not yet started. + */ + TTP_NODISCARD size_t get_num_queued_tasks() const { + const std::lock_guard tasks_lock(task_mutex); + return tasks.size(); + } + + /** + * Get number of in-progress tasks. + * + * @return Approximate number of tasks currently being processed by worker threads. + */ + TTP_NODISCARD size_t get_num_running_tasks() const { + const std::lock_guard tasks_lock(task_mutex); + return num_inflight_tasks; + } + + /** + * Get total number of tasks in the pool. + * + * @return Approximate number of tasks both enqueued and running. + */ + TTP_NODISCARD size_t get_num_tasks() const { + const std::lock_guard tasks_lock(task_mutex); + return tasks.size() + num_inflight_tasks; + } + + /** + * Get number of worker threads. + * + * @return Number of worker threads. + */ + TTP_NODISCARD unsigned int get_num_threads() const { + const std::lock_guard threads_lock(thread_mutex); + return static_cast(threads.size()); + } + + /** + * Set number of worker threads. Will start or stop worker threads as necessary. + * + * @param num_threads Number of worker threads. If 0 then number of threads is equal to the + * number of physical cores on the machine, as given by std::thread::hardware_concurrency(). + * @return Previous number of worker threads. + */ + unsigned int set_num_threads(unsigned int num_threads) { + const std::lock_guard threads_lock(thread_mutex); + unsigned int previous_num_threads = get_num_threads(); + + if (num_threads < 1) { + num_threads = std::thread::hardware_concurrency(); + if (num_threads < 1) { num_threads = 1; } + } + + if (previous_num_threads <= num_threads) { + // expanding the thread pool + start_threads(num_threads - previous_num_threads); + } else { + // contracting the thread pool + stop_all_threads(); + { + const std::lock_guard tasks_lock(task_mutex); + pool_running = true; + } + start_threads(num_threads); + } + + return previous_num_threads; + } + + /** + * Stop executing queued tasks. Use `unpause()` to resume. Note: Destroying the pool will implicitly unpause. + * + * Any in-progress tasks continue executing. + */ + void pause() { + const std::lock_guard tasks_lock(task_mutex); + pool_paused = true; + } + + /** + * Resume executing queued tasks. + */ + void unpause() { + const std::lock_guard tasks_lock(task_mutex); + pool_paused = false; + task_cv.notify_all(); + } + + /** + * Check whether the pool is paused. + * + * @return true if pause() has been called without an intervening unpause(). + */ + TTP_NODISCARD bool is_paused() const { + const std::lock_guard tasks_lock(task_mutex); + return pool_paused; + } + + /** + * Submit a Callable for the pool to execute and return a std::future. + * + * @param func The Callable to execute. Can be a function, a lambda, std::packaged_task, std::function, etc. + * @param args Arguments for func. Optional. + * @return std::future that can be used to get func's return value or thrown exception. + */ + template , std::decay_t...> +#else + typename R = typename std::result_of(decay_t...)>::type +#endif + > + TTP_NODISCARD std::future submit(F&& func, A&&... args) { +#if defined(_MSC_VER) + // MSVC's packaged_task is not movable even though it should be. + // Discussion about this bug and its future fix: + // https://developercommunity.visualstudio.com/t/unable-to-move-stdpackaged-task-into-any-stl-conta/108672 + std::shared_ptr> ptask = + std::make_shared>(std::bind(std::forward(func), std::forward(args)...)); + submit_detach([ptask] { (*ptask)(); }); + return ptask->get_future(); +#else + std::packaged_task task(std::bind(std::forward(func), std::forward(args)...)); + auto ret = task.get_future(); + submit_detach(std::move(task)); + return ret; +#endif + } + + /** + * Submit a zero-argument Callable for the pool to execute. + * + * @param func The Callable to execute. Can be a function, a lambda, std::packaged_task, std::function, etc. + */ + template + void submit_detach(F&& func) { + const std::lock_guard tasks_lock(task_mutex); + tasks.emplace(std::forward(func)); + task_cv.notify_one(); + } + + /** + * Submit a Callable with arguments for the pool to execute. + * + * @param func The Callable to execute. Can be a function, a lambda, std::packaged_task, std::function, etc. + */ + template + void submit_detach(F&& func, A&&... args) { + const std::lock_guard tasks_lock(task_mutex); + tasks.emplace(std::bind(std::forward(func), std::forward(args)...)); + task_cv.notify_one(); + } + + /** + * Block until the task queue is empty. Some tasks may be in-progress when this method returns. + */ + void wait_for_queued_tasks() { + std::unique_lock tasks_lock(task_mutex); + notify_task_finish = true; + task_finished_cv.wait(tasks_lock, [&] { return tasks.empty(); }); + notify_task_finish = false; + } + + /** + * Block until all tasks have finished. + */ + void wait_for_tasks() { + std::unique_lock tasks_lock(task_mutex); + notify_task_finish = true; + task_finished_cv.wait(tasks_lock, [&] { return tasks.empty() && num_inflight_tasks == 0; }); + notify_task_finish = false; + } + + protected: + + /** + * Main function for worker threads. + */ + void worker_main() { + bool finished_task = false; + + while (true) { + std::unique_lock tasks_lock(task_mutex); + + if (finished_task) { + --num_inflight_tasks; + if (notify_task_finish) { + task_finished_cv.notify_all(); + } + } + + task_cv.wait(tasks_lock, [&]() { return !pool_running || (!pool_paused && !tasks.empty()); }); + + if (!pool_running) { + break; + } + + // Must mean that (!pool_paused && !tasks.empty()) is true + + std::packaged_task task{std::move(tasks.front())}; + tasks.pop(); + ++num_inflight_tasks; + tasks_lock.unlock(); + + try { + task(); + } catch (...) { + // std::packaged_task::operator() may throw in some error conditions, such as if the task + // had already been run. Nothing that the pool can do anything about. + } + + finished_task = true; + } + } + + /** + * Start worker threads. + * + * @param num_threads How many threads to start. + */ + void start_threads(const unsigned int num_threads) { + const std::lock_guard threads_lock(thread_mutex); + + for (unsigned int i = 0; i < num_threads; ++i) { + threads.emplace_back(&task_thread_pool::worker_main, this); + } + } + + /** + * Stop, join, and destroy all worker threads. + */ + void stop_all_threads() { + const std::lock_guard threads_lock(thread_mutex); + + { + const std::lock_guard tasks_lock(task_mutex); + pool_running = false; + task_cv.notify_all(); + } + + for (auto& thread : threads) { + if (thread.joinable()) { + thread.join(); + } + } + threads.clear(); + } + + /** + * The worker threads. + * + * Access protected by thread_mutex + */ + std::vector threads; + + /** + * A mutex for methods that start/stop threads. + */ + mutable std::recursive_mutex thread_mutex; + + /** + * The task queue. + * + * Access protected by task_mutex. + */ + std::queue> tasks = {}; + + /** + * A mutex for all variables related to tasks. + */ + mutable std::mutex task_mutex; + + /** + * Used to notify changes to the task queue, such as a new task added, pause/unpause, etc. + */ + std::condition_variable task_cv; + + /** + * Used to notify of finished tasks. + */ + std::condition_variable task_finished_cv; + + /** + * A signal for worker threads that the pool is either running or shutting down. + * + * Access protected by task_mutex. + */ + bool pool_running = true; + + /** + * A signal for worker threads to not pull new tasks from the queue. + * + * Access protected by task_mutex. + */ + bool pool_paused = false; + + /** + * A signal for worker threads that they should notify task_finished_cv when they finish a task. + * + * Access protected by task_mutex. + */ + bool notify_task_finish = false; + + /** + * A counter of the number of tasks in-progress by worker threads. + * Incremented when a task is popped off the task queue and decremented when that task is complete. + * + * Access protected by task_mutex. + */ + int num_inflight_tasks = 0; + }; +} + +// clean up +#undef TTP_NODISCARD +#undef TTP_CXX17 + +#endif diff --git a/lgb_server/model.py b/lgb_server/model.py index b8a235a0e..f2e258732 100644 --- a/lgb_server/model.py +++ b/lgb_server/model.py @@ -3,7 +3,7 @@ import numpy import math -model_path = '/pg_wal/ycc/' +model_path = '/home/guoteng_20241228_135/WaLSM+/log/' # model_path = '' class LGBModel(): @@ -11,11 +11,14 @@ def __init__(self) -> None: self.__model = None # one unit is 4 bits-per-key, class = 2 mean bits-per-key = 4 * 2 = 8 # the default bits-per-key value of previous benchmark is 10 - self.__default_class = 4 - self.__bits_per_key = 4 # bits_per_key for one filter unit + self.__min_class = 0 + self.__max_class = 6 + self.__num_classes = (self.__max_class - self.__min_class) + 1 + self.__default_class = 2 + self.__bits_per_key = 2 # bits_per_key for one filter unit, must larger than 1 self.__num_probes = math.floor(self.__bits_per_key * 0.69) # 4 * 0.69 = 2.76 -> 2 self.__rate_per_unit = math.pow(1.0 - math.exp(-self.__num_probes/self.__bits_per_key), self.__num_probes) # false positive rate of one unit - self.__cost_rate_line = 0.10 # we can torelate deviation that is no more than self.__cost_rate_line * (best I/O cost) (compared to best I/O cost) + self.__cost_threshold = 1.2 # we can torelate deviation that is no more than self.__cost_rate_line * (best I/O cost) (compared to best I/O cost) self.__model_name = 'model.txt' # self.__host = '127.0.0.1' # self.__port = '6666' @@ -37,29 +40,37 @@ def __evaluate_model(self, X: pd.DataFrame, y: pd.Series, c: pd.Series) -> bool: assert len(count_list) == len(class_list) assert len(preds_list) == len(class_list) + assert self.__bits_per_key > 1 best_cost = 0.0 pred_cost = 0.0 for i in range(0, len(class_list)): best_cost += math.pow(self.__rate_per_unit, class_list[i]) * count_list[i] pred_cost += math.pow(self.__rate_per_unit, preds_list[i]) * count_list[i] + # if pred_cost < best_cost, then the pred_cost will use more memory than the limitation + # we force the model to be retrained + if pred_cost < best_cost: + print("pred_cost smaller than best_cost, forced to retrain") + pred_cost += best_cost + best_cost * self.__cost_threshold - # print("best cost : " + str(best_cost) + ", pred cost: " + str(pred_cost)) - return math.fabs((pred_cost-best_cost)/best_cost) < self.__cost_rate_line + print("best cost : " + str(best_cost) + ", pred cost: " + str(pred_cost)) + return math.fabs((pred_cost-best_cost)/best_cost) < self.__cost_threshold def train(self, dataset: str) -> str: df = pd.read_csv(dataset) y = df['Target'] c = df['Count'] # used to check I/O cost metric X = df.drop(columns=['Target', 'Count']) + # print("Length of X:", X.shape[1]) if self.__model is not None and self.__evaluate_model(X, y, c): # still work well - return + return 'no need to train' # clf = lightgbm.LGBMClassifier(min_child_samples=1, n_estimators=1, objective="multiclass") - clf = lightgbm.LGBMClassifier() + clf = lightgbm.LGBMClassifier(verbosity=-1, n_estimators=3, objective="multiclass", num_class=self.__num_classes) clf.fit(X, y) # if we directly set self.__model = clf, then self.__model always predict class 0 # we need save clf to txt file, then read this model to init self.__model + print('train a new model') clf.booster_.save_model(model_path + self.__model_name) self.__model = lightgbm.Booster(model_file=model_path+self.__model_name) # print('load a new model') @@ -72,6 +83,8 @@ def predict(self, datas: pd.DataFrame) -> str: result = self.__model.predict(datas) return str(numpy.argmax(result[0])) else: + assert self.__default_class <= self.__max_class + assert self.__default_class >= self.__min_class return str(self.__default_class) ''' diff --git a/lgb_server/utils.py b/lgb_server/utils.py index 3d61825cb..84bdb5862 100644 --- a/lgb_server/utils.py +++ b/lgb_server/utils.py @@ -2,7 +2,7 @@ import pandas as pd import sys -dataset_path = '/pg_wal/ycc/' +dataset_path = '/home/guoteng_20241228_135/WaLSM+/log/' # dataset_path = '' # msg should be like 'dataset1.csv' @@ -25,6 +25,7 @@ def parse_pred_msg(msg: str) -> list[int]: # build predict data row from list[int] def prepare_data(data: list[int]) -> pd.DataFrame: assert type(data) is list and type(data[0]) is int + assert data[0] > 0 datas = pd.DataFrame([data]) return datas diff --git a/notes.txt b/notes.txt new file mode 100644 index 000000000..6b98fa62a --- /dev/null +++ b/notes.txt @@ -0,0 +1,5 @@ +sudo mount /dev/nvme0n1 /mnt/nvme0n1 +sudo mount -o dax,noatime /dev/pmem0.7 /mnt/pmem0.7 +sudo mount -o dax,noatime /dev/pmem0.8 /mnt/pmem0.8 +rm /mnt/nvme0n1/guoteng/walsmtest/tmp/db_nvm_l0/* -rf ; rm /mnt/pmem0.7/guoteng/nodememory +./ycsb -load -run -db rocksdb -P workloads/workload_test -P rocksdb/rocksdb.properties -p threadcount=8 -s \ No newline at end of file diff --git a/src.mk b/src.mk index f39f6809c..036988551 100644 --- a/src.mk +++ b/src.mk @@ -33,7 +33,8 @@ LIB_SOURCES = \ db/art/heat_buckets.cc \ db/art/clf_model.cc \ db/art/filter_cache_heap.cc \ - db/art/filter_cache_item.cc \ + db/art/filter_cache_entry.cc \ + db/art/global_filter_cache_context.cc \ db/art/filter_cache.cc \ db/art/filter_cache_client.cc \ db/art/greedy_algo.cc \ @@ -164,6 +165,7 @@ LIB_SOURCES = \ table/block_based/block_based_table_factory.cc \ table/block_based/block_based_table_iterator.cc \ table/block_based/block_based_table_reader.cc \ + table/block_based/block_based_table_segment_aware_iterator.cc \ table/block_based/block_builder.cc \ table/block_based/block_prefetcher.cc \ table/block_based/block_prefix_index.cc \ diff --git a/table/block_based/block.cc b/table/block_based/block.cc index 7b3ddb1c7..b07f6854a 100644 --- a/table/block_based/block.cc +++ b/table/block_based/block.cc @@ -11,6 +11,7 @@ #include "table/block_based/block.h" #include +#include #include #include #include diff --git a/table/block_based/block_based_filter_block.cc b/table/block_based/block_based_filter_block.cc index 2e457e32f..77e6ebf1f 100644 --- a/table/block_based/block_based_filter_block.cc +++ b/table/block_based/block_based_filter_block.cc @@ -80,7 +80,7 @@ void BlockBasedFilterBlockBuilder::StartBlock(uint64_t block_offset) { } } -void BlockBasedFilterBlockBuilder::Add(const Slice& key) { +void BlockBasedFilterBlockBuilder::Add(const Slice& key, uint32_t /*segment_id*/) { if (prefix_extractor_ && prefix_extractor_->InDomain(key)) { AddPrefix(key); } diff --git a/table/block_based/block_based_filter_block.h b/table/block_based/block_based_filter_block.h index 67ded1ee3..3f89966dc 100644 --- a/table/block_based/block_based_filter_block.h +++ b/table/block_based/block_based_filter_block.h @@ -44,7 +44,7 @@ class BlockBasedFilterBlockBuilder : public FilterBlockBuilder { virtual bool IsBlockBased() override { return true; } virtual void StartBlock(uint64_t block_offset) override; - virtual void Add(const Slice& key) override; + virtual void Add(const Slice& key, uint32_t segment_id) override; virtual size_t NumAdded() const override { return num_added_; } virtual Slice Finish(const BlockHandle& tmp, Status* status) override; using FilterBlockBuilder::Finish; diff --git a/table/block_based/block_based_table_builder.cc b/table/block_based/block_based_table_builder.cc index 7bd6eb3ca..246a906f9 100644 --- a/table/block_based/block_based_table_builder.cc +++ b/table/block_based/block_based_table_builder.cc @@ -20,6 +20,7 @@ #include #include "db/dbformat.h" +#include "db/art/global_filter_cache_context.h" #include "index_builder.h" #include "port/lang.h" @@ -66,7 +67,8 @@ FilterBlockBuilder* CreateFilterBlockBuilder( const ImmutableCFOptions& /*opt*/, const MutableCFOptions& mopt, const FilterBuildingContext& context, const bool use_delta_encoding_for_index_values, - PartitionedIndexBuilder* const p_index_builder) { + PartitionedIndexBuilder* const p_index_builder, + const InternalKeyComparator* internal_comparator) { const BlockBasedTableOptions& table_opt = context.table_options; if (table_opt.filter_policy == nullptr) return nullptr; @@ -91,7 +93,8 @@ FilterBlockBuilder* CreateFilterBlockBuilder( return new PartitionedFilterBlockBuilder( mopt.prefix_extractor.get(), table_opt.whole_key_filtering, filter_bits_builder, table_opt.index_block_restart_interval, - use_delta_encoding_for_index_values, p_index_builder, partition_size); + use_delta_encoding_for_index_values, p_index_builder, partition_size, + global_filter_cache.range_seperators(), internal_comparator); } else { return new FullFilterBlockBuilder(mopt.prefix_extractor.get(), table_opt.whole_key_filtering, @@ -265,7 +268,7 @@ struct BlockBasedTableBuilder::Rep { // compressing any data blocks. // TODO(ajkr): ideally we don't buffer all keys and all uncompressed data // blocks as it's redundant, but it's easier to implement for now. - std::vector>> + std::vector, std::vector>> data_block_and_keys_buffers; BlockBuilder range_del_block; @@ -476,7 +479,7 @@ struct BlockBasedTableBuilder::Rep { context.info_log = ioptions.info_log; filter_builder.reset(CreateFilterBlockBuilder( ioptions, moptions, context, use_delta_encoding_for_index_values, - p_index_builder_)); + p_index_builder_, &internal_comparator)); } for (auto& collector_factories : *int_tbl_prop_collector_factories) { @@ -711,7 +714,7 @@ BlockBasedTableBuilder::~BlockBasedTableBuilder() { delete rep_; } -void BlockBasedTableBuilder::Add(const Slice& key, const Slice& value) { +void BlockBasedTableBuilder::Add(const Slice& key, const Slice& value, uint32_t segment_id) { Rep* r = rep_; assert(rep_->state != Rep::State::kClosed); if (!ok()) return; @@ -761,7 +764,7 @@ void BlockBasedTableBuilder::Add(const Slice& key, const Slice& value) { if (r->filter_builder != nullptr) { size_t ts_sz = r->internal_comparator.user_comparator()->timestamp_size(); - r->filter_builder->Add(ExtractUserKeyAndStripTimestamp(key, ts_sz)); + r->filter_builder->Add(ExtractUserKeyAndStripTimestamp(key, ts_sz), segment_id); } } } @@ -774,7 +777,9 @@ void BlockBasedTableBuilder::Add(const Slice& key, const Slice& value) { if (r->data_block_and_keys_buffers.empty() || should_flush) { r->data_block_and_keys_buffers.emplace_back(); } - r->data_block_and_keys_buffers.back().second.emplace_back(key.ToString()); + // r->data_block_and_keys_buffers.back().second.emplace_back(key.ToString()); + std::get<1>(r->data_block_and_keys_buffers.back()).emplace_back(key.ToString()); + std::get<2>(r->data_block_and_keys_buffers.back()).emplace_back(segment_id); } else { if (r->compression_opts.parallel_threads == 1) { r->index_builder->OnKeyAdded(key); @@ -889,8 +894,10 @@ void BlockBasedTableBuilder::WriteBlock(const Slice& raw_block_contents, if (r->state == Rep::State::kBuffered) { assert(is_data_block); assert(!r->data_block_and_keys_buffers.empty()); - r->data_block_and_keys_buffers.back().first = raw_block_contents.ToString(); - r->data_begin_offset += r->data_block_and_keys_buffers.back().first.size(); + // r->data_block_and_keys_buffers.back().first = raw_block_contents.ToString(); + // r->data_begin_offset += r->data_block_and_keys_buffers.back().first.size(); + std::get<0>(r->data_block_and_keys_buffers.back()) = raw_block_contents.ToString(); + r->data_begin_offset += std::get<0>(r->data_block_and_keys_buffers.back()).size(); return; } Status compress_status; @@ -1156,6 +1163,7 @@ void BlockBasedTableBuilder::WriteRawBlock(const Slice& block_contents, } } +// WaLSM+: only used in parallel compaction, which is not supported in WaLSM+ void BlockBasedTableBuilder::BGWorkWriteRawBlock() { Rep* r = rep_; ParallelCompressionRep::BlockRepSlot* slot; @@ -1520,11 +1528,16 @@ void BlockBasedTableBuilder::EnterUnbuffered() { size_t rand_idx = static_cast( generator.Uniform(r->data_block_and_keys_buffers.size())); + // size_t copy_len = + // std::min(kSampleBytes - compression_dict_samples.size(), + // r->data_block_and_keys_buffers[rand_idx].first.size()); size_t copy_len = std::min(kSampleBytes - compression_dict_samples.size(), - r->data_block_and_keys_buffers[rand_idx].first.size()); + std::get<0>(r->data_block_and_keys_buffers[rand_idx]).size()); + // compression_dict_samples.append( + // r->data_block_and_keys_buffers[rand_idx].first, 0, copy_len); compression_dict_samples.append( - r->data_block_and_keys_buffers[rand_idx].first, 0, copy_len); + std::get<0>(r->data_block_and_keys_buffers[rand_idx]), 0, copy_len); compression_dict_sample_lens.emplace_back(copy_len); } } @@ -1546,11 +1559,16 @@ void BlockBasedTableBuilder::EnterUnbuffered() { r->compression_type == kZSTDNotFinalCompression)); for (size_t i = 0; ok() && i < r->data_block_and_keys_buffers.size(); ++i) { - auto& data_block = r->data_block_and_keys_buffers[i].first; - auto& keys = r->data_block_and_keys_buffers[i].second; + // auto& data_block = r->data_block_and_keys_buffers[i].first; + // auto& keys = r->data_block_and_keys_buffers[i].second; + auto& data_block = std::get<0>(r->data_block_and_keys_buffers[i]); + auto& keys = std::get<1>(r->data_block_and_keys_buffers[i]); + auto& segment_ids = std::get<2>(r->data_block_and_keys_buffers[i]); assert(!data_block.empty()); assert(!keys.empty()); + assert(!segment_ids.empty()); + // WaLSM+: no parallel compression for now, so no need to modify? if (r->compression_opts.parallel_threads > 1) { ParallelCompressionRep::BlockRep* block_rep = nullptr; r->pc_rep->block_rep_pool.pop(block_rep); @@ -1561,10 +1579,13 @@ void BlockBasedTableBuilder::EnterUnbuffered() { block_rep->compression_type = r->compression_type; block_rep->keys->SwapAssign(keys); + // assign segment_ids here if needed if (i + 1 < r->data_block_and_keys_buffers.size()) { + // block_rep->first_key_in_next_block->assign( + // r->data_block_and_keys_buffers[i + 1].second.front()); block_rep->first_key_in_next_block->assign( - r->data_block_and_keys_buffers[i + 1].second.front()); + std::get<1>(r->data_block_and_keys_buffers[i + 1]).front()); } else { if (r->first_key_in_next_block == nullptr) { block_rep->first_key_in_next_block.reset(nullptr); @@ -1608,19 +1629,24 @@ void BlockBasedTableBuilder::EnterUnbuffered() { lock, [r] { return !r->pc_rep->first_block; }); } } else { - for (const auto& key : keys) { + assert(keys.size() == segment_ids.size()); + for (size_t j = 0; j < keys.size(); ++j) { + const auto& key = keys[j]; + const auto segment_id = segment_ids[j]; if (r->filter_builder != nullptr) { size_t ts_sz = r->internal_comparator.user_comparator()->timestamp_size(); - r->filter_builder->Add(ExtractUserKeyAndStripTimestamp(key, ts_sz)); + r->filter_builder->Add(ExtractUserKeyAndStripTimestamp(key, ts_sz), segment_id); } r->index_builder->OnKeyAdded(key); } WriteBlock(Slice(data_block), &r->pending_handle, true /* is_data_block */); if (ok() && i + 1 < r->data_block_and_keys_buffers.size()) { + // Slice first_key_in_next_block = + // r->data_block_and_keys_buffers[i + 1].second.front(); Slice first_key_in_next_block = - r->data_block_and_keys_buffers[i + 1].second.front(); + std::get<1>(r->data_block_and_keys_buffers[i + 1]).front(); Slice* first_key_in_next_block_ptr = &first_key_in_next_block; r->index_builder->AddIndexEntry( &keys.back(), first_key_in_next_block_ptr, r->pending_handle); @@ -1762,6 +1788,10 @@ const char* BlockBasedTableBuilder::GetFileChecksumFuncName() const { } } +SegmentBuilderResult BlockBasedTableBuilder::GetSegmentBuilderResult() { + return rep_->filter_builder->GetSegmentBuilderResult(); +} + const std::string BlockBasedTable::kFilterBlockPrefix = "filter."; const std::string BlockBasedTable::kFullFilterBlockPrefix = "fullfilter."; const std::string BlockBasedTable::kPartitionedFilterBlockPrefix = diff --git a/table/block_based/block_based_table_builder.h b/table/block_based/block_based_table_builder.h index 38ad948af..728e8100d 100644 --- a/table/block_based/block_based_table_builder.h +++ b/table/block_based/block_based_table_builder.h @@ -19,6 +19,7 @@ #include "rocksdb/listener.h" #include "rocksdb/options.h" #include "rocksdb/status.h" +#include "table/block_based/filter_block.h" #include "table/meta_blocks.h" #include "table/table_builder.h" #include "util/compression.h" @@ -65,7 +66,7 @@ class BlockBasedTableBuilder : public TableBuilder { // REQUIRES: key is after any previously added key according to comparator. // REQUIRES: Finish(), Abandon() have not been called // WaLSM+ Note: call filter_builder->add() - void Add(const Slice& key, const Slice& value) override; + void Add(const Slice& key, const Slice& value, uint32_t segment_id) override; // Return non-ok iff some error has been detected. Status status() const override; @@ -111,6 +112,8 @@ class BlockBasedTableBuilder : public TableBuilder { // Get file checksum function name const char* GetFileChecksumFuncName() const override; + SegmentBuilderResult GetSegmentBuilderResult() override; + private: bool ok() const { return status().ok(); } diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc index 2257d10ea..2521d856d 100644 --- a/table/block_based/block_based_table_reader.cc +++ b/table/block_based/block_based_table_reader.cc @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -32,6 +33,7 @@ #include "rocksdb/iterator.h" #include "rocksdb/options.h" #include "rocksdb/statistics.h" +#include "rocksdb/status.h" #include "rocksdb/table.h" #include "rocksdb/table_properties.h" #include "table/block_based/binary_search_index_reader.h" @@ -39,7 +41,9 @@ #include "table/block_based/block_based_filter_block.h" #include "table/block_based/block_based_table_factory.h" #include "table/block_based/block_based_table_iterator.h" +#include "table/block_based/block_based_table_segment_aware_iterator.h" #include "table/block_based/block_prefix_index.h" +#include "table/block_based/cachable_entry.h" #include "table/block_based/filter_block.h" #include "table/block_based/full_filter_block.h" #include "table/block_based/hash_index_reader.h" @@ -53,6 +57,7 @@ #include "table/multiget_context.h" #include "table/persistent_cache_helper.h" #include "table/sst_file_writer_collectors.h" +#include "table/table_reader_caller.h" #include "table/two_level_iterator.h" #include "monitoring/perf_context_imp.h" @@ -1040,9 +1045,9 @@ Status BlockBasedTable::PrefetchIndexAndFilterBlocks( lookup_context); if (filter) { // Refer to the comment above about paritioned indexes always being cached - if (prefetch_all) { - filter->CacheDependencies(ro, pin_all); - } + // if (prefetch_all) { + // filter->CacheDependencies(ro, pin_all); + // } rep_->filter = std::move(filter); } @@ -2128,6 +2133,49 @@ InternalIterator* BlockBasedTable::NewIterator( need_upper_bound_check && rep_->index_type == BlockBasedTableOptions::kHashSearch, /*input_iter=*/nullptr, /*get_context=*/nullptr, &lookup_context)); + + // WaLSM+ behavior: when compaction, return a SegmentAwareIterator + if (caller == TableReaderCaller::kCompaction) { + std::unique_ptr data_iter; + std::unique_ptr filter_index_iter; + CachableEntry filter_block; + Status s = GetFilterIndexBlock(read_options, true, nullptr, &lookup_context, + &filter_block); + assert(s.ok()); + filter_index_iter.reset(filter_block.GetValue()->NewIndexIterator( + rep_->segment_id_removing_comparator.get(), + get_rep()->get_global_seqno(BlockType::kFilter), nullptr, nullptr, + true /* total_order_seek */, false /* have_first_key */, + rep_->index_key_includes_seq, rep_->index_value_is_full)); + if (arena == nullptr) { + data_iter.reset(new BlockBasedTableIterator( + this, read_options, rep_->internal_comparator, std::move(index_iter), + !skip_filters && !read_options.total_order_seek && + prefix_extractor != nullptr, + need_upper_bound_check, prefix_extractor, caller, + compaction_readahead_size, allow_unprepared_value)); + + return new BlockBasedTableSegmentAwareIterator( + std::move(data_iter), std::move(filter_block), std::move(filter_index_iter), + rep_->internal_comparator, caller); + } else { + auto* mem = arena->AllocateAligned(sizeof(BlockBasedTableIterator)); + data_iter.reset(new (mem) BlockBasedTableIterator( + this, read_options, rep_->internal_comparator, std::move(index_iter), + !skip_filters && !read_options.total_order_seek && + prefix_extractor != nullptr, + need_upper_bound_check, prefix_extractor, caller, + compaction_readahead_size, allow_unprepared_value)); + + mem = arena->AllocateAligned(sizeof(BlockBasedTableSegmentAwareIterator)); + return new BlockBasedTableSegmentAwareIterator( + std::move(data_iter), std::move(filter_block), std::move(filter_index_iter), + rep_->internal_comparator, caller); + } + + // unreachable + } + if (arena == nullptr) { return new BlockBasedTableIterator( this, read_options, rep_->internal_comparator, std::move(index_iter), @@ -3431,10 +3479,19 @@ std::string BlockBasedTable::ApproximateMiddleKey(const Slice& start, uint64_t middle_offset = (end_offset + start_offset) / 2; index_iter->SeekToFirst(); + if (!index_iter->Valid()) { + return std::string(); + } + while (index_iter->Valid() && index_iter->value().handle.offset() < middle_offset) { index_iter->Next(); } + + if (!index_iter->Valid()) { + index_iter->SeekToLast(); + } + std::string prefix = index_iter->user_key().ToString(); return prefix; @@ -3796,4 +3853,53 @@ void BlockBasedTable::DumpKeyValue(const Slice& key, const Slice& value, out_stream << " ------\n"; } +Status BlockBasedTable::GetFilterIndexBlock( + const ReadOptions& read_options, bool use_cache, GetContext* get_context, + BlockCacheLookupContext* lookup_context, + CachableEntry* filter_block) const { + assert(filter_block); + assert(filter_block->IsEmpty()); + + const BlockBasedTable::Rep* const rep = get_rep(); + assert(rep); + bool for_compaction = lookup_context != nullptr + && lookup_context->caller == TableReaderCaller::kCompaction; + + Status s = RetrieveBlock( + nullptr /* prefetch_buffer */, read_options, rep->filter_handle, + UncompressionDict::GetEmptyDict(), filter_block, BlockType::kFilter, + get_context, lookup_context, for_compaction, use_cache); + + return s; +} + +std::map> +BlockBasedTable::GetSegmentBlockHandles() const { + CachableEntry filter_block; + Status s = + GetFilterIndexBlock(ReadOptions(), true, nullptr, nullptr, &filter_block); + assert(s.ok()); + + std::unique_ptr filter_index_iter; + filter_index_iter.reset(filter_block.GetValue()->NewIndexIterator( + rep_->segment_id_removing_comparator.get(), + get_rep()->get_global_seqno(BlockType::kFilter), nullptr, nullptr, + true /* total_order_seek */, false /* have_first_key */, + rep_->index_key_includes_seq, rep_->index_value_is_full)); + + std::map> segment_block_handles; + + filter_index_iter->SeekToFirst(); + while (filter_index_iter->Valid()) { + BlockHandle block_handle = filter_index_iter->value().handle; + const auto filter_key = filter_index_iter->user_key(); + uint32_t segment_id = DecodeFixed32R(filter_key.data() + filter_key.size() - 4); + segment_block_handles[segment_id].push_back(block_handle); + + filter_index_iter->Next(); + } + + return segment_block_handles; +} + } // namespace ROCKSDB_NAMESPACE diff --git a/table/block_based/block_based_table_reader.h b/table/block_based/block_based_table_reader.h index ce7c4ed8a..0a61e273d 100644 --- a/table/block_based/block_based_table_reader.h +++ b/table/block_based/block_based_table_reader.h @@ -9,8 +9,8 @@ #pragma once +#include #include -#include "db/art/filter_cache_client.h" #include "db/range_tombstone_fragmenter.h" #include "file/filename.h" #include "rocksdb/comparator.h" @@ -42,6 +42,8 @@ struct BlockBasedTableOptions; struct EnvOptions; struct ReadOptions; class GetContext; +class FilterCacheClient; +class FilterCacheEntry; typedef std::vector> KVPairBlock; @@ -142,7 +144,9 @@ class BlockBasedTable : public TableReader { Status Get(FilterCacheClient& filter_cache, const ReadOptions& readOptions, const Slice& key, GetContext* get_context, const SliceTransform* prefix_extractor, - bool skip_filters = false) override; + bool skip_filters = false); + + std::map> GetSegmentBlockHandles() const override; #endif // WaLSM+ Note: call FullFilterKeyMayMatch() method in this file @@ -261,6 +265,20 @@ class BlockBasedTable : public TableReader { CachableEntry& block, TBlockIter* input_iter, Status s) const; +#ifdef ART_PLUS + // Similar to the above, with one crucial difference: it will retrieve the + // block from the file even if there are no caches configured (assuming the + // read options allow I/O). + template + Status RetrieveBlock(FilePrefetchBuffer* prefetch_buffer, + const ReadOptions& ro, const BlockHandle& handle, + const UncompressionDict& uncompression_dict, + CachableEntry* block_entry, + BlockType block_type, GetContext* get_context, + BlockCacheLookupContext* lookup_context, + bool for_compaction, bool use_cache) const; +#endif + class PartitionedIndexIteratorState; template @@ -270,6 +288,8 @@ class BlockBasedTable : public TableReader { friend class UncompressionDictReader; + friend class FilterCacheEntry; + protected: Rep* rep_; explicit BlockBasedTable(Rep* rep, BlockCacheTracer* const block_cache_tracer) @@ -320,6 +340,7 @@ class BlockBasedTable : public TableReader { GetContext* get_context, BlockCacheLookupContext* lookup_context, BlockContents* contents) const; +#ifndef ART_PLUS // Similar to the above, with one crucial difference: it will retrieve the // block from the file even if there are no caches configured (assuming the // read options allow I/O). @@ -331,6 +352,7 @@ class BlockBasedTable : public TableReader { BlockType block_type, GetContext* get_context, BlockCacheLookupContext* lookup_context, bool for_compaction, bool use_cache) const; +#endif void RetrieveMultipleBlocks( const ReadOptions& options, const MultiGetRange* batch, @@ -511,6 +533,11 @@ class BlockBasedTable : public TableReader { void DumpKeyValue(const Slice& key, const Slice& value, std::ostream& out_stream); + Status GetFilterIndexBlock(const ReadOptions& read_options, bool use_cache, + GetContext* get_context, + BlockCacheLookupContext* lookup_context, + CachableEntry* filter_block) const; + // A cumulative data block file read in MultiGet lower than this size will // use a stack buffer static constexpr size_t kMultiGetReadStackBufSize = 8192; diff --git a/table/block_based/block_based_table_segment_aware_iterator.cc b/table/block_based/block_based_table_segment_aware_iterator.cc new file mode 100644 index 000000000..aaaf34c06 --- /dev/null +++ b/table/block_based/block_based_table_segment_aware_iterator.cc @@ -0,0 +1,149 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#include "table/block_based/block_based_table_segment_aware_iterator.h" +#include +#include +#include "db/dbformat.h" +#include "rocksdb/slice.h" +#include "rocksdb/status.h" +#include "util/coding.h" + +namespace ROCKSDB_NAMESPACE { +bool BlockBasedTableSegmentAwareIterator::Valid() const { + return data_iter_ && data_iter_->Valid(); +} + +void BlockBasedTableSegmentAwareIterator::SeekToFirst() { + data_iter_->SeekToFirst(); + SeekFilterAndUpdateSegmentID(); +} + +void BlockBasedTableSegmentAwareIterator::SeekToLast() { + data_iter_->SeekToLast(); + SeekFilterAndUpdateSegmentID(); +} + +void BlockBasedTableSegmentAwareIterator::Seek(const Slice& target) { + // data_iter receives internal key, while filter_index_iter receives modified_key (maybe internal key or just internal key) + data_iter_->Seek(target); + SeekFilterAndUpdateSegmentID(); +} + +void BlockBasedTableSegmentAwareIterator::SeekForPrev(const Slice& target) { + data_iter_->SeekForPrev(target); + SeekFilterAndUpdateSegmentID(); +} + +void BlockBasedTableSegmentAwareIterator::Next() { + data_iter_->Next(); + UpdateSegmentID(); +} + +void BlockBasedTableSegmentAwareIterator::Prev() { + data_iter_->Prev(); + // degraded performance + SeekFilterAndUpdateSegmentID(); +} + +Slice BlockBasedTableSegmentAwareIterator::key() const { + return data_iter_->key(); +} + +Slice BlockBasedTableSegmentAwareIterator::user_key() const { + return data_iter_->user_key(); +} + +Slice BlockBasedTableSegmentAwareIterator::value() const { + return data_iter_->value(); +} + +Status BlockBasedTableSegmentAwareIterator::status() const { + Status data_iter_status = data_iter_->status(); + if (!data_iter_status.ok()) { + return data_iter_status; + } + return status_; +} + +uint32_t BlockBasedTableSegmentAwareIterator::segment_id() const { + return current_segment_id_; +} + +void BlockBasedTableSegmentAwareIterator::SeekFilterAndUpdateSegmentID() { + if (!data_iter_->Valid()) { + status_ = data_iter_->status(); + current_segment_id_ = INVALID_SEGMENT_ID; + return; + } + + // we should return segment_id corresponding to the user_key() when called segment_id(), + // so here we use data_iter_->user_key() to get the user_key for filter_index_iter + Slice current_user_key = data_iter_->user_key(); + + std::unique_ptr current_modified_key_buf; + Slice current_modified_key = generate_modified_user_key( + current_modified_key_buf, current_user_key, 0, 0); + + filter_index_iter_->Seek(current_modified_key); + UpdateSegmentID(); +} + +// assumes we will get the entire filter partition key (including user_key, seq_num, segment_id) +// may iterate over the filter_index_iter to find correct filter_index, then extract segment_id +void BlockBasedTableSegmentAwareIterator::UpdateSegmentID() { + if (!data_iter_->Valid()) { + // TODO: (TODO: how to handle this situation?) + current_segment_id_ = INVALID_SEGMENT_ID; + return; + } + + if (!filter_index_iter_ || !filter_index_iter_->Valid()) { + // TODO: (TODO: how to handle this situation?) + current_segment_id_ = INVALID_SEGMENT_ID; + return; + } + + Slice current_user_key = data_iter_->user_key(); + + std::unique_ptr current_modified_key_buf; + Slice current_modified_key = generate_modified_user_key( + current_modified_key_buf, current_user_key, 0, 0); + + Slice filter_key = filter_index_iter_->user_key(); + // forward lookup + while (segment_id_removing_comparator_->Compare(current_modified_key, filter_key) > 0) { + filter_index_iter_->Next(); + if (!filter_index_iter_->Valid()) { + // TODO: (TODO: how to handle this situation?) + current_segment_id_ = INVALID_SEGMENT_ID; + return; + } + filter_key = filter_index_iter_->user_key(); + } + // backward lookup not implemented + // do nothing here, since we already seek it + // frequently seeking backward is not good for performance + + + uint32_t filter_index = DecodeFixed32R(filter_key.data()); + if (filter_index > 0) { + // filter_index=0 should always be satisfied (TODO: how to handle this situation?) + current_segment_id_ = INVALID_SEGMENT_ID; + return; + } + + uint32_t segment_id = INVALID_SEGMENT_ID; + if (filter_key.size() >= 8) { + segment_id = DecodeFixed32R(filter_key.data() + filter_key.size() - 4); + } + + current_segment_id_ = segment_id; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/table/block_based/block_based_table_segment_aware_iterator.h b/table/block_based/block_based_table_segment_aware_iterator.h new file mode 100644 index 000000000..926f96ea0 --- /dev/null +++ b/table/block_based/block_based_table_segment_aware_iterator.h @@ -0,0 +1,75 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#pragma once +#include +#include +#include "db/dbformat.h" +#include "rocksdb/comparator.h" +#include "rocksdb/db.h" +#include "table/block_based/block.h" +#include "table/block_based/block_based_table_reader.h" + +#include "table/block_based/block_based_table_reader_impl.h" +#include "table/block_based/block_prefetcher.h" +#include "table/block_based/cachable_entry.h" +#include "table/block_based/reader_common.h" +#include "table/internal_iterator.h" + +namespace ROCKSDB_NAMESPACE { +// Iterates over the contents of BlockBasedTable, also provides segment_id information by iterating over the filter index. +class BlockBasedTableSegmentAwareIterator : public InternalIteratorBase { + public: + BlockBasedTableSegmentAwareIterator( + std::unique_ptr data_iter, CachableEntry filter_index_block_entry, std::unique_ptr filter_index_iter, + const InternalKeyComparator& icomp, + TableReaderCaller caller) + : data_iter_(std::move(data_iter)), + filter_index_block_entry_(std::move(filter_index_block_entry)), + filter_index_iter_(std::move(filter_index_iter)), + icmp_(&icomp), + segment_id_removing_comparator_(SegmentIdRemovingComparator(icomp.user_comparator())), + lookup_context_(caller), + user_comparator_(icomp.user_comparator()) {} + + ~BlockBasedTableSegmentAwareIterator() {} + + // Ausuming that `target` is the original user key, not the modified key. + void Seek(const Slice& target) override; + // Ausuming that `target` is the original user key, not the modified key. + void SeekForPrev(const Slice& target) override; + void SeekToFirst() override; + void SeekToLast() override; + void Next() final override; + void Prev() override; + bool Valid() const override; + Slice key() const override; + Slice user_key() const override; + Slice value() const override; + Status status() const override; + uint32_t segment_id() const override; + + private: + std::unique_ptr data_iter_; + CachableEntry filter_index_block_entry_; + std::unique_ptr filter_index_iter_; + const InternalKeyComparator* icmp_; + std::unique_ptr segment_id_removing_comparator_; + const SliceTransform* prefix_extractor_; + TableReaderCaller lookup_context_; + InternalKeyComparator user_comparator_; + HistogramImpl* file_read_hist_; + uint32_t current_segment_id_ = 0; + InternalKey current_partition_key_; + Status status_; + int level_; + + void UpdateSegmentID(); + void SeekFilterAndUpdateSegmentID(); +}; +} // namespace ROCKSDB_NAMESPACE diff --git a/table/block_based/cachable_entry.h b/table/block_based/cachable_entry.h index 8b34ada54..71116f5f6 100644 --- a/table/block_based/cachable_entry.h +++ b/table/block_based/cachable_entry.h @@ -186,6 +186,15 @@ class CachableEntry { assert(!own_value_); } +#ifdef ART_PLUS + T* ReleaseValue() { + assert(own_value_ && cache_ == nullptr && cache_handle_ == nullptr); + T* value = value_; + ResetFields(); + return value; + } +#endif + private: // release cache entry in cache or release owned value void ReleaseResource() { diff --git a/table/block_based/filter_block.h b/table/block_based/filter_block.h index d94c7e606..8a9c4cc63 100644 --- a/table/block_based/filter_block.h +++ b/table/block_based/filter_block.h @@ -20,9 +20,13 @@ #include #include +#include +#include #include #include #include +#include "db/art/clf_model.h" +#include "db/art/filter_cache_client.h" #include "db/dbformat.h" #include "rocksdb/options.h" #include "rocksdb/slice.h" @@ -41,6 +45,23 @@ class FilterPolicy; class GetContext; using MultiGetRange = MultiGetContext::Range; +struct SegmentBuilderResult { + struct PerSegmentResult { + uint32_t segment_id; + std::vector range_rate_pairs; + std::unordered_map inherit_recorder; + + std::string smallest_key; + std::string largest_key; + uint32_t key_count; + }; + + std::set new_segment_ids; + std::vector per_segment_results; + std::set merged_segment_ids; + int output_level; +}; + // A FilterBlockBuilder is used to construct all of the filters for a // particular Table. It generates a single string which is stored as // a special block in the Table. @@ -60,7 +81,7 @@ class FilterBlockBuilder { virtual bool IsBlockBased() = 0; // If is blockbased filter virtual void StartBlock(uint64_t block_offset) = 0; // Start new block filter - virtual void Add(const Slice& key) = 0; // Add a key to current filter + virtual void Add(const Slice& key, uint32_t segment_id = INVALID_SEGMENT_ID) = 0; // Add a key to current filter virtual size_t NumAdded() const = 0; // Number of keys added Slice Finish() { // Generate Filter const BlockHandle empty_handle; @@ -70,6 +91,8 @@ class FilterBlockBuilder { return ret; } virtual Slice Finish(const BlockHandle& tmp, Status* status) = 0; + // default invalid method to make compiler happy + virtual SegmentBuilderResult GetSegmentBuilderResult() { assert(false); return SegmentBuilderResult{}; } }; // A FilterBlockReader is used to parse filter from SST table. @@ -103,6 +126,19 @@ class FilterBlockReader { GetContext* get_context, BlockCacheLookupContext* lookup_context) = 0; +#ifdef ART_PLUS + virtual bool KeyMayMatch(FilterCacheClient& filter_cache, + const Slice& key, + const SliceTransform* prefix_extractor, + uint64_t block_offset, const bool no_io, + const Slice* const const_ikey_ptr, + GetContext* get_context, + BlockCacheLookupContext* lookup_context) { + assert(false); + return false; + } +#endif + virtual void KeysMayMatch(MultiGetRange* range, const SliceTransform* prefix_extractor, uint64_t block_offset, const bool no_io, @@ -174,4 +210,5 @@ class FilterBlockReader { } }; + } // namespace ROCKSDB_NAMESPACE diff --git a/table/block_based/filter_policy.cc b/table/block_based/filter_policy.cc index da03f1d5f..27ed3a192 100644 --- a/table/block_based/filter_policy.cc +++ b/table/block_based/filter_policy.cc @@ -16,6 +16,7 @@ #include "rocksdb/filter_policy.h" +#include "db/art/macros.h" #include "rocksdb/slice.h" #include "table/block_based/block_based_filter_block.h" #include "table/block_based/full_filter_block.h" @@ -571,7 +572,7 @@ class MultiLegacyBloomBitsBuilder : public FilterBitsBuilder { virtual Slice Finish(std::unique_ptr* buf) override; virtual Slice FinishWithId(std::unique_ptr* buf, const int hash_id) override; - + virtual int CalculateNumEntry(const uint32_t bytes) override; private: std::vector bits_builders_; @@ -616,6 +617,10 @@ Slice MultiLegacyBloomBitsBuilder::FinishWithId(std::unique_ptr* b int hash_id) { return bits_builders_[hash_id]->Finish(buf); } + +int MultiLegacyBloomBitsBuilder::CalculateNumEntry(const uint32_t bytes) { + return bits_builders_[0]->CalculateNumEntry(bytes); +} #endif class LegacyBloomBitsReader : public FilterBitsReader { @@ -875,8 +880,8 @@ FilterBitsBuilder* BloomFilterPolicy::GetBuilderWithContext( #else // TODO: determine filter_count, // and maybe move this property to some kind of options (WaLSM+) - const int filter_count = 10; - return new MultiLegacyBloomBitsBuilder(filter_count, whole_bits_per_key_, context.info_log); + const int filter_count = MAX_UNITS_NUM; + return new MultiLegacyBloomBitsBuilder(filter_count, BITS_PER_KEY_PER_UNIT, context.info_log); #endif } } diff --git a/table/block_based/full_filter_block.cc b/table/block_based/full_filter_block.cc index f6ecbeb1c..7912461a6 100644 --- a/table/block_based/full_filter_block.cc +++ b/table/block_based/full_filter_block.cc @@ -27,7 +27,7 @@ FullFilterBlockBuilder::FullFilterBlockBuilder( filter_bits_builder_.reset(filter_bits_builder); } -void FullFilterBlockBuilder::Add(const Slice& key) { +void FullFilterBlockBuilder::Add(const Slice& key, uint32_t segment_id) { const bool add_prefix = prefix_extractor_ && prefix_extractor_->InDomain(key); if (whole_key_filtering_) { if (!add_prefix) { diff --git a/table/block_based/full_filter_block.h b/table/block_based/full_filter_block.h index deda30c6f..a91404ba0 100644 --- a/table/block_based/full_filter_block.h +++ b/table/block_based/full_filter_block.h @@ -55,7 +55,7 @@ class FullFilterBlockBuilder : public FilterBlockBuilder { // not implemented in FullFilterBlock virtual void StartBlock(uint64_t /*block_offset*/) override {} // if not use prefix bloom, only call AddKey(key) - virtual void Add(const Slice& key) override; + virtual void Add(const Slice& key, uint32_t segment_id) override; // return num_added_, num of keys virtual size_t NumAdded() const override { return num_added_; } // only return the slice from LegacyBloomBitsBuilder(format version < 5) diff --git a/table/block_based/partitioned_filter_block.cc b/table/block_based/partitioned_filter_block.cc index e077603f4..73397b55a 100644 --- a/table/block_based/partitioned_filter_block.cc +++ b/table/block_based/partitioned_filter_block.cc @@ -4,12 +4,27 @@ // (found in the LICENSE.Apache file in the root directory). #include "table/block_based/partitioned_filter_block.h" +#include +#include +#include #include +#include +#include +#include +#include #include +#include +#include #include +#include +#include +#include #include +#include "db/art/clf_model.h" +#include "db/art/logger.h" +#include "db/art/macros.h" #include "db/dbformat.h" #include "file/file_util.h" #include "monitoring/perf_context_imp.h" @@ -21,21 +36,20 @@ #include "rocksdb/status.h" #include "table/block_based/block.h" #include "table/block_based/block_based_table_reader.h" +#include "table/block_based/filter_block.h" +#include "table/block_based/index_builder.h" +#include "table/format.h" #include "util/coding.h" namespace ROCKSDB_NAMESPACE { -#ifdef ART_PLUS -Slice generate_modified_internal_key(std::unique_ptr& buf, - Slice original_internal_key, - int filter_index, int segment_id); -#endif PartitionedFilterBlockBuilder::PartitionedFilterBlockBuilder( const SliceTransform* _prefix_extractor, bool whole_key_filtering, FilterBitsBuilder* filter_bits_builder, int index_block_restart_interval, const bool use_value_delta_encoding, PartitionedIndexBuilder* const p_index_builder, - const uint32_t partition_size) + const uint32_t partition_size, const std::vector& range_separators, + const InternalKeyComparator* const internal_comparator) : FullFilterBlockBuilder(_prefix_extractor, whole_key_filtering, filter_bits_builder), index_on_filter_block_builder_(index_block_restart_interval, @@ -45,11 +59,14 @@ PartitionedFilterBlockBuilder::PartitionedFilterBlockBuilder( true /*use_delta_encoding*/, use_value_delta_encoding), p_index_builder_(p_index_builder), - keys_added_to_partition_(0) { + keys_added_to_partition_(0), + range_separators_(range_separators), + internal_comparator_(internal_comparator), + user_comparator_(internal_comparator->user_comparator()) { keys_per_partition_ = filter_bits_builder_->CalculateNumEntry(partition_size); if (keys_per_partition_ < 1) { - // partition_size (minus buffer, ~10%) might be smaller than minimum + // partition_size (minus bufer, ~10%) might be smaller than minimum // filter size, sometimes based on cache line size. Try to find that // minimum size without CalculateSpace (not necessarily available). uint32_t larger = std::max(partition_size + 4, uint32_t{16}); @@ -68,18 +85,25 @@ PartitionedFilterBlockBuilder::PartitionedFilterBlockBuilder( } } + // keys_per_partition_ = std::min(keys_per_partition_, (uint32_t) KEYS_PER_SEGMENT); + // keys_per_partition_ = KEYS_PER_SEGMENT; + #ifdef ART_PLUS filter_count_ = filter_bits_builder->filter_count_; filter_gc.resize(filter_count_); filters.resize(filter_count_); finishing_filter_index_ = 0; + + keys_in_current_segment_.reserve(keys_per_partition_); + segment_ids_in_current_segment_.reserve(keys_per_partition_); + current_range_index_ = 0; #endif } PartitionedFilterBlockBuilder::~PartitionedFilterBlockBuilder() {} void PartitionedFilterBlockBuilder::MaybeCutAFilterBlock( - const Slice* next_key) { + const Slice* next_key, uint32_t next_key_segment_id) { // Use == to send the request only once if (keys_added_to_partition_ == keys_per_partition_) { // Currently only index builder is in charge of cutting a partition. We keep @@ -103,24 +127,141 @@ void PartitionedFilterBlockBuilder::MaybeCutAFilterBlock( } #ifdef ART_PLUS + const uint32_t new_segment_id = segment_id_base_.fetch_add(1, std::memory_order_relaxed); for (int i = 0; i < filter_count_; ++i) { filter_gc[i].push_back(std::unique_ptr(nullptr)); Slice filter = filter_bits_builder_->FinishWithId(&filter_gc[i].back(), i); std::string& index_key = p_index_builder_->GetPartitionKey(); - filters[i].push_back({index_key, filter, segment_id_base_.fetch_add(1, std::memory_order_relaxed)}); + filters[i].push_back({index_key, filter, new_segment_id}); } + ProcessSegmentCut(new_segment_id); #else Slice filter = filter_bits_builder_->Finish(&filter_gc.back()); std::string& index_key = p_index_builder_->GetPartitionKey(); filters.push_back({index_key, filter}); #endif + // std::cerr << "keys_added_to_partition = " << keys_per_partition_ << "\n"; keys_added_to_partition_ = 0; Reset(); } -void PartitionedFilterBlockBuilder::Add(const Slice& key) { - MaybeCutAFilterBlock(&key); - FullFilterBlockBuilder::Add(key); +void PartitionedFilterBlockBuilder::ProcessSegmentCut(uint32_t new_segment_id) { + assert(keys_in_current_segment_.size() == + segment_ids_in_current_segment_.size()); + size_t siz = keys_in_current_segment_.size(); + + for (size_t i = 0; i + 1 < siz; ++i) { + if (user_comparator_->Compare(keys_in_current_segment_[i], + keys_in_current_segment_[i + 1]) > 0) { + assert(false); + std::cout << std::endl; + std::cout << "segment_id: " << new_segment_id << ", key count: " << siz + << std::endl; + for (size_t i = 0; i < siz; ++i) { + std::cout << std::setw(5) << i << " " << keys_in_current_segment_[i] + << std::endl; + } + std::cout << std::endl; + } + } + + // for range_recorder + std::vector range_rate_pairs; + uint32_t cnt_in_current_range = 0; + + // for inherit_infos_recorders + + // + std::unordered_map inherit_counts; + + // for every first key in segments, we perform binary search to find corresponding key range + { + auto it = std::upper_bound( + range_separators_.begin(), range_separators_.end(), + keys_in_current_segment_[0], [this](const Slice& a, const Slice& b) { + return this->user_comparator_->Compare(a, b) < 0; + }); + if (it != range_separators_.begin()) { + it--; + current_range_index_ = std::distance(range_separators_.begin(), it); + } else { + // the first key is smaller than all the range separators, + // which should only happen when range_separators_ is empty + assert(range_separators_.empty()); + } + } + + // perform linear scan to find the range for each key + for (size_t i = 0; i < siz; ++i) { + const Slice& key = keys_in_current_segment_[i]; + const uint32_t source_segment_id = segment_ids_in_current_segment_[i]; + + // for merged_segment_ids + source_segment_ids_count[source_segment_id]++; + + // for range_recorder + while (current_range_index_ + 1 < range_separators_.size() && + user_comparator_->Compare( + key, range_separators_[current_range_index_ + 1]) >= 0) { + current_range_index_++; + if (cnt_in_current_range > 0) { + range_rate_pairs.emplace_back( + RangeRatePair{uint32_t(current_range_index_), + double(cnt_in_current_range) / siz}); + cnt_in_current_range = 0; + } + } + + // for inherit_infos_recorders + inherit_counts[source_segment_id]++; + cnt_in_current_range++; + } + + // process the last range + if (current_range_index_ + 1 < range_separators_.size()) { + if (cnt_in_current_range > 0) { + range_rate_pairs.emplace_back(RangeRatePair{ + uint32_t(current_range_index_), double(cnt_in_current_range) / siz}); + } + cnt_in_current_range = 0; + } + + // update smallest & largest key + std::string smallest_key = keys_in_current_segment_[0]; + std::string largest_key = keys_in_current_segment_.back(); + + // update result + // inherit_counts should be updated when GetSegmentBuilderResult() is called + segment_builder_result_.new_segment_ids.insert(new_segment_id); + segment_builder_result_.per_segment_results.push_back(SegmentBuilderResult::PerSegmentResult{ + new_segment_id, range_rate_pairs, inherit_counts, smallest_key, + largest_key, (uint32_t) keys_in_current_segment_.size()}); + + // clear + keys_in_current_segment_.clear(); + segment_ids_in_current_segment_.clear(); + +} + +void PartitionedFilterBlockBuilder::Add(const Slice& key, uint32_t segment_id) { + const std::string key_str = key.ToString(); + for (int i = 0; i < key_str.size(); i++) { + char c = key_str[i]; + if (!std::isprint(c)) { + std::cout << "error" << std::endl; + } + } + if (!keys_in_current_segment_.empty() && user_comparator_->Compare(keys_in_current_segment_.back(), key) > 0) { + std::cout << "error" << std::endl; + } + if (key.ends_with(Slice("\1", 1))) { + std::cout << "error" << std::endl; + } + MaybeCutAFilterBlock(&key, segment_id); + FullFilterBlockBuilder::Add(key, segment_id); + + keys_in_current_segment_.emplace_back(key.ToString()); + segment_ids_in_current_segment_.emplace_back(segment_id); } void PartitionedFilterBlockBuilder::AddKey(const Slice& key) { @@ -170,7 +311,7 @@ Slice PartitionedFilterBlockBuilder::Finish( filters.pop_front(); #endif } else { - MaybeCutAFilterBlock(nullptr); + MaybeCutAFilterBlock(nullptr, INVALID_SEGMENT_ID); } // If there is no filter partition left, then return the index on filter // partitions @@ -350,6 +491,33 @@ BlockHandle PartitionedFilterBlockReader::GetFilterPartitionHandle( return fltr_blk_handle; } +#ifdef ART_PLUS +std::pair PartitionedFilterBlockReader::GetFilterPartitionKeyAndHandle( + const CachableEntry& filter_block, const Slice& entry) const { + IndexBlockIter iter; + const Comparator* const segment_id_removing_comparator = table()->get_rep()->segment_id_removing_comparator.get(); + Statistics* kNullStats = nullptr; + filter_block.GetValue()->NewIndexIterator( + segment_id_removing_comparator, + table()->get_rep()->get_global_seqno(BlockType::kFilter), &iter, + kNullStats, true /* total_order_seek */, false /* have_first_key */, + index_key_includes_seq(), index_value_is_full()); + iter.Seek(entry); + if (UNLIKELY(!iter.Valid())) { + // entry is larger than all the keys. However its prefix might still be + // present in the last partition. If this is called by PrefixMayMatch this + // is necessary for correct behavior. Otherwise it is unnecessary but safe. + // Assuming this is an unlikely case for full key search, the performance + // overhead should be negligible. + iter.SeekToLast(); + } + assert(iter.Valid()); + Slice fltr_block_key = iter.key(); + BlockHandle fltr_blk_handle = iter.value().handle; + return {fltr_block_key, fltr_blk_handle}; +} +#endif + // TODO: retrieve filter block from filter cache (WaLSM+) Status PartitionedFilterBlockReader::GetFilterPartitionBlock( FilePrefetchBuffer* prefetch_buffer, const BlockHandle& fltr_blk_handle, @@ -464,40 +632,50 @@ bool PartitionedFilterBlockReader::MayMatch( return true; } - #ifdef ART_PLUS // find key "0 original_internal key". filter_index=segment_id=0. (WaLSM+) // segment_id itself is useless in comparison, // but must be appended otherwise the extracted user key will be incorrect. std::unique_ptr modified_key_buf; Slice modified_key = generate_modified_internal_key(modified_key_buf, *const_ikey_ptr, 0, 0); - auto filter_handle = GetFilterPartitionHandle(filter_block, modified_key); - #else - auto filter_handle = GetFilterPartitionHandle(filter_block, *const_ikey_ptr); - #endif + // auto filter_handle = GetFilterPartitionHandle(filter_block, modified_key); + auto key_and_handle = GetFilterPartitionKeyAndHandle(filter_block, modified_key); + Slice filter_key = key_and_handle.first; + auto filter_handle = key_and_handle.second; if (UNLIKELY(filter_handle.size() == 0)) { // key is out of range return false; } + assert(filter_key.size() >= 8); + // TODO: validate we have stripped useless internal key suffix (WaLSM+) + uint32_t segment_id = DecodeFixed32R(filter_key.data() + filter_key.size() - 4); + // TODO: get some filter blocks from the filter cache and check (WaLSM+) - CachableEntry filter_partition_block; - s = GetFilterPartitionBlock(nullptr /* prefetch_buffer */, filter_handle, - no_io, get_context, lookup_context, - &filter_partition_block); - if (UNLIKELY(!s.ok())) { - IGNORE_STATUS_IF_ERROR(s); - return true; + std::vector> filter_partition_blocks = + filter_cache.get_filter_blocks(segment_id); + + // static std::array, MAX_UNITS_NUM+1> filter_unit_num_hits; + // static std::atomic maymatch_calls{0}; + // filter_unit_num_hits[filter_partition_blocks.size()]++; + // if (maymatch_calls.fetch_add(1) % 500000 == 0) { + // std::cout << "maymatch_calls: " << maymatch_calls.load() << std::endl; + // for (size_t i = 0; i < filter_unit_num_hits.size(); ++i) { + // std::cout << "filter_unit_num_hits[" << i << "]: " << filter_unit_num_hits[i].load() << std::endl; + // } + // } + + for (size_t hash_id = 0; hash_id < filter_partition_blocks.size(); ++hash_id) { + FullFilterBlockReader filter_partition( + table(), std::move(filter_partition_blocks[hash_id]), hash_id); + bool may_exist = (filter_partition.*filter_function)( + slice, prefix_extractor, block_offset, no_io, const_ikey_ptr, + get_context, lookup_context); + if (!may_exist) { + return false; + } } - FullFilterBlockReader filter_partition(table(), - std::move(filter_partition_block)); - // initialize the reader with hash_id (WaLSM+) - // FullFilterBlockReader filter_partition(table(), - // std::move(filter_partition_block), - // 1); - return (filter_partition.*filter_function)( - slice, prefix_extractor, block_offset, no_io, const_ikey_ptr, get_context, - lookup_context); + return true; } #endif @@ -714,6 +892,33 @@ bool PartitionedFilterBlockReader::index_value_is_full() const { } #ifdef ART_PLUS + +// should be called only once +SegmentBuilderResult PartitionedFilterBlockBuilder::GetSegmentBuilderResult() { + // update inherit_recorders + for (auto& segment_result : segment_builder_result_.per_segment_results) { + auto& inherit_counts = segment_result.inherit_recorder; + int segment_size = 0; + for (auto& inherit_count : inherit_counts) { + segment_size += (int) inherit_count.second; + } + for (auto& inherit_count : inherit_counts) { + inherit_count.second /= segment_size; + } + } + + // update merged_segment_ids + for (const auto& source_segment_id_count : source_segment_ids_count) { + const auto segment_id = source_segment_id_count.first; + segment_builder_result_.merged_segment_ids.insert(segment_id); + } + + assert(!segment_builder_result_.merged_segment_ids.empty()); + assert(!segment_builder_result_.new_segment_ids.empty()); + + return segment_builder_result_; +} + std::atomic PartitionedFilterBlockBuilder::segment_id_base_{0}; #endif @@ -737,6 +942,23 @@ Slice generate_modified_internal_key(std::unique_ptr& buf, Slice o buf.reset(modified_key_buf); return modified_key; } + +Slice generate_modified_user_key(std::unique_ptr& buf, Slice original_user_key, int filter_index, int segment_id) { + // calculate modified_key (WaLSM+) + // +--------------+------------------------------------+------------+ + // | filter_index | original_user_key | segment_id | + // | 4 bytes | (key.size() - kInternalBytes) bytes| 4 bytes | + // +--------------+------------------------------------+------------+ + size_t modified_key_buf_size = 4 + original_user_key.size() + 4; + char *modified_key_buf = new char[modified_key_buf_size]; + EncodeFixed32R(modified_key_buf, filter_index); + std::memcpy(modified_key_buf + 4, original_user_key.data(), original_user_key.size()); + EncodeFixed32R(modified_key_buf + 4 + original_user_key.size(), segment_id); + Slice modified_key = Slice(modified_key_buf, modified_key_buf_size); + + buf.reset(modified_key_buf); + return modified_key; +} #endif } // namespace ROCKSDB_NAMESPACE diff --git a/table/block_based/partitioned_filter_block.h b/table/block_based/partitioned_filter_block.h index 0d970a7a6..4c74f8082 100644 --- a/table/block_based/partitioned_filter_block.h +++ b/table/block_based/partitioned_filter_block.h @@ -6,6 +6,7 @@ #pragma once #include +#include #include #include #include @@ -14,10 +15,12 @@ #include "db/dbformat.h" #include "db/art/filter_cache_client.h" #include "index_builder.h" +#include "rocksdb/comparator.h" #include "rocksdb/options.h" #include "rocksdb/slice.h" #include "rocksdb/slice_transform.h" #include "table/block_based/block.h" +#include "table/block_based/filter_block.h" #include "table/block_based/filter_block_reader_common.h" #include "table/block_based/full_filter_block.h" #include "util/autovector.h" @@ -31,16 +34,20 @@ class PartitionedFilterBlockBuilder : public FullFilterBlockBuilder { FilterBitsBuilder* filter_bits_builder, int index_block_restart_interval, const bool use_value_delta_encoding, PartitionedIndexBuilder* const p_index_builder, - const uint32_t partition_size); + const uint32_t partition_size, + const std::vector& range_separators, + const InternalKeyComparator* const internal_comparator); virtual ~PartitionedFilterBlockBuilder(); void AddKey(const Slice& key) override; - void Add(const Slice& key) override; + void Add(const Slice& key, uint32_t segment_id) override; virtual Slice Finish(const BlockHandle& last_partition_block_handle, Status* status) override; + virtual SegmentBuilderResult GetSegmentBuilderResult() override; + private: // Filter data BlockBuilder index_on_filter_block_builder_; // top-level index builder @@ -65,7 +72,8 @@ class PartitionedFilterBlockBuilder : public FullFilterBlockBuilder { bool finishing_filters = false; // true if Finish is called once but not complete yet. // The policy of when cut a filter block and Finish it - void MaybeCutAFilterBlock(const Slice* next_key); + void MaybeCutAFilterBlock(const Slice* next_key, uint32_t next_key_segment_id); + void ProcessSegmentCut(uint32_t new_segment_id); // Currently we keep the same number of partitions for filters and indexes. // This would allow for some potentioal optimizations in future. If such // optimizations did not realize we can use different number of partitions and @@ -83,6 +91,14 @@ class PartitionedFilterBlockBuilder : public FullFilterBlockBuilder { // When Finish() is called, return filters[filter_index].front() (WaLSM+) int finishing_filter_index_; static std::atomic segment_id_base_; + std::vector keys_in_current_segment_; + std::vector segment_ids_in_current_segment_; + std::map source_segment_ids_count; + std::size_t current_range_index_; + const std::vector& range_separators_; + const InternalKeyComparator* const internal_comparator_; + const Comparator* user_comparator_; + SegmentBuilderResult segment_builder_result_; #endif }; @@ -106,7 +122,7 @@ class PartitionedFilterBlockReader : public FilterBlockReaderCommon { const Slice& key, const SliceTransform* prefix_extractor, uint64_t block_offset, const bool no_io, const Slice* const const_ikey_ptr, GetContext* get_context, - BlockCacheLookupContext* lookup_context); + BlockCacheLookupContext* lookup_context) override; #endif // TODO: not used in WaLSM+ Benchmark, meybe used in MultiGet interface ? void KeysMayMatch(MultiGetRange* range, @@ -129,6 +145,11 @@ class PartitionedFilterBlockReader : public FilterBlockReaderCommon { size_t ApproximateMemoryUsage() const override; private: + #ifdef ART_PLUS + std::pair GetFilterPartitionKeyAndHandle( + const CachableEntry& filter_block, const Slice& entry) const; + #endif + BlockHandle GetFilterPartitionHandle(const CachableEntry& filter_block, const Slice& entry) const; Status GetFilterPartitionBlock( diff --git a/table/block_fetcher.cc b/table/block_fetcher.cc index b0880d516..edf56d446 100644 --- a/table/block_fetcher.cc +++ b/table/block_fetcher.cc @@ -214,6 +214,7 @@ inline void BlockFetcher::GetBlockContents() { } Status BlockFetcher::ReadBlockContents() { + if (file_ == nullptr) return IOStatus::NotFound(Status::SubCode::kNone); // handle to special error of enable_units if (TryGetUncompressBlockFromPersistentCache()) { compression_type_ = kNoCompression; #ifndef NDEBUG diff --git a/table/cuckoo/cuckoo_table_builder.cc b/table/cuckoo/cuckoo_table_builder.cc index f42e87bdf..1ed444772 100644 --- a/table/cuckoo/cuckoo_table_builder.cc +++ b/table/cuckoo/cuckoo_table_builder.cc @@ -3,6 +3,7 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). +#include #ifndef ROCKSDB_LITE #include "table/cuckoo/cuckoo_table_builder.h" @@ -84,7 +85,7 @@ CuckooTableBuilder::CuckooTableBuilder( properties_.db_session_id = db_session_id; } -void CuckooTableBuilder::Add(const Slice& key, const Slice& value) { +void CuckooTableBuilder::Add(const Slice& key, const Slice& value, uint32_t /*segment_id*/) { if (num_entries_ >= kMaxVectorIdx - 1) { status_ = Status::NotSupported("Number of keys in a file must be < 2^32-1"); return; diff --git a/table/cuckoo/cuckoo_table_builder.h b/table/cuckoo/cuckoo_table_builder.h index 8e8026487..4a13830e9 100644 --- a/table/cuckoo/cuckoo_table_builder.h +++ b/table/cuckoo/cuckoo_table_builder.h @@ -40,7 +40,7 @@ class CuckooTableBuilder: public TableBuilder { // Add key,value to the table being constructed. // REQUIRES: key is after any previously added key according to comparator. // REQUIRES: Finish(), Abandon() have not been called - void Add(const Slice& key, const Slice& value) override; + void Add(const Slice& key, const Slice& value, uint32_t segment_id) override; // Return non-ok iff some error has been detected. Status status() const override { return status_; } diff --git a/table/internal_iterator.h b/table/internal_iterator.h index c4382a54e..7ee4dda52 100644 --- a/table/internal_iterator.h +++ b/table/internal_iterator.h @@ -6,6 +6,7 @@ #pragma once +#include #include #include "db/dbformat.h" #include "rocksdb/comparator.h" @@ -172,6 +173,8 @@ class InternalIteratorBase : public Cleanable { return Status::NotSupported(""); } + virtual uint32_t segment_id() const { return INVALID_SEGMENT_ID; } + protected: void SeekForPrevImpl(const Slice& target, const Comparator* cmp) { Seek(target); diff --git a/table/iterator_wrapper.h b/table/iterator_wrapper.h index ff46f2536..a38923ea5 100644 --- a/table/iterator_wrapper.h +++ b/table/iterator_wrapper.h @@ -9,6 +9,7 @@ #pragma once +#include #include #include "table/internal_iterator.h" @@ -65,6 +66,10 @@ class IteratorWrapperBase { assert(Valid()); return iter_->value(); } + uint32_t segment_id() const { + assert(Valid()); + return iter_->segment_id(); + } // Methods below require iter() != nullptr Status status() const { assert(iter_); diff --git a/table/merging_iterator.cc b/table/merging_iterator.cc index fdd1a4910..2a406d0a9 100644 --- a/table/merging_iterator.cc +++ b/table/merging_iterator.cc @@ -242,6 +242,11 @@ class MergingIterator : public InternalIterator { return current_->value(); } + uint32_t segment_id() const override { + assert(Valid()); + return current_->segment_id(); + } + bool PrepareValue() override { assert(Valid()); if (current_->PrepareValue()) { diff --git a/table/plain/plain_table_builder.cc b/table/plain/plain_table_builder.cc index faebcfe2f..d83506343 100644 --- a/table/plain/plain_table_builder.cc +++ b/table/plain/plain_table_builder.cc @@ -122,7 +122,7 @@ PlainTableBuilder::~PlainTableBuilder() { io_status_.PermitUncheckedError(); } -void PlainTableBuilder::Add(const Slice& key, const Slice& value) { +void PlainTableBuilder::Add(const Slice& key, const Slice& value, uint32_t /*segment_id*/) { // temp buffer for metadata bytes between key and value. char meta_bytes_buf[6]; size_t meta_bytes_buf_size = 0; diff --git a/table/plain/plain_table_builder.h b/table/plain/plain_table_builder.h index 6ab5d59e3..3d9ed23db 100644 --- a/table/plain/plain_table_builder.h +++ b/table/plain/plain_table_builder.h @@ -57,7 +57,7 @@ class PlainTableBuilder: public TableBuilder { // Add key,value to the table being constructed. // REQUIRES: key is after any previously added key according to comparator. // REQUIRES: Finish(), Abandon() have not been called - void Add(const Slice& key, const Slice& value) override; + void Add(const Slice& key, const Slice& value, uint32_t segment_id) override; // Return non-ok iff some error has been detected. Status status() const override { return status_; } diff --git a/table/table_builder.h b/table/table_builder.h index 36475c143..2f5d2779e 100644 --- a/table/table_builder.h +++ b/table/table_builder.h @@ -19,6 +19,7 @@ #include "options/cf_options.h" #include "rocksdb/options.h" #include "rocksdb/table_properties.h" +#include "table/block_based/filter_block.h" #include "trace_replay/block_cache_tracer.h" namespace ROCKSDB_NAMESPACE { @@ -150,7 +151,7 @@ class TableBuilder { // Add key,value to the table being constructed. // REQUIRES: key is after any previously added key according to comparator. // REQUIRES: Finish(), Abandon() have not been called - virtual void Add(const Slice& key, const Slice& value) = 0; + virtual void Add(const Slice& key, const Slice& value, uint32_t segment_id = INVALID_SEGMENT_ID) = 0; // Return non-ok iff some error has been detected. virtual Status status() const = 0; @@ -198,6 +199,9 @@ class TableBuilder { // Return file checksum function name virtual const char* GetFileChecksumFuncName() const = 0; + + // default method to make compiler happy, not yet implemented (WaLSM+) + virtual SegmentBuilderResult GetSegmentBuilderResult() { assert(false); return SegmentBuilderResult(); } }; } // namespace ROCKSDB_NAMESPACE diff --git a/table/table_reader.h b/table/table_reader.h index b011790b9..9c5939840 100644 --- a/table/table_reader.h +++ b/table/table_reader.h @@ -8,9 +8,12 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #pragma once +#include #include +#include #include "db/range_tombstone_fragmenter.h" #include "rocksdb/slice_transform.h" +#include "table/format.h" #include "table/get_context.h" #include "table/internal_iterator.h" #include "table/multiget_context.h" @@ -146,6 +149,10 @@ class TableReader { TableReaderCaller /*caller*/) { return Status::NotSupported("VerifyChecksum() not supported"); } + + virtual std::map> GetSegmentBlockHandles() const { + return {}; + } }; } // namespace ROCKSDB_NAMESPACE diff --git a/test_sh/test.sh b/test_sh/test.sh index ab28f37c1..15bf7d76a 100644 --- a/test_sh/test.sh +++ b/test_sh/test.sh @@ -3,7 +3,7 @@ value_array=(32 64 128) test_all_size=81920000000 #8G -pmem_path="/mnt/chen/test" +pmem_path= "/mnt/pmem0.7/guoteng/nodememory" bench_benchmarks="fillrandom, stats, wait, clean_cache, stats, readrandom, stats, clean_cache" bench_readnum="1000000" diff --git a/util/bloom_test.cc b/util/bloom_test.cc index 0fea9c662..f64c72a09 100644 --- a/util/bloom_test.cc +++ b/util/bloom_test.cc @@ -7,6 +7,7 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. +#include "table/block_based/full_filter_block.h" #ifndef GFLAGS #include int main() { @@ -28,6 +29,9 @@ int main() { #include "test_util/testutil.h" #include "util/gflags_compat.h" #include "util/hash.h" +#include "table/block_based/partitioned_filter_block.h" +#include "include/rocksdb/filter_policy.h" +#include "rocksdb/slice.h" using GFLAGS_NAMESPACE::ParseCommandLineFlags; @@ -977,6 +981,111 @@ INSTANTIATE_TEST_CASE_P(Full, FullBloomTest, testing::Values(BloomFilterPolicy::kLegacyBloom, BloomFilterPolicy::kFastLocalBloom)); +#ifdef ART_PLUS +class MultiUnitBloomEffectTest : public testing::Test { + protected: + size_t filter_count_ = 8; + int bits_per_key_per_unit_ = 2; + int num_keys_ = 1000; + std::vector keys_; + std::vector> filter_bufs_; + std::vector filter_slices_; + std::shared_ptr policy_; + std::unique_ptr bits_builder_; + + void SetUp() override { + policy_.reset(new BloomFilterPolicy(bits_per_key_per_unit_, BloomFilterPolicy::kLegacyBloom)); + + BlockBasedTableOptions table_options; + table_options.filter_policy = policy_; + + FilterBuildingContext context(table_options); + + // 直接通过policy获取builder + bits_builder_.reset(policy_->GetBuilderWithContext(context)); + + // 生成key + for (int i = 0; i < num_keys_; ++i) { + keys_.push_back("key" + std::to_string(i)); + } + + // 添加所有key到filter + for (const auto& k : keys_) { + bits_builder_->AddKey(Slice(k)); + } + + // 分别构造每个unit的filter + filter_bufs_.resize(filter_count_); + filter_slices_.resize(filter_count_); + for (size_t i = 0; i < filter_count_; ++i) { + // 调用FinishWithId + filter_slices_[i] = bits_builder_->FinishWithId(&filter_bufs_[i], i); + std::cout << "slice " << i << " size: " << filter_slices_[i].size() << std::endl; + } + } + + // 用前n个unit做AND查询 + bool MayMatchWithNUnits(const std::string& key, int n) { + for (int i = 0; i < n; ++i) { + FilterBitsReader* reader = policy_->GetFilterBitsReader(filter_slices_[i]); + if (!reader->MayMatchWithId(Slice(key), i)) { + delete reader; + return false; + } + delete reader; + } + return true; + } +}; + +TEST_F(MultiUnitBloomEffectTest, FalsePositiveRateDecreasesWithMoreUnits) { + int test_fp = 0; + int test_total = 100000000; + // 用1个unit + for (int i = num_keys_; i < num_keys_ + test_total; ++i) { + if (MayMatchWithNUnits("key" + std::to_string(i), 1)) test_fp++; + } + double fp_rate_1 = test_fp / double(test_total); + + test_fp = 0; + // 用2个unit + for (int i = num_keys_; i < num_keys_ + test_total; ++i) { + if (MayMatchWithNUnits("key" + std::to_string(i), 2)) test_fp++; + } + double fp_rate_2 = test_fp / double(test_total); + + test_fp = 0; + // 用4个unit + for (int i = num_keys_; i < num_keys_ + test_total; ++i) { + if (MayMatchWithNUnits("key" + std::to_string(i), 4)) test_fp++; + } + double fp_rate_4 = test_fp / double(test_total); + + test_fp = 0; + // 用8个unit + for (int i = num_keys_; i < num_keys_ + test_total; ++i) { + if (MayMatchWithNUnits("key" + std::to_string(i), 8)) test_fp++; + } + double fp_rate_8 = test_fp / double(test_total); + + printf("FP rate with 1 unit: %f\n", fp_rate_1); + printf("FP rate with 2 units: %f\n", fp_rate_2); + printf("FP rate with 4 units: %f\n", fp_rate_4); + printf("FP rate with 8 units: %f\n", fp_rate_8); + + ASSERT_GT(fp_rate_1, fp_rate_2); + ASSERT_GT(fp_rate_2, fp_rate_4); + ASSERT_GT(fp_rate_4, fp_rate_8); +} + +TEST_F(MultiUnitBloomEffectTest, AllKeysAlwaysMatch) { + // 所有插入的key都应该能查到 + for (const auto& k : keys_) { + ASSERT_TRUE(MayMatchWithNUnits(k, 8)); + } +} +#endif // ART_PLUS + } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { diff --git a/util/comparator.cc b/util/comparator.cc index f82a6dd14..b7dfa40f0 100644 --- a/util/comparator.cc +++ b/util/comparator.cc @@ -232,11 +232,13 @@ class SegmentIdRemovingComparatorImpl : public Comparator { void FindShortestSeparator(std::string* start, const Slice& limit) const override { - real_comparator->FindShortestSeparator(start, limit); + // real_comparator->FindShortestSeparator(start, limit); + // do nothing to disable the key shortening feature } void FindShortSuccessor(std::string* key) const override { - real_comparator->FindShortSuccessor(key); + // real_comparator->FindShortSuccessor(key); + // do nothing to disable the key shortening feature } bool IsSameLengthImmediateSuccessor(const Slice& s,