From 1b6146b7c8b3a3363b96f2a8eab225ba176876df Mon Sep 17 00:00:00 2001 From: Omnish_Kumar Date: Thu, 12 Mar 2026 10:03:18 +0530 Subject: [PATCH 1/8] clang-format and ci pipeline --- .github/workflows/ci.yml | 73 ++ install.sh | 15 +- src/core/ndd.hpp | 222 +++--- src/filter/category_index.hpp | 8 +- src/filter/filter.hpp | 74 +- src/filter/numeric_index.hpp | 540 +++++++------- src/main.cpp | 335 ++++++--- src/quant/binary.hpp | 4 +- src/quant/float16.hpp | 36 +- src/quant/float32.hpp | 12 +- src/quant/int16.hpp | 22 +- src/quant/int8.hpp | 9 +- src/sparse/inverted_index.cpp | 1273 +++++++++++++++++--------------- src/sparse/inverted_index.hpp | 126 ++-- src/sparse/sparse_storage.hpp | 38 +- src/sparse/sparse_vector.hpp | 12 +- src/storage/backup_store.hpp | 34 +- src/storage/index_meta.hpp | 46 +- src/storage/vector_storage.hpp | 31 +- src/utils/log.hpp | 97 +-- src/utils/settings.hpp | 12 +- tests/filter_test.cpp | 97 +-- 22 files changed, 1725 insertions(+), 1391 deletions(-) create mode 100644 .github/workflows/ci.yml diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 000000000..47ab914a2 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,73 @@ +# CONTINUOUS INTEGRATION WORKFLOW +# LINT TEST -> UNIT TEST -> BUILD + + +name: Continuous Integration + +# ========== TRIGGER EVENT ========== +on: + # PUSH EVENT + push: + branches: [ feature/linting_tests ] + # PULL REQUEST EVENT + pull_request: + branches: [ feature/continuous_integration] + + +# ========== JOBS ========== +jobs: + + # ----- JOB: 1 -> LINT AND CODE QUALITY + lint: + name: Lint and Code Quality Check + runs-on: ubuntu-latest + + steps: + + #---------------- STEP 1: CHECKOUT TO SOURCE CODE + - name: Checkout Source Code + uses: actions/checkout@v4 + + #---------------- STEP 2: CACHE FIRST, THEN INSTALL + - name: Cache apt packages + uses: awalsh128/cache-apt-pkgs-action@latest + with: + packages: clang-format clang-tidy shellcheck cmake libcurl4-openssl-dev + version: 1.0 + + # - name: Cache apt packages + # uses: awalsh128/cache-apt-pkgs-action@latest + # with: + # packages: clang-format clang-tidy shellcheck cmake libcurl4-openssl-dev + # version: 1.0 + + # RUN CLANG-FORMAT FOR CODE STYLE, INDENTATIONS, SPACING + # DRY RUN MAKE SURE IT WON'T CHANGE CODE ONLY IT WILL CHECK FORMATTING + - name: C++ and HPP formatting + run: | + find src/ \( -name "*.cpp" -o -name "*.hpp" \) -not -path "*/third_party/*" | xargs --no-run-if-empty clang-format --dry-run --Werror + + # CHECK FOR BUGS AND BAD PRACTICES USING CLANG-TIDY + - name: C++ Static Analysis + run: | + rm -rf build + mkdir -p build + cd build + cmake .. \ + -DCMAKE_BUILD_TYPE=Release \ + -DUSE_AVX2=ON \ + -DCMAKE_EXPORT_COMPILE_COMMANDS=ON + cd .. + find src/ -name "*.cpp" \ + -not -path "*/third_party/*" | \ + xargs --no-run-if-empty clang-tidy \ + -p build/compile_commands.json \ + --header-filter='^.*(src)/.*' + + #---------------- STEP 3: CHECK SHELL SCRIPTS + - name: Lint Shell Scripts + run: | + find . -name "*.sh" \ + -not -path "*/build/*" \ + -not -path "*/third_party/*" | \ + xargs --no-run-if-empty shellcheck -x \ No newline at end of file diff --git a/install.sh b/install.sh index 062789b1a..990862f15 100755 --- a/install.sh +++ b/install.sh @@ -163,6 +163,7 @@ distro_factory() { OS_FAMILY="mac" ;; *) + uname_s=$(uname -s) error "Unsupported kernel: $uname_s" exit 1 ;; @@ -179,10 +180,11 @@ distro_factory() { fi if [ -f /etc/os-release ]; then + # shellcheck disable=SC1091 . /etc/os-release DISTRO_ID="${ID:-unknown}" DISTRO_VERSION_ID="${VERSION_ID:-unknown}" - DISTRO_CODENAME="${VERSION_CODENAME:-unknown}" + export DISTRO_CODENAME="${VERSION_CODENAME:-unknown}" case $DISTRO_ID in ubuntu) INSTALLER_FUNC="install_dependencies_ubuntu_family $DISTRO_VERSION_ID" ;; @@ -197,14 +199,13 @@ distro_factory() { # **************************************** add_frontend() { - VERSION="v1.2.0" - log "Pulling frontend version ${VERSION}" - mkdir -p $script_dir/frontend - cd $script_dir/frontend - curl -fL -o react-dist.zip https://github.com/EndeeLabs/endee-web-ui/releases/download/${VERSION}/dist.zip + log "pulling frontend" + mkdir -p "$script_dir/frontend" + cd "$script_dir/frontend" + curl -L -o react-dist.zip https://github.com/EndeeLabs/endee-web-ui/releases/download/v1.0.2/endee-web-ui.zip unzip -o react-dist.zip rm react-dist.zip - log "Frontend version ${VERSION} added" + log "frontend added" } # **************************************** diff --git a/src/core/ndd.hpp b/src/core/ndd.hpp index 439425204..700c13791 100644 --- a/src/core/ndd.hpp +++ b/src/core/ndd.hpp @@ -45,8 +45,7 @@ struct IndexInfo { size_t dimension; size_t sparse_dim; std::string space_type_str; - ndd::quant::QuantizationLevel - quant_level; // Selected quantization level + ndd::quant::QuantizationLevel quant_level; // Selected quantization level int32_t checksum; size_t M; size_t ef_con; @@ -234,9 +233,9 @@ class IndexManager { if(!failed_vector_add_ids.empty()) { entry.id_mapper->reclaim_failed_ids(failed_vector_add_ids); LOG_INFO(2010, - index_id, - "Reclaimed " << failed_vector_add_ids.size() - << " failed VECTOR_ADD ids for reuse"); + index_id, + "Reclaimed " << failed_vector_add_ids.size() + << " failed VECTOR_ADD ids for reuse"); } // Mark as updated to trigger a save @@ -354,7 +353,7 @@ class IndexManager { if(remainingCapacity < settings::MAX_ELEMENTS_INCREMENT_TRIGGER) { size_t newMaxElements = maxElements + settings::MAX_ELEMENTS_INCREMENT; LOG_DEBUG("Auto-resizing index " << entry.index_id << " from " << maxElements << " to " - << newMaxElements << " elements"); + << newMaxElements << " elements"); try { entry.alg->resizeIndex(newMaxElements); @@ -377,8 +376,7 @@ class IndexManager { // Update element count in metadata if(!metadata_manager_->updateElementCount(entry.index_id, entry.alg->getElementsCount())) { - LOG_WARN( - 2014, entry.index_id, "Failed to update element count in metadata"); + LOG_WARN(2014, entry.index_id, "Failed to update element count in metadata"); } entry.updated = false; } @@ -406,7 +404,8 @@ class IndexManager { // Only evict if the index is not dirty (hasn't been updated) if(it->second.updated) { - LOG_WARN(2015, to_evict, "Cannot evict dirty index; it must be saved first"); + LOG_WARN( + 2015, to_evict, "Cannot evict dirty index; it must be saved first"); // Put it back at the front to try other indices indices_list_.push_front(to_evict); continue; @@ -500,9 +499,8 @@ class IndexManager { saveIndex(pair.first); } } catch(const std::exception& e) { - LOG_ERROR(2017, - pair.first, - "Failed to save index during shutdown: " << e.what()); + LOG_ERROR( + 2017, pair.first, "Failed to save index during shutdown: " << e.what()); } } LOG_DEBUG("Shutdown complete"); @@ -556,8 +554,6 @@ class IndexManager { return true; } - - bool createIndex(const std::string& index_id, const IndexConfig& config, UserType user_type = UserType::Admin, @@ -607,23 +603,23 @@ class IndexManager { //create the directory and initialize sequence for IDMapper LOG_INFO(2021, - index_id, - "Creating ID mapper with user type " << userTypeToString(user_type)); + index_id, + "Creating ID mapper with user type " << userTypeToString(user_type)); // IDMapper now uses tier-based fixed bloom filter sizing based on user_type auto id_mapper = std::make_shared(lmdb_dir, true, user_type); - // Create HNSW directly with all necessary parameters ndd::quant::QuantizationLevel quant_level = config.quant_level; - auto vector_storage = - std::make_shared(index_dir, index_id, config.dim, config.quant_level); + auto vector_storage = std::make_shared( + index_dir, index_id, config.dim, config.quant_level); // Initialize Sparse Storage if needed std::unique_ptr sparse_storage = nullptr; if(config.sparse_dim > 0) { std::string sparse_storage_dir = index_dir + "/sparse"; - sparse_storage = std::make_unique(sparse_storage_dir, index_id); + sparse_storage = + std::make_unique(sparse_storage_dir, index_id); if(!sparse_storage->initialize()) { throw std::runtime_error("Failed to initialize sparse storage"); } @@ -642,7 +638,10 @@ class IndexManager { return vs->get_vector(label, buffer); }); - alg->setVectorFetcherBatch([vs = vector_storage](const ndd::idInt* labels, uint8_t* buffers, bool* success, size_t count) -> size_t { + alg->setVectorFetcherBatch([vs = vector_storage](const ndd::idInt* labels, + uint8_t* buffers, + bool* success, + size_t count) -> size_t { return vs->get_vectors_batch_into(labels, buffers, success, count); }); @@ -737,7 +736,8 @@ class IndexManager { std::unique_ptr sparse_storage; if(sparse_dim > 0) { std::string sparse_storage_dir = index_dir + "/sparse"; - sparse_storage = std::make_unique(sparse_storage_dir, index_id); + sparse_storage = + std::make_unique(sparse_storage_dir, index_id); if(!sparse_storage->initialize()) { throw std::runtime_error("Failed to initialize sparse storage for index: " + index_id); @@ -749,7 +749,10 @@ class IndexManager { return vs->get_vector(label, buffer); }); - alg->setVectorFetcherBatch([vs = vector_storage](const ndd::idInt* labels, uint8_t* buffers, bool* success, size_t count) -> size_t { + alg->setVectorFetcherBatch([vs = vector_storage](const ndd::idInt* labels, + uint8_t* buffers, + bool* success, + size_t count) -> size_t { return vs->get_vectors_batch_into(labels, buffers, success, count); }); @@ -815,9 +818,9 @@ class IndexManager { if(it != indices_.end()) { // Cache removed LOG_INFO(2025, - index_id, - "Reloaded index with " - << it->second.alg->getElementsCount() << " elements"); + index_id, + "Reloaded index with " << it->second.alg->getElementsCount() + << " elements"); } } @@ -850,7 +853,10 @@ class IndexManager { return vs->get_vector(label, buffer); }); - new_alg->setVectorFetcherBatch([vs = entry.vector_storage](const ndd::idInt* labels, uint8_t* buffers, bool* success, size_t count) -> size_t { + new_alg->setVectorFetcherBatch([vs = entry.vector_storage](const ndd::idInt* labels, + uint8_t* buffers, + bool* success, + size_t count) -> size_t { return vs->get_vectors_batch_into(labels, buffers, success, count); }); @@ -980,8 +986,8 @@ class IndexManager { // Calculate start and end indices for this thread size_t start_idx = t * chunk_size; size_t end_idx = (start_idx + chunk_size < quantized_vectors.size()) - ? (start_idx + chunk_size) - : quantized_vectors.size(); + ? (start_idx + chunk_size) + : quantized_vectors.size(); // Process assigned chunk of vectors for(size_t i = start_idx; i < end_idx; i++) { @@ -1116,8 +1122,9 @@ class IndexManager { if(empty_vector_count.load() > 0) { LOG_WARN(2032, - index_id, - "Skipped " << empty_vector_count.load() << " vectors during recovery because they were empty"); + index_id, + "Skipped " << empty_vector_count.load() + << " vectors during recovery because they were empty"); } LOG_INFO(2033, index_id, "Recovered " << batch.size() << " vectors"); @@ -1334,8 +1341,9 @@ class IndexManager { // 0. Compute Filter Bitmap (Shared) std::optional active_filter_bitmap; - if (!filter_array.empty()) { - active_filter_bitmap = entry.vector_storage->filter_store_->computeFilterBitmap(filter_array); + if(!filter_array.empty()) { + active_filter_bitmap = + entry.vector_storage->filter_store_->computeFilterBitmap(filter_array); } // 1. Sparse Search (Async) @@ -1360,7 +1368,8 @@ class IndexManager { sparse_query.values.push_back(p.second); } - const ndd::RoaringBitmap* filter_ptr = active_filter_bitmap.has_value() ? &(*active_filter_bitmap) : nullptr; + const ndd::RoaringBitmap* filter_ptr = + active_filter_bitmap.has_value() ? &(*active_filter_bitmap) : nullptr; return entry.sparse_storage->search(sparse_query, k, filter_ptr); }); } @@ -1375,48 +1384,59 @@ class IndexManager { std::vector query_bytes = ndd::quant::get_quantizer_dispatch(quant_level).quantize(query); - if (!active_filter_bitmap) { + if(!active_filter_bitmap) { dense_results = entry.alg->searchKnn(query_bytes.data(), k, ef); } else { // Smart Filter Execution Strategy auto& bitmap = *active_filter_bitmap; size_t card = bitmap.cardinality(); - if (card == 0) { + if(card == 0) { // No results match filter - } else if (card < params.prefilter_threshold) { - // Strategy A: Brute Force on Small Subset - std::vector valid_ids; - valid_ids.reserve(card); - bitmap.iterate([](ndd::idInt id, void* ptr){ - static_cast*>(ptr)->push_back(id); - return true; - }, &valid_ids); - - // Fetch vectors - auto vector_batch = entry.vector_storage->get_vectors_batch(valid_ids); - - // Prepare subset for bruteforce search - std::vector>> vector_subset; - vector_subset.reserve(vector_batch.size()); - for(const auto& [nid, vbytes] : vector_batch) { - vector_subset.emplace_back(nid, vbytes); - } - - dense_results = hnswlib::searchKnnSubset( - query_bytes.data(), vector_subset, k, space); - + } else if(card < params.prefilter_threshold) { + // Strategy A: Brute Force on Small Subset + std::vector valid_ids; + valid_ids.reserve(card); + bitmap.iterate( + [](ndd::idInt id, void* ptr) { + static_cast*>(ptr)->push_back(id); + return true; + }, + &valid_ids); + + // Fetch vectors + auto vector_batch = entry.vector_storage->get_vectors_batch(valid_ids); + + // Prepare subset for bruteforce search + std::vector>> vector_subset; + vector_subset.reserve(vector_batch.size()); + for(const auto& [nid, vbytes] : vector_batch) { + vector_subset.emplace_back(nid, vbytes); + } + + dense_results = hnswlib::searchKnnSubset( + query_bytes.data(), vector_subset, k, space); + } else { // Strategy B: Filtered HNSW Search BitMapFilterFunctor functor(bitmap); size_t effective_ef = ef > 0 ? ef : settings::DEFAULT_EF_SEARCH; // Try to use optimized templated search if algorithm matches - auto* hnsw_alg = dynamic_cast*>(entry.alg.get()); - if (hnsw_alg) { - dense_results = hnsw_alg->searchKnn(query_bytes.data(), k, effective_ef, &functor, params.boost_percentage); + auto* hnsw_alg = + dynamic_cast*>(entry.alg.get()); + if(hnsw_alg) { + dense_results = hnsw_alg->searchKnn(query_bytes.data(), + k, + effective_ef, + &functor, + params.boost_percentage); } else { - dense_results = entry.alg->searchKnn(query_bytes.data(), k, effective_ef, &functor, params.boost_percentage); + dense_results = entry.alg->searchKnn(query_bytes.data(), + k, + effective_ef, + &functor, + params.boost_percentage); } } } @@ -1605,8 +1625,7 @@ class IndexManager { } } else { LOG_DEBUG("Filter cardinality too high for pre-filtering (" - << filter_cardinality - << " >= " << params.prefilter_threshold + << filter_cardinality << " >= " << params.prefilter_threshold << "), returning post-filter results"); } } @@ -1664,8 +1683,7 @@ class IndexManager { return true; } } catch(const std::filesystem::filesystem_error& e) { - LOG_ERROR( - 2040, index_id, "Failed to move index to deleted directory: " << e.what()); + LOG_ERROR(2040, index_id, "Failed to move index to deleted directory: " << e.what()); return false; } @@ -1744,11 +1762,11 @@ class IndexManager { // Orchestration methods (defined below after class) std::pair createBackupAsync(const std::string& index_id, - const std::string& backup_name); + const std::string& backup_name); std::pair restoreBackup(const std::string& backup_name, - const std::string& target_index_name, - const std::string& username); + const std::string& target_index_name, + const std::string& username); // Forwarding methods (no IndexManager internals needed) std::vector listBackups(const std::string& username) { @@ -1756,7 +1774,7 @@ class IndexManager { } std::pair deleteBackup(const std::string& backup_name, - const std::string& username) { + const std::string& username) { return backup_store_.deleteBackup(backup_name, username); } @@ -1764,8 +1782,7 @@ class IndexManager { return backup_store_.getActiveBackup(username); } - nlohmann::json getBackupInfo(const std::string& backup_name, - const std::string& username) { + nlohmann::json getBackupInfo(const std::string& backup_name, const std::string& username) { return backup_store_.getBackupInfo(backup_name, username); } @@ -1776,16 +1793,17 @@ class IndexManager { // ========== IndexManager backup implementations ========== -inline void IndexManager::executeBackupJob(const std::string& index_id, const std::string& backup_name) { +inline void IndexManager::executeBackupJob(const std::string& index_id, + const std::string& backup_name) { std::string username; size_t upos = index_id.find('/'); - if (upos != std::string::npos) { + if(upos != std::string::npos) { username = index_id.substr(0, upos); } try { std::string index_name; - if (upos != std::string::npos) { + if(upos != std::string::npos) { index_name = index_id.substr(upos + 1); } else { throw std::runtime_error("Invalid index ID format"); @@ -1812,24 +1830,25 @@ inline void IndexManager::executeBackupJob(const std::string& index_id, const st auto space_info = std::filesystem::space(user_backup_dir); if(space_info.available < index_size * 2) { - throw std::runtime_error("Insufficient disk space: need " + - std::to_string(index_size * 2 / MB) + " MB"); + throw std::runtime_error("Insufficient disk space: need " + + std::to_string(index_size * 2 / MB) + " MB"); } auto meta = metadata_manager_->getMetadata(index_id); nlohmann::json metadata_json; if(meta) { metadata_json["original_index"] = index_name; - metadata_json["timestamp"] = std::chrono::system_clock::to_time_t(std::chrono::system_clock::now()); + metadata_json["timestamp"] = + std::chrono::system_clock::to_time_t(std::chrono::system_clock::now()); metadata_json["size_mb"] = index_size / MB; metadata_json["params"] = {{"M", meta->M}, - {"ef_construction", meta->ef_con}, - {"dim", meta->dimension}, - {"sparse_dim", meta->sparse_dim}, - {"space_type", meta->space_type_str}, - {"quant_level", static_cast(meta->quant_level)}, - {"total_elements", meta->total_elements}, - {"checksum", meta->checksum}}; + {"ef_construction", meta->ef_con}, + {"dim", meta->dimension}, + {"sparse_dim", meta->sparse_dim}, + {"space_type", meta->space_type_str}, + {"quant_level", static_cast(meta->quant_level)}, + {"total_elements", meta->total_elements}, + {"checksum", meta->checksum}}; LOG_DEBUG("Metadata prepared for backup: " << metadata_json.dump()); } else { LOG_ERROR(2041, index_id, "Failed to get metadata for backup"); @@ -1846,16 +1865,20 @@ inline void IndexManager::executeBackupJob(const std::string& index_id, const st if(!metadata_json.empty()) { std::ofstream meta_file(metadata_file_in_index, std::ios::binary); if(!meta_file) { - throw std::runtime_error("Failed to create metadata file: " + metadata_file_in_index); + throw std::runtime_error("Failed to create metadata file: " + + metadata_file_in_index); } meta_file << metadata_json.dump(4); meta_file.flush(); meta_file.close(); if(!std::filesystem::exists(metadata_file_in_index)) { - throw std::runtime_error("Metadata file was not created: " + metadata_file_in_index); + throw std::runtime_error("Metadata file was not created: " + + metadata_file_in_index); } - LOG_DEBUG("Metadata file created: " << metadata_file_in_index << " (size: " << std::filesystem::file_size(metadata_file_in_index) << " bytes)"); + LOG_DEBUG("Metadata file created: " + << metadata_file_in_index << " (size: " + << std::filesystem::file_size(metadata_file_in_index) << " bytes)"); } std::string error_msg; @@ -1870,7 +1893,9 @@ inline void IndexManager::executeBackupJob(const std::string& index_id, const st if(!std::filesystem::exists(backup_tar_temp)) { throw std::runtime_error("Tar archive was not created: " + backup_tar_temp); } - LOG_DEBUG("Tar archive created successfully: " << backup_tar_temp << " (size: " << std::filesystem::file_size(backup_tar_temp) << " bytes)"); + LOG_DEBUG("Tar archive created successfully: " + << backup_tar_temp + << " (size: " << std::filesystem::file_size(backup_tar_temp) << " bytes)"); if(std::filesystem::exists(metadata_file_in_index)) { std::filesystem::remove(metadata_file_in_index); @@ -1889,7 +1914,7 @@ inline void IndexManager::executeBackupJob(const std::string& index_id, const st LOG_INFO(2043, index_id, "Backup completed: " << backup_name << " -> " << backup_tar_final); - } catch (const std::exception& e) { + } catch(const std::exception& e) { std::string user_backup_dir = backup_store_.getUserBackupDir(username); std::string user_temp_dir = backup_store_.getUserTempDir(username); std::string source_dir = data_dir_ + "/" + index_id; @@ -1913,9 +1938,10 @@ inline void IndexManager::executeBackupJob(const std::string& index_id, const st } } -inline std::pair IndexManager::restoreBackup(const std::string& backup_name, - const std::string& target_index_name, - const std::string& username) { +inline std::pair +IndexManager::restoreBackup(const std::string& backup_name, + const std::string& target_index_name, + const std::string& username) { std::pair result = backup_store_.validateBackupName(backup_name); if(!result.first) { return result; @@ -1999,8 +2025,8 @@ inline std::pair IndexManager::restoreBackup(const std::strin } } -inline std::pair IndexManager::createBackupAsync(const std::string& index_id, - const std::string& backup_name) { +inline std::pair +IndexManager::createBackupAsync(const std::string& index_id, const std::string& backup_name) { std::pair result = backup_store_.validateBackupName(backup_name); if(!result.first) { return result; @@ -2008,20 +2034,20 @@ inline std::pair IndexManager::createBackupAsync(const std::s std::string username; size_t pos = index_id.find('/'); - if (pos != std::string::npos) { + if(pos != std::string::npos) { username = index_id.substr(0, pos); } else { return {false, "Invalid index ID format"}; } - if (backup_store_.hasActiveBackup(username)) { + if(backup_store_.hasActiveBackup(username)) { return {false, "Backup already in progress for user: " + username}; } std::string user_backup_dir = backup_store_.getUserBackupDir(username); std::filesystem::create_directories(user_backup_dir); std::string backup_tar = user_backup_dir + "/" + backup_name + ".tar"; - if (std::filesystem::exists(backup_tar)) { + if(std::filesystem::exists(backup_tar)) { return {false, "Backup already exists: " + backup_name}; } diff --git a/src/filter/category_index.hpp b/src/filter/category_index.hpp index 70e969f28..1873c3ded 100644 --- a/src/filter/category_index.hpp +++ b/src/filter/category_index.hpp @@ -123,7 +123,9 @@ namespace ndd { std::vector scan_values(const std::string& field) const { std::vector values; MDBX_txn* txn; - if (mdbx_txn_begin(env_, nullptr, MDBX_TXN_RDONLY, &txn) != MDBX_SUCCESS) return values; + if(mdbx_txn_begin(env_, nullptr, MDBX_TXN_RDONLY, &txn) != MDBX_SUCCESS) { + return values; + } MDBX_cursor* cursor; mdbx_cursor_open(txn, dbi_, &cursor); @@ -135,7 +137,9 @@ namespace ndd { int rc = mdbx_cursor_get(cursor, &key, &data, MDBX_SET_RANGE); while(rc == MDBX_SUCCESS) { std::string found_key((char*)key.iov_base, key.iov_len); - if(found_key.rfind(prefix, 0) != 0) break; + if(found_key.rfind(prefix, 0) != 0) { + break; + } values.push_back(found_key.substr(prefix.size())); rc = mdbx_cursor_get(cursor, &key, &data, MDBX_NEXT); diff --git a/src/filter/filter.hpp b/src/filter/filter.hpp index a6e1c4ef8..e36fbcc3c 100644 --- a/src/filter/filter.hpp +++ b/src/filter/filter.hpp @@ -16,7 +16,7 @@ #include "mdbx/mdbx.h" #include "../utils/log.hpp" #include "../core/types.hpp" -#include "../hnsw/hnswlib.h" // For BaseFilterFunctor +#include "../hnsw/hnswlib.h" // For BaseFilterFunctor #include "numeric_index.hpp" #include "category_index.hpp" @@ -31,11 +31,11 @@ enum class FieldType : uint8_t { // Filter Functor for HNSW class BitMapFilterFunctor : public hnswlib::BaseFilterFunctor { const ndd::RoaringBitmap& bitmap_; + public: - BitMapFilterFunctor(const ndd::RoaringBitmap& bitmap) : bitmap_(bitmap) {} - bool operator()(ndd::idInt id) override { - return bitmap_.contains(id); - } + BitMapFilterFunctor(const ndd::RoaringBitmap& bitmap) : + bitmap_(bitmap) {} + bool operator()(ndd::idInt id) override { return bitmap_.contains(id); } }; class Filter { @@ -55,8 +55,9 @@ class Filter { MDBX_txn* txn; int rc = mdbx_txn_begin(env_, nullptr, MDBX_TXN_RDONLY, &txn); if(rc != MDBX_SUCCESS) { - LOG_ERROR( - 1210, index_id_, "Failed to begin schema read transaction: " << mdbx_strerror(rc)); + LOG_ERROR(1210, + index_id_, + "Failed to begin schema read transaction: " << mdbx_strerror(rc)); return; } @@ -89,8 +90,9 @@ class Filter { MDBX_txn* txn; int rc = mdbx_txn_begin(env_, nullptr, MDBX_TXN_READWRITE, &txn); if(rc != MDBX_SUCCESS) { - LOG_ERROR( - 1208, index_id_, "Failed to begin schema write transaction: " << mdbx_strerror(rc)); + LOG_ERROR(1208, + index_id_, + "Failed to begin schema write transaction: " << mdbx_strerror(rc)); return; } @@ -101,8 +103,9 @@ class Filter { if(rc == MDBX_SUCCESS) { rc = mdbx_txn_commit(txn); if(rc != MDBX_SUCCESS) { - LOG_ERROR( - 1209, index_id_, "Failed to commit filter schema update: " << mdbx_strerror(rc)); + LOG_ERROR(1209, + index_id_, + "Failed to commit filter schema update: " << mdbx_strerror(rc)); } } else { mdbx_txn_abort(txn); @@ -131,14 +134,13 @@ class Filter { mdbx_env_set_maxdbs(env_, 10); // Set geometry for auto-grow using the filter map size settings - rc = mdbx_env_set_geometry( - env_, - -1, // lower size bound (use default) - 1ULL << settings::FILTER_MAP_SIZE_BITS, // current/now size - 1ULL << settings::FILTER_MAP_SIZE_MAX_BITS, // upper size bound - 1ULL << settings::FILTER_MAP_SIZE_BITS, // growth step - -1, // shrink threshold (use default) - -1); // pagesize (use default) + rc = mdbx_env_set_geometry(env_, + -1, // lower size bound (use default) + 1ULL << settings::FILTER_MAP_SIZE_BITS, // current/now size + 1ULL << settings::FILTER_MAP_SIZE_MAX_BITS, // upper size bound + 1ULL << settings::FILTER_MAP_SIZE_BITS, // growth step + -1, // shrink threshold (use default) + -1); // pagesize (use default) if(rc != MDBX_SUCCESS) { throw std::runtime_error("Failed to set geometry for filters"); } @@ -256,7 +258,9 @@ class Filter { str_val = val.get() ? "1" : "0"; } else { str_val = std::to_string(val.get()); - if (str_val.size() > 255) throw std::runtime_error("Category value too long"); + if(str_val.size() > 255) { + throw std::runtime_error("Category value too long"); + } } std::string key = format_filter_key(field, str_val); or_result = category_index_->get_bitmap_by_key(key); @@ -294,7 +298,9 @@ class Filter { str_val = std::to_string(v.get()); } if(!str_val.empty()) { - if (str_val.size() > 255) throw std::runtime_error("Category value too long"); + if(str_val.size() > 255) { + throw std::runtime_error("Category value too long"); + } std::string key = format_filter_key(field, str_val); or_result |= category_index_->get_bitmap_by_key(key); } @@ -338,23 +344,28 @@ class Filter { } else { throw std::runtime_error("Unsupported operator: " + op); } - + partial_results.push_back(std::move(or_result)); } // Optimization: Sort by cardinality (smallest first) - std::sort(partial_results.begin(), partial_results.end(), - [](const ndd::RoaringBitmap& a, const ndd::RoaringBitmap& b) { - return a.cardinality() < b.cardinality(); - }); + std::sort(partial_results.begin(), + partial_results.end(), + [](const ndd::RoaringBitmap& a, const ndd::RoaringBitmap& b) { + return a.cardinality() < b.cardinality(); + }); - if (partial_results.empty()) return ndd::RoaringBitmap(); + if(partial_results.empty()) { + return ndd::RoaringBitmap(); + } ndd::RoaringBitmap final_result = partial_results[0]; for(size_t i = 1; i < partial_results.size(); ++i) { final_result &= partial_results[i]; // If result becomes empty, stop early - if(final_result.isEmpty()) return final_result; + if(final_result.isEmpty()) { + return final_result; + } } return final_result; @@ -444,10 +455,9 @@ class Filter { filter_to_ids[filter_key].push_back(numeric_id); } else { LOG_WARN(1203, - index_id_, - "Unsupported filter type for field '" << field - << "' in filter: " - << value.dump()); + index_id_, + "Unsupported filter type for field '" + << field << "' in filter: " << value.dump()); } } } catch(const std::exception& e) { diff --git a/src/filter/numeric_index.hpp b/src/filter/numeric_index.hpp index c00265213..bc8284ad4 100644 --- a/src/filter/numeric_index.hpp +++ b/src/filter/numeric_index.hpp @@ -56,41 +56,39 @@ namespace ndd { bool is_dirty = false; // Helper to get actual value - uint32_t get_value(size_t index) const { - return base_value + deltas[index]; - } + uint32_t get_value(size_t index) const { return base_value + deltas[index]; } void add(uint32_t val, ndd::idInt id) { - if (val < base_value) { - // Should not happen if Key logic is correct - throw std::runtime_error("Insert value < Base Value"); + if(val < base_value) { + // Should not happen if Key logic is correct + throw std::runtime_error("Insert value < Base Value"); } uint32_t delta_32 = val - base_value; - if (delta_32 > MAX_DELTA) { + if(delta_32 > MAX_DELTA) { throw std::runtime_error("Delta overflow"); } - + // Maintain sorted order by Value (Delta) uint16_t delta = static_cast(delta_32); - + // Find insertion point auto it = std::lower_bound(deltas.begin(), deltas.end(), delta); size_t index = std::distance(deltas.begin(), it); deltas.insert(it, delta); ids.insert(ids.begin() + index, id); - + summary_bitmap.add(id); is_dirty = true; } bool remove(ndd::idInt id) { // Find index by ID (linear scan needed as ids are not sorted) - for (size_t i = 0; i < ids.size(); ++i) { - if (ids[i] == id) { + for(size_t i = 0; i < ids.size(); ++i) { + if(ids[i] == id) { ids.erase(ids.begin() + i); deltas.erase(deltas.begin() + i); - + // Rebuild or update bitmap? Roaring remove is fast summary_bitmap.remove(id); is_dirty = true; @@ -109,95 +107,108 @@ namespace ndd { std::vector serialize() const { // Optimize bitmap const_cast(summary_bitmap).runOptimize(); - + size_t bm_size = summary_bitmap.getSizeInBytes(); uint16_t count = static_cast(ids.size()); - + size_t total_size = 4 + bm_size + 2 + (count * 2) + (count * sizeof(ndd::idInt)); std::vector buffer(total_size); uint8_t* ptr = buffer.data(); // 1. Bitmap Header uint32_t bm_size_32 = static_cast(bm_size); - std::memcpy(ptr, &bm_size_32, 4); ptr += 4; + std::memcpy(ptr, &bm_size_32, 4); + ptr += 4; // 2. Bitmap Data - if (bm_size > 0) { + if(bm_size > 0) { summary_bitmap.write(reinterpret_cast(ptr)); ptr += bm_size; } // 3. Count - std::memcpy(ptr, &count, 2); ptr += 2; + std::memcpy(ptr, &count, 2); + ptr += 2; // 4. Deltas - if (count > 0) { - std::memcpy(ptr, deltas.data(), count * 2); ptr += count * 2; + if(count > 0) { + std::memcpy(ptr, deltas.data(), count * 2); + ptr += count * 2; } // 5. IDs - if (count > 0) { - std::memcpy(ptr, ids.data(), count * sizeof(ndd::idInt)); + if(count > 0) { + std::memcpy(ptr, ids.data(), count * sizeof(ndd::idInt)); } - + return buffer; } static Bucket deserialize(const void* data, size_t len, uint32_t base_val) { Bucket b; b.base_value = base_val; - - if (len < 6) return b; // Min valid size + + if(len < 6) { + return b; // Min valid size + } const uint8_t* ptr = static_cast(data); const uint8_t* end = ptr + len; - + // 1. Bitmap Size uint32_t bm_size; - std::memcpy(&bm_size, ptr, 4); ptr += 4; + std::memcpy(&bm_size, ptr, 4); + ptr += 4; - if (ptr + bm_size > end) { + if(ptr + bm_size > end) { throw std::runtime_error("Bucket corrupt: invalid bitmap size"); } // 2. Bitmap - if (bm_size > 0) { - b.summary_bitmap = ndd::RoaringBitmap::read(reinterpret_cast(ptr)); - ptr += bm_size; + if(bm_size > 0) { + b.summary_bitmap = ndd::RoaringBitmap::read(reinterpret_cast(ptr)); + ptr += bm_size; } - if (ptr + 2 > end) throw std::runtime_error("Bucket corrupt: truncated count"); + if(ptr + 2 > end) { + throw std::runtime_error("Bucket corrupt: truncated count"); + } // 3. Count uint16_t count; - std::memcpy(&count, ptr, 2); ptr += 2; + std::memcpy(&count, ptr, 2); + ptr += 2; // 4. Deltas & IDs - if (count > 0) { + if(count > 0) { size_t delta_size = count * 2; size_t id_size = count * sizeof(ndd::idInt); - - if (ptr + delta_size + id_size > end) { - throw std::runtime_error("Bucket corrupt: truncated Data"); + + if(ptr + delta_size + id_size > end) { + throw std::runtime_error("Bucket corrupt: truncated Data"); } b.deltas.resize(count); - std::memcpy(b.deltas.data(), ptr, delta_size); ptr += delta_size; + std::memcpy(b.deltas.data(), ptr, delta_size); + ptr += delta_size; b.ids.resize(count); - std::memcpy(b.ids.data(), ptr, id_size); + std::memcpy(b.ids.data(), ptr, id_size); } - + return b; } // Fast access to just the bitmap (for middle buckets) static ndd::RoaringBitmap read_summary_bitmap(const void* data, size_t len) { - const uint8_t* ptr = static_cast(data); - uint32_t bm_size; - std::memcpy(&bm_size, ptr, 4); ptr += 4; - if(bm_size == 0) return ndd::RoaringBitmap(); - return ndd::RoaringBitmap::read(reinterpret_cast(ptr)); + const uint8_t* ptr = static_cast(data); + uint32_t bm_size; + std::memcpy(&bm_size, ptr, 4); + ptr += 4; + if(bm_size == 0) { + return ndd::RoaringBitmap(); + } + return ndd::RoaringBitmap::read(reinterpret_cast(ptr)); } bool is_full() const { return ids.size() >= MAX_SIZE; } @@ -229,7 +240,9 @@ namespace ndd { } uint32_t parse_bucket_key_val(const std::string& key) { - if (key.size() < 4) return 0; + if(key.size() < 4) { + return 0; + } uint32_t be_val; std::memcpy(&be_val, key.data() + key.size() - 4, 4); #if defined(__GNUC__) || defined(__clang__) @@ -241,9 +254,10 @@ namespace ndd { } public: - NumericIndex(MDBX_env* env) : env_(env) { + NumericIndex(MDBX_env* env) : + env_(env) { MDBX_txn* txn; - if (mdbx_txn_begin(env_, nullptr, MDBX_TXN_READWRITE, &txn) == MDBX_SUCCESS) { + if(mdbx_txn_begin(env_, nullptr, MDBX_TXN_READWRITE, &txn) == MDBX_SUCCESS) { mdbx_dbi_open(txn, "numeric_forward", MDBX_CREATE, &forward_dbi_); mdbx_dbi_open(txn, "numeric_inverted", MDBX_CREATE, &inverted_dbi_); mdbx_txn_commit(txn); @@ -285,16 +299,19 @@ namespace ndd { } private: - void put_internal(MDBX_txn* txn, const std::string& field, ndd::idInt id, uint32_t value) { + void + put_internal(MDBX_txn* txn, const std::string& field, ndd::idInt id, uint32_t value) { // 1. Check Forward Index std::string fwd_key_str = make_forward_key(field, id); MDBX_val fwd_key{const_cast(fwd_key_str.data()), fwd_key_str.size()}; MDBX_val fwd_val; - if (mdbx_get(txn, forward_dbi_, &fwd_key, &fwd_val) == MDBX_SUCCESS) { + if(mdbx_get(txn, forward_dbi_, &fwd_key, &fwd_val) == MDBX_SUCCESS) { uint32_t old_val; std::memcpy(&old_val, fwd_val.iov_base, 4); - if (old_val == value) return; + if(old_val == value) { + return; + } remove_from_buckets(txn, field, old_val, id); } @@ -306,7 +323,10 @@ namespace ndd { add_to_buckets(txn, field, value, id); } - void remove_from_buckets(MDBX_txn* txn, const std::string& field, uint32_t value, ndd::idInt id) { + void remove_from_buckets(MDBX_txn* txn, + const std::string& field, + uint32_t value, + ndd::idInt id) { // Find bucket std::string bkey_str = make_bucket_key(field, value); MDBX_val key{const_cast(bkey_str.data()), bkey_str.size()}; @@ -316,44 +336,47 @@ namespace ndd { // Scan backward to find bucket covering 'value' int rc = mdbx_cursor_get(cursor, &key, &data, MDBX_SET_RANGE); - + // Logic to find correct bucket: std::string found_key; - if (rc == MDBX_SUCCESS) { + if(rc == MDBX_SUCCESS) { found_key = std::string((char*)key.iov_base, key.iov_len); // Check if we are in right field & range - if (found_key.rfind(field + ":", 0) != 0 || parse_bucket_key_val(found_key) > value) { - rc = mdbx_cursor_get(cursor, &key, &data, MDBX_PREV); + if(found_key.rfind(field + ":", 0) != 0 + || parse_bucket_key_val(found_key) > value) { + rc = mdbx_cursor_get(cursor, &key, &data, MDBX_PREV); } - } else if (rc == MDBX_NOTFOUND) { - rc = mdbx_cursor_get(cursor, &key, &data, MDBX_LAST); + } else if(rc == MDBX_NOTFOUND) { + rc = mdbx_cursor_get(cursor, &key, &data, MDBX_LAST); } // Should be at correct bucket now - if (rc == MDBX_SUCCESS) { - found_key = std::string((char*)key.iov_base, key.iov_len); - if (found_key.rfind(field + ":", 0) == 0) { - uint32_t bucket_base = parse_bucket_key_val(found_key); - if (value >= bucket_base) { - Bucket b = Bucket::deserialize(data.iov_base, data.iov_len, bucket_base); - if (b.remove(id)) { - // Save back or Delete if empty - if (b.is_empty()) { - mdbx_cursor_del(cursor, static_cast(0)); - } else { - auto bytes = b.serialize(); - MDBX_val new_data{bytes.data(), bytes.size()}; - mdbx_cursor_put(cursor, &key, &new_data, MDBX_CURRENT); - } - } - } - } + if(rc == MDBX_SUCCESS) { + found_key = std::string((char*)key.iov_base, key.iov_len); + if(found_key.rfind(field + ":", 0) == 0) { + uint32_t bucket_base = parse_bucket_key_val(found_key); + if(value >= bucket_base) { + Bucket b = + Bucket::deserialize(data.iov_base, data.iov_len, bucket_base); + if(b.remove(id)) { + // Save back or Delete if empty + if(b.is_empty()) { + mdbx_cursor_del(cursor, static_cast(0)); + } else { + auto bytes = b.serialize(); + MDBX_val new_data{bytes.data(), bytes.size()}; + mdbx_cursor_put(cursor, &key, &new_data, MDBX_CURRENT); + } + } + } + } } mdbx_cursor_close(cursor); } - void add_to_buckets(MDBX_txn* txn, const std::string& field, uint32_t value, ndd::idInt id) { + void + add_to_buckets(MDBX_txn* txn, const std::string& field, uint32_t value, ndd::idInt id) { MDBX_cursor* cursor; mdbx_cursor_open(txn, inverted_dbi_, &cursor); @@ -363,28 +386,30 @@ namespace ndd { MDBX_val data; int rc = mdbx_cursor_get(cursor, &key, &data, MDBX_SET_RANGE); - + bool create_new = false; std::string target_key_str; uint32_t target_base = 0; // Move logic to find predecessor - if (rc == MDBX_SUCCESS) { - std::string found_key((char*)key.iov_base, key.iov_len); - if (found_key.rfind(field + ":", 0) != 0 || parse_bucket_key_val(found_key) > value) { - rc = mdbx_cursor_get(cursor, &key, &data, MDBX_PREV); - } + if(rc == MDBX_SUCCESS) { + std::string found_key((char*)key.iov_base, key.iov_len); + if(found_key.rfind(field + ":", 0) != 0 + || parse_bucket_key_val(found_key) > value) { + rc = mdbx_cursor_get(cursor, &key, &data, MDBX_PREV); + } } else { rc = mdbx_cursor_get(cursor, &key, &data, MDBX_LAST); } - if (rc == MDBX_SUCCESS) { + if(rc == MDBX_SUCCESS) { std::string found_key((char*)key.iov_base, key.iov_len); - if (found_key.rfind(field + ":", 0) == 0) { + if(found_key.rfind(field + ":", 0) == 0) { target_base = parse_bucket_key_val(found_key); // Check range condition - if (value >= target_base && (static_cast(value) - target_base) <= Bucket::MAX_DELTA) { - target_key_str = found_key; + if(value >= target_base + && (static_cast(value) - target_base) <= Bucket::MAX_DELTA) { + target_key_str = found_key; } else { create_new = true; } @@ -395,128 +420,135 @@ namespace ndd { create_new = true; } - if (create_new) { + if(create_new) { // Create new bucket at exact value Bucket b; b.base_value = value; b.add(value, id); auto bytes = b.serialize(); - + target_key_str = make_bucket_key(field, value); MDBX_val k{const_cast(target_key_str.data()), target_key_str.size()}; MDBX_val v{bytes.data(), bytes.size()}; mdbx_put(txn, inverted_dbi_, &k, &v, MDBX_UPSERT); - + } else { // Update existing - // We must re-fetch current key/data because cursor move might have updated key/data - MDBX_val k{const_cast(target_key_str.data()), target_key_str.size()}; - MDBX_val v; - if(mdbx_cursor_get(cursor, &k, &v, MDBX_SET) != MDBX_SUCCESS) { - // Should not happen if logic is correct - throw std::runtime_error("Cursor sync fail"); - } + // We must re-fetch current key/data because cursor move might have updated + // key/data + MDBX_val k{const_cast(target_key_str.data()), target_key_str.size()}; + MDBX_val v; + if(mdbx_cursor_get(cursor, &k, &v, MDBX_SET) != MDBX_SUCCESS) { + // Should not happen if logic is correct + throw std::runtime_error("Cursor sync fail"); + } Bucket b = Bucket::deserialize(v.iov_base, v.iov_len, target_base); - + // Capacity Check - if (b.ids.size() >= Bucket::MAX_SIZE) { - // SPLIT LOGIC - // Sort is maintained by arrays. - // "Slide Split": Scan right from median - size_t mid_idx = b.ids.size() / 2; - - // Ensure we don't split a group of identical values - size_t probe_right = mid_idx; - while (probe_right < b.deltas.size() && probe_right > 0 && b.deltas[probe_right] == b.deltas[probe_right - 1]) { - probe_right++; - } - - if (probe_right < b.deltas.size()) { - mid_idx = probe_right; - } else { - // Fallback: Try scanning left - size_t probe_left = mid_idx; - while (probe_left > 0 && b.deltas[probe_left] == b.deltas[probe_left - 1]) { - probe_left--; - } - - if (probe_left > 0) { - mid_idx = probe_left; - } else { - // All identical - mid_idx = b.deltas.size(); - } - } - - // If we hit end, we can't split by value uniqueness - if (mid_idx == b.deltas.size()) { - // Fallback: Just append (overfill) or implement logic to handle identicals. - // For now: Append - b.add(value, id); - auto bytes = b.serialize(); - MDBX_val k2{const_cast(target_key_str.data()), target_key_str.size()}; - MDBX_val v2{bytes.data(), bytes.size()}; - mdbx_cursor_put(cursor, &k2, &v2, MDBX_CURRENT); - mdbx_cursor_close(cursor); - return; - } - - // Standard Slide Split - Bucket right_b; - right_b.base_value = b.base_value + b.deltas[mid_idx]; // New base - - // Move entries - for(size_t i=mid_idx; i= right_b.base_value) { - right_b.add(value, id); - } else { - // If value < right, goes to left. - // But wait, split point was determined by existing items. - // If new value is >= base+split_delta, it goes right. - // BUT we just cleared right from b. - // Correct logic: - b.add(value, id); // Add to left if it fits range (logic handles delta) - // Oh wait, if we added to left, we might overflow again or break order? - // Simply: Check which bucket covers it. - // Left covers [Base, RightBase-1] - // Right covers [RightBase, ...] - } - - // Save Left - auto left_bytes = b.serialize(); - MDBX_val left_v{left_bytes.data(), left_bytes.size()}; - MDBX_val left_k{const_cast(target_key_str.data()), target_key_str.size()}; - mdbx_cursor_put(cursor, &left_k, &left_v, MDBX_CURRENT); - - // Save Right - auto right_bytes = right_b.serialize(); - std::string right_k_str = make_bucket_key(field, right_b.base_value); - MDBX_val right_k{const_cast(right_k_str.data()), right_k_str.size()}; - MDBX_val right_v{right_bytes.data(), right_bytes.size()}; - - // Use put for new key - mdbx_put(txn, inverted_dbi_, &right_k, &right_v, MDBX_UPSERT); + if(b.ids.size() >= Bucket::MAX_SIZE) { + // SPLIT LOGIC + // Sort is maintained by arrays. + // "Slide Split": Scan right from median + size_t mid_idx = b.ids.size() / 2; + + // Ensure we don't split a group of identical values + size_t probe_right = mid_idx; + while(probe_right < b.deltas.size() && probe_right > 0 + && b.deltas[probe_right] == b.deltas[probe_right - 1]) { + probe_right++; + } + + if(probe_right < b.deltas.size()) { + mid_idx = probe_right; + } else { + // Fallback: Try scanning left + size_t probe_left = mid_idx; + while(probe_left > 0 + && b.deltas[probe_left] == b.deltas[probe_left - 1]) { + probe_left--; + } + + if(probe_left > 0) { + mid_idx = probe_left; + } else { + // All identical + mid_idx = b.deltas.size(); + } + } + + // If we hit end, we can't split by value uniqueness + if(mid_idx == b.deltas.size()) { + // Fallback: Just append (overfill) or implement logic to handle + // identicals. For now: Append + b.add(value, id); + auto bytes = b.serialize(); + MDBX_val k2{const_cast(target_key_str.data()), + target_key_str.size()}; + MDBX_val v2{bytes.data(), bytes.size()}; + mdbx_cursor_put(cursor, &k2, &v2, MDBX_CURRENT); + mdbx_cursor_close(cursor); + return; + } + + // Standard Slide Split + Bucket right_b; + right_b.base_value = b.base_value + b.deltas[mid_idx]; // New base + + // Move entries + for(size_t i = mid_idx; i < b.deltas.size(); ++i) { + right_b.add(b.base_value + b.deltas[i], b.ids[i]); + } + + // Truncate left + b.deltas.resize(mid_idx); + b.ids.resize(mid_idx); + // Rebuild left bitmap + b.summary_bitmap = ndd::RoaringBitmap(); + for(auto pid : b.ids) { + b.summary_bitmap.add(pid); + } + + // Now add new value to correct bucket + if(value >= right_b.base_value) { + right_b.add(value, id); + } else { + // If value < right, goes to left. + // But wait, split point was determined by existing items. + // If new value is >= base+split_delta, it goes right. + // BUT we just cleared right from b. + // Correct logic: + b.add(value, id); // Add to left if it fits range (logic handles delta) + // Oh wait, if we added to left, we might overflow again or break order? + // Simply: Check which bucket covers it. + // Left covers [Base, RightBase-1] + // Right covers [RightBase, ...] + } + + // Save Left + auto left_bytes = b.serialize(); + MDBX_val left_v{left_bytes.data(), left_bytes.size()}; + MDBX_val left_k{const_cast(target_key_str.data()), + target_key_str.size()}; + mdbx_cursor_put(cursor, &left_k, &left_v, MDBX_CURRENT); + + // Save Right + auto right_bytes = right_b.serialize(); + std::string right_k_str = make_bucket_key(field, right_b.base_value); + MDBX_val right_k{const_cast(right_k_str.data()), right_k_str.size()}; + MDBX_val right_v{right_bytes.data(), right_bytes.size()}; + + // Use put for new key + mdbx_put(txn, inverted_dbi_, &right_k, &right_v, MDBX_UPSERT); } else { // Normal Insert b.add(value, id); auto bytes = b.serialize(); MDBX_val new_data{bytes.data(), bytes.size()}; - + // Use cursor put to update current - mdbx_cursor_put(cursor, &k, &new_data, MDBX_CURRENT); + mdbx_cursor_put(cursor, &k, &new_data, MDBX_CURRENT); } } mdbx_cursor_close(cursor); @@ -526,7 +558,9 @@ namespace ndd { ndd::RoaringBitmap range(const std::string& field, uint32_t min_val, uint32_t max_val) { ndd::RoaringBitmap result; MDBX_txn* txn; - if (mdbx_txn_begin(env_, nullptr, MDBX_TXN_RDONLY, &txn) != MDBX_SUCCESS) return result; + if(mdbx_txn_begin(env_, nullptr, MDBX_TXN_RDONLY, &txn) != MDBX_SUCCESS) { + return result; + } MDBX_cursor* cursor; mdbx_cursor_open(txn, inverted_dbi_, &cursor); @@ -537,83 +571,88 @@ namespace ndd { MDBX_val data; int rc = mdbx_cursor_get(cursor, &key, &data, MDBX_SET_RANGE); - if (rc == MDBX_SUCCESS) { + if(rc == MDBX_SUCCESS) { // Check if we need to back up - std::string fkey((char*)key.iov_base, key.iov_len); - if (fkey.rfind(field + ":", 0) != 0 || parse_bucket_key_val(fkey) > min_val) { - // Check prev - MDBX_val p_key = key; - MDBX_val p_data; - if (mdbx_cursor_get(cursor, &p_key, &p_data, MDBX_PREV) == MDBX_SUCCESS) { - std::string pkey_str((char*)p_key.iov_base, p_key.iov_len); - if (pkey_str.rfind(field + ":", 0) == 0) { - // Prev is valid start - key = p_key; data = p_data; - rc = MDBX_SUCCESS; - } - } - } - } else if (rc == MDBX_NOTFOUND) { - rc = mdbx_cursor_get(cursor, &key, &data, MDBX_LAST); - if (rc == MDBX_SUCCESS && data.iov_len > 0) { - std::string fkey((char*)key.iov_base, key.iov_len); - if (fkey.rfind(field + ":", 0) == 0) { - rc = MDBX_SUCCESS; - } else { - rc = MDBX_NOTFOUND; - } - } else { - rc = MDBX_NOTFOUND; - } + std::string fkey((char*)key.iov_base, key.iov_len); + if(fkey.rfind(field + ":", 0) != 0 || parse_bucket_key_val(fkey) > min_val) { + // Check prev + MDBX_val p_key = key; + MDBX_val p_data; + if(mdbx_cursor_get(cursor, &p_key, &p_data, MDBX_PREV) == MDBX_SUCCESS) { + std::string pkey_str((char*)p_key.iov_base, p_key.iov_len); + if(pkey_str.rfind(field + ":", 0) == 0) { + // Prev is valid start + key = p_key; + data = p_data; + rc = MDBX_SUCCESS; + } + } + } + } else if(rc == MDBX_NOTFOUND) { + rc = mdbx_cursor_get(cursor, &key, &data, MDBX_LAST); + if(rc == MDBX_SUCCESS && data.iov_len > 0) { + std::string fkey((char*)key.iov_base, key.iov_len); + if(fkey.rfind(field + ":", 0) == 0) { + rc = MDBX_SUCCESS; + } else { + rc = MDBX_NOTFOUND; + } + } else { + rc = MDBX_NOTFOUND; + } } // Iterate forward - while (rc == MDBX_SUCCESS) { + while(rc == MDBX_SUCCESS) { std::string cur_key((char*)key.iov_base, key.iov_len); - if (cur_key.rfind(field + ":", 0) != 0) break; // End of field + if(cur_key.rfind(field + ":", 0) != 0) { + break; // End of field + } uint32_t bucket_base = parse_bucket_key_val(cur_key); - - if (bucket_base > max_val) break; // Past the end + + if(bucket_base > max_val) { + break; // Past the end + } // Peek Strategy: // If bucket_base >= min_val, we know the start is covered. // If we could know NEXT bucket start, we'd know overlap. // Since we iterate, we can be greedy on read. - - // For now, always deserialize. - // Potential optimization: Read only bitmap if we are "deep" in the range. + + // For now, always deserialize. + // Potential optimization: Read only bitmap if we are "deep" in the range. // e.g. min_val=10, max_val=100. Bucket=20. // If bucket=20. Next Bucket=30. // Then Bucket 20 covers [20..30). // Range [10..100] covers [20..30] fully. - // So we need lookahead. - + // So we need lookahead. + // Simple logic without lookahead: - // Just read full bucket. It's 8KB max (2 pages). + // Just read full bucket. It's 8KB max (2 pages). // It's fast unless we have millions of buckets. - + Bucket b = Bucket::deserialize(data.iov_base, data.iov_len, bucket_base); - - if (b.ids.empty()) { + + if(b.ids.empty()) { rc = mdbx_cursor_get(cursor, &key, &data, MDBX_NEXT); continue; } uint32_t b_min = b.get_value(0); - uint32_t b_max = b.get_value(b.ids.size()-1); + uint32_t b_max = b.get_value(b.ids.size() - 1); - if (b_min >= min_val && b_max <= max_val) { - // Full overlap - result |= b.summary_bitmap; + if(b_min >= min_val && b_max <= max_val) { + // Full overlap + result |= b.summary_bitmap; } else { // Partial overlap - for(size_t i=0; i= min_val && v <= max_val) { - result.add(b.ids[i]); - } - } + for(size_t i = 0; i < b.ids.size(); ++i) { + uint32_t v = b.get_value(i); + if(v >= min_val && v <= max_val) { + result.add(b.ids[i]); + } + } } rc = mdbx_cursor_get(cursor, &key, &data, MDBX_NEXT); @@ -624,25 +663,32 @@ namespace ndd { return result; } - bool check_range(const std::string& field, ndd::idInt id, uint32_t min_val, uint32_t max_val) { + bool check_range(const std::string& field, + ndd::idInt id, + uint32_t min_val, + uint32_t max_val) { MDBX_txn* txn; - if(mdbx_txn_begin(env_, nullptr, MDBX_TXN_RDONLY, &txn) != MDBX_SUCCESS) return false; - + if(mdbx_txn_begin(env_, nullptr, MDBX_TXN_RDONLY, &txn) != MDBX_SUCCESS) { + return false; + } + std::string fwd_key_str = make_forward_key(field, id); MDBX_val fwd_key{const_cast(fwd_key_str.data()), fwd_key_str.size()}; MDBX_val fwd_val; - + bool match = false; if(mdbx_get(txn, forward_dbi_, &fwd_key, &fwd_val) == MDBX_SUCCESS) { uint32_t val; std::memcpy(&val, fwd_val.iov_base, 4); - if(val >= min_val && val <= max_val) match = true; + if(val >= min_val && val <= max_val) { + match = true; + } } - + mdbx_txn_abort(txn); return match; } }; - } // namespace filter -} // namespace ndd + } // namespace filter +} // namespace ndd diff --git a/src/main.cpp b/src/main.cpp index c51ec54cc..9e726b073 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -209,7 +209,7 @@ int main(int argc, char** argv) { LOG_INFO("DEFAULT_MAX_ELEMENTS: " << settings::DEFAULT_MAX_ELEMENTS); LOG_INFO("DEFAULT_MAX_ELEMENTS_INCREMENT: " << settings::DEFAULT_MAX_ELEMENTS_INCREMENT); LOG_INFO("DEFAULT_MAX_ELEMENTS_INCREMENT_TRIGGER: " - << settings::DEFAULT_MAX_ELEMENTS_INCREMENT_TRIGGER); + << settings::DEFAULT_MAX_ELEMENTS_INCREMENT_TRIGGER); // Path to React build directory // Get the executable's directory and resolve frontend/dist relative to it @@ -314,7 +314,9 @@ int main(int argc, char** argv) { } if(!body.has("index_name") || !body.has("dim") || !body.has("space_type")) { - LOG_WARN(1012, ctx.username, "Create-index request is missing required parameters"); + LOG_WARN(1012, + ctx.username, + "Create-index request is missing required parameters"); return json_error(400, "Missing required parameters"); } @@ -356,7 +358,8 @@ int main(int argc, char** argv) { } // Get quantization level (default to INT16) - std::string precision = body.has("precision") ? std::string(body["precision"].s()) : "int16"; + std::string precision = + body.has("precision") ? std::string(body["precision"].s()) : "int16"; if(precision == "int8d") { precision = "int8"; @@ -386,11 +389,14 @@ int main(int argc, char** argv) { size_in_millions = static_cast(body["size_in_millions"].i()); if(size_in_millions == 0 || size_in_millions > 10000) { // Cap at 10B vectors LOG_WARN(1017, - index_id, - "Invalid custom size_in_millions: " << size_in_millions); + index_id, + "Invalid custom size_in_millions: " << size_in_millions); return json_error(400, "size_in_millions must be between 1 and 10000"); } - LOG_INFO(1018, index_id, "Creating index with custom size: " << size_in_millions << "M vectors"); + LOG_INFO(1018, + index_id, + "Creating index with custom size: " << size_in_millions + << "M vectors"); } size_t sparse_dim = body.has("sparse_dim") ? (size_t)body["sparse_dim"].i() : 0; @@ -412,8 +418,10 @@ int main(int argc, char** argv) { LOG_WARN(1019, index_id, "Create-index request failed: " << e.what()); return json_error(409, e.what()); } catch(const std::exception& e) { - return json_error_500( - ctx.username, body["index_name"].s(), req.url, std::string("Error: ") + e.what()); + return json_error_500(ctx.username, + body["index_name"].s(), + req.url, + std::string("Error: ") + e.what()); } }); @@ -426,7 +434,10 @@ int main(int argc, char** argv) { auto body = crow::json::load(req.body); if(!body || !body.has("name")) { - LOG_WARN(1020, ctx.username, index_name, "Create-backup request missing backup name"); + LOG_WARN(1020, + ctx.username, + index_name, + "Create-backup request missing backup name"); return json_error(400, "Missing backup name"); } @@ -437,7 +448,10 @@ int main(int argc, char** argv) { std::pair result = index_manager.createBackupAsync(index_id, backup_name); if(!result.first) { - LOG_WARN(1021, ctx.username, index_name, "Create-backup request rejected: " << result.second); + LOG_WARN(1021, + ctx.username, + index_name, + "Create-backup request rejected: " << result.second); return json_error(400, result.second); } @@ -478,17 +492,21 @@ int main(int argc, char** argv) { auto body = crow::json::load(req.body); if(!body || !body.has("target_index_name")) { - LOG_WARN(1022, ctx.username, "Restore-backup request missing target index name"); + LOG_WARN( + 1022, ctx.username, "Restore-backup request missing target index name"); return json_error(400, "Missing target_index_name"); } std::string target_index_name = body["target_index_name"].s(); try { - std::pair result = - index_manager.restoreBackup(backup_name, target_index_name, ctx.username); + std::pair result = index_manager.restoreBackup( + backup_name, target_index_name, ctx.username); if(!result.first) { - LOG_WARN(1023, ctx.username, target_index_name, "Restore-backup request rejected: " << result.second); + LOG_WARN(1023, + ctx.username, + target_index_name, + "Restore-backup request rejected: " << result.second); return json_error(400, result.second); } return crow::response(201, "Backup restored successfully"); @@ -504,9 +522,12 @@ int main(int argc, char** argv) { const std::string& backup_name) { auto& ctx = app.get_context(req); try { - std::pair result = index_manager.deleteBackup(backup_name, ctx.username); + std::pair result = + index_manager.deleteBackup(backup_name, ctx.username); if(!result.first) { - LOG_WARN(1024, ctx.username, "Delete-backup request rejected: " << result.second); + LOG_WARN(1024, + ctx.username, + "Delete-backup request rejected: " << result.second); return json_error(400, result.second); } return crow::response(204, "Backup deleted successfully"); @@ -528,15 +549,17 @@ int main(int argc, char** argv) { } } - std::string backup_file = - settings::DATA_DIR + "/backups/" + settings::DEFAULT_USERNAME + "/" + backup_name + ".tar"; + std::string backup_file = settings::DATA_DIR + "/backups/" + + settings::DEFAULT_USERNAME + "/" + backup_name + + ".tar"; if(!std::filesystem::exists(backup_file)) { - LOG_WARN(1058, settings::DEFAULT_USERNAME, "Backup download requested for missing backup " << backup_name); + LOG_WARN(1058, + settings::DEFAULT_USERNAME, + "Backup download requested for missing backup " << backup_name); return json_error(404, "Backup not found"); } - crow::response response; response.set_static_file_info_unsafe(backup_file); response.set_header("Content-Type", "application/x-tar"); @@ -576,8 +599,12 @@ int main(int argc, char** argv) { if(backup_name.ends_with(".tar")) { backup_name = backup_name.substr(0, backup_name.size() - 4); } else { - LOG_WARN(1059, ctx.username, "Backup upload used invalid file extension"); - return json_error(400, "Invalid backup file extension. Expected .tar file"); + LOG_WARN(1059, + ctx.username, + "Backup upload used invalid file extension"); + return json_error( + 400, + "Invalid backup file extension. Expected .tar file"); } } file_content = part.body; @@ -591,7 +618,9 @@ int main(int argc, char** argv) { } if(file_content.empty()) { - LOG_WARN(1061, ctx.username, "Backup upload request missing backup file content"); + LOG_WARN(1061, + ctx.username, + "Backup upload request missing backup file content"); return json_error(400, "Missing backup file content"); } @@ -599,7 +628,9 @@ int main(int argc, char** argv) { std::pair result = index_manager.validateBackupName(backup_name); if(!result.first) { - LOG_WARN(1062, ctx.username, "Backup upload request rejected: " << result.second); + LOG_WARN(1062, + ctx.username, + "Backup upload request rejected: " << result.second); return json_error(400, result.second); } @@ -608,7 +639,9 @@ int main(int argc, char** argv) { std::filesystem::create_directories(user_backup_dir); std::string backup_path = user_backup_dir + "/" + backup_name + ".tar"; if(std::filesystem::exists(backup_path)) { - LOG_WARN(1063, ctx.username, "Backup upload conflicts with existing backup " << backup_name); + LOG_WARN(1063, + ctx.username, + "Backup upload conflicts with existing backup " << backup_name); return json_error(409, "Backup with name '" + backup_name + "' already exists"); } @@ -625,8 +658,7 @@ int main(int argc, char** argv) { if(!out.good()) { // Clean up partial file on error std::filesystem::remove(backup_path); - return json_error_500( - ctx.username, req.url, "Failed to write backup file"); + return json_error_500(ctx.username, req.url, "Failed to write backup file"); } return crow::response(201, "Backup uploaded successfully"); @@ -643,7 +675,7 @@ int main(int argc, char** argv) { try { auto active = index_manager.getActiveBackup(ctx.username); crow::json::wvalue response; - if (active) { + if(active) { response["active"] = true; response["backup_name"] = active->backup_name; response["index_id"] = active->index_id; @@ -664,8 +696,10 @@ int main(int argc, char** argv) { auto& ctx = app.get_context(req); try { auto info = index_manager.getBackupInfo(backup_name, ctx.username); - if (info.empty()) { - LOG_WARN(1064, ctx.username, "Backup-info request for missing backup " << backup_name); + if(info.empty()) { + LOG_WARN(1064, + ctx.username, + "Backup-info request for missing backup " << backup_name); return json_error(404, "Backup not found or metadata missing"); } crow::response res; @@ -724,11 +758,17 @@ int main(int argc, char** argv) { if(index_manager.deleteIndex(index_id)) { return crow::response(200, "Index deleted successfully"); } else { - LOG_WARN(1030, ctx.username, index_name, "Delete-index request for missing index"); + LOG_WARN(1030, + ctx.username, + index_name, + "Delete-index request for missing index"); return json_error(404, "Index not found"); } } catch(const std::runtime_error& e) { - LOG_WARN(1031, ctx.username, index_name, "Delete-index request rejected: " << e.what()); + LOG_WARN(1031, + ctx.username, + index_name, + "Delete-index request rejected: " << e.what()); return json_error(400, e.what()); } catch(const std::exception& e) { return json_error_500(ctx.username, @@ -749,12 +789,18 @@ int main(int argc, char** argv) { auto body = crow::json::load(req.body); if(!body || !body.has("k")) { - LOG_WARN(1032, ctx.username, index_name, "Search request missing parameter k or has invalid JSON"); + LOG_WARN(1032, + ctx.username, + index_name, + "Search request missing parameter k or has invalid JSON"); return json_error(400, "Missing required parameters: k"); } if(!body.has("vector") && !body.has("sparse_indices")) { - LOG_WARN(1033, ctx.username, index_name, "Search request missing dense and sparse query vectors"); + LOG_WARN(1033, + ctx.username, + index_name, + "Search request missing dense and sparse query vectors"); return json_error(400, "Missing query vector (dense or sparse)"); } @@ -782,9 +828,9 @@ int main(int argc, char** argv) { if(sparse_indices.size() != sparse_values.size()) { LOG_WARN(1034, - ctx.username, - index_name, - "Search request has mismatched sparse_indices and sparse_values"); + ctx.username, + index_name, + "Search request has mismatched sparse_indices and sparse_values"); return json_error(400, "Mismatch between sparse_indices and sparse_values size"); } @@ -806,44 +852,56 @@ int main(int argc, char** argv) { auto raw_filter = nlohmann::json::parse(body["filter"].s()); // Expect new array-based filter format if(!raw_filter.is_array()) { - LOG_WARN(1036, ctx.username, index_name, "Search request used invalid filter format"); + LOG_WARN(1036, + ctx.username, + index_name, + "Search request used invalid filter format"); return json_error(400, "Filter must be an array. Please use format: " "[{\"field\":{\"$op\":value}}]"); } filter_array = raw_filter; } catch(const std::exception& e) { - LOG_WARN(1037, ctx.username, index_name, "Search request filter JSON parsing failed: " << e.what()); + LOG_WARN(1037, + ctx.username, + index_name, + "Search request filter JSON parsing failed: " << e.what()); return json_error(400, std::string("Invalid filter JSON: ") + e.what()); } } // Extract filter parameters (Option B from chat plan) ndd::FilterParams filter_params; - if (body.has("filter_params")) { - auto fp = body["filter_params"]; - if (fp.has("prefilter_threshold")) { - filter_params.prefilter_threshold = static_cast(fp["prefilter_threshold"].i()); - } - if (fp.has("boost_percentage")) { - filter_params.boost_percentage = static_cast(fp["boost_percentage"].i()); - } + if(body.has("filter_params")) { + auto fp = body["filter_params"]; + if(fp.has("prefilter_threshold")) { + filter_params.prefilter_threshold = + static_cast(fp["prefilter_threshold"].i()); + } + if(fp.has("boost_percentage")) { + filter_params.boost_percentage = + static_cast(fp["boost_percentage"].i()); + } } LOG_DEBUG("Filter: " << filter_array.dump()); try { auto search_response = index_manager.searchKNN(index_id, - query, - sparse_indices, - sparse_values, - k, - filter_array, - filter_params, - include_vectors, - ef); + query, + sparse_indices, + sparse_values, + k, + filter_array, + filter_params, + include_vectors, + ef); if(!search_response) { - LOG_WARN(1038, ctx.username, index_name, "Search request returned no results because the index is missing or search failed"); + LOG_WARN(1038, + ctx.username, + index_name, + "Search request returned no results because the index is missing " + "or search failed"); return json_error(404, "Index not found or search failed"); } @@ -854,15 +912,17 @@ int main(int argc, char** argv) { resp.add_header("Content-Type", "application/msgpack"); return resp; } catch(const std::runtime_error& e) { - LOG_WARN(1039, ctx.username, index_name, "Search request rejected: " << e.what()); + LOG_WARN(1039, + ctx.username, + index_name, + "Search request rejected: " << e.what()); return json_error(400, e.what()); } catch(const std::exception& e) { LOG_DEBUG("Search failed: " << e.what()); - return json_error_500( - ctx.username, - index_name, - req.url, - std::string("Search failed: ") + e.what()); + return json_error_500(ctx.username, + index_name, + req.url, + std::string("Search failed: ") + e.what()); } }); @@ -880,7 +940,10 @@ int main(int argc, char** argv) { if(content_type == "application/json") { auto body = crow::json::load(req.body); if(!body) { - LOG_WARN(1040, ctx.username, index_name, "Insert request contained invalid JSON"); + LOG_WARN(1040, + ctx.username, + index_name, + "Insert request contained invalid JSON"); return json_error(400, "Invalid JSON"); } @@ -942,7 +1005,10 @@ int main(int argc, char** argv) { bool success = index_manager.addVectors(index_id, vectors); return crow::response(success ? 200 : 400); } catch(const std::runtime_error& e) { - LOG_WARN(1041, ctx.username, index_name, "Insert request rejected: " << e.what()); + LOG_WARN(1041, + ctx.username, + index_name, + "Insert request rejected: " << e.what()); return json_error(400, e.what()); } catch(const std::exception& e) { return json_error_500(ctx.username, index_name, req.url, e.what()); @@ -967,14 +1033,20 @@ int main(int argc, char** argv) { return crow::response(success ? 200 : 400); } } catch(const std::runtime_error& e) { - LOG_WARN(1042, ctx.username, index_name, "Insert request rejected: " << e.what()); + LOG_WARN(1042, + ctx.username, + index_name, + "Insert request rejected: " << e.what()); return json_error(400, e.what()); } catch(const std::exception& e) { LOG_DEBUG("Batch insertion failed: " << e.what()); return json_error_500(ctx.username, index_name, req.url, e.what()); } } else { - LOG_WARN(1043, ctx.username, index_name, "Insert request used unsupported Content-Type: " << content_type); + LOG_WARN(1043, + ctx.username, + index_name, + "Insert request used unsupported Content-Type: " << content_type); return crow::response( 400, "Content-Type must be application/msgpack or application/json"); } @@ -983,39 +1055,43 @@ int main(int argc, char** argv) { // Get a single vector CROW_ROUTE(app, "/api/v1/index//vector/get") .CROW_MIDDLEWARES(app, AuthMiddleware) - .methods("POST"_method)( - [&index_manager, &app](const crow::request& req, std::string index_name) { - auto& ctx = app.get_context(req); - std::string index_id = ctx.username + "/" + index_name; - - // Read vector ID from JSON input (still using JSON for ID here) - auto body = crow::json::load(req.body); - if(!body || !body.has("id")) { - LOG_WARN(1044, ctx.username, index_name, "Get-vector request missing vector id"); - return json_error(400, "Missing required parameter 'id'"); - } - std::string vector_id = body["id"].s(); - try { - auto vector = index_manager.getVector(index_id, vector_id); - if(!vector) { - LOG_WARN(1045, ctx.username, index_name, "Get-vector request for missing vector id " << vector_id); - return json_error(404, "Vector with the given ID does not exist"); - } - // Serialize vector as MsgPack - msgpack::sbuffer sbuf; - msgpack::pack(sbuf, vector.value()); - // Return as MessagePack - crow::response resp(200, std::string(sbuf.data(), sbuf.size())); - resp.add_header("Content-Type", "application/msgpack"); - return resp; - } catch(const std::exception& e) { - LOG_DEBUG("Failed to get vector: " << e.what()); - return json_error_500(ctx.username, - index_name, - req.url, - std::string("Failed to get vector: ") + e.what()); - } - }); + .methods("POST"_method)([&index_manager, &app](const crow::request& req, + std::string index_name) { + auto& ctx = app.get_context(req); + std::string index_id = ctx.username + "/" + index_name; + + // Read vector ID from JSON input (still using JSON for ID here) + auto body = crow::json::load(req.body); + if(!body || !body.has("id")) { + LOG_WARN( + 1044, ctx.username, index_name, "Get-vector request missing vector id"); + return json_error(400, "Missing required parameter 'id'"); + } + std::string vector_id = body["id"].s(); + try { + auto vector = index_manager.getVector(index_id, vector_id); + if(!vector) { + LOG_WARN(1045, + ctx.username, + index_name, + "Get-vector request for missing vector id " << vector_id); + return json_error(404, "Vector with the given ID does not exist"); + } + // Serialize vector as MsgPack + msgpack::sbuffer sbuf; + msgpack::pack(sbuf, vector.value()); + // Return as MessagePack + crow::response resp(200, std::string(sbuf.data(), sbuf.size())); + resp.add_header("Content-Type", "application/msgpack"); + return resp; + } catch(const std::exception& e) { + LOG_DEBUG("Failed to get vector: " << e.what()); + return json_error_500(ctx.username, + index_name, + req.url, + std::string("Failed to get vector: ") + e.what()); + } + }); // Delete a vector CROW_ROUTE(app, "/api/v1/index//vector//delete") @@ -1032,11 +1108,17 @@ int main(int argc, char** argv) { if(index_manager.deleteVector(index_id, vector_id)) { return crow::response(200, "Vector deleted successfully"); } else { - LOG_WARN(1046, ctx.username, index_name, "Delete-vector request for missing vector id " << vector_id); + LOG_WARN(1046, + ctx.username, + index_name, + "Delete-vector request for missing vector id " << vector_id); return json_error(404, "Vector with the given ID does not exist"); } } catch(const std::runtime_error& e) { - LOG_WARN(1047, ctx.username, index_name, "Delete-vector request rejected: " << e.what()); + LOG_WARN(1047, + ctx.username, + index_name, + "Delete-vector request rejected: " << e.what()); return json_error(400, e.what()); } catch(const std::exception& e) { LOG_DEBUG("Failed to delete vector: " << e.what()); @@ -1059,18 +1141,27 @@ int main(int argc, char** argv) { try { body = nlohmann::json::parse(req.body); } catch(const std::exception& e) { - LOG_WARN(1048, ctx.username, index_name, "Delete-by-filter request contained invalid JSON"); + LOG_WARN(1048, + ctx.username, + index_name, + "Delete-by-filter request contained invalid JSON"); return json_error(400, "Invalid JSON body"); } if(!body.contains("filter")) { - LOG_WARN(1049, ctx.username, index_name, "Delete-by-filter request is missing filter"); + LOG_WARN(1049, + ctx.username, + index_name, + "Delete-by-filter request is missing filter"); return json_error(400, "Invalid request body - missing filter"); } try { nlohmann::json filter_array = body["filter"]; // Expect new array-based filter format if(!filter_array.is_array()) { - LOG_WARN(1050, ctx.username, index_name, "Delete-by-filter request used invalid filter format"); + LOG_WARN(1050, + ctx.username, + index_name, + "Delete-by-filter request used invalid filter format"); return json_error(400, "Filter must be an array. Please use format: " "[{\"field\":{\"$op\":value}}]"); @@ -1080,7 +1171,10 @@ int main(int argc, char** argv) { return crow::response(200, std::to_string(deleted_count) + " vectors deleted"); } catch(const std::runtime_error& e) { - LOG_WARN(1051, ctx.username, index_name, "Delete-by-filter request rejected: " << e.what()); + LOG_WARN(1051, + ctx.username, + index_name, + "Delete-by-filter request rejected: " << e.what()); return json_error(400, e.what()); } catch(const std::exception& e) { return json_error_500(ctx.username, @@ -1102,12 +1196,18 @@ int main(int argc, char** argv) { try { body = nlohmann::json::parse(req.body); } catch(const std::exception& e) { - LOG_WARN(1052, ctx.username, index_name, "Update-filters request contained invalid JSON"); + LOG_WARN(1052, + ctx.username, + index_name, + "Update-filters request contained invalid JSON"); return json_error(400, "Invalid JSON body"); } if(!body.contains("updates") || !body["updates"].is_array()) { - LOG_WARN(1053, ctx.username, index_name, "Update-filters request missing valid updates array"); + LOG_WARN(1053, + ctx.username, + index_name, + "Update-filters request missing valid updates array"); return json_error(400, "Missing or invalid 'updates' field. Must be a list of {id, " "filter} objects."); @@ -1129,7 +1229,10 @@ int main(int argc, char** argv) { return crow::response(200, std::to_string(count) + " filters updated"); } catch(const std::runtime_error& e) { - LOG_WARN(1054, ctx.username, index_name, "Update-filters request rejected: " << e.what()); + LOG_WARN(1054, + ctx.username, + index_name, + "Update-filters request rejected: " << e.what()); return json_error(400, e.what()); } catch(const std::exception& e) { return json_error_500(ctx.username, @@ -1148,7 +1251,10 @@ int main(int argc, char** argv) { try { auto info = index_manager.getIndexInfo(index_id); if(!info) { - LOG_WARN(1055, ctx.username, index_name, "Index-info request for missing index"); + LOG_WARN(1055, + ctx.username, + index_name, + "Index-info request for missing index"); return json_error(404, "Index does not exist"); } crow::json::wvalue response( @@ -1163,13 +1269,14 @@ int main(int argc, char** argv) { {"lib_token", settings::DEFAULT_LIB_TOKEN}}); return crow::response(200, response.dump()); } catch(const std::runtime_error& e) { - LOG_WARN(1056, ctx.username, index_name, "Index-info request failed: " << e.what()); + LOG_WARN(1056, + ctx.username, + index_name, + "Index-info request failed: " << e.what()); return json_error(404, std::string("Error: ") + e.what()); } catch(const std::exception& e) { - return json_error_500(ctx.username, - index_name, - req.url, - std::string("Error: ") + e.what()); + return json_error_500( + ctx.username, index_name, req.url, std::string("Error: ") + e.what()); } }); diff --git a/src/quant/binary.hpp b/src/quant/binary.hpp index 9ec11c59d..722e7e020 100644 --- a/src/quant/binary.hpp +++ b/src/quant/binary.hpp @@ -680,8 +680,8 @@ namespace ndd { float dist = dist_acc[i]; for(size_t w = 0; w < block_len; ++w) { - dist += __builtin_popcountll( - q_words[block_start + w] ^ v_words[block_start + w]); + dist += __builtin_popcountll(q_words[block_start + w] + ^ v_words[block_start + w]); } dist_acc[i] = dist; diff --git a/src/quant/float16.hpp b/src/quant/float16.hpp index 5ff1c6b44..d3963870d 100644 --- a/src/quant/float16.hpp +++ b/src/quant/float16.hpp @@ -836,7 +836,8 @@ namespace ndd { } for(size_t i = 0; i < count; ++i) { - const uint16_t* v_ptr = static_cast(vectors[i]) + block_start; + const uint16_t* v_ptr = + static_cast(vectors[i]) + block_start; float dot = dot_acc[i]; float vec_sq = l2_metric ? vec_sq_acc[i] : 0.0f; @@ -845,8 +846,10 @@ namespace ndd { __m512 dot_vec = _mm512_setzero_ps(); __m512 sq_vec = _mm512_setzero_ps(); for(; d + 16 <= block_len; d += 16) { - __m256i q_h = _mm256_loadu_si256(reinterpret_cast(q_ptr + d)); - __m256i v_h = _mm256_loadu_si256(reinterpret_cast(v_ptr + d)); + __m256i q_h = + _mm256_loadu_si256(reinterpret_cast(q_ptr + d)); + __m256i v_h = + _mm256_loadu_si256(reinterpret_cast(v_ptr + d)); __m512 qv = _mm512_cvtph_ps(q_h); __m512 vv = _mm512_cvtph_ps(v_h); dot_vec = _mm512_fmadd_ps(qv, vv, dot_vec); @@ -862,21 +865,23 @@ namespace ndd { __m256 dot_vec = _mm256_setzero_ps(); __m256 sq_vec = _mm256_setzero_ps(); for(; d + 8 <= block_len; d += 8) { - __m128i q_h = _mm_loadu_si128(reinterpret_cast(q_ptr + d)); - __m128i v_h = _mm_loadu_si128(reinterpret_cast(v_ptr + d)); + __m128i q_h = + _mm_loadu_si128(reinterpret_cast(q_ptr + d)); + __m128i v_h = + _mm_loadu_si128(reinterpret_cast(v_ptr + d)); __m256 qv = _mm256_cvtph_ps(q_h); __m256 vv = _mm256_cvtph_ps(v_h); -#if defined(__FMA__) +# if defined(__FMA__) dot_vec = _mm256_fmadd_ps(qv, vv, dot_vec); if(l2_metric) { sq_vec = _mm256_fmadd_ps(vv, vv, sq_vec); } -#else +# else dot_vec = _mm256_add_ps(dot_vec, _mm256_mul_ps(qv, vv)); if(l2_metric) { sq_vec = _mm256_add_ps(sq_vec, _mm256_mul_ps(vv, vv)); } -#endif +# endif } { __m128 lo = _mm256_castps256_ps128(dot_vec); @@ -915,9 +920,12 @@ namespace ndd { size_t lane = svcnth(); for(; d + lane <= block_len; d += lane) { svbool_t pg16 = svptrue_b16(); - svbool_t pg32 = svptrue_b32();; - svfloat16_t q_h = svld1_f16(pg16, reinterpret_cast(q_ptr + d)); - svfloat16_t v_h = svld1_f16(pg16, reinterpret_cast(v_ptr + d)); + svbool_t pg32 = svptrue_b32(); + ; + svfloat16_t q_h = + svld1_f16(pg16, reinterpret_cast(q_ptr + d)); + svfloat16_t v_h = + svld1_f16(pg16, reinterpret_cast(v_ptr + d)); svfloat32_t qv = svcvt_f32_f16_x(pg32, q_h); svfloat32_t vv = svcvt_f32_f16_x(pg32, v_h); dot += svaddv_f32(pg32, svmul_f32_x(pg32, qv, vv)); @@ -1045,9 +1053,9 @@ namespace ndd { __m128i p3 = _mm512_cvtepi32_epi8(i3); _mm_storeu_si128((__m128i*)&data_ptr[i], p0); - _mm_storeu_si128((__m128i*)&data_ptr[i+16], p1); - _mm_storeu_si128((__m128i*)&data_ptr[i+32], p2); - _mm_storeu_si128((__m128i*)&data_ptr[i+48], p3); + _mm_storeu_si128((__m128i*)&data_ptr[i + 16], p1); + _mm_storeu_si128((__m128i*)&data_ptr[i + 32], p2); + _mm_storeu_si128((__m128i*)&data_ptr[i + 48], p3); } #elif defined(USE_AVX2) size_t i = 0; diff --git a/src/quant/float32.hpp b/src/quant/float32.hpp index a0553380a..dc75a43fe 100644 --- a/src/quant/float32.hpp +++ b/src/quant/float32.hpp @@ -510,7 +510,8 @@ namespace hnswlib { } for(size_t i = 0; i < count; ++i) { - const float* v_ptr = reinterpret_cast(vectors[i]) + block_start; + const float* v_ptr = + reinterpret_cast(vectors[i]) + block_start; float dot = dot_acc[i]; float vec_sq = l2_metric ? vec_sq_acc[i] : 0.0f; @@ -536,17 +537,17 @@ namespace hnswlib { for(; d + 8 <= block_len; d += 8) { __m256 qv = _mm256_loadu_ps(q_ptr + d); __m256 vv = _mm256_loadu_ps(v_ptr + d); -#if defined(__FMA__) +# if defined(__FMA__) dot_vec = _mm256_fmadd_ps(qv, vv, dot_vec); if(l2_metric) { sq_vec = _mm256_fmadd_ps(vv, vv, sq_vec); } -#else +# else dot_vec = _mm256_add_ps(dot_vec, _mm256_mul_ps(qv, vv)); if(l2_metric) { sq_vec = _mm256_add_ps(sq_vec, _mm256_mul_ps(vv, vv)); } -#endif +# endif } { __m128 lo = _mm256_castps256_ps128(dot_vec); @@ -582,7 +583,8 @@ namespace hnswlib { #elif defined(USE_SVE2) size_t lane = svcntw(); for(; d + lane <= block_len; d += lane) { - svbool_t pg = svptrue_b32();; + svbool_t pg = svptrue_b32(); + ; svfloat32_t qv = svld1_f32(pg, q_ptr + d); svfloat32_t vv = svld1_f32(pg, v_ptr + d); dot += svaddv_f32(pg, svmul_f32_x(pg, qv, vv)); diff --git a/src/quant/int16.hpp b/src/quant/int16.hpp index a80cf8342..22f0bcd8c 100644 --- a/src/quant/int16.hpp +++ b/src/quant/int16.hpp @@ -820,11 +820,9 @@ namespace ndd { __m512i prod1 = _mm512_dpwssd_epi32(_mm512_setzero_si512(), v1_1, v2_1); __m512i prod0_lo = _mm512_cvtepi32_epi64(_mm512_castsi512_si256(prod0)); - __m512i prod0_hi = - _mm512_cvtepi32_epi64(_mm512_extracti32x8_epi32(prod0, 1)); + __m512i prod0_hi = _mm512_cvtepi32_epi64(_mm512_extracti32x8_epi32(prod0, 1)); __m512i prod1_lo = _mm512_cvtepi32_epi64(_mm512_castsi512_si256(prod1)); - __m512i prod1_hi = - _mm512_cvtepi32_epi64(_mm512_extracti32x8_epi32(prod1, 1)); + __m512i prod1_hi = _mm512_cvtepi32_epi64(_mm512_extracti32x8_epi32(prod1, 1)); sum_vec0 = _mm512_add_epi64(sum_vec0, prod0_lo); sum_vec1 = _mm512_add_epi64(sum_vec1, prod0_hi); @@ -1113,8 +1111,8 @@ namespace ndd { _mm512_dpwssd_epi32(_mm512_setzero_si512(), v_i16, v_i16); __m512i sq_i64_lo = _mm512_cvtepi32_epi64(_mm512_castsi512_si256(sq_i32)); - __m512i sq_i64_hi = _mm512_cvtepi32_epi64( - _mm512_extracti32x8_epi32(sq_i32, 1)); + __m512i sq_i64_hi = + _mm512_cvtepi32_epi64(_mm512_extracti32x8_epi32(sq_i32, 1)); sq_vec_lo = _mm512_add_epi64(sq_vec_lo, sq_i64_lo); sq_vec_hi = _mm512_add_epi64(sq_vec_hi, sq_i64_hi); } @@ -1140,7 +1138,8 @@ namespace ndd { __m256i q_i32 = _mm256_cvtepi16_epi32(q_i16); __m256i v_i32 = _mm256_cvtepi16_epi32(v_i16); __m256i dot_i32 = _mm256_mullo_epi32(q_i32, v_i32); - __m256i dot_i64_lo = _mm256_cvtepi32_epi64(_mm256_castsi256_si128(dot_i32)); + __m256i dot_i64_lo = + _mm256_cvtepi32_epi64(_mm256_castsi256_si128(dot_i32)); __m256i dot_i64_hi = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(dot_i32, 1)); dot_vec_lo = _mm256_add_epi64(dot_vec_lo, dot_i64_lo); @@ -1183,13 +1182,16 @@ namespace ndd { int16x8_t v_i16 = vld1q_s16(vec + block_start + d); int32x4_t dot_lo = vmull_s16(vget_low_s16(q_i16), vget_low_s16(v_i16)); - int32x4_t dot_hi = vmull_s16(vget_high_s16(q_i16), vget_high_s16(v_i16)); + int32x4_t dot_hi = + vmull_s16(vget_high_s16(q_i16), vget_high_s16(v_i16)); dot_vec = vpadalq_s32(dot_vec, dot_lo); dot_vec = vpadalq_s32(dot_vec, dot_hi); if(l2_metric) { - int32x4_t sq_lo = vmull_s16(vget_low_s16(v_i16), vget_low_s16(v_i16)); - int32x4_t sq_hi = vmull_s16(vget_high_s16(v_i16), vget_high_s16(v_i16)); + int32x4_t sq_lo = + vmull_s16(vget_low_s16(v_i16), vget_low_s16(v_i16)); + int32x4_t sq_hi = + vmull_s16(vget_high_s16(v_i16), vget_high_s16(v_i16)); sq_vec = vpadalq_s32(sq_vec, sq_lo); sq_vec = vpadalq_s32(sq_vec, sq_hi); } diff --git a/src/quant/int8.hpp b/src/quant/int8.hpp index f0ba9733b..a41a86640 100644 --- a/src/quant/int8.hpp +++ b/src/quant/int8.hpp @@ -688,15 +688,14 @@ namespace ndd { float dot2 = static_cast(svaddv_s32(svptrue_b32(), sum_sq2)); float dot_prod = static_cast(svaddv_s32(svptrue_b32(), sum_prod)); - res = (dot1 * scale1) * scale1 + (dot2 * scale2) * scale2 - - 2.0f * ((dot_prod * scale1) * scale2); + res = (dot1 * scale1) * scale1 + (dot2 * scale2) * scale2 + - 2.0f * ((dot_prod * scale1) * scale2); #elif defined(USE_NEON) // NEON implementation for L2Sqr // Uses the expansion: (a*s1 - b*s2)^2 = a^2*s1^2 + b^2*s2^2 - 2ab*s1*s2 // This allows using integer dot products for the terms. - int32x4_t sum_sq1 = vdupq_n_s32(0); int32x4_t sum_sq2 = vdupq_n_s32(0); int32x4_t sum_prod = vdupq_n_s32(0); @@ -745,8 +744,8 @@ namespace ndd { float dot2 = static_cast(vaddvq_s32(sum_sq2)); float dot_prod = static_cast(vaddvq_s32(sum_prod)); - res = (dot1 * scale1) * scale1 + (dot2 * scale2) * scale2 - - 2.0f * ((dot_prod * scale1) * scale2); + res = (dot1 * scale1) * scale1 + (dot2 * scale2) * scale2 + - 2.0f * ((dot_prod * scale1) * scale2); #endif for(; i < qty; i++) { diff --git a/src/sparse/inverted_index.cpp b/src/sparse/inverted_index.cpp index 1b4e5b8c4..ec19dba30 100644 --- a/src/sparse/inverted_index.cpp +++ b/src/sparse/inverted_index.cpp @@ -21,25 +21,18 @@ namespace ndd { namespace { - template - struct PostingValueAccessor; + template struct PostingValueAccessor; - template <> - struct PostingValueAccessor { + template <> struct PostingValueAccessor { using ValueType = float; - static inline bool isLive(ValueType value) { - return value > 0.0f; - } + static inline bool isLive(ValueType value) { return value > 0.0f; } }; - template <> - struct PostingValueAccessor { + template <> struct PostingValueAccessor { using ValueType = uint8_t; - static inline bool isLive(ValueType value) { - return value > 0; - } + static inline bool isLive(ValueType value) { return value > 0; } }; #ifdef ND_SPARSE_INSTRUMENT @@ -107,16 +100,18 @@ namespace ndd { private: SteadyClock::time_point start_; }; -#endif // ND_SPARSE_INSTRUMENT +#endif // ND_SPARSE_INSTRUMENT } // namespace #ifdef ND_SPARSE_INSTRUMENT void printSparseSearchDebugStats() { SparseSearchDebugStats& stats = sparseSearchDebugStats(); - const uint64_t visited = stats.phase2_iterators_visited.exchange(0, std::memory_order_relaxed); + const uint64_t visited = + stats.phase2_iterators_visited.exchange(0, std::memory_order_relaxed); const uint64_t contributed = stats.phase2_iterators_contributed.exchange(0, std::memory_order_relaxed); - const uint64_t parse_calls = stats.parse_current_kv_calls.exchange(0, std::memory_order_relaxed); + const uint64_t parse_calls = + stats.parse_current_kv_calls.exchange(0, std::memory_order_relaxed); const uint64_t parse_total_ns = stats.parse_current_kv_total_ns.exchange(0, std::memory_order_relaxed); @@ -125,7 +120,8 @@ namespace ndd { LOG_INFO("phase3 iterators contributed: " << contributed); LOG_INFO("phase3 contribution rate(%): " << std::fixed << std::setprecision(3) - << (visited ? (100.0 * static_cast(contributed) / static_cast(visited)) + << (visited ? (100.0 * static_cast(contributed) + / static_cast(visited)) : 0.0)); LOG_INFO("parseCurrentKV count: " << parse_calls); LOG_INFO("parseCurrentKV total(ms): " @@ -141,24 +137,29 @@ namespace ndd { void printSparseUpdateDebugStats() { SparseUpdateDebugStats& stats = sparseUpdateDebugStats(); - const uint64_t add_batch_calls = stats.add_batch_calls.exchange(0, std::memory_order_relaxed); + const uint64_t add_batch_calls = + stats.add_batch_calls.exchange(0, std::memory_order_relaxed); const uint64_t add_batch_docs = stats.add_batch_docs.exchange(0, std::memory_order_relaxed); - const uint64_t add_batch_terms = stats.add_batch_terms.exchange(0, std::memory_order_relaxed); + const uint64_t add_batch_terms = + stats.add_batch_terms.exchange(0, std::memory_order_relaxed); const uint64_t add_batch_raw_updates = stats.add_batch_raw_updates.exchange(0, std::memory_order_relaxed); const uint64_t add_batch_deduped_updates = stats.add_batch_deduped_updates.exchange(0, std::memory_order_relaxed); - const uint64_t add_batch_blocks = stats.add_batch_blocks.exchange(0, std::memory_order_relaxed); + const uint64_t add_batch_blocks = + stats.add_batch_blocks.exchange(0, std::memory_order_relaxed); const uint64_t build_term_updates_total_ns = stats.build_term_updates_total_ns.exchange(0, std::memory_order_relaxed); const uint64_t sort_dedup_total_ns = stats.sort_dedup_total_ns.exchange(0, std::memory_order_relaxed); - const uint64_t load_block_calls = stats.load_block_calls.exchange(0, std::memory_order_relaxed); + const uint64_t load_block_calls = + stats.load_block_calls.exchange(0, std::memory_order_relaxed); const uint64_t load_block_total_ns = stats.load_block_total_ns.exchange(0, std::memory_order_relaxed); const uint64_t load_block_entries_total = stats.load_block_entries_total.exchange(0, std::memory_order_relaxed); - const uint64_t merge_block_calls = stats.merge_block_calls.exchange(0, std::memory_order_relaxed); + const uint64_t merge_block_calls = + stats.merge_block_calls.exchange(0, std::memory_order_relaxed); const uint64_t merge_block_total_ns = stats.merge_block_total_ns.exchange(0, std::memory_order_relaxed); const uint64_t merge_existing_entries_total = @@ -167,7 +168,8 @@ namespace ndd { stats.merge_update_entries_total.exchange(0, std::memory_order_relaxed); const uint64_t merge_output_entries_total = stats.merge_output_entries_total.exchange(0, std::memory_order_relaxed); - const uint64_t save_block_calls = stats.save_block_calls.exchange(0, std::memory_order_relaxed); + const uint64_t save_block_calls = + stats.save_block_calls.exchange(0, std::memory_order_relaxed); const uint64_t save_block_total_ns = stats.save_block_total_ns.exchange(0, std::memory_order_relaxed); const uint64_t save_block_entries_total = @@ -196,89 +198,92 @@ namespace ndd { << (static_cast(load_block_total_ns) / 1'000'000.0)); LOG_INFO("loadBlockEntries avg(us): " << std::fixed << std::setprecision(3) - << (load_block_calls - ? (static_cast(load_block_total_ns) / 1000.0) - / static_cast(load_block_calls) - : 0.0)); + << (load_block_calls ? (static_cast(load_block_total_ns) / 1000.0) + / static_cast(load_block_calls) + : 0.0)); LOG_INFO("loadBlockEntries avg existing entries: " << std::fixed << std::setprecision(3) - << (load_block_calls - ? static_cast(load_block_entries_total) - / static_cast(load_block_calls) - : 0.0)); + << (load_block_calls ? static_cast(load_block_entries_total) + / static_cast(load_block_calls) + : 0.0)); LOG_INFO("merge blocks count: " << merge_block_calls); LOG_INFO("merge blocks total(ms): " << std::fixed << std::setprecision(3) << (static_cast(merge_block_total_ns) / 1'000'000.0)); LOG_INFO("merge blocks avg(us): " << std::fixed << std::setprecision(3) - << (merge_block_calls - ? (static_cast(merge_block_total_ns) / 1000.0) - / static_cast(merge_block_calls) - : 0.0)); + << (merge_block_calls ? (static_cast(merge_block_total_ns) / 1000.0) + / static_cast(merge_block_calls) + : 0.0)); LOG_INFO("merge avg existing entries: " << std::fixed << std::setprecision(3) - << (merge_block_calls - ? static_cast(merge_existing_entries_total) - / static_cast(merge_block_calls) - : 0.0)); + << (merge_block_calls ? static_cast(merge_existing_entries_total) + / static_cast(merge_block_calls) + : 0.0)); LOG_INFO("merge avg update entries: " << std::fixed << std::setprecision(3) - << (merge_block_calls - ? static_cast(merge_update_entries_total) - / static_cast(merge_block_calls) - : 0.0)); + << (merge_block_calls ? static_cast(merge_update_entries_total) + / static_cast(merge_block_calls) + : 0.0)); LOG_INFO("merge avg output entries: " << std::fixed << std::setprecision(3) - << (merge_block_calls - ? static_cast(merge_output_entries_total) - / static_cast(merge_block_calls) - : 0.0)); + << (merge_block_calls ? static_cast(merge_output_entries_total) + / static_cast(merge_block_calls) + : 0.0)); LOG_INFO("saveBlockEntries count: " << save_block_calls); LOG_INFO("saveBlockEntries total(ms): " << std::fixed << std::setprecision(3) << (static_cast(save_block_total_ns) / 1'000'000.0)); LOG_INFO("saveBlockEntries avg(us): " << std::fixed << std::setprecision(3) - << (save_block_calls - ? (static_cast(save_block_total_ns) / 1000.0) - / static_cast(save_block_calls) - : 0.0)); + << (save_block_calls ? (static_cast(save_block_total_ns) / 1000.0) + / static_cast(save_block_calls) + : 0.0)); LOG_INFO("saveBlockEntries avg entries: " << std::fixed << std::setprecision(3) - << (save_block_calls - ? static_cast(save_block_entries_total) - / static_cast(save_block_calls) - : 0.0)); + << (save_block_calls ? static_cast(save_block_entries_total) + / static_cast(save_block_calls) + : 0.0)); LOG_INFO("recomputeGlobalMax count: " << recompute_max_calls); LOG_INFO("recomputeGlobalMax total(ms): " << std::fixed << std::setprecision(3) << (static_cast(recompute_max_total_ns) / 1'000'000.0)); LOG_INFO("recomputeGlobalMax avg(us): " << std::fixed << std::setprecision(3) - << (recompute_max_calls - ? (static_cast(recompute_max_total_ns) / 1000.0) - / static_cast(recompute_max_calls) - : 0.0)); + << (recompute_max_calls ? (static_cast(recompute_max_total_ns) / 1000.0) + / static_cast(recompute_max_calls) + : 0.0)); std::cout << "=================================\n"; } #else - void printSparseSearchDebugStats() {} - void printSparseUpdateDebugStats() {} -#endif // ND_SPARSE_INSTRUMENT + void printSparseSearchDebugStats() { + } + void printSparseUpdateDebugStats() { + } +#endif // ND_SPARSE_INSTRUMENT - InvertedIndex::InvertedIndex(MDBX_env* env, size_t vocab_size, const std::string& index_id) - : env_(env), blocked_term_postings_dbi_(0), vocab_size_(vocab_size), index_id_(index_id) {} + InvertedIndex::InvertedIndex(MDBX_env* env, size_t vocab_size, const std::string& index_id) : + env_(env), + blocked_term_postings_dbi_(0), + vocab_size_(vocab_size), + index_id_(index_id) { + } void InvertedIndex::applyHeaderDelta(PostingListHeader& header, - int64_t total_delta, - int64_t live_delta) { + int64_t total_delta, + int64_t live_delta) { int64_t new_total = static_cast(header.nr_entries) + total_delta; int64_t new_live = static_cast(header.nr_live_entries) + live_delta; - if (new_total < 0) new_total = 0; - if (new_live < 0) new_live = 0; - if (new_live > new_total) new_live = new_total; + if(new_total < 0) { + new_total = 0; + } + if(new_live < 0) { + new_live = 0; + } + if(new_live > new_total) { + new_live = new_total; + } header.nr_entries = static_cast(new_total); header.nr_live_entries = static_cast(new_live); @@ -287,20 +292,21 @@ namespace ndd { bool InvertedIndex::validateSuperBlock(MDBX_txn* txn) { SuperBlock sb; bool sb_found = false; - if (!readSuperBlock(txn, &sb, &sb_found)) { + if(!readSuperBlock(txn, &sb, &sb_found)) { return false; } - if (!sb_found) { + if(!sb_found) { // Check whether the DBI already has data (legacy DB without superblock). MDBX_stat stat; int rc = mdbx_dbi_stat(txn, blocked_term_postings_dbi_, &stat, sizeof(stat)); - if (rc == MDBX_SUCCESS && stat.ms_entries > 0) { + if(rc == MDBX_SUCCESS && stat.ms_entries > 0) { LOG_ERROR(2201, index_id_, - "Sparse index database exists without a superblock; it was created by an older incompatible version"); + "Sparse index database exists without a superblock; it was created by an " + "older incompatible version"); throw std::runtime_error( - "Incompatible sparse index: database has no superblock (legacy format)"); + "Incompatible sparse index: database has no superblock (legacy format)"); } // Fresh database — write the superblock. @@ -309,23 +315,22 @@ namespace ndd { index_id_, "Writing fresh sparse superblock (version=" << static_cast(settings::SPARSE_ONDISK_VERSION) << ")"); - if (!writeSuperBlock(txn, sb)) { + if(!writeSuperBlock(txn, sb)) { return false; } return true; } - if (sb.format_version != settings::SPARSE_ONDISK_VERSION) { + if(sb.format_version != settings::SPARSE_ONDISK_VERSION) { LOG_ERROR(2203, index_id_, "Sparse index format version mismatch: on-disk=" << static_cast(sb.format_version) << " compiled=" << static_cast(settings::SPARSE_ONDISK_VERSION)); - throw std::runtime_error( - "Incompatible sparse index: format version " - + std::to_string(sb.format_version) - + " does not match compiled version " - + std::to_string(settings::SPARSE_ONDISK_VERSION)); + throw std::runtime_error("Incompatible sparse index: format version " + + std::to_string(sb.format_version) + + " does not match compiled version " + + std::to_string(settings::SPARSE_ONDISK_VERSION)); } return true; @@ -336,52 +341,56 @@ namespace ndd { MDBX_txn* txn = nullptr; int rc = mdbx_txn_begin(env_, nullptr, MDBX_TXN_READWRITE, &txn); - if (rc != MDBX_SUCCESS) { - LOG_ERROR(2204, index_id_, "Failed to begin sparse index init transaction: " << mdbx_strerror(rc)); + if(rc != MDBX_SUCCESS) { + LOG_ERROR(2204, + index_id_, + "Failed to begin sparse index init transaction: " << mdbx_strerror(rc)); return false; } rc = mdbx_dbi_open(txn, - "blocked_term_postings", - MDBX_CREATE | MDBX_INTEGERKEY, - &blocked_term_postings_dbi_); - if (rc != MDBX_SUCCESS) { - LOG_ERROR(2205, index_id_, "Failed to open blocked_term_postings DBI: " << mdbx_strerror(rc)); + "blocked_term_postings", + MDBX_CREATE | MDBX_INTEGERKEY, + &blocked_term_postings_dbi_); + if(rc != MDBX_SUCCESS) { + LOG_ERROR(2205, + index_id_, + "Failed to open blocked_term_postings DBI: " << mdbx_strerror(rc)); mdbx_txn_abort(txn); return false; } - if (!validateSuperBlock(txn)) { + if(!validateSuperBlock(txn)) { mdbx_txn_abort(txn); return false; } rc = mdbx_txn_commit(txn); - if (rc != MDBX_SUCCESS) { - LOG_ERROR(2206, index_id_, "Failed to commit sparse index init transaction: " << mdbx_strerror(rc)); + if(rc != MDBX_SUCCESS) { + LOG_ERROR(2206, + index_id_, + "Failed to commit sparse index init transaction: " << mdbx_strerror(rc)); return false; } - if (!loadTermInfo()) { + if(!loadTermInfo()) { return false; } - LOG_INFO(2207, index_id_, "Sparse index initialized with " << term_info_.size() << " loaded terms"); + LOG_INFO(2207, + index_id_, + "Sparse index initialized with " << term_info_.size() << " loaded terms"); return true; } - bool InvertedIndex::addDocumentsBatch( - MDBX_txn* txn, - const std::vector>& docs) - { + bool + InvertedIndex::addDocumentsBatch(MDBX_txn* txn, + const std::vector>& docs) { std::unique_lock lock(mutex_); return addDocumentsBatchInternal(txn, docs); } - bool InvertedIndex::removeDocument(MDBX_txn* txn, - ndd::idInt doc_id, - const SparseVector& vec) - { + bool InvertedIndex::removeDocument(MDBX_txn* txn, ndd::idInt doc_id, const SparseVector& vec) { std::unique_lock lock(mutex_); return removeDocumentInternal(txn, doc_id, vec); } @@ -396,12 +405,11 @@ namespace ndd { template bool InvertedIndex::accumulateBatchScores(PostingListIterator* it, - ndd::idInt batch_start, - uint32_t batch_end_block_nr, - BlockOffset batch_end_block_offset, - float* scores_buf, - float term_weight) - { + ndd::idInt batch_start, + uint32_t batch_end_block_nr, + BlockOffset batch_end_block_offset, + float* scores_buf, + float term_weight) { using Accessor = PostingValueAccessor; using ValueType = typename Accessor::ValueType; @@ -412,23 +420,24 @@ namespace ndd { float block_max_value = it->max_value; bool contributed = false; - while (true) { - if (it->current_block_nr > batch_end_block_nr) { + while(true) { + if(it->current_block_nr > batch_end_block_nr) { break; } const bool consume_full_block = it->current_block_nr < batch_end_block_nr; - const int64_t local_base = - static_cast(it->currentBlockBaseDocId()) - static_cast(batch_start); + const int64_t local_base = static_cast(it->currentBlockBaseDocId()) + - static_cast(batch_start); const uint32_t before = idx; - while (idx < sz && (consume_full_block || offsets[idx] <= batch_end_block_offset)) { + while(idx < sz && (consume_full_block || offsets[idx] <= batch_end_block_offset)) { const ValueType value = vals[idx]; - if (Accessor::isLive(value)) { + if(Accessor::isLive(value)) { const size_t local = static_cast(local_base + offsets[idx]); - if constexpr (StoreFloats) { + if constexpr(StoreFloats) { scores_buf[local] += value * term_weight; } else { - scores_buf[local] += InvertedIndex::dequantize(value, block_max_value) * term_weight; + scores_buf[local] += + InvertedIndex::dequantize(value, block_max_value) * term_weight; } contributed = true; } @@ -436,12 +445,12 @@ namespace ndd { } it->consumeEntries(idx - before); - if (idx < sz) { + if(idx < sz) { break; } it->current_entry_idx = idx; - if (!it->loadNextBlock()) { + if(!it->loadNextBlock()) { break; } @@ -451,11 +460,9 @@ namespace ndd { idx = 0; sz = it->data_size; - if (it->current_block_nr > batch_end_block_nr - || (it->current_block_nr == batch_end_block_nr - && sz > 0 - && offsets[0] > batch_end_block_offset)) - { + if(it->current_block_nr > batch_end_block_nr + || (it->current_block_nr == batch_end_block_nr && sz > 0 + && offsets[0] > batch_end_block_offset)) { break; } } @@ -466,20 +473,19 @@ namespace ndd { } std::vector> - InvertedIndex::search(const SparseVector& query, - size_t k, - const ndd::RoaringBitmap* filter) - { + InvertedIndex::search(const SparseVector& query, size_t k, const ndd::RoaringBitmap* filter) { std::shared_lock lock(mutex_); MDBX_txn* txn = nullptr; int rc = mdbx_txn_begin(env_, nullptr, MDBX_TXN_RDONLY, &txn); - if (rc != MDBX_SUCCESS) { - LOG_ERROR(2208, index_id_, "Failed to begin sparse search transaction: " << mdbx_strerror(rc)); + if(rc != MDBX_SUCCESS) { + LOG_ERROR(2208, + index_id_, + "Failed to begin sparse search transaction: " << mdbx_strerror(rc)); return {}; } - if (query.empty() || k == 0) { + if(query.empty() || k == 0) { mdbx_txn_abort(txn); return {}; } @@ -495,66 +501,63 @@ namespace ndd { { LOG_TIME("search phase 1"); - // Build one iterator per live query term. Each iterator owns a cursor and lazily - // streams the term's block rows instead of pulling the whole posting list in memory. - for (size_t qi = 0; qi < query.indices.size(); qi++) { - uint32_t term_id = query.indices[qi]; - if (term_id == kMetadataTermId) continue; - - float qw = query.values[qi]; - if (qw <= 0.0f) continue; + // Build one iterator per live query term. Each iterator owns a cursor and lazily + // streams the term's block rows instead of pulling the whole posting list in memory. + for(size_t qi = 0; qi < query.indices.size(); qi++) { + uint32_t term_id = query.indices[qi]; + if(term_id == kMetadataTermId) { + continue; + } + float qw = query.values[qi]; + if(qw <= 0.0f) { + continue; + } - auto info_it = term_info_.find(term_id); - if (info_it == term_info_.end()) { - LOG_WARN(2209, index_id_, "Search skipped unknown query term_id=" << term_id); - continue; - } + auto info_it = term_info_.find(term_id); + if(info_it == term_info_.end()) { + LOG_WARN(2209, index_id_, "Search skipped unknown query term_id=" << term_id); + continue; + } - bool header_found = false; - PostingListHeader header = readPostingListHeader(txn, term_id, &header_found); - if (!header_found || header.nr_entries == 0 || header.nr_live_entries == 0) { - continue; - } + bool header_found = false; + PostingListHeader header = readPostingListHeader(txn, term_id, &header_found); + if(!header_found || header.nr_entries == 0 || header.nr_live_entries == 0) { + continue; + } - MDBX_cursor* cursor = nullptr; - rc = mdbx_cursor_open(txn, blocked_term_postings_dbi_, &cursor); - if (rc != MDBX_SUCCESS) { - LOG_ERROR(2210, - index_id_, - "Failed to open sparse search cursor for term " - << term_id << ": " << mdbx_strerror(rc)); - continue; - } + MDBX_cursor* cursor = nullptr; + rc = mdbx_cursor_open(txn, blocked_term_postings_dbi_, &cursor); + if(rc != MDBX_SUCCESS) { + LOG_ERROR(2210, + index_id_, + "Failed to open sparse search cursor for term " << term_id << ": " + << mdbx_strerror(rc)); + continue; + } - PostingListIterator it; - it.init(cursor, - term_id, - qw, - info_it->second, - header.nr_entries, - this); + PostingListIterator it; + it.init(cursor, term_id, qw, info_it->second, header.nr_entries, this); - if (it.current_doc_id != EXHAUSTED_DOC_ID) { - iters_storage.push_back(it); - cursors.push_back(cursor); - } else { - mdbx_cursor_close(cursor); + if(it.current_doc_id != EXHAUSTED_DOC_ID) { + iters_storage.push_back(it); + cursors.push_back(cursor); + } else { + mdbx_cursor_close(cursor); + } } - } - for (size_t i = 0; i < iters_storage.size(); i++) { - iters.push_back(&iters_storage[i]); - } + for(size_t i = 0; i < iters_storage.size(); i++) { + iters.push_back(&iters_storage[i]); + } - if (iters.empty()) { - mdbx_txn_abort(txn); - return {}; - } + if(iters.empty()) { + mdbx_txn_abort(txn); + return {}; + } - //END OF PHASE 1 + //END OF PHASE 1 } - bool use_pruning = (iters.size() > 1); float best_min_score = 0.0f; @@ -565,8 +568,8 @@ namespace ndd { auto minIterDocId = [&iters]() -> ndd::idInt { ndd::idInt min_id = EXHAUSTED_DOC_ID; - for (size_t i = 0; i < iters.size(); i++) { - if (iters[i]->current_doc_id < min_id) { + for(size_t i = 0; i < iters.size(); i++) { + if(iters[i]->current_doc_id < min_id) { min_id = iters[i]->current_doc_id; } } @@ -577,135 +580,140 @@ namespace ndd { // Process the index in doc-id windows. The accumulator is dense within the current // window even though the posting lists themselves stay sparse and block-based. - while (min_id != EXHAUSTED_DOC_ID) { + while(min_id != EXHAUSTED_DOC_ID) { ndd::idInt batch_start = min_id; - ndd::idInt batch_end = batch_start - + (ndd::idInt)settings::INV_IDX_SEARCH_BATCH_SZ - 1; - if (batch_end < batch_start) { + ndd::idInt batch_end = batch_start + (ndd::idInt)settings::INV_IDX_SEARCH_BATCH_SZ - 1; + if(batch_end < batch_start) { batch_end = EXHAUSTED_DOC_ID - 1; } const uint32_t batch_end_block_nr = docToBlockNr(batch_end); const BlockOffset batch_end_block_offset = docToBlockOffset(batch_end); size_t batch_len = (size_t)(batch_end - batch_start) + 1; - if (batch_len > scores_buf.size()) { + if(batch_len > scores_buf.size()) { scores_buf.resize(batch_len); } std::memset(scores_buf.data(), 0, batch_len * sizeof(float)); { - LOG_TIME("search phase 2"); - // Consume all postings that fall into this batch. The iterator keeps absolute doc_ids - // implicit as (current_block_nr, doc_offsets[idx]) to avoid rebuilding them eagerly. - for (size_t i = 0; i < iters.size(); i++) { - PostingListIterator* it = iters[i]; + LOG_TIME("search phase 2"); + // Consume all postings that fall into this batch. The iterator keeps absolute + // doc_ids implicit as (current_block_nr, doc_offsets[idx]) to avoid rebuilding them + // eagerly. + for(size_t i = 0; i < iters.size(); i++) { + PostingListIterator* it = iters[i]; #ifdef ND_SPARSE_INSTRUMENT - sparseSearchDebugStats().phase2_iterators_visited.fetch_add(1, std::memory_order_relaxed); -#endif // ND_SPARSE_INSTRUMENT - if (it->current_doc_id > batch_end) { - continue; - } - [[maybe_unused]] const bool phase3_contributed = + sparseSearchDebugStats().phase2_iterators_visited.fetch_add( + 1, std::memory_order_relaxed); +#endif // ND_SPARSE_INSTRUMENT + if(it->current_doc_id > batch_end) { + continue; + } + [[maybe_unused]] const bool phase3_contributed = #if defined(NDD_INV_IDX_STORE_FLOATS) - accumulateBatchScores( - it, - batch_start, - batch_end_block_nr, - batch_end_block_offset, - scores_buf.data(), - it->term_weight); + accumulateBatchScores(it, + batch_start, + batch_end_block_nr, + batch_end_block_offset, + scores_buf.data(), + it->term_weight); #else - accumulateBatchScores( - it, - batch_start, - batch_end_block_nr, - batch_end_block_offset, - scores_buf.data(), - it->term_weight); -#endif // NDD_INV_IDX_STORE_FLOATS + accumulateBatchScores(it, + batch_start, + batch_end_block_nr, + batch_end_block_offset, + scores_buf.data(), + it->term_weight); +#endif // NDD_INV_IDX_STORE_FLOATS #ifdef ND_SPARSE_INSTRUMENT - if (phase3_contributed) { - sparseSearchDebugStats().phase2_iterators_contributed.fetch_add( - 1, std::memory_order_relaxed); + if(phase3_contributed) { + sparseSearchDebugStats().phase2_iterators_contributed.fetch_add( + 1, std::memory_order_relaxed); + } +#endif // ND_SPARSE_INSTRUMENT } -#endif // ND_SPARSE_INSTRUMENT - } - //END OF SEARCH PHASE 2 + //END OF SEARCH PHASE 2 } { LOG_TIME("search phase 3"); - // Only scores inside the current batch can be non-zero, so convert that temporary - // dense buffer into top-k candidates before moving to the next window. - for (size_t local = 0; local < batch_len; local++) { - float s = scores_buf[local]; - if (s == 0.0f || s <= threshold) continue; - - ndd::idInt doc_id = batch_start + (ndd::idInt)local; - if (filter && !filter->contains(doc_id)) continue; - - if (top_results.size() < k) { - top_results.emplace(doc_id, s); - if (top_results.size() == k) { + // Only scores inside the current batch can be non-zero, so convert that temporary + // dense buffer into top-k candidates before moving to the next window. + for(size_t local = 0; local < batch_len; local++) { + float s = scores_buf[local]; + if(s == 0.0f || s <= threshold) { + continue; + } + + ndd::idInt doc_id = batch_start + (ndd::idInt)local; + if(filter && !filter->contains(doc_id)) { + continue; + } + + if(top_results.size() < k) { + top_results.emplace(doc_id, s); + if(top_results.size() == k) { + threshold = top_results.top().score; + } + } else if(s > threshold) { + top_results.pop(); + top_results.emplace(doc_id, s); threshold = top_results.top().score; } - } else if (s > threshold) { - top_results.pop(); - top_results.emplace(doc_id, s); - threshold = top_results.top().score; } - } - //END OF SEARCH PHASE 3 + //END OF SEARCH PHASE 3 } { LOG_TIME("search phase 4"); - // Compact away exhausted iterators, then optionally prune the longest remaining list - // when its best possible future contribution cannot beat the current threshold. - size_t write_idx = 0; - for (size_t i = 0; i < iters.size(); i++) { - if (iters[i]->current_doc_id != EXHAUSTED_DOC_ID) { - iters[write_idx++] = iters[i]; + // Compact away exhausted iterators, then optionally prune the longest remaining + // list when its best possible future contribution cannot beat the current + // threshold. + size_t write_idx = 0; + for(size_t i = 0; i < iters.size(); i++) { + if(iters[i]->current_doc_id != EXHAUSTED_DOC_ID) { + iters[write_idx++] = iters[i]; + } + } + iters.resize(write_idx); + if(iters.empty()) { + break; } - } - iters.resize(write_idx); - if (iters.empty()) break; - min_id = minIterDocId(); + min_id = minIterDocId(); - if (use_pruning && top_results.size() >= k) { - float new_min_score = threshold; - if (!nearEqual(new_min_score, best_min_score)) { - best_min_score = new_min_score; - pruneLongest(iters, new_min_score); - min_id = minIterDocId(); + if(use_pruning && top_results.size() >= k) { + float new_min_score = threshold; + if(!nearEqual(new_min_score, best_min_score)) { + best_min_score = new_min_score; + pruneLongest(iters, new_min_score); + min_id = minIterDocId(); + } } - } - //END OF SEARCH PHASE 4 + //END OF SEARCH PHASE 4 } } #ifdef NDD_INV_IDX_PRUNE_DEBUG - for (const PostingListIterator& it : iters_storage) { + for(const PostingListIterator& it : iters_storage) { LOG_INFO(2229, index_id_, "Sparse prune stats: term_id=" << it.term_id - << " posting_list_len=" << it.initial_entries - << " pruned_len=" << it.pruned_entries); + << " posting_list_len=" << it.initial_entries + << " pruned_len=" << it.pruned_entries); } -#endif // NDD_INV_IDX_PRUNE_DEBUG +#endif // NDD_INV_IDX_PRUNE_DEBUG - for (MDBX_cursor* cursor : cursors) { + for(MDBX_cursor* cursor : cursors) { mdbx_cursor_close(cursor); } mdbx_txn_abort(txn); std::vector> results; results.reserve(top_results.size()); - while (!top_results.empty()) { - results.push_back( - std::make_pair(top_results.top().doc_id, top_results.top().score)); + while(!top_results.empty()) { + results.push_back(std::make_pair(top_results.top().doc_id, top_results.top().score)); top_results.pop(); } std::reverse(results.begin(), results.end()); @@ -713,27 +721,31 @@ namespace ndd { } inline uint8_t InvertedIndex::quantize(float val, float max_val) { - if (max_val <= settings::NEAR_ZERO) + if(max_val <= settings::NEAR_ZERO) { return 0; + } float scaled = (val / max_val) * UINT8_MAX; - if (scaled >= UINT8_MAX) + if(scaled >= UINT8_MAX) { return UINT8_MAX; - if (scaled <= 0.0f) + } + if(scaled <= 0.0f) { return 0; + } uint8_t result = (uint8_t)(scaled + 0.5f); /** * Since a 0 weight is considered deleted, * we change it to 1 - */ + */ return result == 0 ? 1 : result; } inline float InvertedIndex::dequantize(uint8_t val, float max_val) { - if (max_val <= settings::NEAR_ZERO) + if(max_val <= settings::NEAR_ZERO) { return 0.0f; + } return (float)val * (max_val / UINT8_MAX); } @@ -742,21 +754,20 @@ namespace ndd { // ========================================================================= size_t InvertedIndex::findDocIdSIMD(const uint32_t* doc_ids, - size_t size, - size_t start_idx, - uint32_t target) const - { + size_t size, + size_t start_idx, + uint32_t target) const { size_t idx = start_idx; #if defined(USE_AVX512) const size_t simd_width = 16; __m512i target_vec = _mm512_set1_epi32((int)target); - while (idx + simd_width <= size) { + while(idx + simd_width <= size) { __m512i data_vec = _mm512_loadu_si512(doc_ids + idx); __mmask16 mask = _mm512_cmpge_epu32_mask(data_vec, target_vec); - if (mask != 0) { + if(mask != 0) { return idx + __builtin_ctz(mask); } idx += simd_width; @@ -765,20 +776,19 @@ namespace ndd { const size_t simd_width = 8; __m256i target_vec = _mm256_set1_epi32((int)target); - while (idx + simd_width <= size) { + while(idx + simd_width <= size) { __builtin_prefetch(doc_ids + idx + 32); - if (doc_ids[idx + simd_width - 1] < target) { + if(doc_ids[idx + simd_width - 1] < target) { idx += simd_width; continue; } - __m256i data_vec = - _mm256_loadu_si256((const __m256i*)(doc_ids + idx)); + __m256i data_vec = _mm256_loadu_si256((const __m256i*)(doc_ids + idx)); __m256i max_vec = _mm256_max_epu32(data_vec, target_vec); __m256i cmp = _mm256_cmpeq_epi32(max_vec, data_vec); int mask = _mm256_movemask_ps(_mm256_castsi256_ps(cmp)); - if (mask != 0) { + if(mask != 0) { return idx + __builtin_ctz(mask); } idx += simd_width; @@ -787,11 +797,11 @@ namespace ndd { svbool_t pg = svwhilelt_b32(idx, size); svuint32_t target_vec = svdup_u32(target); - while (svptest_any(svptrue_b32(), pg)) { + while(svptest_any(svptrue_b32(), pg)) { svuint32_t data_vec = svld1_u32(pg, doc_ids + idx); svbool_t cmp = svcmpge_u32(pg, data_vec, target_vec); - if (svptest_any(pg, cmp)) { + if(svptest_any(pg, cmp)) { svbool_t before_match = svbrkb_z(pg, cmp); uint64_t count = svcntp_b32(pg, before_match); return idx + count; @@ -804,42 +814,40 @@ namespace ndd { const size_t simd_width = 4; uint32x4_t target_vec = vdupq_n_u32(target); - while (idx + simd_width <= size) { + while(idx + simd_width <= size) { uint32x4_t data_vec = vld1q_u32(doc_ids + idx); uint32x4_t cmp = vcgeq_u32(data_vec, target_vec); - if (vmaxvq_u32(cmp) != 0) { - for (size_t i = 0; i < simd_width; i++) { - if (doc_ids[idx + i] >= target) { + if(vmaxvq_u32(cmp) != 0) { + for(size_t i = 0; i < simd_width; i++) { + if(doc_ids[idx + i] >= target) { return idx + i; } } } idx += simd_width; } -#endif // USE_AVX512 +#endif // USE_AVX512 - while (idx < size && doc_ids[idx] < target) { + while(idx < size && doc_ids[idx] < target) { idx++; } return idx; } - size_t InvertedIndex::findNextLiveSIMD(const uint8_t* values, - size_t size, - size_t start_idx) const - { + size_t + InvertedIndex::findNextLiveSIMD(const uint8_t* values, size_t size, size_t start_idx) const { size_t idx = start_idx; #if defined(USE_AVX512) const size_t simd_width = 64; __m512i zero_vec = _mm512_setzero_si512(); - while (idx + simd_width <= size) { + while(idx + simd_width <= size) { __m512i data_vec = _mm512_loadu_si512(values + idx); __mmask64 mask = _mm512_cmpneq_epu8_mask(data_vec, zero_vec); - if (mask != 0) { + if(mask != 0) { return idx + __builtin_ctzll(mask); } idx += simd_width; @@ -848,13 +856,12 @@ namespace ndd { const size_t simd_width = 32; __m256i zero_vec = _mm256_setzero_si256(); - while (idx + simd_width <= size) { - __m256i data_vec = - _mm256_loadu_si256((const __m256i*)(values + idx)); + while(idx + simd_width <= size) { + __m256i data_vec = _mm256_loadu_si256((const __m256i*)(values + idx)); __m256i cmp = _mm256_cmpeq_epi8(data_vec, zero_vec); int mask = _mm256_movemask_epi8(cmp); - if ((uint32_t)mask != 0xFFFFFFFF) { + if((uint32_t)mask != 0xFFFFFFFF) { return idx + __builtin_ctz(~mask); } idx += simd_width; @@ -863,13 +870,13 @@ namespace ndd { const size_t simd_width = 16; uint8x16_t zero_vec = vdupq_n_u8(0); - while (idx + simd_width <= size) { + while(idx + simd_width <= size) { uint8x16_t data_vec = vld1q_u8(values + idx); uint8x16_t cmp = vceqq_u8(data_vec, zero_vec); - if (vminvq_u8(cmp) == 0) { - for (size_t i = 0; i < simd_width; i++) { - if (values[idx + i] != 0) { + if(vminvq_u8(cmp) == 0) { + for(size_t i = 0; i < simd_width; i++) { + if(values[idx + i] != 0) { return idx + i; } } @@ -878,11 +885,11 @@ namespace ndd { } #elif defined(USE_SVE2) svbool_t pg = svwhilelt_b8(idx, size); - while (svptest_any(svptrue_b8(), pg)) { + while(svptest_any(svptrue_b8(), pg)) { svuint8_t data_vec = svld1_u8(pg, values + idx); svbool_t cmp = svcmpne_n_u8(pg, data_vec, 0); - if (svptest_any(pg, cmp)) { + if(svptest_any(pg, cmp)) { svbool_t before_match = svbrkb_z(pg, cmp); return idx + svcntp_b8(pg, before_match); } @@ -890,10 +897,12 @@ namespace ndd { pg = svwhilelt_b8(idx, size); } return idx; -#endif // USE_AVX512 +#endif // USE_AVX512 - while (idx < size) { - if (values[idx] != 0) return idx; + while(idx < size) { + if(values[idx] != 0) { + return idx; + } idx++; } return idx; @@ -903,30 +912,32 @@ namespace ndd { // Superblock helpers // ========================================================================= - bool InvertedIndex::readSuperBlock(MDBX_txn* txn, - SuperBlock* out, - bool* out_found) const { - if (out_found) *out_found = false; + bool InvertedIndex::readSuperBlock(MDBX_txn* txn, SuperBlock* out, bool* out_found) const { + if(out_found) { + *out_found = false; + } uint64_t packed = packPostingKey(kMetadataTermId, kSuperBlockBlockNr); MDBX_val key{&packed, sizeof(packed)}; MDBX_val data; int rc = mdbx_get(txn, blocked_term_postings_dbi_, &key, &data); - if (rc == MDBX_NOTFOUND) { + if(rc == MDBX_NOTFOUND) { return true; } - if (rc != MDBX_SUCCESS) { + if(rc != MDBX_SUCCESS) { LOG_ERROR(2211, index_id_, "readSuperBlock MDBX lookup failed: " << mdbx_strerror(rc)); return false; } - if (data.iov_len < sizeof(SuperBlock)) { + if(data.iov_len < sizeof(SuperBlock)) { LOG_ERROR(2212, index_id_, "Corrupt sparse superblock: payload too small"); return false; } std::memcpy(out, data.iov_base, sizeof(SuperBlock)); - if (out_found) *out_found = true; + if(out_found) { + *out_found = true; + } return true; } @@ -936,7 +947,7 @@ namespace ndd { MDBX_val data{const_cast(&sb), sizeof(SuperBlock)}; int rc = mdbx_put(txn, blocked_term_postings_dbi_, &key, &data, MDBX_UPSERT); - if (rc != MDBX_SUCCESS) { + if(rc != MDBX_SUCCESS) { LOG_ERROR(2213, index_id_, "writeSuperBlock MDBX put failed: " << mdbx_strerror(rc)); return false; } @@ -947,13 +958,14 @@ namespace ndd { // Metadata and block helpers // ========================================================================= - PostingListHeader InvertedIndex::readPostingListHeader(MDBX_txn* txn, - uint32_t term_id, - bool* out_found) const { + PostingListHeader + InvertedIndex::readPostingListHeader(MDBX_txn* txn, uint32_t term_id, bool* out_found) const { PostingListHeader header; - if (out_found) *out_found = false; + if(out_found) { + *out_found = false; + } - if (term_id == kMetadataTermId) { + if(term_id == kMetadataTermId) { return header; } @@ -962,19 +974,23 @@ namespace ndd { MDBX_val data; int rc = mdbx_get(txn, blocked_term_postings_dbi_, &key, &data); - if (rc == MDBX_SUCCESS && data.iov_len >= sizeof(PostingListHeader)) { + if(rc == MDBX_SUCCESS && data.iov_len >= sizeof(PostingListHeader)) { std::memcpy(&header, data.iov_base, sizeof(PostingListHeader)); - if (out_found) *out_found = true; + if(out_found) { + *out_found = true; + } } return header; } bool InvertedIndex::writePostingListHeader(MDBX_txn* txn, - uint32_t term_id, - const PostingListHeader& header) { - if (term_id == kMetadataTermId) { - LOG_ERROR(2214, index_id_, "Refusing to write a posting-list header for the reserved metadata term"); + uint32_t term_id, + const PostingListHeader& header) { + if(term_id == kMetadataTermId) { + LOG_ERROR(2214, + index_id_, + "Refusing to write a posting-list header for the reserved metadata term"); return false; } @@ -983,11 +999,11 @@ namespace ndd { MDBX_val data{const_cast(&header), sizeof(PostingListHeader)}; int rc = mdbx_put(txn, blocked_term_postings_dbi_, &key, &data, MDBX_UPSERT); - if (rc != MDBX_SUCCESS) { + if(rc != MDBX_SUCCESS) { LOG_ERROR(2215, index_id_, - "Failed to write posting-list header for term " - << term_id << ": " << mdbx_strerror(rc)); + "Failed to write posting-list header for term " << term_id << ": " + << mdbx_strerror(rc)); return false; } @@ -1005,8 +1021,12 @@ namespace ndd { bool InvertedIndex::parseBlockViewFromValue(const MDBX_val& data, uint32_t block_nr, BlockView* out_view) const { - if (!out_view) return false; - if (data.iov_len < sizeof(BlockHeader)) return false; + if(!out_view) { + return false; + } + if(data.iov_len < sizeof(BlockHeader)) { + return false; + } const BlockHeader* header = (const BlockHeader*)data.iov_base; uint32_t n = header->nr_entries; @@ -1018,18 +1038,14 @@ namespace ndd { #if defined(NDD_INV_IDX_STORE_FLOATS) uint8_t vbits = 32; const void* values = ptr; - size_t required = sizeof(BlockHeader) - + n * sizeof(BlockOffset) - + n * sizeof(float); + size_t required = sizeof(BlockHeader) + n * sizeof(BlockOffset) + n * sizeof(float); #else uint8_t vbits = 8; const void* values = ptr; - size_t required = sizeof(BlockHeader) - + n * sizeof(BlockOffset) - + n * sizeof(uint8_t); -#endif // NDD_INV_IDX_STORE_FLOATS + size_t required = sizeof(BlockHeader) + n * sizeof(BlockOffset) + n * sizeof(uint8_t); +#endif // NDD_INV_IDX_STORE_FLOATS - if (data.iov_len < required) { + if(data.iov_len < required) { LOG_ERROR(2216, index_id_, "Corrupt sparse block payload: fewer bytes than expected"); return false; } @@ -1043,19 +1059,26 @@ namespace ndd { } bool InvertedIndex::loadBlockEntries(MDBX_txn* txn, - uint32_t term_id, - uint32_t block_nr, - std::vector* entries, - uint32_t* out_live_in_block, - float* out_max_value, - bool* out_found) const - { - if (entries) entries->clear(); - if (out_live_in_block) *out_live_in_block = 0; - if (out_max_value) *out_max_value = 0.0f; - if (out_found) *out_found = false; - - if (term_id == kMetadataTermId || block_nr == kMetadataBlockNr) { + uint32_t term_id, + uint32_t block_nr, + std::vector* entries, + uint32_t* out_live_in_block, + float* out_max_value, + bool* out_found) const { + if(entries) { + entries->clear(); + } + if(out_live_in_block) { + *out_live_in_block = 0; + } + if(out_max_value) { + *out_max_value = 0.0f; + } + if(out_found) { + *out_found = false; + } + + if(term_id == kMetadataTermId || block_nr == kMetadataBlockNr) { return false; } @@ -1064,10 +1087,10 @@ namespace ndd { MDBX_val data; int rc = mdbx_get(txn, blocked_term_postings_dbi_, &key, &data); - if (rc == MDBX_NOTFOUND) { + if(rc == MDBX_NOTFOUND) { return true; } - if (rc != MDBX_SUCCESS) { + if(rc != MDBX_SUCCESS) { LOG_ERROR(2217, index_id_, "loadBlockEntries MDBX lookup failed for term " @@ -1076,17 +1099,25 @@ namespace ndd { } BlockView view; - if (!parseBlockViewFromValue(data, block_nr, &view)) { - LOG_ERROR(2218, index_id_, "Corrupt block payload for term " << term_id << " block " << block_nr); + if(!parseBlockViewFromValue(data, block_nr, &view)) { + LOG_ERROR(2218, + index_id_, + "Corrupt block payload for term " << term_id << " block " << block_nr); return false; } const BlockHeader* header = (const BlockHeader*)data.iov_base; - if (out_live_in_block) *out_live_in_block = header->nr_live_in_block; - if (out_max_value) *out_max_value = header->max_value; - if (out_found) *out_found = true; + if(out_live_in_block) { + *out_live_in_block = header->nr_live_in_block; + } + if(out_max_value) { + *out_max_value = header->max_value; + } + if(out_found) { + *out_found = true; + } - if (!entries) { + if(!entries) { return true; } @@ -1097,15 +1128,15 @@ namespace ndd { const float* vals = (const float*)view.values; #else const uint8_t* vals = (const uint8_t*)view.values; -#endif // NDD_INV_IDX_STORE_FLOATS +#endif // NDD_INV_IDX_STORE_FLOATS - for (uint32_t i = 0; i < view.count; i++) { + for(uint32_t i = 0; i < view.count; i++) { entries->at(i).doc_id = blockOffsetToDocId(block_nr, view.doc_offsets[i]); #if defined(NDD_INV_IDX_STORE_FLOATS) entries->at(i).value = vals[i]; #else entries->at(i).value = dequantize(vals[i], header->max_value); -#endif // NDD_INV_IDX_STORE_FLOATS +#endif // NDD_INV_IDX_STORE_FLOATS } return true; @@ -1115,22 +1146,23 @@ namespace ndd { * Saves the block header and entries */ bool InvertedIndex::saveBlockEntries(MDBX_txn* txn, - uint32_t term_id, - uint32_t block_nr, - const std::vector& entries, - uint32_t live_in_block, - float max_val) - { - if (term_id == kMetadataTermId || block_nr == kMetadataBlockNr) { - LOG_ERROR(2219, index_id_, "Refusing to save a reserved metadata key as a sparse data block"); + uint32_t term_id, + uint32_t block_nr, + const std::vector& entries, + uint32_t live_in_block, + float max_val) { + if(term_id == kMetadataTermId || block_nr == kMetadataBlockNr) { + LOG_ERROR(2219, + index_id_, + "Refusing to save a reserved metadata key as a sparse data block"); return false; } - if (entries.empty()) { + if(entries.empty()) { return deleteBlock(txn, term_id, block_nr); } - if (entries.size() > kBlockCapacity) { + if(entries.size() > kBlockCapacity) { LOG_ERROR(2220, index_id_, "Block for term " << term_id << " block " << block_nr @@ -1147,11 +1179,11 @@ namespace ndd { size_t value_size = sizeof(float); #else size_t value_size = sizeof(uint8_t); -#endif // NDD_INV_IDX_STORE_FLOATS +#endif // NDD_INV_IDX_STORE_FLOATS size_t total_size = sizeof(BlockHeader) - + (entries.size() * sizeof(BlockOffset)) //doc-local offsets - + (entries.size() * value_size); //doc weights + + (entries.size() * sizeof(BlockOffset)) //doc-local offsets + + (entries.size() * value_size); //doc weights std::vector buffer(total_size); // Serialize back into the compact on-disk layout used by the search iterator. @@ -1163,8 +1195,8 @@ namespace ndd { BlockOffset prev_offset = 0; bool has_prev = false; - for (size_t i = 0; i < entries.size(); i++) { - if (docToBlockNr(entries[i].doc_id) != block_nr) { + for(size_t i = 0; i < entries.size(); i++) { + if(docToBlockNr(entries[i].doc_id) != block_nr) { LOG_ERROR(2221, index_id_, "Entry doc_id " << entries[i].doc_id << " does not belong to term " @@ -1173,7 +1205,7 @@ namespace ndd { } BlockOffset offset = docToBlockOffset(entries[i].doc_id); - if (has_prev && offset <= prev_offset) { + if(has_prev && offset <= prev_offset) { LOG_ERROR(2222, index_id_, "Block entries must be strictly sorted by doc offset"); return false; } @@ -1184,22 +1216,22 @@ namespace ndd { #if defined(NDD_INV_IDX_STORE_FLOATS) float* vals_out = (float*)ptr; - for (size_t i = 0; i < entries.size(); i++) { + for(size_t i = 0; i < entries.size(); i++) { vals_out[i] = entries[i].value; } #else uint8_t* vals_out = ptr; - for (size_t i = 0; i < entries.size(); i++) { + for(size_t i = 0; i < entries.size(); i++) { vals_out[i] = quantize(entries[i].value, max_val); } -#endif // NDD_INV_IDX_STORE_FLOATS +#endif // NDD_INV_IDX_STORE_FLOATS uint64_t packed = packPostingKey(term_id, block_nr); MDBX_val key{&packed, sizeof(packed)}; MDBX_val value{buffer.data(), buffer.size()}; int rc = mdbx_put(txn, blocked_term_postings_dbi_, &key, &value, MDBX_UPSERT); - if (rc != MDBX_SUCCESS) { + if(rc != MDBX_SUCCESS) { LOG_ERROR(2223, index_id_, "saveBlockEntries MDBX put failed for term " @@ -1219,14 +1251,14 @@ namespace ndd { } bool InvertedIndex::iterateTermBlocks( - MDBX_txn* txn, - uint32_t term_id, - const std::function& callback) const { + MDBX_txn* txn, + uint32_t term_id, + const std::function& callback) const { // Because keys are packed as (term_id, block_nr), all rows for one term are contiguous. // A single seek is enough to walk every block that belongs to that term. MDBX_cursor* cursor = nullptr; int rc = mdbx_cursor_open(txn, blocked_term_postings_dbi_, &cursor); - if (rc != MDBX_SUCCESS) { + if(rc != MDBX_SUCCESS) { return false; } @@ -1235,8 +1267,8 @@ namespace ndd { MDBX_val data; rc = mdbx_cursor_get(cursor, &key, &data, MDBX_SET_RANGE); - while (rc == MDBX_SUCCESS) { - if (key.iov_len != sizeof(uint64_t)) { + while(rc == MDBX_SUCCESS) { + if(key.iov_len != sizeof(uint64_t)) { break; } @@ -1245,15 +1277,15 @@ namespace ndd { uint32_t key_term = unpackTermId(packed_key); uint32_t block_nr = unpackBlockNr(packed_key); - if (key_term != term_id) { + if(key_term != term_id) { break; } - if (block_nr == kMetadataBlockNr) { + if(block_nr == kMetadataBlockNr) { break; } - if (!callback(block_nr, data)) { + if(!callback(block_nr, data)) { mdbx_cursor_close(cursor); return false; } @@ -1270,21 +1302,19 @@ namespace ndd { // Only needed when the previous global max may have been lowered by an in-place update // or delete. We then rescan block headers to find the true max for the term. - bool ok = iterateTermBlocks(txn, - term_id, - [&recomputed_max](uint32_t block_nr, const MDBX_val& data) { - if (data.iov_len < sizeof(BlockHeader)) { - return false; - } - const BlockHeader* header = - (const BlockHeader*)data.iov_base; - if (header->max_value > recomputed_max) { - recomputed_max = header->max_value; - } - return true; - }); - - if (!ok) { + bool ok = iterateTermBlocks( + txn, term_id, [&recomputed_max](uint32_t block_nr, const MDBX_val& data) { + if(data.iov_len < sizeof(BlockHeader)) { + return false; + } + const BlockHeader* header = (const BlockHeader*)data.iov_base; + if(header->max_value > recomputed_max) { + recomputed_max = header->max_value; + } + return true; + }); + + if(!ok) { return 0.0f; } @@ -1300,35 +1330,36 @@ namespace ndd { MDBX_txn* txn = nullptr; int rc = mdbx_txn_begin(env_, nullptr, MDBX_TXN_RDONLY, &txn); - if (rc != MDBX_SUCCESS) { - LOG_ERROR(2224, index_id_, "Failed to begin loadTermInfo transaction: " << mdbx_strerror(rc)); + if(rc != MDBX_SUCCESS) { + LOG_ERROR(2224, + index_id_, + "Failed to begin loadTermInfo transaction: " << mdbx_strerror(rc)); return false; } MDBX_cursor* cursor = nullptr; rc = mdbx_cursor_open(txn, blocked_term_postings_dbi_, &cursor); - if (rc != MDBX_SUCCESS) { + if(rc != MDBX_SUCCESS) { mdbx_txn_abort(txn); return false; } MDBX_val key, data; rc = mdbx_cursor_get(cursor, &key, &data, MDBX_FIRST); - while (rc == MDBX_SUCCESS) { - if (key.iov_len == sizeof(uint64_t)) { + while(rc == MDBX_SUCCESS) { + if(key.iov_len == sizeof(uint64_t)) { uint64_t packed_key; std::memcpy(&packed_key, key.iov_base, sizeof(uint64_t)); uint32_t term_id = unpackTermId(packed_key); uint32_t block_nr = unpackBlockNr(packed_key); - if (term_id != kMetadataTermId - && block_nr == kMetadataBlockNr - && data.iov_len >= sizeof(PostingListHeader)) { + if(term_id != kMetadataTermId && block_nr == kMetadataBlockNr + && data.iov_len >= sizeof(PostingListHeader)) { PostingListHeader header; std::memcpy(&header, data.iov_base, sizeof(PostingListHeader)); - if (header.nr_live_entries > 0 && header.max_value > settings::NEAR_ZERO) { + if(header.nr_live_entries > 0 && header.max_value > settings::NEAR_ZERO) { term_info_[term_id] = header.max_value; } } @@ -1348,10 +1379,10 @@ namespace ndd { // ========================================================================= bool InvertedIndex::addDocumentsBatchInternal( - MDBX_txn* txn, - const std::vector>& docs) - { - if (docs.empty()) return true; + MDBX_txn* txn, const std::vector>& docs) { + if(docs.empty()) { + return true; + } // Reorganize the batch by term so each term can be merged into its posting list // independently. The on-disk structure is term-major. @@ -1361,18 +1392,19 @@ namespace ndd { update_stats.add_batch_docs.fetch_add(docs.size(), std::memory_order_relaxed); uint64_t raw_update_count = 0; const auto build_term_updates_start = SteadyClock::now(); -#endif // ND_SPARSE_INSTRUMENT +#endif // ND_SPARSE_INSTRUMENT std::unordered_map>> term_updates; - for (const auto& [doc_id, sparse_vec] : docs) { + for(const auto& [doc_id, sparse_vec] : docs) { #ifdef ND_SPARSE_INSTRUMENT raw_update_count += sparse_vec.indices.size(); -#endif // ND_SPARSE_INSTRUMENT - for (size_t i = 0; i < sparse_vec.indices.size(); i++) { +#endif // ND_SPARSE_INSTRUMENT + for(size_t i = 0; i < sparse_vec.indices.size(); i++) { uint32_t term_id = sparse_vec.indices[i]; - if (term_id == kMetadataTermId) { - LOG_ERROR(2226, index_id_, "term_id UINT32_MAX is reserved for sparse metadata"); + if(term_id == kMetadataTermId) { + LOG_ERROR( + 2226, index_id_, "term_id UINT32_MAX is reserved for sparse metadata"); return false; } term_updates[term_id].push_back(std::make_pair(doc_id, sparse_vec.values[i])); @@ -1382,24 +1414,25 @@ namespace ndd { #ifdef ND_SPARSE_INSTRUMENT update_stats.add_batch_raw_updates.fetch_add(raw_update_count, std::memory_order_relaxed); update_stats.add_batch_terms.fetch_add(term_updates.size(), std::memory_order_relaxed); - update_stats.build_term_updates_total_ns.fetch_add( - elapsedNsSince(build_term_updates_start), std::memory_order_relaxed); -#endif // ND_SPARSE_INSTRUMENT + update_stats.build_term_updates_total_ns.fetch_add(elapsedNsSince(build_term_updates_start), + std::memory_order_relaxed); +#endif // ND_SPARSE_INSTRUMENT - for (auto& [term_id, updates] : term_updates) { + for(auto& [term_id, updates] : term_updates) { #ifdef ND_SPARSE_INSTRUMENT const auto sort_dedup_start = SteadyClock::now(); -#endif // ND_SPARSE_INSTRUMENT +#endif // ND_SPARSE_INSTRUMENT // Merge logic below assumes doc_ids are sorted and unique per term within this batch. - std::sort(updates.begin(), updates.end(), - [](const auto& a, const auto& b) { return a.first < b.first; }); + std::sort(updates.begin(), updates.end(), [](const auto& a, const auto& b) { + return a.first < b.first; + }); // Keep only the last update per doc_id if duplicates are found. std::vector> deduped; deduped.reserve(updates.size()); - for (const auto& u : updates) { - if (!deduped.empty() && deduped.back().first == u.first) { + for(const auto& u : updates) { + if(!deduped.empty() && deduped.back().first == u.first) { deduped.back().second = u.second; } else { deduped.push_back(u); @@ -1407,11 +1440,11 @@ namespace ndd { } #ifdef ND_SPARSE_INSTRUMENT - update_stats.sort_dedup_total_ns.fetch_add( - elapsedNsSince(sort_dedup_start), std::memory_order_relaxed); - update_stats.add_batch_deduped_updates.fetch_add( - deduped.size(), std::memory_order_relaxed); -#endif // ND_SPARSE_INSTRUMENT + update_stats.sort_dedup_total_ns.fetch_add(elapsedNsSince(sort_dedup_start), + std::memory_order_relaxed); + update_stats.add_batch_deduped_updates.fetch_add(deduped.size(), + std::memory_order_relaxed); +#endif // ND_SPARSE_INSTRUMENT bool header_found = false; PostingListHeader header = readPostingListHeader(txn, term_id, &header_found); @@ -1419,21 +1452,21 @@ namespace ndd { bool need_recompute_max = false; size_t ui = 0; - while (ui < deduped.size()) { + while(ui < deduped.size()) { uint32_t block_nr = docToBlockNr(deduped[ui].first); size_t block_begin = ui; - while (ui < deduped.size() && docToBlockNr(deduped[ui].first) == block_nr) { + while(ui < deduped.size() && docToBlockNr(deduped[ui].first) == block_nr) { ui++; } #ifdef ND_SPARSE_INSTRUMENT update_stats.add_batch_blocks.fetch_add(1, std::memory_order_relaxed); -#endif // ND_SPARSE_INSTRUMENT +#endif // ND_SPARSE_INSTRUMENT // One MDBX record stores exactly one (term, block_nr) slice, so split the // term's updates into block-local chunks before merging. std::vector> block_updates( - deduped.begin() + block_begin, deduped.begin() + ui); + deduped.begin() + block_begin, deduped.begin() + ui); std::vector existing; uint32_t old_live_in_block = 0; @@ -1442,7 +1475,7 @@ namespace ndd { #ifdef ND_SPARSE_INSTRUMENT const auto load_block_start = SteadyClock::now(); -#endif // ND_SPARSE_INSTRUMENT +#endif // ND_SPARSE_INSTRUMENT bool load_ok = loadBlockEntries(txn, term_id, block_nr, @@ -1452,33 +1485,33 @@ namespace ndd { &block_found); #ifdef ND_SPARSE_INSTRUMENT update_stats.load_block_calls.fetch_add(1, std::memory_order_relaxed); - update_stats.load_block_total_ns.fetch_add( - elapsedNsSince(load_block_start), std::memory_order_relaxed); - update_stats.load_block_entries_total.fetch_add( - existing.size(), std::memory_order_relaxed); -#endif // ND_SPARSE_INSTRUMENT - if (!load_ok) { + update_stats.load_block_total_ns.fetch_add(elapsedNsSince(load_block_start), + std::memory_order_relaxed); + update_stats.load_block_entries_total.fetch_add(existing.size(), + std::memory_order_relaxed); +#endif // ND_SPARSE_INSTRUMENT + if(!load_ok) { return false; } #ifdef ND_SPARSE_INSTRUMENT const auto merge_start = SteadyClock::now(); -#endif // ND_SPARSE_INSTRUMENT - // Classic merge of two sorted streams: existing postings in the block and the - // incoming updates for that same block. +#endif // ND_SPARSE_INSTRUMENT + // Classic merge of two sorted streams: existing postings in the block and the + // incoming updates for that same block. std::vector merged; merged.reserve(existing.size() + block_updates.size()); size_t ei = 0; size_t bi = 0; - while (ei < existing.size() && bi < block_updates.size()) { + while(ei < existing.size() && bi < block_updates.size()) { ndd::idInt existing_id = existing[ei].doc_id; ndd::idInt update_id = block_updates[bi].first; - if (existing_id < update_id) { + if(existing_id < update_id) { merged.push_back(existing[ei]); ei++; - } else if (existing_id > update_id) { + } else if(existing_id > update_id) { merged.push_back(PostingListEntry(update_id, block_updates[bi].second)); bi++; } else { @@ -1487,23 +1520,25 @@ namespace ndd { bi++; } } - while (ei < existing.size()) { + while(ei < existing.size()) { merged.push_back(existing[ei]); ei++; } - while (bi < block_updates.size()) { - merged.push_back(PostingListEntry(block_updates[bi].first, - block_updates[bi].second)); + while(bi < block_updates.size()) { + merged.push_back( + PostingListEntry(block_updates[bi].first, block_updates[bi].second)); bi++; } uint32_t new_live_in_block = 0; float new_block_max = 0.0f; - for (const auto& e : merged) { - if (e.value > 0.0f) { + for(const auto& e : merged) { + if(e.value > 0.0f) { new_live_in_block++; - if (e.value > new_block_max) new_block_max = e.value; - } else if (e.value == 0.0f) { + if(e.value > new_block_max) { + new_block_max = e.value; + } + } else if(e.value == 0.0f) { LOG_WARN(2227, index_id_, "addDocumentsBatch received zero value for term " @@ -1511,56 +1546,54 @@ namespace ndd { } else { LOG_WARN(2228, index_id_, - "addDocumentsBatch received negative value " << e.value - << " for term " << term_id - << "; treating as dead"); + "addDocumentsBatch received negative value " + << e.value << " for term " << term_id + << "; treating as dead"); } } #ifdef ND_SPARSE_INSTRUMENT update_stats.merge_block_calls.fetch_add(1, std::memory_order_relaxed); - update_stats.merge_block_total_ns.fetch_add( - elapsedNsSince(merge_start), std::memory_order_relaxed); - update_stats.merge_existing_entries_total.fetch_add( - existing.size(), std::memory_order_relaxed); - update_stats.merge_update_entries_total.fetch_add( - block_updates.size(), std::memory_order_relaxed); - update_stats.merge_output_entries_total.fetch_add( - merged.size(), std::memory_order_relaxed); -#endif // ND_SPARSE_INSTRUMENT + update_stats.merge_block_total_ns.fetch_add(elapsedNsSince(merge_start), + std::memory_order_relaxed); + update_stats.merge_existing_entries_total.fetch_add(existing.size(), + std::memory_order_relaxed); + update_stats.merge_update_entries_total.fetch_add(block_updates.size(), + std::memory_order_relaxed); + update_stats.merge_output_entries_total.fetch_add(merged.size(), + std::memory_order_relaxed); +#endif // ND_SPARSE_INSTRUMENT uint32_t old_total = static_cast(existing.size()); uint32_t new_total = static_cast(merged.size()); applyHeaderDelta(header, - static_cast(new_total) - static_cast(old_total), - static_cast(new_live_in_block) - - static_cast(old_live_in_block)); + static_cast(new_total) - static_cast(old_total), + static_cast(new_live_in_block) + - static_cast(old_live_in_block)); - if (merged.empty()) { - if (!deleteBlock(txn, term_id, block_nr)) return false; + if(merged.empty()) { + if(!deleteBlock(txn, term_id, block_nr)) { + return false; + } } else { #ifdef ND_SPARSE_INSTRUMENT const auto save_block_start = SteadyClock::now(); -#endif // ND_SPARSE_INSTRUMENT - bool save_ok = saveBlockEntries(txn, - term_id, - block_nr, - merged, - new_live_in_block, - new_block_max); +#endif // ND_SPARSE_INSTRUMENT + bool save_ok = saveBlockEntries( + txn, term_id, block_nr, merged, new_live_in_block, new_block_max); #ifdef ND_SPARSE_INSTRUMENT update_stats.save_block_calls.fetch_add(1, std::memory_order_relaxed); - update_stats.save_block_total_ns.fetch_add( - elapsedNsSince(save_block_start), std::memory_order_relaxed); - update_stats.save_block_entries_total.fetch_add( - merged.size(), std::memory_order_relaxed); -#endif // ND_SPARSE_INSTRUMENT - if (!save_ok) { + update_stats.save_block_total_ns.fetch_add(elapsedNsSince(save_block_start), + std::memory_order_relaxed); + update_stats.save_block_entries_total.fetch_add(merged.size(), + std::memory_order_relaxed); +#endif // ND_SPARSE_INSTRUMENT + if(!save_ok) { return false; } } - if (new_block_max > header.max_value) { + if(new_block_max > header.max_value) { header.max_value = new_block_max; } @@ -1573,40 +1606,42 @@ namespace ndd { * recompute global max once all the blocks have been updated * from this document batch. */ - if (old_block_max > 0.0f && nearEqual(old_block_max, old_global_max) - && new_block_max + settings::NEAR_ZERO < old_global_max) { + if(old_block_max > 0.0f && nearEqual(old_block_max, old_global_max) + && new_block_max + settings::NEAR_ZERO < old_global_max) { need_recompute_max = true; } } - if (header.nr_entries == 0) { - if (!deletePostingListHeader(txn, term_id)) return false; - term_info_.erase(term_id); - continue; + if(header.nr_entries == 0) { + if(!deletePostingListHeader(txn, term_id)) { + return false; } + term_info_.erase(term_id); + continue; + } // Recompute the term max only when the previous max might have been invalidated. - if (need_recompute_max) { + if(need_recompute_max) { #ifdef ND_SPARSE_INSTRUMENT const auto recompute_max_start = SteadyClock::now(); -#endif // ND_SPARSE_INSTRUMENT +#endif // ND_SPARSE_INSTRUMENT header.max_value = recomputeGlobalMaxFromBlocks(txn, term_id); #ifdef ND_SPARSE_INSTRUMENT update_stats.recompute_max_calls.fetch_add(1, std::memory_order_relaxed); - update_stats.recompute_max_total_ns.fetch_add( - elapsedNsSince(recompute_max_start), std::memory_order_relaxed); -#endif // ND_SPARSE_INSTRUMENT - } //while (ui < deduped.size()) + update_stats.recompute_max_total_ns.fetch_add(elapsedNsSince(recompute_max_start), + std::memory_order_relaxed); +#endif // ND_SPARSE_INSTRUMENT + } //while (ui < deduped.size()) - if (header.nr_live_entries == 0) { + if(header.nr_live_entries == 0) { header.max_value = 0.0f; } - if (!writePostingListHeader(txn, term_id, header)) { + if(!writePostingListHeader(txn, term_id, header)) { return false; } - if (header.nr_live_entries > 0 && header.max_value > settings::NEAR_ZERO) { + if(header.nr_live_entries > 0 && header.max_value > settings::NEAR_ZERO) { term_info_[term_id] = header.max_value; } else { term_info_.erase(term_id); @@ -1616,22 +1651,24 @@ namespace ndd { return true; } - bool InvertedIndex::removeDocumentInternal(MDBX_txn* txn, - ndd::idInt doc_id, - const SparseVector& vec) - { + ndd::idInt doc_id, + const SparseVector& vec) { /** * NOTE: This can be slow right now since we provide a single vector to delete * at once. It should ideally be faster with a batch. */ - for (size_t i = 0; i < vec.indices.size(); i++) { + for(size_t i = 0; i < vec.indices.size(); i++) { uint32_t term_id = vec.indices[i]; - if (term_id == kMetadataTermId) continue; + if(term_id == kMetadataTermId) { + continue; + } bool header_found = false; PostingListHeader header = readPostingListHeader(txn, term_id, &header_found); - if (!header_found || header.nr_entries == 0) continue; + if(!header_found || header.nr_entries == 0) { + continue; + } uint32_t block_nr = docToBlockNr(doc_id); @@ -1640,33 +1677,35 @@ namespace ndd { float old_block_max = 0.0f; bool block_found = false; - if (!loadBlockEntries(txn, - term_id, - block_nr, - &entries, - &old_live_in_block, - &old_block_max, - &block_found)) { + if(!loadBlockEntries(txn, + term_id, + block_nr, + &entries, + &old_live_in_block, + &old_block_max, + &block_found)) { return false; } - if (!block_found || entries.empty()) continue; + if(!block_found || entries.empty()) { + continue; + } size_t lo = 0; size_t hi = entries.size(); - while (lo < hi) { + while(lo < hi) { size_t mid = lo + (hi - lo) / 2; - if (entries[mid].doc_id < doc_id) { + if(entries[mid].doc_id < doc_id) { lo = mid + 1; } else { hi = mid; } } - if (lo >= entries.size() || entries[lo].doc_id != doc_id) { + if(lo >= entries.size() || entries[lo].doc_id != doc_id) { continue; } - if (entries[lo].value <= 0.0f) { + if(entries[lo].value <= 0.0f) { continue; } @@ -1677,15 +1716,15 @@ namespace ndd { uint32_t new_live_in_block = old_live_in_block > 0 ? old_live_in_block - 1 : 0; uint32_t old_total = static_cast(entries.size()); - float tombstone_ratio = old_total > 0 - ? (float)(old_total - new_live_in_block) / (float)old_total - : 0.0f; + float tombstone_ratio = + old_total > 0 ? (float)(old_total - new_live_in_block) / (float)old_total + : 0.0f; - if (tombstone_ratio >= settings::INV_IDX_COMPACTION_TOMBSTONE_RATIO) { + if(tombstone_ratio >= settings::INV_IDX_COMPACTION_TOMBSTONE_RATIO) { //Compact deleted entries size_t write = 0; - for (size_t j = 0; j < entries.size(); j++) { - if (entries[j].value > 0.0f) { + for(size_t j = 0; j < entries.size(); j++) { + if(entries[j].value > 0.0f) { entries[write++] = entries[j]; } } @@ -1694,57 +1733,59 @@ namespace ndd { new_live_in_block = 0; float new_block_max = 0.0f; - for (const auto& e : entries) { - if (e.value > 0.0f) { + for(const auto& e : entries) { + if(e.value > 0.0f) { new_live_in_block++; - if (e.value > new_block_max) new_block_max = e.value; + if(e.value > new_block_max) { + new_block_max = e.value; + } } } uint32_t new_total = static_cast(entries.size()); applyHeaderDelta(header, - static_cast(new_total) - static_cast(old_total), - static_cast(new_live_in_block) - - static_cast(old_live_in_block)); + static_cast(new_total) - static_cast(old_total), + static_cast(new_live_in_block) + - static_cast(old_live_in_block)); bool need_recompute_max = false; - if (old_block_max > 0.0f && nearEqual(old_block_max, header.max_value) - && new_block_max + settings::NEAR_ZERO < header.max_value) { + if(old_block_max > 0.0f && nearEqual(old_block_max, header.max_value) + && new_block_max + settings::NEAR_ZERO < header.max_value) { need_recompute_max = true; } - if (entries.empty()) { - if (!deleteBlock(txn, term_id, block_nr)) return false; + if(entries.empty()) { + if(!deleteBlock(txn, term_id, block_nr)) { + return false; + } } else { - if (!saveBlockEntries(txn, - term_id, - block_nr, - entries, - new_live_in_block, - new_block_max)) { + if(!saveBlockEntries( + txn, term_id, block_nr, entries, new_live_in_block, new_block_max)) { return false; } } - if (header.nr_entries == 0) { - if (!deletePostingListHeader(txn, term_id)) return false; + if(header.nr_entries == 0) { + if(!deletePostingListHeader(txn, term_id)) { + return false; + } term_info_.erase(term_id); continue; } - if (need_recompute_max) { + if(need_recompute_max) { header.max_value = recomputeGlobalMaxFromBlocks(txn, term_id); } - if (header.nr_live_entries == 0) { + if(header.nr_live_entries == 0) { header.max_value = 0.0f; } - if (!writePostingListHeader(txn, term_id, header)) { + if(!writePostingListHeader(txn, term_id, header)) { return false; } - if (header.nr_live_entries > 0 && header.max_value > settings::NEAR_ZERO) { + if(header.nr_live_entries > 0 && header.max_value > settings::NEAR_ZERO) { term_info_[term_id] = header.max_value; } else { term_info_.erase(term_id); @@ -1758,62 +1799,65 @@ namespace ndd { // Pruning // ========================================================================= - void InvertedIndex::pruneLongest(std::vector& iters, - float min_score) - { - if (iters.size() < 2) return; + void InvertedIndex::pruneLongest(std::vector& iters, float min_score) { + if(iters.size() < 2) { + return; + } // Pruning only ever advances the single longest remaining list. That keeps the rule // simple: if even its maximum possible future contribution cannot beat the current // threshold, skip ahead to where the other lists resume. size_t longest_idx = 0; uint32_t longest_rem = 0; - for (size_t i = 0; i < iters.size(); i++) { + for(size_t i = 0; i < iters.size(); i++) { uint32_t rem = iters[i]->remainingEntries(); - if (rem > longest_rem) { + if(rem > longest_rem) { longest_rem = rem; longest_idx = i; } } - if (longest_idx != 0) { + if(longest_idx != 0) { PostingListIterator* tmp = iters[0]; iters[0] = iters[longest_idx]; iters[longest_idx] = tmp; } PostingListIterator* longest = iters[0]; - if (longest->current_doc_id == EXHAUSTED_DOC_ID) return; + if(longest->current_doc_id == EXHAUSTED_DOC_ID) { + return; + } ndd::idInt longest_doc = longest->current_doc_id; ndd::idInt others_min_doc_id = EXHAUSTED_DOC_ID; - for (size_t i = 1; i < iters.size(); i++) { - if (iters[i]->current_doc_id < others_min_doc_id) { + for(size_t i = 1; i < iters.size(); i++) { + if(iters[i]->current_doc_id < others_min_doc_id) { others_min_doc_id = iters[i]->current_doc_id; } } - if (others_min_doc_id <= longest_doc) return; + if(others_min_doc_id <= longest_doc) { + return; + } float max_possible = longest->upperBound(); - if (max_possible <= min_score) { + if(max_possible <= min_score) { #ifdef NDD_INV_IDX_PRUNE_DEBUG uint32_t remaining_before_prune = longest->remaining_entries; -#endif // NDD_INV_IDX_PRUNE_DEBUG - if (others_min_doc_id == EXHAUSTED_DOC_ID) { +#endif // NDD_INV_IDX_PRUNE_DEBUG + if(others_min_doc_id == EXHAUSTED_DOC_ID) { longest->current_doc_id = EXHAUSTED_DOC_ID; longest->remaining_entries = 0; } else { longest->advance(others_min_doc_id); } #ifdef NDD_INV_IDX_PRUNE_DEBUG - if (remaining_before_prune > longest->remaining_entries) { - longest->pruned_entries += - (remaining_before_prune - longest->remaining_entries); + if(remaining_before_prune > longest->remaining_entries) { + longest->pruned_entries += (remaining_before_prune - longest->remaining_entries); } -#endif // NDD_INV_IDX_PRUNE_DEBUG +#endif // NDD_INV_IDX_PRUNE_DEBUG } } @@ -1822,11 +1866,11 @@ namespace ndd { // ========================================================================= void InvertedIndex::PostingListIterator::init(MDBX_cursor* cursor_in, - uint32_t tid, - float tw, - float gmax, - uint32_t total_entries, - const InvertedIndex* idx) { + uint32_t tid, + float tw, + float gmax, + uint32_t total_entries, + const InvertedIndex* idx) { cursor = cursor_in; term_id = tid; term_weight = tw; @@ -1846,11 +1890,11 @@ namespace ndd { #ifdef NDD_INV_IDX_PRUNE_DEBUG initial_entries = total_entries; pruned_entries = 0; -#endif // NDD_INV_IDX_PRUNE_DEBUG +#endif // NDD_INV_IDX_PRUNE_DEBUG // Position the iterator on the first non-empty block and then on the first live entry // inside that block. - if (!loadFirstBlock()) { + if(!loadFirstBlock()) { current_doc_id = EXHAUSTED_DOC_ID; remaining_entries = 0; return; @@ -1864,8 +1908,8 @@ namespace ndd { const MDBX_val& data) { #ifdef ND_SPARSE_INSTRUMENT ParseCurrentKVTimer parse_timer; -#endif // ND_SPARSE_INSTRUMENT - if (key.iov_len != sizeof(uint64_t)) { +#endif // ND_SPARSE_INSTRUMENT + if(key.iov_len != sizeof(uint64_t)) { return false; } @@ -1874,12 +1918,12 @@ namespace ndd { uint32_t key_term = unpackTermId(packed_key); uint32_t block_nr = unpackBlockNr(packed_key); - if (key_term != term_id || block_nr == kMetadataBlockNr) { + if(key_term != term_id || block_nr == kMetadataBlockNr) { return false; } BlockView view; - if (!index->parseBlockViewFromValue(data, block_nr, &view)) { + if(!index->parseBlockViewFromValue(data, block_nr, &view)) { return false; } @@ -1903,23 +1947,25 @@ namespace ndd { // Seek once into the contiguous key range for this term, then skip any empty blocks. int rc = mdbx_cursor_get(cursor, &key, &data, MDBX_SET_RANGE); - while (rc == MDBX_SUCCESS) { - if (key.iov_len != sizeof(uint64_t)) return false; + while(rc == MDBX_SUCCESS) { + if(key.iov_len != sizeof(uint64_t)) { + return false; + } uint64_t packed_key; std::memcpy(&packed_key, key.iov_base, sizeof(uint64_t)); uint32_t key_term = unpackTermId(packed_key); uint32_t block_nr = unpackBlockNr(packed_key); - if (key_term != term_id || block_nr == kMetadataBlockNr) { + if(key_term != term_id || block_nr == kMetadataBlockNr) { return false; } - if (!parseCurrentKV(key, data)) { + if(!parseCurrentKV(key, data)) { return false; } - if (data_size == 0) { + if(data_size == 0) { rc = mdbx_cursor_get(cursor, &key, &data, MDBX_NEXT); continue; } @@ -1938,8 +1984,8 @@ namespace ndd { // Stop as soon as the cursor leaves this term's key range. The next term or metadata row // belongs to a different posting list. - while (rc == MDBX_SUCCESS) { - if (key.iov_len != sizeof(uint64_t)) { + while(rc == MDBX_SUCCESS) { + if(key.iov_len != sizeof(uint64_t)) { current_doc_id = EXHAUSTED_DOC_ID; data_size = 0; return false; @@ -1950,19 +1996,19 @@ namespace ndd { uint32_t key_term = unpackTermId(packed_key); uint32_t block_nr = unpackBlockNr(packed_key); - if (key_term != term_id || block_nr == kMetadataBlockNr) { + if(key_term != term_id || block_nr == kMetadataBlockNr) { current_doc_id = EXHAUSTED_DOC_ID; data_size = 0; return false; } - if (!parseCurrentKV(key, data)) { + if(!parseCurrentKV(key, data)) { current_doc_id = EXHAUSTED_DOC_ID; data_size = 0; return false; } - if (data_size == 0) { + if(data_size == 0) { rc = mdbx_cursor_get(cursor, &key, &data, MDBX_NEXT); continue; } @@ -1977,31 +2023,29 @@ namespace ndd { void InvertedIndex::PostingListIterator::advanceToNextLive() { // LOG_TIME("advanceToNextLive"); //this function is also not slow - while (true) { - if (value_bits == 32) { + while(true) { + if(value_bits == 32) { const float* vals = (const float*)values_ptr; - while (current_entry_idx < data_size && vals[current_entry_idx] <= 0.0f) { + while(current_entry_idx < data_size && vals[current_entry_idx] <= 0.0f) { consumeEntries(1); current_entry_idx++; } } else { uint32_t next_live = static_cast(index->findNextLiveSIMD( - (const uint8_t*)values_ptr, - data_size, - current_entry_idx)); + (const uint8_t*)values_ptr, data_size, current_entry_idx)); consumeEntries(next_live - current_entry_idx); current_entry_idx = next_live; } - if (current_entry_idx < data_size) { + if(current_entry_idx < data_size) { // Found the next non-zero value in the current block. current_doc_id = docIdAt(current_entry_idx); return; } - // Current block is exhausted; keep scanning forward until we find another non-empty block - // or run out of rows for this term. - if (!loadNextBlock()) { + // Current block is exhausted; keep scanning forward until we find another non-empty + // block or run out of rows for this term. + if(!loadNextBlock()) { current_doc_id = EXHAUSTED_DOC_ID; return; } @@ -2011,25 +2055,29 @@ namespace ndd { } void InvertedIndex::PostingListIterator::next() { - if (current_doc_id == EXHAUSTED_DOC_ID) return; + if(current_doc_id == EXHAUSTED_DOC_ID) { + return; + } consumeEntries(1); current_entry_idx++; advanceToNextLive(); } void InvertedIndex::PostingListIterator::advance(ndd::idInt target_doc_id) { - if (current_doc_id == EXHAUSTED_DOC_ID || current_doc_id >= target_doc_id) { + if(current_doc_id == EXHAUSTED_DOC_ID || current_doc_id >= target_doc_id) { return; } - while (true) { - if (current_doc_id == EXHAUSTED_DOC_ID) return; + while(true) { + if(current_doc_id == EXHAUSTED_DOC_ID) { + return; + } const uint32_t target_block_nr = docToBlockNr(target_doc_id); - if (current_block_nr < target_block_nr) { + if(current_block_nr < target_block_nr) { // Target is in a later block, so skip the remainder of the current block at once. consumeEntries(data_size - current_entry_idx); - if (!loadNextBlock()) { + if(!loadNextBlock()) { current_doc_id = EXHAUSTED_DOC_ID; break; } @@ -2037,7 +2085,7 @@ namespace ndd { continue; } - if (current_block_nr > target_block_nr) { + if(current_block_nr > target_block_nr) { current_entry_idx = 0; advanceToNextLive(); break; @@ -2048,8 +2096,7 @@ namespace ndd { // doc_id without decoding the entire block into absolute ids. const BlockOffset* begin = doc_offsets + current_entry_idx; const BlockOffset* end = doc_offsets + data_size; - const BlockOffset* next = - std::lower_bound(begin, end, target_offset); + const BlockOffset* next = std::lower_bound(begin, end, target_offset); uint32_t next_idx = static_cast(next - doc_offsets); consumeEntries(next_idx - current_entry_idx); diff --git a/src/sparse/inverted_index.hpp b/src/sparse/inverted_index.hpp index 0627cf7a3..be2f9c87f 100644 --- a/src/sparse/inverted_index.hpp +++ b/src/sparse/inverted_index.hpp @@ -20,7 +20,7 @@ # include # endif # include -#endif // defined(__x86_64__) || defined(_M_X64) +#endif // defined(__x86_64__) || defined(_M_X64) #include "mdbx/mdbx.h" #include "../core/types.hpp" @@ -59,15 +59,21 @@ namespace ndd { ndd::idInt doc_id; float value; - PostingListEntry() : doc_id(0), value(0.0f) {} - PostingListEntry(ndd::idInt id, float val) : doc_id(id), value(val) {} + PostingListEntry() : + doc_id(0), + value(0.0f) {} + PostingListEntry(ndd::idInt id, float val) : + doc_id(id), + value(val) {} }; struct ScoredDoc { ndd::idInt doc_id; float score; - ScoredDoc(ndd::idInt id, float s) : doc_id(id), score(s) {} + ScoredDoc(ndd::idInt id, float s) : + doc_id(id), + score(s) {} bool operator<(const ScoredDoc& other) const { // Reverse ordering so std::priority_queue behaves like a min-heap on score. @@ -86,16 +92,15 @@ namespace ndd { bool initialize(); bool addDocumentsBatch(MDBX_txn* txn, - const std::vector>& docs); + const std::vector>& docs); bool removeDocument(MDBX_txn* txn, ndd::idInt doc_id, const SparseVector& vec); size_t getTermCount() const; size_t getVocabSize() const; - std::vector>search(const SparseVector& query, - size_t k, - const ndd::RoaringBitmap* filter = nullptr); + std::vector> + search(const SparseVector& query, size_t k, const ndd::RoaringBitmap* filter = nullptr); private: friend class InvertedIndexTestPeer; @@ -148,8 +153,7 @@ namespace ndd { } static inline ndd::idInt blockOffsetToDocId(uint32_t block_nr, BlockOffset block_offset) { - uint64_t base = static_cast(block_nr) - * static_cast(kBlockCapacity); + uint64_t base = static_cast(block_nr) * static_cast(kBlockCapacity); return static_cast(base + static_cast(block_offset)); } @@ -192,43 +196,41 @@ namespace ndd { #ifdef NDD_INV_IDX_PRUNE_DEBUG uint32_t initial_entries; uint32_t pruned_entries; -#endif // NDD_INV_IDX_PRUNE_DEBUG +#endif // NDD_INV_IDX_PRUNE_DEBUG void init(MDBX_cursor* cursor, - uint32_t term_id, - float term_weight, - float global_max, - uint32_t total_entries, - const InvertedIndex* index); + uint32_t term_id, + float term_weight, + float global_max, + uint32_t total_entries, + const InvertedIndex* index); inline float valueAt(uint32_t idx) const { - if (value_bits == 32) { + if(value_bits == 32) { return ((const float*)values_ptr)[idx]; } return dequantize(((const uint8_t*)values_ptr)[idx], max_value); } inline bool isLiveAt(uint32_t idx) const { - if (value_bits == 32) { + if(value_bits == 32) { return ((const float*)values_ptr)[idx] > 0.0f; } return ((const uint8_t*)values_ptr)[idx] > 0; } - inline float currentValue() const { - return valueAt(current_entry_idx); - } + inline float currentValue() const { return valueAt(current_entry_idx); } void advanceToNextLive(); void next(); void advance(ndd::idInt target_doc_id); - float upperBound() const { - return global_max * term_weight; - } + float upperBound() const { return global_max * term_weight; } uint32_t remainingEntries() const { - if (current_doc_id == EXHAUSTED_DOC_ID) return 0; + if(current_doc_id == EXHAUSTED_DOC_ID) { + return 0; + } return remaining_entries; } @@ -238,7 +240,7 @@ namespace ndd { inline void consumeEntries(uint32_t count) { // Pruning relies on remaining_entries being conservative and monotonic. - if (count >= remaining_entries) { + if(count >= remaining_entries) { remaining_entries = 0; } else { remaining_entries -= count; @@ -255,19 +257,19 @@ namespace ndd { private: static inline float dequantize(uint8_t val, float max_val) { - if (max_val <= settings::NEAR_ZERO) return 0.0f; + if(max_val <= settings::NEAR_ZERO) { + return 0.0f; + } return (float)val * (max_val / UINT8_MAX); } }; size_t findDocIdSIMD(const uint32_t* doc_ids, - size_t size, - size_t start_idx, - uint32_t target) const; + size_t size, + size_t start_idx, + uint32_t target) const; - size_t findNextLiveSIMD(const uint8_t* values, - size_t size, - size_t start_idx) const; + size_t findNextLiveSIMD(const uint8_t* values, size_t size, size_t start_idx) const; template static bool accumulateBatchScores(PostingListIterator* it, @@ -277,47 +279,43 @@ namespace ndd { float* scores_buf, float term_weight); - PostingListHeader readPostingListHeader(MDBX_txn* txn, - uint32_t term_id, - bool* out_found = nullptr) const; + PostingListHeader + readPostingListHeader(MDBX_txn* txn, uint32_t term_id, bool* out_found = nullptr) const; - bool writePostingListHeader(MDBX_txn* txn, - uint32_t term_id, - const PostingListHeader& header); + bool + writePostingListHeader(MDBX_txn* txn, uint32_t term_id, const PostingListHeader& header); bool deletePostingListHeader(MDBX_txn* txn, uint32_t term_id); bool loadBlockEntries(MDBX_txn* txn, - uint32_t term_id, - uint32_t block_nr, - std::vector* entries, - uint32_t* out_live_in_block, - float* out_max_value, - bool* out_found) const; + uint32_t term_id, + uint32_t block_nr, + std::vector* entries, + uint32_t* out_live_in_block, + float* out_max_value, + bool* out_found) const; bool saveBlockEntries(MDBX_txn* txn, - uint32_t term_id, - uint32_t block_nr, - const std::vector& entries, - uint32_t live_in_block, - float max_val); + uint32_t term_id, + uint32_t block_nr, + const std::vector& entries, + uint32_t live_in_block, + float max_val); bool deleteBlock(MDBX_txn* txn, uint32_t term_id, uint32_t block_nr); - bool parseBlockViewFromValue(const MDBX_val& data, - uint32_t block_nr, - BlockView* out_view) const; + bool + parseBlockViewFromValue(const MDBX_val& data, uint32_t block_nr, BlockView* out_view) const; bool iterateTermBlocks( - MDBX_txn* txn, - uint32_t term_id, - const std::function& callback) const; + MDBX_txn* txn, + uint32_t term_id, + const std::function& callback) const; float recomputeGlobalMaxFromBlocks(MDBX_txn* txn, uint32_t term_id) const; - static void applyHeaderDelta(PostingListHeader& header, - int64_t total_delta, - int64_t live_delta); + static void + applyHeaderDelta(PostingListHeader& header, int64_t total_delta, int64_t live_delta); bool loadTermInfo(); @@ -325,13 +323,11 @@ namespace ndd { bool writeSuperBlock(MDBX_txn* txn, const SuperBlock& sb); bool validateSuperBlock(MDBX_txn* txn); - bool addDocumentsBatchInternal( - MDBX_txn* txn, - const std::vector>& docs); + bool + addDocumentsBatchInternal(MDBX_txn* txn, + const std::vector>& docs); - bool removeDocumentInternal(MDBX_txn* txn, - ndd::idInt doc_id, - const SparseVector& vec); + bool removeDocumentInternal(MDBX_txn* txn, ndd::idInt doc_id, const SparseVector& vec); void pruneLongest(std::vector& iters, float min_score); }; diff --git a/src/sparse/sparse_storage.hpp b/src/sparse/sparse_storage.hpp index e55c48e44..9c5010a65 100644 --- a/src/sparse/sparse_storage.hpp +++ b/src/sparse/sparse_storage.hpp @@ -40,7 +40,8 @@ namespace ndd { updateVectorCount(); LOG_INFO(2241, index_id_, - "SparseVectorStorage initialized at " << db_path_ << " with " << vector_count_ << " vectors"); + "SparseVectorStorage initialized at " << db_path_ << " with " << vector_count_ + << " vectors"); return true; } @@ -56,7 +57,7 @@ namespace ndd { storage_->env_, nullptr, static_cast(flags), &txn_); if(rc != 0) { throw std::runtime_error("Failed to begin transaction: " - + std::string(mdbx_strerror(rc))); + + std::string(mdbx_strerror(rc))); } } @@ -113,7 +114,6 @@ namespace ndd { return storage_->getVectorInternal(txn_, doc_id); } - bool delete_vector(ndd::idInt doc_id) { if(read_only_) { return false; @@ -123,7 +123,9 @@ namespace ndd { // terms from the inverted index, then delete the raw payload row. auto vec = get_vector(doc_id); if(!vec) { - LOG_WARN(2242, storage_->index_id_, "delete_vector could not find doc_id=" << doc_id); + LOG_WARN(2242, + storage_->index_id_, + "delete_vector could not find doc_id=" << doc_id); return false; } @@ -171,7 +173,9 @@ namespace ndd { for(const auto& [doc_id, sparse_vec] : batch) { if(!storeVectorInternal(txn->getTxn(), doc_id, sparse_vec)) { - LOG_ERROR(2243, index_id_, "store_vectors_batch failed to store doc_id=" << doc_id); + LOG_ERROR(2243, + index_id_, + "store_vectors_batch failed to store doc_id=" << doc_id); txn->abort(); return false; } @@ -212,12 +216,10 @@ namespace ndd { } return txn->commit(); } -#endif //if 0 +#endif //if 0 - std::vector> search(const SparseVector& query, - size_t k, - const ndd::RoaringBitmap* filter = nullptr) - { + std::vector> + search(const SparseVector& query, size_t k, const ndd::RoaringBitmap* filter = nullptr) { return sparse_index_->search(query, k, filter); } @@ -262,7 +264,9 @@ namespace ndd { std::error_code ec; std::filesystem::create_directories(db_path_, ec); if(ec) { - LOG_ERROR(2248, index_id_, "create_directories failed for " << db_path_ << ": " << ec.message()); + LOG_ERROR(2248, + index_id_, + "create_directories failed for " << db_path_ << ": " << ec.message()); return false; } @@ -286,7 +290,9 @@ namespace ndd { rc = mdbx_dbi_open(txn, "sparse_docs", MDBX_CREATE | MDBX_INTEGERKEY, &docs_dbi_); if(rc != 0) { - LOG_ERROR(2251, index_id_, "mdbx_dbi_open failed for sparse_docs: " << mdbx_strerror(rc)); + LOG_ERROR(2251, + index_id_, + "mdbx_dbi_open failed for sparse_docs: " << mdbx_strerror(rc)); mdbx_txn_abort(txn); return false; } @@ -315,11 +321,11 @@ namespace ndd { data.iov_len = packed.size(); int rc = mdbx_put(txn, docs_dbi_, &key, &data, MDBX_UPSERT); - if (rc != 0) { + if(rc != 0) { LOG_ERROR(2253, index_id_, - "storeVectorInternal MDBX put failed for doc_id=" - << doc_id << ": " << mdbx_strerror(rc)); + "storeVectorInternal MDBX put failed for doc_id=" << doc_id << ": " + << mdbx_strerror(rc)); } return rc == 0; } @@ -341,7 +347,7 @@ namespace ndd { key.iov_base = &doc_id; key.iov_len = sizeof(ndd::idInt); int rc = mdbx_del(txn, docs_dbi_, &key, nullptr); - if (rc != 0 && rc != MDBX_NOTFOUND) { + if(rc != 0 && rc != MDBX_NOTFOUND) { LOG_ERROR(2254, index_id_, "deleteVectorInternal MDBX delete failed for doc_id=" diff --git a/src/sparse/sparse_vector.hpp b/src/sparse/sparse_vector.hpp index e93019b1d..d3dab895a 100644 --- a/src/sparse/sparse_vector.hpp +++ b/src/sparse/sparse_vector.hpp @@ -20,7 +20,8 @@ namespace ndd { // Constructor from packed data SparseVector(const uint8_t* data, size_t data_size) { if(data_size < sizeof(uint16_t)) { - throw std::runtime_error("Invalid packed data: insufficient size for nr_nonzero field"); + throw std::runtime_error( + "Invalid packed data: insufficient size for nr_nonzero field"); } const uint8_t* ptr = data; @@ -32,7 +33,8 @@ namespace ndd { ptr += sizeof(uint16_t); // Validate remaining data size: nr_nonzero * (4 + 2) bytes - size_t expected_size = sizeof(uint16_t) + (nr_nonzero * (sizeof(uint32_t) + sizeof(uint16_t))); + size_t expected_size = + sizeof(uint16_t) + (nr_nonzero * (sizeof(uint32_t) + sizeof(uint16_t))); if(data_size != expected_size) { throw std::runtime_error("Invalid packed data: size mismatch"); } @@ -74,8 +76,8 @@ namespace ndd { uint16_t nr_nonzero = static_cast(indices.size()); // Calculate total size: nr_nonzero(2) + term_ids(4*nr_nonzero) + values(2*nr_nonzero) - size_t total_size = - sizeof(uint16_t) + (nr_nonzero * sizeof(uint32_t)) + (nr_nonzero * sizeof(uint16_t)); + size_t total_size = sizeof(uint16_t) + (nr_nonzero * sizeof(uint32_t)) + + (nr_nonzero * sizeof(uint16_t)); // Serialize contiguously so the vector can be written to MDBX as one value blob. std::vector packed(total_size); @@ -173,7 +175,7 @@ namespace ndd { float dot(const std::vector& packed_data) const { return dot(packed_data.data(), packed_data.size()); } -#endif //if 0 +#endif //if 0 // Utility methods bool empty() const { return indices.empty(); } diff --git a/src/storage/backup_store.hpp b/src/storage/backup_store.hpp index 9600c2d96..aa7c64cbc 100644 --- a/src/storage/backup_store.hpp +++ b/src/storage/backup_store.hpp @@ -31,8 +31,8 @@ class BackupStore { mutable std::mutex backup_state_mutex_; public: - BackupStore(const std::string& data_dir) - : data_dir_(data_dir) { + BackupStore(const std::string& data_dir) : + data_dir_(data_dir) { std::filesystem::create_directories(data_dir + "/backups"); cleanupTempDir(); } @@ -144,14 +144,16 @@ class BackupStore { nlohmann::json readBackupJson(const std::string& username) { std::string path = getBackupJsonPath(username); - if (!std::filesystem::exists(path)) return nlohmann::json::object(); + if(!std::filesystem::exists(path)) { + return nlohmann::json::object(); + } try { std::ifstream f(path); return nlohmann::json::parse(f); - } catch (const std::exception& e) { + } catch(const std::exception& e) { LOG_WARN(1304, - username, - "Failed to parse backup metadata file " << path << ": " << e.what()); + username, + "Failed to parse backup metadata file " << path << ": " << e.what()); return nlohmann::json::object(); } } @@ -166,11 +168,11 @@ class BackupStore { void cleanupTempDir() { std::string temp_dir = data_dir_ + "/backups/.tmp"; - if (std::filesystem::exists(temp_dir)) { + if(std::filesystem::exists(temp_dir)) { try { std::filesystem::remove_all(temp_dir); LOG_INFO(1301, "Cleaned up backup temp directory"); - } catch (const std::exception& e) { + } catch(const std::exception& e) { LOG_ERROR(1302, "Failed to clean up backup temp directory: " << e.what()); } } @@ -178,7 +180,9 @@ class BackupStore { // Active backup tracking - void setActiveBackup(const std::string& username, const std::string& index_id, const std::string& backup_name) { + void setActiveBackup(const std::string& username, + const std::string& index_id, + const std::string& backup_name) { std::lock_guard lock(backup_state_mutex_); active_user_backups_[username] = {index_id, backup_name}; } @@ -231,8 +235,8 @@ class BackupStore { if(entry.is_regular_file()) { std::string filename = entry.path().filename().string(); - if(filename.size() > 4 && filename.substr(filename.size() - 4) == ".tar" && - !filename.starts_with(".tmp_")) { + if(filename.size() > 4 && filename.substr(filename.size() - 4) == ".tar" + && !filename.starts_with(".tmp_")) { std::string backup_name = filename.substr(0, filename.size() - 4); backups.push_back(backup_name); } @@ -244,7 +248,7 @@ class BackupStore { // Backup deletion std::pair deleteBackup(const std::string& backup_name, - const std::string& username) { + const std::string& username) { std::pair result = validateBackupName(backup_name); if(!result.first) { return result; @@ -271,7 +275,9 @@ class BackupStore { std::optional getActiveBackup(const std::string& username) { std::lock_guard lock(backup_state_mutex_); auto it = active_user_backups_.find(username); - if (it != active_user_backups_.end()) return it->second; + if(it != active_user_backups_.end()) { + return it->second; + } return std::nullopt; } @@ -279,7 +285,7 @@ class BackupStore { nlohmann::json getBackupInfo(const std::string& backup_name, const std::string& username) { nlohmann::json backup_db = readBackupJson(username); - if (backup_db.contains(backup_name)) { + if(backup_db.contains(backup_name)) { return backup_db[backup_name]; } return nlohmann::json(); diff --git a/src/storage/index_meta.hpp b/src/storage/index_meta.hpp index 1011c1370..48d4494de 100644 --- a/src/storage/index_meta.hpp +++ b/src/storage/index_meta.hpp @@ -97,15 +97,15 @@ class MetadataManager { rc = mdbx_put(txn, metadata_dbi_, &db_key, &data, MDBX_UPSERT); if(rc != 0) { mdbx_txn_abort(txn); - LOG_ERROR( - 1502, index_id, "Failed to store metadata: " << mdbx_strerror(rc)); + LOG_ERROR(1502, index_id, "Failed to store metadata: " << mdbx_strerror(rc)); return false; } rc = mdbx_txn_commit(txn); if(rc != 0) { - LOG_ERROR( - 1503, index_id, "Failed to commit metadata transaction: " << mdbx_strerror(rc)); + LOG_ERROR(1503, + index_id, + "Failed to commit metadata transaction: " << mdbx_strerror(rc)); return false; } @@ -135,8 +135,9 @@ class MetadataManager { MDBX_txn* txn; int rc = mdbx_txn_begin(metadata_env_, nullptr, MDBX_TXN_RDONLY, &txn); if(rc != 0) { - LOG_ERROR( - 1506, index_id, "Failed to begin metadata read transaction: " << mdbx_strerror(rc)); + LOG_ERROR(1506, + index_id, + "Failed to begin metadata read transaction: " << mdbx_strerror(rc)); return std::nullopt; } @@ -148,8 +149,7 @@ class MetadataManager { if(rc != 0) { mdbx_txn_abort(txn); if(rc != MDBX_NOTFOUND) { - LOG_ERROR( - 1507, index_id, "Failed to retrieve metadata: " << mdbx_strerror(rc)); + LOG_ERROR(1507, index_id, "Failed to retrieve metadata: " << mdbx_strerror(rc)); } return std::nullopt; } @@ -171,8 +171,9 @@ class MetadataManager { MDBX_txn* txn; int rc = mdbx_txn_begin(metadata_env_, nullptr, MDBX_TXN_READWRITE, &txn); if(rc != MDBX_SUCCESS) { - LOG_ERROR( - 1509, index_id, "Failed to begin metadata delete transaction: " << mdbx_strerror(rc)); + LOG_ERROR(1509, + index_id, + "Failed to begin metadata delete transaction: " << mdbx_strerror(rc)); return false; } @@ -182,15 +183,15 @@ class MetadataManager { rc = mdbx_del(txn, metadata_dbi_, &db_key, nullptr); if(rc != MDBX_SUCCESS && rc != MDBX_NOTFOUND) { mdbx_txn_abort(txn); - LOG_ERROR( - 1510, index_id, "Failed to delete metadata: " << mdbx_strerror(rc)); + LOG_ERROR(1510, index_id, "Failed to delete metadata: " << mdbx_strerror(rc)); return false; } rc = mdbx_txn_commit(txn); if(rc != MDBX_SUCCESS) { - LOG_ERROR( - 1511, index_id, "Failed to commit metadata delete transaction: " << mdbx_strerror(rc)); + LOG_ERROR(1511, + index_id, + "Failed to commit metadata delete transaction: " << mdbx_strerror(rc)); return false; } @@ -209,8 +210,7 @@ class MetadataManager { MDBX_txn* txn; int rc = mdbx_txn_begin(metadata_env_, nullptr, MDBX_TXN_RDONLY, &txn); if(rc != 0) { - LOG_ERROR( - 1513, "Failed to begin list-all metadata transaction: " << mdbx_strerror(rc)); + LOG_ERROR(1513, "Failed to begin list-all metadata transaction: " << mdbx_strerror(rc)); return result; } @@ -230,7 +230,8 @@ class MetadataManager { result.push_back( {key_str, IndexMetadata::from_json(nlohmann::json::parse(json_str))}); } catch(const std::exception& e) { - LOG_ERROR(1515, "Failed to parse metadata while listing all metadata: " << e.what()); + LOG_ERROR(1515, + "Failed to parse metadata while listing all metadata: " << e.what()); // Skip invalid entries } } @@ -249,8 +250,9 @@ class MetadataManager { MDBX_txn* txn; int rc = mdbx_txn_begin(metadata_env_, nullptr, MDBX_TXN_RDONLY, &txn); if(rc != 0) { - LOG_ERROR( - 1516, username, "Failed to begin list-user metadata transaction: " << mdbx_strerror(rc)); + LOG_ERROR(1516, + username, + "Failed to begin list-user metadata transaction: " << mdbx_strerror(rc)); return indexes; } @@ -280,8 +282,7 @@ class MetadataManager { // Add to result indexes.emplace_back(index_name, std::move(metadata)); } catch(const std::exception& e) { - LOG_ERROR( - 1518, key_str, "Failed to parse metadata for index: " << e.what()); + LOG_ERROR(1518, key_str, "Failed to parse metadata for index: " << e.what()); // Skip invalid entries } } @@ -298,8 +299,7 @@ class MetadataManager { MDBX_txn* txn; int rc = mdbx_txn_begin(metadata_env_, nullptr, MDBX_TXN_RDONLY, &txn); if(rc != 0) { - LOG_ERROR( - 1519, "Failed to begin list-all indexes transaction: " << mdbx_strerror(rc)); + LOG_ERROR(1519, "Failed to begin list-all indexes transaction: " << mdbx_strerror(rc)); return result; } diff --git a/src/storage/vector_storage.hpp b/src/storage/vector_storage.hpp index e452888b9..3f54c9613 100644 --- a/src/storage/vector_storage.hpp +++ b/src/storage/vector_storage.hpp @@ -57,7 +57,8 @@ class VectorStore { throw std::runtime_error("Failed to begin transaction"); } - rc = mdbx_dbi_open(txn, settings::DEFAULT_SUBINDEX.c_str(), MDBX_CREATE | MDBX_INTEGERKEY, &dbi_); + rc = mdbx_dbi_open( + txn, settings::DEFAULT_SUBINDEX.c_str(), MDBX_CREATE | MDBX_INTEGERKEY, &dbi_); if(rc != MDBX_SUCCESS) { mdbx_txn_abort(txn); throw std::runtime_error("Failed to open database"); @@ -66,7 +67,7 @@ class VectorStore { rc = mdbx_txn_commit(txn); if(rc != MDBX_SUCCESS) { throw std::runtime_error("Failed to commit transaction: " - + std::string(mdbx_strerror(rc))); + + std::string(mdbx_strerror(rc))); } } @@ -124,9 +125,9 @@ class VectorStore { if(key.iov_len != sizeof(ndd::idInt)) { LOG_ERROR(1601, - index_id_, - "Invalid key size " << key.iov_len << ", expected " - << sizeof(ndd::idInt)); + index_id_, + "Invalid key size " << key.iov_len << ", expected " + << sizeof(ndd::idInt)); throw std::runtime_error("Invalid key size in LMDB entry"); } @@ -222,14 +223,20 @@ class VectorStore { // buffers: pre-allocated flat buffer of size (count * bytes_per_vector_) // success: output array of bool indicating which fetches succeeded // Returns number of successful fetches - size_t get_vectors_batch_into(const ndd::idInt* labels, uint8_t* buffers, - bool* success, size_t count) const { - if(count == 0) return 0; + size_t get_vectors_batch_into(const ndd::idInt* labels, + uint8_t* buffers, + bool* success, + size_t count) const { + if(count == 0) { + return 0; + } MDBX_txn* txn; int rc = mdbx_txn_begin(env_, nullptr, MDBX_TXN_RDONLY, &txn); if(rc != MDBX_SUCCESS) { - for(size_t i = 0; i < count; i++) success[i] = false; + for(size_t i = 0; i < count; i++) { + success[i] = false; + } return 0; } @@ -727,8 +734,10 @@ class VectorStorage { } // Batch fetch: multiple vectors in one MDBX txn - size_t get_vectors_batch_into(const ndd::idInt* labels, uint8_t* buffers, - bool* success, size_t count) const { + size_t get_vectors_batch_into(const ndd::idInt* labels, + uint8_t* buffers, + bool* success, + size_t count) const { return vector_store_->get_vectors_batch_into(labels, buffers, success, count); } diff --git a/src/utils/log.hpp b/src/utils/log.hpp index 5c4b5bbaf..51e92206a 100644 --- a/src/utils/log.hpp +++ b/src/utils/log.hpp @@ -106,61 +106,66 @@ inline std::mutex FunctionTimer::mutex; // Production logs share one formatter so every call site emits stable operational output. namespace ndd::log { -constexpr int kNoCode = -1; + constexpr int kNoCode = -1; -struct Context { - std::string username{"-"}; - std::string index_name{"-"}; -}; + struct Context { + std::string username{"-"}; + std::string index_name{"-"}; + }; -// Logs always render username/index_name, using "-" placeholders when scope is missing. -inline std::string normalizeContextPart(std::string value) { - if(value.empty()) { - return "-"; + // Logs always render username/index_name, using "-" placeholders when scope is missing. + inline std::string normalizeContextPart(std::string value) { + if(value.empty()) { + return "-"; + } + return value; } - return value; -} - -inline Context makeContext(const std::string& username, const std::string& index_name) { - return {normalizeContextPart(username), normalizeContextPart(index_name)}; -} -inline Context makeUserContext(const std::string& username) { return makeContext(username, "-"); } + inline Context makeContext(const std::string& username, const std::string& index_name) { + return {normalizeContextPart(username), normalizeContextPart(index_name)}; + } -inline Context makeGlobalContext() { return makeContext("-", "-"); } + inline Context makeUserContext(const std::string& username) { + return makeContext(username, "-"); + } -inline Context contextFromIndexId(const std::string& index_id) { - const size_t slash_pos = index_id.find('/'); - if(slash_pos == std::string::npos) { - return makeGlobalContext(); + inline Context makeGlobalContext() { + return makeContext("-", "-"); } - return makeContext(index_id.substr(0, slash_pos), index_id.substr(slash_pos + 1)); -} + inline Context contextFromIndexId(const std::string& index_id) { + const size_t slash_pos = index_id.find('/'); + if(slash_pos == std::string::npos) { + return makeGlobalContext(); + } + + return makeContext(index_id.substr(0, slash_pos), index_id.substr(slash_pos + 1)); + } -inline Context contextFromString(const std::string& context) { - if(context.empty() || context == "-" || context == "-/-") { - return makeGlobalContext(); + inline Context contextFromString(const std::string& context) { + if(context.empty() || context == "-" || context == "-/-") { + return makeGlobalContext(); + } + if(context.find('/') != std::string::npos) { + return contextFromIndexId(context); + } + return makeUserContext(context); } - if(context.find('/') != std::string::npos) { - return contextFromIndexId(context); + + inline std::string formatContext(const Context& context) { + return normalizeContextPart(context.username) + "/" + + normalizeContextPart(context.index_name); } - return makeUserContext(context); -} - -inline std::string formatContext(const Context& context) { - return normalizeContextPart(context.username) + "/" - + normalizeContextPart(context.index_name); -} - -// Prefixes are either LEVEL_code for explicit codes or LEVEL for intentional code-less logs. -inline void emit(const char* level, int code, const Context& context, const std::string& message) { - std::cerr << level; - if(code != kNoCode) { - std::cerr << "_" << code; + + // Prefixes are either LEVEL_code for explicit codes or LEVEL for intentional code-less logs. + inline void + emit(const char* level, int code, const Context& context, const std::string& message) { + std::cerr << level; + if(code != kNoCode) { + std::cerr << "_" << code; + } + std::cerr << ": " << formatContext(context) << ": " << message << std::endl; } - std::cerr << ": " << formatContext(context) << ": " << message << std::endl; -} } // namespace ndd::log #define NDD_LOG_EMIT(level, code, context, msg) \ @@ -170,12 +175,12 @@ inline void emit(const char* level, int code, const Context& context, const std: ndd::log::emit(level, code, context, __log_ss__.str()); \ } while(0) -// Arity dispatch keeps the public macros simple while selecting global, user, index, or explicit context. +// Arity dispatch keeps the public macros simple while selecting global, user, index, or explicit +// context. #define NDD_LOG_1(level, msg) \ NDD_LOG_EMIT(level, ndd::log::kNoCode, ndd::log::makeGlobalContext(), msg) -#define NDD_LOG_2(level, code, msg) \ - NDD_LOG_EMIT(level, code, ndd::log::makeGlobalContext(), msg) +#define NDD_LOG_2(level, code, msg) NDD_LOG_EMIT(level, code, ndd::log::makeGlobalContext(), msg) #define NDD_LOG_3(level, code, context, msg) \ NDD_LOG_EMIT(level, code, ndd::log::contextFromString(context), msg) diff --git a/src/utils/settings.hpp b/src/utils/settings.hpp index 1c13a3a45..693ca8b71 100644 --- a/src/utils/settings.hpp +++ b/src/utils/settings.hpp @@ -29,7 +29,7 @@ namespace settings { constexpr size_t MAX_M = 512; constexpr size_t DEFAULT_EF_CONSTRUCT = 128; constexpr size_t MIN_EF_CONSTRUCT = 8; - constexpr size_t BACKFILL_BUFFER = 4; // Keep 4 slots free for high quality neighbors + constexpr size_t BACKFILL_BUFFER = 4; // Keep 4 slots free for high quality neighbors constexpr size_t MAX_EF_CONSTRUCT = 4096; constexpr size_t DEFAULT_EF_SEARCH = 128; constexpr size_t MIN_K = 1; @@ -90,13 +90,13 @@ namespace settings { const std::string DEFAULT_SERVER_TYPE = "OSS"; const std::string DEFAULT_DATA_DIR = "/mnt/data"; const std::string DEFAULT_SUBINDEX = "default"; - constexpr size_t MAX_NR_SUBINDEX = 100; //Maximum number of subindexes + constexpr size_t MAX_NR_SUBINDEX = 100; //Maximum number of subindexes constexpr size_t DEFAULT_MAX_ACTIVE_INDICES = 64; constexpr size_t DEFAULT_MAX_ELEMENTS = 100'000; constexpr size_t DEFAULT_MAX_ELEMENTS_INCREMENT = 100'000; constexpr size_t DEFAULT_MAX_ELEMENTS_INCREMENT_TRIGGER = 50'000; constexpr size_t DEFAULT_VECTOR_CACHE_PERCENTAGE = 15; - constexpr size_t DEFAULT_VECTOR_CACHE_MIN_BITS = 17; // Minimum 128K entries in cache + constexpr size_t DEFAULT_VECTOR_CACHE_MIN_BITS = 17; // Minimum 128K entries in cache const std::string DEFAULT_SERVER_ID = "unknown"; //For Backups @@ -113,14 +113,14 @@ namespace settings { inline static size_t NUM_SERVER_THREADS = [] { const char* env = std::getenv("NDD_NUM_SERVER_THREADS"); - if (env) { + if(env) { return (size_t)std::stoull(env); } // If no env var, check if default is 0 (auto-detect) - if (DEFAULT_NUM_SERVER_THREADS == 0) { + if(DEFAULT_NUM_SERVER_THREADS == 0) { unsigned int hw = std::thread::hardware_concurrency() * 2; - return hw > 0 ? (size_t)hw : 1; // Fallback to 1 if hardware_concurrency returns 0 + return hw > 0 ? (size_t)hw : 1; // Fallback to 1 if hardware_concurrency returns 0 } return (size_t)DEFAULT_NUM_SERVER_THREADS; diff --git a/tests/filter_test.cpp b/tests/filter_test.cpp index 101be3403..ccd2c2736 100644 --- a/tests/filter_test.cpp +++ b/tests/filter_test.cpp @@ -4,7 +4,7 @@ #include #include "filter/filter.hpp" #include "json/nlohmann_json.hpp" -#include "filter/numeric_index.hpp" // For Bucket test +#include "filter/numeric_index.hpp" // For Bucket test namespace fs = std::filesystem; using json = nlohmann::json; @@ -14,10 +14,10 @@ TEST(BucketTest, Serialization) { b.base_value = 100; b.add(105, 1); b.add(110, 2); - + auto bytes = b.serialize(); EXPECT_GT(bytes.size(), 6); - + auto b2 = ndd::filter::Bucket::deserialize(bytes.data(), bytes.size(), 100); EXPECT_EQ(b2.ids.size(), 2); EXPECT_EQ(b2.ids[0], 1); @@ -32,18 +32,18 @@ class FilterTest : public ::testing::Test { void SetUp() override { // Create a unique temporary directory for each test db_path = "./test_db_" + std::to_string(rand()); - if (fs::exists(db_path)) { + if(fs::exists(db_path)) { fs::remove_all(db_path); } - + // Initialize Filter filter = std::make_unique(db_path); } void TearDown() override { // Clean up - filter.reset(); // Close DB environment first - if (fs::exists(db_path)) { + filter.reset(); // Close DB environment first + if(fs::exists(db_path)) { fs::remove_all(db_path); } } @@ -54,18 +54,16 @@ TEST_F(FilterTest, CategoryFilterBasics) { // ID 1: City=Paris // ID 2: City=London // ID 3: City=Paris - + filter->add_to_filter("city", "Paris", 1); filter->add_to_filter("city", "London", 2); filter->add_to_filter("city", "Paris", 3); // Query for City=Paris - json query = json::array({ - {{"city", {{"$eq", "Paris"}}}} - }); + json query = json::array({{{"city", {{"$eq", "Paris"}}}}}); std::vector ids = filter->getIdsMatchingFilter(query); - + // Should find 1 and 3 EXPECT_EQ(ids.size(), 2); EXPECT_NE(std::find(ids.begin(), ids.end(), 1), ids.end()); @@ -77,25 +75,21 @@ TEST_F(FilterTest, BooleanFilterBasics) { // Boolean is just a special category "0" or "1" // ID 10: Active=true // ID 11: Active=false - + // Using JSON add interface for variety filter->add_filters_from_json(10, R"({"is_active": true})"); filter->add_filters_from_json(11, R"({"is_active": false})"); // Query Active=true - json query_true = json::array({ - {{"is_active", {{"$eq", true}}}} - }); - + json query_true = json::array({{{"is_active", {{"$eq", true}}}}}); + auto ids_true = filter->getIdsMatchingFilter(query_true); EXPECT_EQ(ids_true.size(), 1); EXPECT_EQ(ids_true[0], 10); // Query Active=false - json query_false = json::array({ - {{"is_active", {{"$eq", false}}}} - }); - + json query_false = json::array({{{"is_active", {{"$eq", false}}}}}); + auto ids_false = filter->getIdsMatchingFilter(query_false); EXPECT_EQ(ids_false.size(), 1); EXPECT_EQ(ids_false[0], 11); @@ -105,24 +99,26 @@ TEST_F(FilterTest, NumericFilterBasics) { // ID 100: Age=25 // ID 101: Age=30 // ID 102: Age=35 - + filter->add_filters_from_json(100, R"({"age": 25})"); filter->add_filters_from_json(101, R"({"age": 30})"); filter->add_filters_from_json(102, R"({"age": 35})"); // Range Query: 20 <= Age <= 32 - json query_range = json::array({ - {{"age", {{"$range", {20, 32}}}}} - }); + json query_range = json::array({{{"age", {{"$range", {20, 32}}}}}}); auto ids = filter->getIdsMatchingFilter(query_range); - + // Should match 100 (25) and 101 (30) EXPECT_EQ(ids.size(), 2); bool found100 = false, found101 = false; for(auto id : ids) { - if(id == 100) found100 = true; - if(id == 101) found101 = true; + if(id == 100) { + found100 = true; + } + if(id == 101) { + found101 = true; + } } EXPECT_TRUE(found100); EXPECT_TRUE(found101); @@ -131,13 +127,11 @@ TEST_F(FilterTest, NumericFilterBasics) { TEST_F(FilterTest, FloatNumericFilter) { // ID 1: Price=10.5 // ID 2: Price=20.0 - + filter->add_filters_from_json(1, R"({"price": 10.5})"); filter->add_filters_from_json(2, R"({"price": 20.0})"); - json query = json::array({ - {{"price", {{"$range", {10.0, 15.0}}}}} - }); + json query = json::array({{{"price", {{"$range", {10.0, 15.0}}}}}}); auto ids = filter->getIdsMatchingFilter(query); EXPECT_EQ(ids.size(), 1); @@ -148,16 +142,13 @@ TEST_F(FilterTest, MixedAndLogic) { // ID 1: City=NY, Age=30 (Match) // ID 2: City=NY, Age=40 (Age fail) // ID 3: City=LA, Age=30 (City fail) - + filter->add_filters_from_json(1, R"({"city": "NY", "age": 30})"); filter->add_filters_from_json(2, R"({"city": "NY", "age": 40})"); filter->add_filters_from_json(3, R"({"city": "LA", "age": 30})"); // Filter: City=NY AND Age < 35 - json query = json::array({ - {{"city", {{"$eq", "NY"}}}}, - {{"age", {{"$range", {0, 35}}}}} - }); + json query = json::array({{{"city", {{"$eq", "NY"}}}}, {{"age", {{"$range", {0, 35}}}}}}); auto ids = filter->getIdsMatchingFilter(query); EXPECT_EQ(ids.size(), 1); @@ -168,52 +159,46 @@ TEST_F(FilterTest, InOperator) { // ID 1: Color=Red // ID 2: Color=Blue // ID 3: Color=Green - + filter->add_to_filter("color", "Red", 1); filter->add_to_filter("color", "Blue", 2); filter->add_to_filter("color", "Green", 3); // Query: Color IN [Red, Green] - json query = json::array({ - {{"color", {{"$in", {"Red", "Green"}}}}} - }); + json query = json::array({{{"color", {{"$in", {"Red", "Green"}}}}}}); auto ids = filter->getIdsMatchingFilter(query); - EXPECT_EQ(ids.size(), 2); // 1 and 3 + EXPECT_EQ(ids.size(), 2); // 1 and 3 } TEST_F(FilterTest, DeleteFilter) { // ID 1: Tag=A filter->add_to_filter("tag", "A", 1); - - json query = json::array({ - {{"tag", {{"$eq", "A"}}}} - }); - + + json query = json::array({{{"tag", {{"$eq", "A"}}}}}); + EXPECT_EQ(filter->countIdsMatchingFilter(query), 1); - + // Remove functionality test // Usually removal requires us to know what to remove or we remove entire ID? // The Filter class has: remove_from_filter(field, value, id) - + filter->remove_from_filter("tag", "A", 1); - + EXPECT_EQ(filter->countIdsMatchingFilter(query), 0); } TEST_F(FilterTest, NumericDelete) { // ID 1: Score=100 filter->add_filters_from_json(1, R"({"score": 100})"); - + // Check it exists - json query = json::array({ - {{"score", {{"$eq", 100}}}} - }); + json query = json::array({{{"score", {{"$eq", 100}}}}}); EXPECT_EQ(filter->countIdsMatchingFilter(query), 1); - + // Remove // remove_filters_from_json uses the whole object filter->remove_filters_from_json(1, R"({"score": 100})"); - + EXPECT_EQ(filter->countIdsMatchingFilter(query), 0); } From f66c45c65f5cf18daad18fcf643346bdf90d79b2 Mon Sep 17 00:00:00 2001 From: Omnish_Kumar Date: Thu, 12 Mar 2026 10:06:12 +0530 Subject: [PATCH 2/8] branch name change in ci yml file --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 47ab914a2..ae4c11595 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -8,7 +8,7 @@ name: Continuous Integration on: # PUSH EVENT push: - branches: [ feature/linting_tests ] + branches: [ feature/continuous_integration ] # PULL REQUEST EVENT pull_request: branches: [ feature/continuous_integration] From 9e292af9bdbabea436bb68dd9f3a38f719f54d6f Mon Sep 17 00:00:00 2001 From: Omnish_Kumar Date: Thu, 12 Mar 2026 10:46:51 +0530 Subject: [PATCH 3/8] removed non-third party check of clang-format --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index ae4c11595..e2495d9a2 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -45,7 +45,7 @@ jobs: # DRY RUN MAKE SURE IT WON'T CHANGE CODE ONLY IT WILL CHECK FORMATTING - name: C++ and HPP formatting run: | - find src/ \( -name "*.cpp" -o -name "*.hpp" \) -not -path "*/third_party/*" | xargs --no-run-if-empty clang-format --dry-run --Werror + find src/ \( -name "*.cpp" -o -name "*.hpp" \) | xargs --no-run-if-empty clang-format --dry-run --Werror # CHECK FOR BUGS AND BAD PRACTICES USING CLANG-TIDY - name: C++ Static Analysis From b1ccd7125dd8e0e3f5a78f6d886dec7d55fb3191 Mon Sep 17 00:00:00 2001 From: Omnish_Kumar Date: Fri, 13 Mar 2026 10:36:39 +0530 Subject: [PATCH 4/8] working on benchmark menu --- .github/workflows/benchmark_menu.yml | 62 ++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) create mode 100644 .github/workflows/benchmark_menu.yml diff --git a/.github/workflows/benchmark_menu.yml b/.github/workflows/benchmark_menu.yml new file mode 100644 index 000000000..97a135ed2 --- /dev/null +++ b/.github/workflows/benchmark_menu.yml @@ -0,0 +1,62 @@ + +# ==================== BIRD VIEW OF THIS FILE ========================= +# PR opened +# ↓ +# CI workflow runs (lint, unit tests) +# ↓ passes +# This workflow wakes up +# ↓ +# "Was it a PR? Did CI pass? Can I find the PR?" +# ↓ yes to all +# Posts a comment on the PR with benchmark menu +# Sets a yellow pending status on the commit + +name: Benchmark Menu + +# ========================= TRIGGER EVENT ========================= +on: + + # ---- TRIGGER WHEN PREVIOUS WORKFLOW COMPLETED ( PREVIOUS WORKFLOW WAS CONTINUOUS INTEGRATION) + workflow_run: + workflows: [ Continuous Integration ] + types: [ completed ] + +# ========================= JOBS ========================= +jobs: + + # ---- JOB 1: FIGURE OUT WHICH PR CAUSED THIS CI RUN + # WHEN WORKFLOW_RUN TRIGGER EVENT IS FIRES, WE GET INTO DETACHED CONTEXT. SO WE LOST CONTEXT OF THE PR TRIGGERED CI. + # WE NEED TO FIGURE OUT WHICH PR CAUSED THIS CI RUN. + + PR Information: + name: Retrieve PR Information + runs-on: ubuntu-latest + if: | + github.event.workflow_run.conclusion == 'success' && + github.event.workflow_run.event == 'pull_request' + + outputs: + # PR NUMBER IS PULL REQUEST NUMBER UNIQUE IDENTIFIER + # HEAD SHA IS THE SPECIFIC HASH OF THE LAS COMMIT IN SOURCE BRANCH + # HEAD REF IS THE NAME OF THE SOURCE BRANCH BEING MERGED + pr_number: ${{ steps.find-pr.outputs.pr_number }} + head_sha: ${{ steps.find-pr.outputs.head_sha }} + head_ref: ${{ steps.find-pr.outputs.head_ref }} + + steps: + - name: Find PR for this CI run + # USES CONTAIN SOME FUNCTIONS IT'S LIKE IMPORT AND WITH IS HOW YOU CALL THAT WITH ARGUMENTS + id: find-pr + # GITHUB-SCRIPT@V7 IS SPECIAL ACTION IT NEEDS JAVASCRIPT CODE AS ARGUMENT + # ALSO WITH + uses: actions/github-script@v7 + with: + script: | + const headSha = context.payload.workflow_run.head_sha; + const headRef = context.payload.workflow_run.head_ref; + + core.setOutput('head_sha', headSha); + core.setOutput('head_ref', headRef); + + + From 9d53fc29094ee333c29d58ddcdb5640cd21d04a9 Mon Sep 17 00:00:00 2001 From: Omnish_Kumar Date: Fri, 13 Mar 2026 12:34:43 +0530 Subject: [PATCH 5/8] Benchmark Menu option created on PR page --- .github/workflows/benchmark_menu.yml | 193 ++++++++++++++++++++++++++- 1 file changed, 190 insertions(+), 3 deletions(-) diff --git a/.github/workflows/benchmark_menu.yml b/.github/workflows/benchmark_menu.yml index 97a135ed2..000288087 100644 --- a/.github/workflows/benchmark_menu.yml +++ b/.github/workflows/benchmark_menu.yml @@ -28,7 +28,7 @@ jobs: # WHEN WORKFLOW_RUN TRIGGER EVENT IS FIRES, WE GET INTO DETACHED CONTEXT. SO WE LOST CONTEXT OF THE PR TRIGGERED CI. # WE NEED TO FIGURE OUT WHICH PR CAUSED THIS CI RUN. - PR Information: + resolve-pr: name: Retrieve PR Information runs-on: ubuntu-latest if: | @@ -47,8 +47,68 @@ jobs: - name: Find PR for this CI run # USES CONTAIN SOME FUNCTIONS IT'S LIKE IMPORT AND WITH IS HOW YOU CALL THAT WITH ARGUMENTS id: find-pr - # GITHUB-SCRIPT@V7 IS SPECIAL ACTION IT NEEDS JAVASCRIPT CODE AS ARGUMENT - # ALSO WITH + # GITHUB-SCRIPT@V7 IS SPECIAL ACTION IT NEEDS SCRIPT AND RUNS IT AS JAVASCRIPT + # ALSO WITH PRE-INJECTED ARGUMENTS THAT IS GITHUB, CONTEXT, AND CORE + # GITHUB: AUTHENTICATED GITHUB API CLIENT + # CONTEXT: INFO ABOUT CURRENT WORKFLOW RUN, REPO AND EVENT PAYLOAD + # context + # ├── repo + # │ ├── owner → 'my-org' + # │ └── repo → 'my-repo' + # │ + # ├── payload → the raw webhook event that triggered the workflow + # │ └── workflow_run (specific to your workflow_run event) + # │ ├── head_sha → full commit SHA + # │ ├── head_branch → branch name + # │ ├── conclusion → 'success' | 'failure' | null + # │ ├── html_url → link to the CI run + # │ └── id → run ID + # │ + # ├── eventName → 'workflow_run' | 'pull_request' | 'push' etc. + # ├── sha → commit SHA of the current workflow + # ├── ref → 'refs/heads/main' + # ├── workflow → name of the current workflow + # ├── runId → unique ID of this run + # ├── runNumber → incrementing number (1, 2, 3...) + # │ + # ├── actor → username who triggered the workflow + # ├── job → current job id + # │ + # └── issue → (available on PR/issue events) + # ├── owner + # ├── repo + # └── number → PR or issue number + # CORE: UTILITY FUNCTIONS FOR WORKFLOW ACTIONS LIKE SETOUTPUT, INFO AND WARNING LOGGING + # CORE IS BRIDGE BETWEEN WORKFLOW AND SCRIPT WHICH WILL RUN IN THE GITHUB ACTIONS RUNNER SO THAT WE ABLE TO SEE IT'S OUTPUT + # core + # │ + # ├── OUTPUTS + # │ └── setOutput(name, value) → sends value out to workflow outputs + # │ + # ├── LOGGING + # │ ├── info('message') → plain white log line + # │ ├── warning('message') → yellow job continues + # │ ├── error('message') → red job continues + # │ ├── debug('message') → only visible if debug mode is on + # │ └── notice('message') → blue highlighted in log + # │ + # ├── FAILURE CONTROL + # │ └── setFailed('message') → marks step failed + stops job + # │ + # ├── INPUT READING + # │ └── getInput('name') → reads a `with:` input if action has one + # │ + # ├── ENVIRONMENT + # │ ├── exportVariable(name, val) → sets an env variable for next steps + # │ └── addPath(path) → adds to PATH for next steps + # │ + # ├── MASKING + # │ └── setSecret('value') → masks a value in all logs (shows as ***) + # │ + # └── GROUPING (log formatting) + # ├── startGroup('title') → collapses log lines under a title + # └── endGroup() → closes the group + uses: actions/github-script@v7 with: script: | @@ -58,5 +118,132 @@ jobs: core.setOutput('head_sha', headSha); core.setOutput('head_ref', headRef); + # FIND THE OPEN PR WHOSE HEAD MATCHES THIS SHA + const { data: prs } = await github.rest.pulls.list({ + owner: context.repo.owner, + repo: context.repo.repo, + state: 'open', + head: `${context.repo.owner}:${headRef}`, + }); + + const pr = prs.find(pr => p.head.sha === headSha); + if (!pr) { + core.warning(`No Open PR Found for SHA ${headSha} - skipping`); + core.setOutput('pr_number', ''); + return + } + core.info(`Found PR #${pr.number}`); + core.setOutput('pr_number', String(pr.number)); + + + # ========================= POST BENCHMARK OPTIONS COMMENT ( CI PASSED, PR FOUND) ========================= + + post-menu: + name: Benchmark Options + # NEEDS DO TWO THING- A) CONTROLS EXECUTION - RESOLVE-PR RUN FIRST THEN POST-MENU OTHERWISE BOTH RUN SIMULTANEOUSLY + # B) GIVE ACCESS TO THAT JOB'S OUTPUT: `needs.resolve-pr.outputs.*` inside `post-menu`. Without declaring `needs`, WE CANNOT ANOTHER JOB'S OUTPUTS — the reference would be empty. + # needs.resolve-pr + # ├── result → 'success' | 'failure' | 'skipped' | 'cancelled' + # └── outputs + # ├── pr_number → '42' + # ├── head_sha → 'abc123...' + # └── head_ref → 'feature/my-branch' + needs: resolve-pr + runs-on: ubuntu-latest + if: needs.resolve-pr.outputs.pr_number != '' + + steps: + - name: Post or Update Benchmark Menu comment + uses: actions/github-script@v7 + with: + script: | + const owner = context.repo.owner; + const repo = context.repo.repo; + const prNumber = Number('${{ needs.resolve-pr.outputs.pr_number }}'); + const sha = '${{ needs.resolve-pr.outputs.head_sha }}'; + const branch = '${{ needs.resolve-pr.outputs.head_ref }}'; + const ciRunUrl = context.payload.workflow_run.html_url; + + const body = [ + `## VectorDB Benchmark - Ready To Run`, + ``, + `> **CI Passed** ([lint + unit tests] (${ ciRunUrl })) - benchmark options unlocked.`, + ``, + `Post one of the command below. Only members with **write access** can trigger runs.`, + ``, + `--------`, + ``, + `### Available Modes`, + ``, + `| Mode | Command | What runs |`, + `|------|---------|-----------|`, + `| 🔵 Dense | \`/benchmark dense\` | HNSW insert throughput · query P50/P95/P99 · recall@10 · concurrent QPS |`, + `| 🟣 Hybrid | \`/benchmark hybrid\` | Dense + sparse BM25 fusion · same suite + fusion latency overhead |`, + `| ⚡ Both | \`/benchmark all\` | Runs dense then hybrid · posts a side-by-side comparison table |`, + ``, + `---`, + ``, + `### Infrastructure`, + ``, + `| Server | Role | Instance |`, + `|--------|------|----------|`, + `| Server A | Endee VectorDB — code from this branch | \`c5.2xlarge\` |`, + `| Server B | Benchmark runner | \`r6i.2xlarge\` |`, + ``, + `Both servers start on demand and are **always terminated** after the run — pass or fail.`, + ``, + `---`, + ``, + `### How It Works`, + `\`\`\``, + `1. Post /benchmark `, + `2. Server A starts → this branch's code deployed → Endee starts in chosen mode`, + `3. Server B starts → benchmark suite transferred`, + `4. Server B runs benchmark against Endee on Server A`, + `5. Results posted back here → pass/fail + full metrics table`, + `6. Both servers terminated → always, even on failure`, + `\`\`\``, + ``, + `> After a new push, CI must pass again before this menu reappears.`, + ].join('\n'); + // UPSERT: UPDATE EXISTING COMMENT OR CREATE NEW ONE + // FETCH ALL COMMENTS FOR THIS PR + const { data: comments } = await github.rest.issues.listComments({ + owner, repo, issue_number: prNumber + }); + // When GitHub Actions runs a workflow, it acts on behalf of a special built-in account called github-actions[bot] + const existing = comments.find(c => + c.user.login === 'github-actions[bot]' && + c.body.includes('VectorDB Benchmark') + ); + if (existing) { + await github.rest.issues.updateComment({ + + owner, repo, comment_id: existing.id, body + }) + } else { + await github.rest.issues.createComment({ + owner, repo, issue_number: prNumber, body, + }); + core.info(`Updated existing comment #${existing.id}`); + } else { + await github.rest.issues.createComment({ + owner, repo, issue_number: prNumber, body, + }); + core.info(`Created New Benchmark Menu Comment`); + } + + - name: Set benchmark commit status -> pending + uses: actions/github-script@v7 + with: + script: | + await github.rest.repos.createCommitStatus({ + owner: context.repo.owner, + repo: context.repo.repo, + sha: '${{ needs.resolve-pr.outputs.head_sha }}', + state: 'pending', + description: 'CI passed — waiting for /benchmark command', + context: 'ci/vectordb-benchmark', + }); \ No newline at end of file From d00cd770f70ffa2ed6bc54ec44f7144437de67cb Mon Sep 17 00:00:00 2001 From: Omnish_Kumar Date: Fri, 13 Mar 2026 12:52:20 +0530 Subject: [PATCH 6/8] bugs fixed --- .github/workflows/benchmark_menu.yml | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/.github/workflows/benchmark_menu.yml b/.github/workflows/benchmark_menu.yml index 000288087..281492155 100644 --- a/.github/workflows/benchmark_menu.yml +++ b/.github/workflows/benchmark_menu.yml @@ -113,7 +113,7 @@ jobs: with: script: | const headSha = context.payload.workflow_run.head_sha; - const headRef = context.payload.workflow_run.head_ref; + const headRef = context.payload.workflow_run.head_branch; core.setOutput('head_sha', headSha); core.setOutput('head_ref', headRef); @@ -126,7 +126,7 @@ jobs: head: `${context.repo.owner}:${headRef}`, }); - const pr = prs.find(pr => p.head.sha === headSha); + const pr = prs.find(p => p.head.sha === headSha); if (!pr) { core.warning(`No Open PR Found for SHA ${headSha} - skipping`); core.setOutput('pr_number', ''); @@ -222,10 +222,6 @@ jobs: await github.rest.issues.updateComment({ owner, repo, comment_id: existing.id, body - }) - } else { - await github.rest.issues.createComment({ - owner, repo, issue_number: prNumber, body, }); core.info(`Updated existing comment #${existing.id}`); } else { From c2f5eae8f8c1a5d5f56e7d739358359cc41c2ff6 Mon Sep 17 00:00:00 2001 From: Omnish_Kumar Date: Fri, 13 Mar 2026 17:30:05 +0530 Subject: [PATCH 7/8] updated workflow files --- .github/workflows/benchmark_menu.yml | 12 +++++++++--- .github/workflows/ci.yml | 8 ++++---- .github/workflows/vectordb_benchmark.yml | 0 3 files changed, 13 insertions(+), 7 deletions(-) create mode 100644 .github/workflows/vectordb_benchmark.yml diff --git a/.github/workflows/benchmark_menu.yml b/.github/workflows/benchmark_menu.yml index 281492155..6cf0ca95d 100644 --- a/.github/workflows/benchmark_menu.yml +++ b/.github/workflows/benchmark_menu.yml @@ -16,11 +16,17 @@ name: Benchmark Menu # ========================= TRIGGER EVENT ========================= on: - # ---- TRIGGER WHEN PREVIOUS WORKFLOW COMPLETED ( PREVIOUS WORKFLOW WAS CONTINUOUS INTEGRATION) + # ---- TRIGGER WHEN PREVIOUS WORKFLOW COMPLETED ( PREVIOUS WORKFLOW WAS CONTINUOUS INTEGRATION) workflow_run: workflows: [ Continuous Integration ] types: [ completed ] + branches: [ feature/continuous_integration ] +permissions: + contents: read + pull-requests: read + issues: write + statuses: write # ========================= JOBS ========================= jobs: @@ -118,7 +124,7 @@ jobs: core.setOutput('head_sha', headSha); core.setOutput('head_ref', headRef); - # FIND THE OPEN PR WHOSE HEAD MATCHES THIS SHA + // FIND THE OPEN PR WHOSE HEAD MATCHES THIS SHA const { data: prs } = await github.rest.pulls.list({ owner: context.repo.owner, repo: context.repo.repo, @@ -167,7 +173,7 @@ jobs: const body = [ `## VectorDB Benchmark - Ready To Run`, ``, - `> **CI Passed** ([lint + unit tests] (${ ciRunUrl })) - benchmark options unlocked.`, + `> **CI Passed** ([lint + unit tests] (${ciRunUrl})) - benchmark options unlocked.`, ``, `Post one of the command below. Only members with **write access** can trigger runs.`, ``, diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index e2495d9a2..d73ec5c91 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -6,12 +6,12 @@ name: Continuous Integration # ========== TRIGGER EVENT ========== on: - # PUSH EVENT - push: - branches: [ feature/continuous_integration ] + # # PUSH EVENT + # push: + # branches: [ base_ci ] # PULL REQUEST EVENT pull_request: - branches: [ feature/continuous_integration] + branches: [ feature/continuous_integration ] # ========== JOBS ========== diff --git a/.github/workflows/vectordb_benchmark.yml b/.github/workflows/vectordb_benchmark.yml new file mode 100644 index 000000000..e69de29bb From a4f4fed48fb2f00efe3e8a4dbba66ccb02721966 Mon Sep 17 00:00:00 2001 From: Omnish_Kumar Date: Fri, 13 Mar 2026 17:31:12 +0530 Subject: [PATCH 8/8] branches updated --- .github/workflows/benchmark_menu.yml | 2 +- .github/workflows/ci.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/benchmark_menu.yml b/.github/workflows/benchmark_menu.yml index 6cf0ca95d..4896f7160 100644 --- a/.github/workflows/benchmark_menu.yml +++ b/.github/workflows/benchmark_menu.yml @@ -20,7 +20,7 @@ on: workflow_run: workflows: [ Continuous Integration ] types: [ completed ] - branches: [ feature/continuous_integration ] + branches: [ feature/ci_main ] permissions: contents: read diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index d73ec5c91..de107a54f 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -11,7 +11,7 @@ on: # branches: [ base_ci ] # PULL REQUEST EVENT pull_request: - branches: [ feature/continuous_integration ] + branches: [ feature/ci_main ] # ========== JOBS ==========