diff --git a/docs/filter.md b/docs/filter.md index 734084040..81d479a95 100644 --- a/docs/filter.md +++ b/docs/filter.md @@ -43,6 +43,108 @@ The database (LMDB) acts as a coarse-grained B+ Tree. * **Split Triggers:** Count > 1024 OR Delta > 65,535. * **Sliding Split:** To ensure Key Uniqueness in LMDB, splits do not strictly occur at the median. The split point "slides" right to find the first value divergence, ensuring `Key(RightBucket) != Key(LeftBucket)`. +### 2.4. Comparison Operators + +The system supports four comparison operators for numeric fields, enabling flexible single-boundary range queries: + +#### `$gt` - Greater Than (Exclusive) +Returns documents where the field value is **strictly greater than** the specified value. + +**Syntax:** +```json +[{"field_name": {"$gt": value}}] +``` + +**Supported Types:** Numeric fields only (integers and floats) + +**Examples:** +```json +// Find users older than 25 +[{"age": {"$gt": 25}}] + +// Find products with price greater than 99.99 +[{"price": {"$gt": 99.99}}] + +// Combine with other filters: users in NY older than 30 +[ + {"city": {"$eq": "NY"}}, + {"age": {"$gt": 30}} +] +``` + +**Implementation:** Uses `range(value+1, UINT32_MAX)` after sortable conversion. Edge case: returns empty bitmap if value equals maximum sortable value. + +#### `$gte` - Greater Than or Equal To (Inclusive) +Returns documents where the field value is **greater than or equal to** the specified value. + +**Syntax:** +```json +[{"field_name": {"$gte": value}}] +``` + +**Supported Types:** Numeric fields only (integers and floats) + +**Examples:** +```json +// Find users 25 or older +[{"age": {"$gte": 25}}] + +// Find products with price at least 100 +[{"price": {"$gte": 100.0}}] +``` + +**Implementation:** Uses `range(value, UINT32_MAX)` after sortable conversion. + +#### `$lt` - Less Than (Exclusive) +Returns documents where the field value is **strictly less than** the specified value. + +**Syntax:** +```json +[{"field_name": {"$lt": value}}] +``` + +**Supported Types:** Numeric fields only (integers and floats) + +**Examples:** +```json +// Find users younger than 30 +[{"age": {"$lt": 30}}] + +// Find products with price less than 100 +[{"price": {"$lt": 100.0}}] +``` + +**Implementation:** Uses `range(0, value-1)` after sortable conversion. Edge case: returns empty bitmap if value equals minimum sortable value (0). + +#### `$lte` - Less Than or Equal To (Inclusive) +Returns documents where the field value is **less than or equal to** the specified value. + +**Syntax:** +```json +[{"field_name": {"$lte": value}}] +``` + +**Supported Types:** Numeric fields only (integers and floats) + +**Examples:** +```json +// Find users 30 or younger +[{"age": {"$lte": 30}}] + +// Find products with price up to 100 (inclusive) +[{"price": {"$lte": 100.0}}] + +// Combine: find products between 10 and 100 (inclusive) +[ + {"price": {"$gte": 10.0}}, + {"price": {"$lte": 100.0}} +] +``` + +**Implementation:** Uses `range(0, value)` after sortable conversion. + +**Note:** All comparison operators work with both positive and negative numbers, including floats. You can combine operators to create precise ranges (e.g., `$gte` + `$lte` for inclusive range, `$gt` + `$lt` for exclusive range). + --- ## 3. Category Filter Design diff --git a/src/filter/filter.hpp b/src/filter/filter.hpp index a6e1c4ef8..dcfef7297 100644 --- a/src/filter/filter.hpp +++ b/src/filter/filter.hpp @@ -114,6 +114,11 @@ class Filter { std::lock_guard lock(schema_mutex_); auto it = schema_cache_.find(field); if(it != schema_cache_.end()) { + if(it->second != type) { + LOG_WARN(1222, index_id_, "Schema type conflict for field '" << field + << "': expected=" << static_cast(it->second) + << ", got=" << static_cast(type)); + } return it->second == type; } @@ -176,6 +181,167 @@ class Filter { return field + ":" + value; } + /** + * Converts JSON numeric value to sortable uint32_t representation. + * Handles both integers and floats. + * @throws runtime_error if value is not a number + */ + uint32_t json_to_sortable_numeric(const nlohmann::json& val) const { + if(val.is_number_integer()) { + return ndd::filter::int_to_sortable(val.get()); + } else if(val.is_number()) { + return ndd::filter::float_to_sortable(val.get()); + } else { + throw std::runtime_error("Value must be a number"); + } + } + + /** + * Process $eq (equality) operator for a field. + * Handles both numeric and category fields. + */ + ndd::RoaringBitmap process_eq_operator( + const std::string& field, + const nlohmann::json& val, + FieldType type) const { + + if(type == FieldType::Number) { + uint32_t sortable_val = json_to_sortable_numeric(val); + return numeric_index_->range(field, sortable_val, sortable_val); + } else { + // Category/boolean handling + if(!val.is_string() && !val.is_number_integer() && !val.is_boolean()) { + throw std::runtime_error("$eq value must be string, integer or boolean"); + } + std::string str_val; + if(val.is_string()) { + str_val = val.get(); + } else if(val.is_boolean()) { + str_val = val.get() ? "1" : "0"; + } else { + str_val = std::to_string(val.get()); + if (str_val.size() > 255) throw std::runtime_error("Category value too long"); + } + std::string key = format_filter_key(field, str_val); + return category_index_->get_bitmap_by_key(key); + } + } + + /** + * Process $in (set membership) operator for a field. + * Returns union of all matching values. + */ + ndd::RoaringBitmap process_in_operator( + const std::string& field, + const nlohmann::json& val, + FieldType type) const { + + if(!val.is_array()) { + throw std::runtime_error("$in must be array"); + } + + ndd::RoaringBitmap result; + + if(val.empty()) { + LOG_DEBUG("Empty $in array for field: " << field); + return result; + } + + for(const auto& v : val) { + if(type == FieldType::Number) { + uint32_t sortable_val = json_to_sortable_numeric(v); + result |= numeric_index_->range(field, sortable_val, sortable_val); + } else { + if(!v.is_string() && !v.is_number_integer() && !v.is_boolean()) { + throw std::runtime_error("$in values must be string, integer or boolean"); + } + std::string str_val; + if(v.is_string()) { + str_val = v.get(); + } else if(v.is_boolean()) { + str_val = v.get() ? "1" : "0"; + } else { + str_val = std::to_string(v.get()); + } + if(!str_val.empty()) { + if (str_val.size() > 255) throw std::runtime_error("Category value too long"); + std::string key = format_filter_key(field, str_val); + result |= category_index_->get_bitmap_by_key(key); + } + } + } + return result; + } + + /** + * Process $range (inclusive range) operator for numeric fields. + * Format: [start, end] - both inclusive. + */ + ndd::RoaringBitmap process_range_operator( + const std::string& field, + const nlohmann::json& val, + FieldType type) const { + + if(!val.is_array() || val.size() != 2) { + throw std::runtime_error("$range must be [start, end] array with exactly 2 elements"); + } + + if(type != FieldType::Number) { + throw std::runtime_error("$range operator is only supported for numeric fields"); + } + + uint32_t start_val = json_to_sortable_numeric(val[0]); + uint32_t end_val = json_to_sortable_numeric(val[1]); + + if(start_val > end_val) { + throw std::runtime_error("Invalid range: start > end"); + } + + return numeric_index_->range(field, start_val, end_val); + } + + /** + * Process comparison operators: $gt, $gte, $lt, $lte. + * All implemented as range queries with appropriate bounds. + */ + ndd::RoaringBitmap process_comparison_operator( + const std::string& field, + const nlohmann::json& val, + FieldType type, + const std::string& op) const { + + if(type != FieldType::Number) { + throw std::runtime_error(op + " operator is only supported for numeric fields"); + } + + uint32_t sortable_val = json_to_sortable_numeric(val); + + if(op == "$gt") { + // Greater than (exclusive): (value, MAX] + if(sortable_val == UINT32_MAX) { + return ndd::RoaringBitmap(); // No values > MAX + } + return numeric_index_->range(field, sortable_val + 1, UINT32_MAX); + + } else if(op == "$gte") { + // Greater than or equal (inclusive): [value, MAX] + return numeric_index_->range(field, sortable_val, UINT32_MAX); + + } else if(op == "$lt") { + // Less than (exclusive): [0, value) + if(sortable_val == 0) { + return ndd::RoaringBitmap(); // No values < 0 + } + return numeric_index_->range(field, 0, sortable_val - 1); + + } else if(op == "$lte") { + // Less than or equal (inclusive): [0, value] + return numeric_index_->range(field, 0, sortable_val); + } + + throw std::runtime_error("Unsupported comparison operator: " + op); + } + public: Filter(const std::string& path, const std::string& index_id) : index_id_(index_id), @@ -189,7 +355,19 @@ class Filter { mdbx_env_close(env_); } - // Compute the filter bitmap based on the provided JSON filter array + /** + * Computes a RoaringBitmap of IDs matching all filter conditions using AND semantics. + * + * Filter format: array of conditions, e.g.: + * [{"field1": {"$eq": "value"}}, {"field2": {"$gt": 10}}] + * + * Each condition is evaluated independently to produce a bitmap, then all bitmaps + * are intersected (AND) to get the final result. + * + * Optimization: Conditions are sorted by cardinality (smallest first) before + * intersection to minimize work on subsequent AND operations. + */ + ndd::RoaringBitmap computeFilterBitmap(const nlohmann::json& filter_array) const { if(!filter_array.is_array()) { throw std::runtime_error("Filter must be an array"); @@ -234,111 +412,27 @@ class Filter { const std::string op = expr.begin().key(); const auto& val = expr.begin().value(); - if(op == "$eq") { - if(type == FieldType::Number) { - uint32_t sortable_val; - if(val.is_number_integer()) { - sortable_val = ndd::filter::int_to_sortable(val.get()); - } else if(val.is_number()) { - sortable_val = ndd::filter::float_to_sortable(val.get()); - } else { - throw std::runtime_error("$eq value for numeric field must be a number"); - } - or_result = numeric_index_->range(field, sortable_val, sortable_val); - } else { - if(!val.is_string() && !val.is_number_integer() && !val.is_boolean()) { - throw std::runtime_error("$eq value must be string, integer or boolean"); - } - std::string str_val; - if(val.is_string()) { - str_val = val.get(); - } else if(val.is_boolean()) { - str_val = val.get() ? "1" : "0"; - } else { - str_val = std::to_string(val.get()); - if (str_val.size() > 255) throw std::runtime_error("Category value too long"); - } - std::string key = format_filter_key(field, str_val); - or_result = category_index_->get_bitmap_by_key(key); - } - } else if(op == "$in") { - if(!val.is_array()) { - throw std::runtime_error("$in must be array"); - } - if(val.empty()) { - LOG_DEBUG("Empty $in array for field: " << field); - } else { - for(const auto& v : val) { - if(type == FieldType::Number) { - uint32_t sortable_val; - if(v.is_number_integer()) { - sortable_val = ndd::filter::int_to_sortable(v.get()); - } else if(v.is_number()) { - sortable_val = ndd::filter::float_to_sortable(v.get()); - } else { - throw std::runtime_error( - "$in value for numeric field must be a number"); - } - or_result |= numeric_index_->range(field, sortable_val, sortable_val); - } else { - if(!v.is_string() && !v.is_number_integer() && !v.is_boolean()) { - throw std::runtime_error( - "$in values must be string, integer or boolean"); - } - std::string str_val; - if(v.is_string()) { - str_val = v.get(); - } else if(v.is_boolean()) { - str_val = v.get() ? "1" : "0"; - } else { - str_val = std::to_string(v.get()); - } - if(!str_val.empty()) { - if (str_val.size() > 255) throw std::runtime_error("Category value too long"); - std::string key = format_filter_key(field, str_val); - or_result |= category_index_->get_bitmap_by_key(key); - } - } - } - } - } else if(op == "$range") { - if(!val.is_array() || val.size() != 2) { - throw std::runtime_error( - "$range must be [start, end] array with exactly 2 elements"); - } - - if(type == FieldType::Number) { - uint32_t start_val, end_val; - - if(val[0].is_number_integer()) { - start_val = ndd::filter::int_to_sortable(val[0].get()); - } else if(val[0].is_number()) { - start_val = ndd::filter::float_to_sortable(val[0].get()); - } else { - throw std::runtime_error("Range start must be a number"); - } - - if(val[1].is_number_integer()) { - end_val = ndd::filter::int_to_sortable(val[1].get()); - } else if(val[1].is_number()) { - end_val = ndd::filter::float_to_sortable(val[1].get()); - } else { - throw std::runtime_error("Range end must be a number"); - } - - if(start_val > end_val) { - throw std::runtime_error("Invalid range: start > end"); - } + LOG_INFO(1215, index_id_, "Processing operator " << op << " for field '" << field << "'"); - or_result = numeric_index_->range(field, start_val, end_val); + // Dispatch to operator-specific helper functions + try { + if(op == "$eq") { + or_result = process_eq_operator(field, val, type); + } else if(op == "$in") { + or_result = process_in_operator(field, val, type); + } else if(op == "$range") { + or_result = process_range_operator(field, val, type); + } else if(op == "$gt" || op == "$gte" || op == "$lt" || op == "$lte") { + or_result = process_comparison_operator(field, val, type, op); } else { - throw std::runtime_error( - "$range operator is only supported for numeric fields"); + LOG_ERROR(1217, index_id_, "Unsupported operator: " << op << " for field '" << field << "'"); + throw std::runtime_error("Unsupported operator: " + op); } - } else { - throw std::runtime_error("Unsupported operator: " + op); + } catch(const std::exception& e) { + LOG_ERROR(1216, index_id_, "Operator " << op << " failed for field '" << field << "': " << e.what()); + throw; } - + partial_results.push_back(std::move(or_result)); } @@ -350,11 +444,16 @@ class Filter { if (partial_results.empty()) return ndd::RoaringBitmap(); + // AND all conditions together. Start with smallest bitmap (index 0), + // then intersect with each subsequent bitmap. Short-circuit on empty result. ndd::RoaringBitmap final_result = partial_results[0]; for(size_t i = 1; i < partial_results.size(); ++i) { final_result &= partial_results[i]; // If result becomes empty, stop early - if(final_result.isEmpty()) return final_result; + if(final_result.isEmpty()) { + LOG_INFO(1219, index_id_, "Filter computation complete: early termination with empty result"); + return final_result; + } } return final_result; diff --git a/tests/filter_test.cpp b/tests/filter_test.cpp index 101be3403..65dadf897 100644 --- a/tests/filter_test.cpp +++ b/tests/filter_test.cpp @@ -214,6 +214,250 @@ TEST_F(FilterTest, NumericDelete) { // Remove // remove_filters_from_json uses the whole object filter->remove_filters_from_json(1, R"({"score": 100})"); - + EXPECT_EQ(filter->countIdsMatchingFilter(query), 0); } + +TEST_F(FilterTest, GtOperatorInteger) { + // Setup: ID 100: age=20, ID 101: age=25, ID 102: age=30, ID 103: age=35 + filter->add_filters_from_json(100, R"({"age": 20})"); + filter->add_filters_from_json(101, R"({"age": 25})"); + filter->add_filters_from_json(102, R"({"age": 30})"); + filter->add_filters_from_json(103, R"({"age": 35})"); + + // Query: age > 25 + json query = json::array({ + {{"age", {{"$gt", 25}}}} + }); + + auto ids = filter->getIdsMatchingFilter(query); + + // Should match 102 (30) and 103 (35), NOT 101 (25) + EXPECT_EQ(ids.size(), 2); + std::sort(ids.begin(), ids.end()); + EXPECT_EQ(ids[0], 102); + EXPECT_EQ(ids[1], 103); +} + +TEST_F(FilterTest, GeOperatorInteger) { + // Setup: ID 100: age=20, ID 101: age=25, ID 102: age=30, ID 103: age=35 + filter->add_filters_from_json(100, R"({"age": 20})"); + filter->add_filters_from_json(101, R"({"age": 25})"); + filter->add_filters_from_json(102, R"({"age": 30})"); + filter->add_filters_from_json(103, R"({"age": 35})"); + + // Query: age >= 25 + json query = json::array({ + {{"age", {{"$gte", 25}}}} + }); + + auto ids = filter->getIdsMatchingFilter(query); + + // Should match 101 (25), 102 (30), and 103 (35) + EXPECT_EQ(ids.size(), 3); + std::sort(ids.begin(), ids.end()); + EXPECT_EQ(ids[0], 101); + EXPECT_EQ(ids[1], 102); + EXPECT_EQ(ids[2], 103); +} + +TEST_F(FilterTest, GtGeOperatorFloat) { + // Setup with float values + filter->add_filters_from_json(1, R"({"price": 9.99})"); + filter->add_filters_from_json(2, R"({"price": 10.5})"); + filter->add_filters_from_json(3, R"({"price": 15.0})"); + filter->add_filters_from_json(4, R"({"price": 20.25})"); + + // Test $gt + json query_gt = json::array({ + {{"price", {{"$gt", 10.5}}}} + }); + auto ids_gt = filter->getIdsMatchingFilter(query_gt); + EXPECT_EQ(ids_gt.size(), 2); // IDs 3, 4 + + // Test $gte + json query_ge = json::array({ + {{"price", {{"$gte", 10.5}}}} + }); + auto ids_ge = filter->getIdsMatchingFilter(query_ge); + EXPECT_EQ(ids_ge.size(), 3); // IDs 2, 3, 4 +} + +TEST_F(FilterTest, GtOperatorEdgeCaseMax) { + // Setup + filter->add_filters_from_json(1, R"({"value": 2147483647})"); // INT32_MAX + filter->add_filters_from_json(2, R"({"value": 2147483646})"); + + // Query: value > INT32_MAX + json query = json::array({ + {{"value", {{"$gt", 2147483647}}}} + }); + + auto ids = filter->getIdsMatchingFilter(query); + + // Should return empty (no value greater than max int) + EXPECT_EQ(ids.size(), 0); +} + +TEST_F(FilterTest, GtWithAndLogic) { + // Setup: ID 1: city=NY, age=30, ID 2: city=NY, age=40, ID 3: city=LA, age=40 + filter->add_filters_from_json(1, R"({"city": "NY", "age": 30})"); + filter->add_filters_from_json(2, R"({"city": "NY", "age": 40})"); + filter->add_filters_from_json(3, R"({"city": "LA", "age": 40})"); + + // Query: city=NY AND age > 35 + json query = json::array({ + {{"city", {{"$eq", "NY"}}}}, + {{"age", {{"$gt", 35}}}} + }); + + auto ids = filter->getIdsMatchingFilter(query); + + // Should match only ID 2 (NY + age 40) + EXPECT_EQ(ids.size(), 1); + EXPECT_EQ(ids[0], 2); +} + +TEST_F(FilterTest, GtOperatorErrorNonNumeric) { + // Setup string field + filter->add_to_filter("city", "Paris", 1); + + // Query: city > "Paris" (should throw error) + json query = json::array({ + {{"city", {{"$gt", "Paris"}}}} + }); + + EXPECT_THROW( + filter->getIdsMatchingFilter(query), + std::runtime_error + ); +} + +TEST_F(FilterTest, GtGeOperatorNegativeNumbers) { + // Setup with negative numbers + filter->add_filters_from_json(1, R"({"temperature": -10})"); + filter->add_filters_from_json(2, R"({"temperature": -5})"); + filter->add_filters_from_json(3, R"({"temperature": 0})"); + filter->add_filters_from_json(4, R"({"temperature": 5})"); + + // Query: temperature > -5 + json query_gt = json::array({ + {{"temperature", {{"$gt", -5}}}} + }); + auto ids_gt = filter->getIdsMatchingFilter(query_gt); + EXPECT_EQ(ids_gt.size(), 2); // IDs 3, 4 (0 and 5) + + // Query: temperature >= -5 + json query_ge = json::array({ + {{"temperature", {{"$gte", -5}}}} + }); + auto ids_ge = filter->getIdsMatchingFilter(query_ge); + EXPECT_EQ(ids_ge.size(), 3); // IDs 2, 3, 4 (-5, 0, 5) +} + +TEST_F(FilterTest, LtOperatorInteger) { + // Setup: ID 100: age=20, ID 101: age=25, ID 102: age=30, ID 103: age=35 + filter->add_filters_from_json(100, R"({"age": 20})"); + filter->add_filters_from_json(101, R"({"age": 25})"); + filter->add_filters_from_json(102, R"({"age": 30})"); + filter->add_filters_from_json(103, R"({"age": 35})"); + + // Query: age < 30 + json query = json::array({ + {{"age", {{"$lt", 30}}}} + }); + + auto ids = filter->getIdsMatchingFilter(query); + + // Should match 100 (20) and 101 (25), NOT 102 (30) + EXPECT_EQ(ids.size(), 2); + std::sort(ids.begin(), ids.end()); + EXPECT_EQ(ids[0], 100); + EXPECT_EQ(ids[1], 101); +} + +TEST_F(FilterTest, LeOperatorInteger) { + // Setup: ID 100: age=20, ID 101: age=25, ID 102: age=30, ID 103: age=35 + filter->add_filters_from_json(100, R"({"age": 20})"); + filter->add_filters_from_json(101, R"({"age": 25})"); + filter->add_filters_from_json(102, R"({"age": 30})"); + filter->add_filters_from_json(103, R"({"age": 35})"); + + // Query: age <= 30 + json query = json::array({ + {{"age", {{"$lte", 30}}}} + }); + + auto ids = filter->getIdsMatchingFilter(query); + + // Should match 100 (20), 101 (25), and 102 (30) + EXPECT_EQ(ids.size(), 3); + std::sort(ids.begin(), ids.end()); + EXPECT_EQ(ids[0], 100); + EXPECT_EQ(ids[1], 101); + EXPECT_EQ(ids[2], 102); +} + +TEST_F(FilterTest, LtLeOperatorFloat) { + // Setup with float values + filter->add_filters_from_json(1, R"({"price": 9.99})"); + filter->add_filters_from_json(2, R"({"price": 10.5})"); + filter->add_filters_from_json(3, R"({"price": 15.0})"); + filter->add_filters_from_json(4, R"({"price": 20.25})"); + + // Test $lt + json query_lt = json::array({ + {{"price", {{"$lt", 15.0}}}} + }); + auto ids_lt = filter->getIdsMatchingFilter(query_lt); + EXPECT_EQ(ids_lt.size(), 2); // IDs 1, 2 + + // Test $lte + json query_le = json::array({ + {{"price", {{"$lte", 15.0}}}} + }); + auto ids_le = filter->getIdsMatchingFilter(query_le); + EXPECT_EQ(ids_le.size(), 3); // IDs 1, 2, 3 +} + +TEST_F(FilterTest, LtOperatorEdgeCaseMin) { + // Setup + filter->add_filters_from_json(1, R"({"value": -2147483648})"); // INT32_MIN + filter->add_filters_from_json(2, R"({"value": -2147483647})"); + + // Query: value < INT32_MIN + json query = json::array({ + {{"value", {{"$lt", -2147483648}}}} + }); + + auto ids = filter->getIdsMatchingFilter(query); + + // Should return empty (no value less than min int) + EXPECT_EQ(ids.size(), 0); +} + +TEST_F(FilterTest, ComparisonRangeEquivalence) { + // Setup + filter->add_filters_from_json(1, R"({"age": 20})"); + filter->add_filters_from_json(2, R"({"age": 25})"); + filter->add_filters_from_json(3, R"({"age": 30})"); + filter->add_filters_from_json(4, R"({"age": 35})"); + + // Test: $gte 25 AND $lte 30 should equal $range [25, 30] + json query_comparison = json::array({ + {{"age", {{"$gte", 25}}}}, + {{"age", {{"$lte", 30}}}} + }); + auto ids_comp = filter->getIdsMatchingFilter(query_comparison); + + json query_range = json::array({ + {{"age", {{"$range", {25, 30}}}}} + }); + auto ids_range = filter->getIdsMatchingFilter(query_range); + + // Should produce identical results + EXPECT_EQ(ids_comp.size(), ids_range.size()); + std::sort(ids_comp.begin(), ids_comp.end()); + std::sort(ids_range.begin(), ids_range.end()); + EXPECT_EQ(ids_comp, ids_range); +}