From e57d9f20b4546cc787fe2980488a732efb9757f9 Mon Sep 17 00:00:00 2001 From: csun5285 Date: Thu, 6 Mar 2025 17:18:32 +0800 Subject: [PATCH 1/4] [enhance](variant) add ut for schema::util --- .../segment_v2/variant_column_writer_impl.cpp | 5 +- be/src/vec/common/schema_util.cpp | 42 +- be/src/vec/common/schema_util.h | 10 +- be/test/vec/common/schema_util_test.cpp | 514 ++++++++++++++++++ 4 files changed, 524 insertions(+), 47 deletions(-) create mode 100644 be/test/vec/common/schema_util_test.cpp diff --git a/be/src/olap/rowset/segment_v2/variant_column_writer_impl.cpp b/be/src/olap/rowset/segment_v2/variant_column_writer_impl.cpp index 34fe6e085ecdc4..2c9f4afd02f1c3 100644 --- a/be/src/olap/rowset/segment_v2/variant_column_writer_impl.cpp +++ b/be/src/olap/rowset/segment_v2/variant_column_writer_impl.cpp @@ -403,9 +403,8 @@ Status VariantColumnWriterImpl::_process_sparse_column( return status; } VLOG_DEBUG << "dump sparse " - << vectorized::schema_util::dump_column( - vectorized::ColumnObject::get_sparse_column_type(), - ptr->get_sparse_column()); + << vectorized::Block::dump_column(ptr->get_sparse_column(), + vectorized::ColumnObject::get_sparse_column_type()); RETURN_IF_ERROR( _sparse_column_writer->append(column->get_nullmap(), column->get_data(), num_rows)); ++column_id; diff --git a/be/src/vec/common/schema_util.cpp b/be/src/vec/common/schema_util.cpp index 59d6d9bc1e9c94..a19469c1bcdab8 100644 --- a/be/src/vec/common/schema_util.cpp +++ b/be/src/vec/common/schema_util.cpp @@ -573,43 +573,6 @@ vectorized::ColumnObject::Subcolumns get_sorted_subcolumns( return sorted; } -// --------------------------- - -std::string dump_column(DataTypePtr type, const ColumnPtr& col) { - Block tmp; - tmp.insert(ColumnWithTypeAndName {col, type, col->get_name()}); - return tmp.dump_data(0, tmp.rows()); -} - -// --------------------------- -Status extract(ColumnPtr source, const PathInData& path, MutableColumnPtr& dst) { - auto type_string = std::make_shared(); - std::string jsonpath = path.to_jsonpath(); - bool is_nullable = source->is_nullable(); - auto json_type = is_nullable ? make_nullable(std::make_shared()) - : std::make_shared(); - ColumnsWithTypeAndName arguments { - {source, json_type, ""}, - {type_string->create_column_const(1, Field(String(jsonpath.data(), jsonpath.size()))), - type_string, ""}}; - auto function = - SimpleFunctionFactory::instance().get_function("jsonb_extract", arguments, json_type); - if (!function) { - return Status::InternalError("Not found function jsonb_extract"); - } - Block tmp_block {arguments}; - vectorized::ColumnNumbers argnum; - argnum.emplace_back(0); - argnum.emplace_back(1); - size_t result_column = tmp_block.columns(); - tmp_block.insert({nullptr, json_type, ""}); - RETURN_IF_ERROR(function->execute(nullptr, tmp_block, argnum, result_column, source->size())); - dst = tmp_block.get_by_position(result_column) - .column->convert_to_full_column_if_const() - ->assume_mutable(); - return Status::OK(); -} - bool has_schema_index_diff(const TabletSchema* new_schema, const TabletSchema* old_schema, int32_t new_col_idx, int32_t old_col_idx) { const auto& column_new = new_schema->column(new_col_idx); @@ -816,6 +779,8 @@ Status get_compaction_schema(const std::vector& rowsets, void calculate_variant_stats(const IColumn& encoded_sparse_column, segment_v2::VariantStatisticsPB* stats, size_t row_pos, size_t num_rows) { + int limit = stats->sparse_column_non_null_size().size() - + VariantStatistics::MAX_SPARSE_DATA_STATISTICS_SIZE; // Cast input column to ColumnMap type since sparse column is stored as a map const auto& map_column = assert_cast(encoded_sparse_column); @@ -840,8 +805,7 @@ void calculate_variant_stats(const IColumn& encoded_sparse_column, } // If path doesn't exist and we haven't hit the max statistics size limit, // add it with count 1 - else if (sparse_data_paths_statistics.size() < - VariantStatistics::MAX_SPARSE_DATA_STATISTICS_SIZE) { + else if (sparse_data_paths_statistics.size() < limit) { sparse_data_paths_statistics.emplace(path, 1); } } diff --git a/be/src/vec/common/schema_util.h b/be/src/vec/common/schema_util.h index a4101883fc97ae..065af0dcb859cb 100644 --- a/be/src/vec/common/schema_util.h +++ b/be/src/vec/common/schema_util.h @@ -122,17 +122,17 @@ void inherit_column_attributes(const TabletColumn& source, TabletColumn& target, vectorized::ColumnObject::Subcolumns get_sorted_subcolumns( const vectorized::ColumnObject::Subcolumns& subcolumns); -// Extract json data from source with path -Status extract(ColumnPtr source, const PathInData& path, MutableColumnPtr& dst); - -std::string dump_column(DataTypePtr type, const ColumnPtr& col); - bool has_schema_index_diff(const TabletSchema* new_schema, const TabletSchema* old_schema, int32_t new_col_idx, int32_t old_col_idx); // create ColumnMap TabletColumn create_sparse_column(const TabletColumn& variant); +// get the subpaths and sparse paths for the variant column +void get_subpaths(const TabletColumn& variant, + const std::unordered_map& path_stats, + std::unordered_map& uid_to_paths_set_info) + // Build the temporary schema for compaction, this will reduce the memory usage of compacting variant columns Status get_compaction_schema(const std::vector& rowsets, TabletSchemaSPtr& target); diff --git a/be/test/vec/common/schema_util_test.cpp b/be/test/vec/common/schema_util_test.cpp new file mode 100644 index 00000000000000..76debf7c79216c --- /dev/null +++ b/be/test/vec/common/schema_util_test.cpp @@ -0,0 +1,514 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "vec/common/schema_util.h" + +#include +#include + +#include "olap/rowset/segment_v2/variant_column_writer_impl.h" + +#include "vec/json/parse2column.h" +#include "olap/rowset/beta_rowset_writer.h" +#include "olap/rowset/rowset_factory.h" +#include "olap/tablet_schema.h" +#include "olap/storage_engine.h" + +using namespace doris::vectorized; + +using namespace doris::segment_v2; + +using namespace doris; + +constexpr static uint32_t MAX_PATH_LEN = 1024; +constexpr static std::string_view dest_dir = "./ut_dir/schema_util_test"; +constexpr static std::string_view tmp_dir = "./ut_dir/tmp"; + +class SchemaUtilTest : public testing::Test { +protected: + void SetUp() override { + // absolute dir + char buffer[MAX_PATH_LEN]; + EXPECT_NE(getcwd(buffer, MAX_PATH_LEN), nullptr); + _curreent_dir = std::string(buffer); + _absolute_dir = _curreent_dir + std::string(dest_dir); + EXPECT_TRUE(io::global_local_filesystem()->delete_directory(_absolute_dir).ok()); + EXPECT_TRUE(io::global_local_filesystem()->create_directory(_absolute_dir).ok()); + + // tmp dir + EXPECT_TRUE(io::global_local_filesystem()->delete_directory(tmp_dir).ok()); + EXPECT_TRUE(io::global_local_filesystem()->create_directory(tmp_dir).ok()); + std::vector paths; + paths.emplace_back(std::string(tmp_dir), 1024000000); + auto tmp_file_dirs = std::make_unique(paths); + EXPECT_TRUE(tmp_file_dirs->init().ok()); + ExecEnv::GetInstance()->set_tmp_file_dir(std::move(tmp_file_dirs)); + + // storage engine + doris::EngineOptions options; + auto engine = std::make_unique(options); + _engine_ref = engine.get(); + std::cout << "absolute_dir: " << _absolute_dir << std::endl; + _data_dir = std::make_unique(*_engine_ref, _absolute_dir); + static_cast(_data_dir->update_capacity()); + EXPECT_TRUE(_data_dir->init(true).ok()); + ExecEnv::GetInstance()->set_storage_engine(std::move(engine)); + } + void TearDown() override { + //EXPECT_TRUE(io::global_local_filesystem()->delete_directory(_tablet->tablet_path()).ok()); + EXPECT_TRUE(io::global_local_filesystem()->delete_directory(_absolute_dir).ok()); + EXPECT_TRUE(io::global_local_filesystem()->delete_directory(tmp_dir).ok()); + _engine_ref = nullptr; + ExecEnv::GetInstance()->set_storage_engine(nullptr); + } + +public: + SchemaUtilTest() = default; + virtual ~SchemaUtilTest() = default; + +private: + StorageEngine* _engine_ref = nullptr; + std::unique_ptr _data_dir = nullptr; + TabletSharedPtr _tablet = nullptr; + std::string _absolute_dir; + std::string _curreent_dir; +}; + +void construct_column(ColumnPB* column_pb, TabletIndexPB* tablet_index, int64_t index_id, + const std::string& index_name, int32_t col_unique_id, + const std::string& column_type, const std::string& column_name, + const IndexType& index_type) { + column_pb->set_unique_id(col_unique_id); + column_pb->set_name(column_name); + column_pb->set_type(column_type); + column_pb->set_is_nullable(false); + column_pb->set_is_bf_column(true); + tablet_index->set_index_id(index_id); + tablet_index->set_index_name(index_name); + tablet_index->set_index_type(index_type); + tablet_index->add_col_unique_id(col_unique_id); +} + +void construct_subcolumn(TabletSchemaSPtr schema, const FieldType& type, int32_t col_unique_id, + std::string_view path, std::vector* subcolumns) { + TabletColumn subcol; + subcol.set_type(type); + subcol.set_is_nullable(true); + subcol.set_unique_id(-1); + subcol.set_parent_unique_id(col_unique_id); + vectorized::PathInData col_path(path); + subcol.set_path_info(col_path); + subcol.set_name(col_path.get_path()); + schema->append_column(subcol); + subcolumns->emplace_back(std::move(subcol)); +} + +TEST_F(SchemaUtilTest, inherit_column_attributes) { + TabletSchemaPB schema_pb; + schema_pb.set_keys_type(KeysType::DUP_KEYS); + schema_pb.set_inverted_index_storage_format(InvertedIndexStorageFormatPB::V2); + + construct_column(schema_pb.add_column(), schema_pb.add_index(), 10000, "key_index", 0, "INT", + "key", IndexType::INVERTED); + construct_column(schema_pb.add_column(), schema_pb.add_index(), 10001, "v1_index", 1, "VARIANT", + "v1", IndexType::INVERTED); + construct_column(schema_pb.add_column(), schema_pb.add_index(), 10003, "v3_index", 3, "VARIANT", + "v3", IndexType::INVERTED); + + TabletSchemaSPtr tablet_schema = std::make_shared(); + tablet_schema->init_from_pb(schema_pb); + std::vector subcolumns; + + construct_subcolumn(tablet_schema, FieldType::OLAP_FIELD_TYPE_STRING, 1, "v1.b", &subcolumns); + construct_subcolumn(tablet_schema, FieldType::OLAP_FIELD_TYPE_INT, 1, "v1.c", &subcolumns); + + construct_subcolumn(tablet_schema, FieldType::OLAP_FIELD_TYPE_ARRAY, 3, "v3.d", &subcolumns); + construct_subcolumn(tablet_schema, FieldType::OLAP_FIELD_TYPE_FLOAT, 3, "v3.a", &subcolumns); + + schema_util::inherit_column_attributes(tablet_schema); + for (const auto& col : subcolumns) { + switch (col._parent_col_unique_id) { + case 1: + EXPECT_TRUE(tablet_schema->inverted_index(col) != nullptr); + break; + case 3: + EXPECT_TRUE(tablet_schema->inverted_index(col) == nullptr); + break; + default: + EXPECT_TRUE(false); + } + } + EXPECT_EQ(tablet_schema->inverted_indexes().size(), 7); + + for (const auto& col : tablet_schema->_cols) { + if (!col->is_extracted_column()) { + continue; + } + switch (col->_parent_col_unique_id) { + case 1: + EXPECT_TRUE(col->is_bf_column()); + break; + case 3: + EXPECT_TRUE(!col->is_bf_column()); + break; + default: + EXPECT_TRUE(false); + } + } +} + +std::unordered_map construct_column_map_with_random_values( + auto& column_map, int key_size, int value_size, const std::string& prefix) { + std::unordered_map key_value_counts; + auto& key = assert_cast(column_map->get_keys()); + auto& value = assert_cast(column_map->get_values()); + auto& offsets = column_map->get_offsets(); + + std::srand(42); + + for (int i = 0; i < key_size; ++i) { + std::string current_key = prefix + std::to_string(i); + + int value_count = std::rand() % value_size + 1; + key_value_counts[current_key] = value_count; + + for (int j = 0; j < value_count; ++j) { + key.insert_data(current_key.data(), current_key.size()); + auto value_str = prefix + std::to_string(j); + value.insert_data(value_str.data(), value_str.size()); + } + offsets.push_back(key.size()); + } + + return key_value_counts; +} + +TEST_F(SchemaUtilTest, calculate_variant_stats) { + VariantStatisticsPB stats; + auto column_map = ColumnMap::create( + ColumnString::create(), ColumnString::create(), ColumnArray::ColumnOffsets::create()); + + const auto& key_value_counts = construct_column_map_with_random_values(column_map, 200, 100, "key_"); + + schema_util::calculate_variant_stats(*column_map, &stats); + EXPECT_EQ(stats.sparse_column_non_null_size_size(), key_value_counts.size()); + + for (const auto& kv : key_value_counts) { + auto it = stats.sparse_column_non_null_size().find(kv.first); + EXPECT_NE(it, stats.sparse_column_non_null_size().end()); + EXPECT_EQ(it->second, kv.second); + } + + column_map->clear(); + const auto& key_value_counts2 = construct_column_map_with_random_values(column_map, 3000, 100, "key_"); + schema_util::calculate_variant_stats(*column_map, &stats); + EXPECT_EQ(stats.sparse_column_non_null_size_size(), 3000); + + for (const auto& [path, size] : stats.sparse_column_non_null_size()) { + auto first_size = key_value_counts.find(path) == key_value_counts.end() ? 0 : key_value_counts.find(path)->second; + auto second_size = key_value_counts2.find(path) == key_value_counts2.end() ? 0 : key_value_counts2.find(path)->second; + EXPECT_EQ(size, first_size + second_size); + } + + column_map->clear(); + const auto& key_value_counts3 = construct_column_map_with_random_values(column_map, VariantStatistics::MAX_SPARSE_DATA_STATISTICS_SIZE, 5, "key2_"); + schema_util::calculate_variant_stats(*column_map, &stats); + EXPECT_EQ(VariantStatistics::MAX_SPARSE_DATA_STATISTICS_SIZE, stats.sparse_column_non_null_size_size()); + + for (const auto& [path, size] : stats.sparse_column_non_null_size()) { + auto first_size = key_value_counts.find(path) == key_value_counts.end() ? 0 : key_value_counts.find(path)->second; + auto second_size = key_value_counts2.find(path) == key_value_counts2.end() ? 0 : key_value_counts2.find(path)->second; + auto third_size = key_value_counts3.find(path) == key_value_counts3.end() ? 0 : key_value_counts3.find(path)->second; + EXPECT_EQ(size, first_size + second_size + third_size); + } +} + +TEST_F(SchemaUtilTest, get_subpaths) { + TabletColumn variant; + variant.set_unique_id(1); + variant.set_variant_max_subcolumns_count(3); + + std::unordered_map path_stats; + path_stats[1] = { + {"path1", 1000}, + {"path2", 800}, + {"path3", 500}, + {"path4", 300}, + {"path5", 200} + }; + + std::unordered_map uid_to_paths_set_info; + schema_util::get_subpaths(variant, path_stats, uid_to_paths_set_info); + + EXPECT_EQ(uid_to_paths_set_info[1].sub_path_set.size(), 3); + EXPECT_EQ(uid_to_paths_set_info[1].sparse_path_set.size(), 2); + + EXPECT_TRUE(uid_to_paths_set_info[1].sub_path_set.find("path1") != uid_to_paths_set_info[1].sub_path_set.end()); + EXPECT_TRUE(uid_to_paths_set_info[1].sub_path_set.find("path2") != uid_to_paths_set_info[1].sub_path_set.end()); + EXPECT_TRUE(uid_to_paths_set_info[1].sub_path_set.find("path3") != uid_to_paths_set_info[1].sub_path_set.end()); + + EXPECT_TRUE(uid_to_paths_set_info[1].sparse_path_set.find("path4") != uid_to_paths_set_info[1].sparse_path_set.end()); + EXPECT_TRUE(uid_to_paths_set_info[1].sparse_path_set.find("path5") != uid_to_paths_set_info[1].sparse_path_set.end()); +} + +TEST_F(SchemaUtilTest, get_subpaths_equal_to_max) { + TabletColumn variant; + variant.set_unique_id(1); + variant.set_variant_max_subcolumns_count(3); + + std::unordered_map path_stats; + path_stats[1] = { + {"path1", 1000}, + {"path2", 800}, + {"path3", 500} + }; + + std::unordered_map uid_to_paths_set_info; + schema_util::get_subpaths(variant, path_stats, uid_to_paths_set_info); + + EXPECT_EQ(uid_to_paths_set_info[1].sub_path_set.size(), 3); + EXPECT_EQ(uid_to_paths_set_info[1].sparse_path_set.size(), 0); + + EXPECT_TRUE(uid_to_paths_set_info[1].sub_path_set.find("path1") != uid_to_paths_set_info[1].sub_path_set.end()); + EXPECT_TRUE(uid_to_paths_set_info[1].sub_path_set.find("path2") != uid_to_paths_set_info[1].sub_path_set.end()); + EXPECT_TRUE(uid_to_paths_set_info[1].sub_path_set.find("path3") != uid_to_paths_set_info[1].sub_path_set.end()); +} + +TEST_F(SchemaUtilTest, get_subpaths_multiple_variants) { + TabletColumn variant1; + variant1.set_unique_id(1); + variant1.set_variant_max_subcolumns_count(3); + + TabletColumn variant2; + variant2.set_unique_id(2); + variant2.set_variant_max_subcolumns_count(2); + + TabletColumn variant3; + variant3.set_unique_id(3); + variant3.set_variant_max_subcolumns_count(4); + + std::unordered_map path_stats; + path_stats[1] = { + {"path1", 1000}, + {"path2", 800}, + {"path3", 500}, + {"path4", 300}, + {"path5", 200} + }; + path_stats[2] = { + {"path1", 1000}, + {"path2", 800} + }; + path_stats[3] = { + {"path1", 1000}, + {"path2", 800}, + {"path3", 500}, + {"path4", 300} + }; + path_stats[4] = { + {"path1", 1000}, + {"path2", 800}, + {"path3", 500}, + {"path4", 300}, + {"path5", 200} + }; + + std::unordered_map uid_to_paths_set_info; + schema_util::get_subpaths(variant1, path_stats, uid_to_paths_set_info); + schema_util::get_subpaths(variant2, path_stats, uid_to_paths_set_info); + schema_util::get_subpaths(variant3, path_stats, uid_to_paths_set_info); + + EXPECT_EQ(uid_to_paths_set_info[1].sub_path_set.size(), 3); + EXPECT_EQ(uid_to_paths_set_info[1].sparse_path_set.size(), 2); + + EXPECT_EQ(uid_to_paths_set_info[2].sub_path_set.size(), 2); + EXPECT_EQ(uid_to_paths_set_info[2].sparse_path_set.size(), 0); + + EXPECT_EQ(uid_to_paths_set_info[3].sub_path_set.size(), 4); + EXPECT_EQ(uid_to_paths_set_info[3].sparse_path_set.size(), 0); + + EXPECT_TRUE(uid_to_paths_set_info[1].sub_path_set.find("path1") != uid_to_paths_set_info[1].sub_path_set.end()); + EXPECT_TRUE(uid_to_paths_set_info[1].sub_path_set.find("path2") != uid_to_paths_set_info[1].sub_path_set.end()); + EXPECT_TRUE(uid_to_paths_set_info[1].sub_path_set.find("path3") != uid_to_paths_set_info[1].sub_path_set.end()); + + EXPECT_TRUE(uid_to_paths_set_info[1].sparse_path_set.find("path4") != uid_to_paths_set_info[1].sparse_path_set.end()); + EXPECT_TRUE(uid_to_paths_set_info[1].sparse_path_set.find("path5") != uid_to_paths_set_info[1].sparse_path_set.end()); + + EXPECT_TRUE(uid_to_paths_set_info[2].sub_path_set.find("path1") != uid_to_paths_set_info[2].sub_path_set.end()); + EXPECT_TRUE(uid_to_paths_set_info[2].sub_path_set.find("path2") != uid_to_paths_set_info[2].sub_path_set.end()); + + EXPECT_TRUE(uid_to_paths_set_info[3].sub_path_set.find("path1") != uid_to_paths_set_info[3].sub_path_set.end()); + EXPECT_TRUE(uid_to_paths_set_info[3].sub_path_set.find("path2") != uid_to_paths_set_info[3].sub_path_set.end()); + EXPECT_TRUE(uid_to_paths_set_info[3].sub_path_set.find("path3") != uid_to_paths_set_info[3].sub_path_set.end()); + EXPECT_TRUE(uid_to_paths_set_info[3].sub_path_set.find("path4") != uid_to_paths_set_info[3].sub_path_set.end()); +} + +TEST_F(SchemaUtilTest, get_subpaths_no_path_stats) { + TabletColumn variant; + variant.set_unique_id(1); + variant.set_variant_max_subcolumns_count(3); + + std::unordered_map path_stats; + path_stats[2] = { + {"path1", 1000}, + {"path2", 800} + }; + + std::unordered_map uid_to_paths_set_info; + schema_util::get_subpaths(variant, path_stats, uid_to_paths_set_info); + + EXPECT_EQ(uid_to_paths_set_info[1].sub_path_set.size(), 0); + EXPECT_EQ(uid_to_paths_set_info[1].sparse_path_set.size(), 0); +} + +void construct_column(ColumnPB* column_pb, int32_t col_unique_id, + const std::string& column_type, const std::string& column_name, bool is_key = false) { + column_pb->set_unique_id(col_unique_id); + column_pb->set_name(column_name); + column_pb->set_type(column_type); + column_pb->set_is_key(is_key); + column_pb->set_is_nullable(false); + if (column_type == "VARIANT") { + column_pb->set_variant_max_subcolumns_count(3); + } +} + +void construct_tablet_index(TabletIndexPB* tablet_index, int64_t index_id, const std::string& index_name, int32_t col_unique_id) { + tablet_index->set_index_id(index_id); + tablet_index->set_index_name(index_name); + tablet_index->set_index_type(IndexType::INVERTED); + tablet_index->add_col_unique_id(col_unique_id); +} + +void fill_string_column_with_test_data(auto& column_string, int size) { + std::srand(42); + for (int i = 0; i < size; i++) { + std::string json_str = "{"; + int num_pairs = std::rand() % 10 + 1; + for (int j = 0; j < num_pairs; j++) { + std::string key = "key" + std::to_string(j); + if (std::rand() % 2 == 0) { + int value = std::rand() % 100; + json_str += "\"" + key + "\" : " + std::to_string(value); + } else { + std::string value = "str" + std::to_string(std::rand() % 100); + json_str += "\"" + key + "\" : \"" + value + "\""; + } + if (j < num_pairs - 1) { + json_str += ", "; + } + } + json_str += "}"; + vectorized::Field str(json_str); + column_string->insert_data(json_str.data(), json_str.size()); + } +} + +void fill_varaint_column(auto& variant_column, int size) { + auto type_string =std::make_shared(); + auto column = type_string->create_column(); + auto column_string = assert_cast(column.get()); + fill_string_column_with_test_data(column_string, size); + vectorized::ParseConfig config; + config.enable_flatten_nested = false; + //ColumnObject* variant_column_object = assert_cast(variant_column.get()); + parse_json_to_variant(*variant_column, *column_string, config); +} + +void fill_block_with_test_data(vectorized::Block* block, int size) { + auto columns = block->mutate_columns(); + // insert key + for (int i = 0; i < size; i++) { + vectorized::Field key = i; + columns[0]->insert(key); + } + + // insert v1 + fill_varaint_column(columns[1], size); + + // insert v2 + for (int i = 0; i < size; i++) { + vectorized::Field v2("V2"); + columns[2]->insert(v2); + } + + //insert v3 + fill_varaint_column(columns[3], size); + + // insert v4 + for (int i = 0; i < size; i++) { + vectorized::Field v4(i); + columns[4]->insert(v4); + } +} +static int64_t inc_id = 1000; +static RowsetWriterContext rowset_writer_context(const std::unique_ptr& data_dir, + const TabletSchemaSPtr& schema, + const std::string& tablet_path) { + RowsetWriterContext context; + RowsetId rowset_id; + rowset_id.init(inc_id++); + context.rowset_id = rowset_id; + context.rowset_type = BETA_ROWSET; + context.data_dir = data_dir.get(); + context.rowset_state = VISIBLE; + context.tablet_schema = schema; + context.tablet_path = tablet_path; + context.version = Version(inc_id, inc_id); + context.max_rows_per_segment = 200; + return context; + } + +TEST_F(SchemaUtilTest, collect_path_stats) { + TabletSchemaPB schema_pb; + + construct_column(schema_pb.add_column(), 0, "INT", "key", true); + construct_column(schema_pb.add_column(), 1, "VARIANT", "v1"); + construct_column(schema_pb.add_column(), 2, "STRING", "v2"); + construct_column(schema_pb.add_column(), 3, "VARIANT", "v3"); + construct_column(schema_pb.add_column(), 4, "INT", "v4"); + + TabletSchemaSPtr tablet_schema = std::make_shared(); + tablet_schema->init_from_pb(schema_pb); + + TabletMetaSharedPtr tablet_meta(new TabletMeta(tablet_schema)); + + _tablet = std::make_shared(*_engine_ref, tablet_meta, _data_dir.get()); + EXPECT_TRUE(_tablet->init().ok()); + + + const auto& res = RowsetFactory::create_rowset_writer( + *_engine_ref, rowset_writer_context(_data_dir, tablet_schema, _tablet->tablet_path()), + false); + EXPECT_TRUE(res.has_value()) << res.error(); + const auto& rowset_writer = res.value(); + + vectorized::Block block = tablet_schema->create_block(); + + fill_block_with_test_data(&block, 1000); + auto st = rowset_writer->add_block(&block); + EXPECT_TRUE(st.ok()) << st.msg(); + st = rowset_writer->flush(); + EXPECT_TRUE(st.ok()) << st.msg(); + + RowsetSharedPtr rowset; + EXPECT_TRUE(rowset_writer->build(rowset).ok()); + EXPECT_TRUE(_tablet->add_rowset(rowset).ok()); + EXPECT_TRUE(rowset->num_segments() == 5); +} + + From ff50412bfe8a0c6f0308a0884d5ef230907def75 Mon Sep 17 00:00:00 2001 From: csun5285 Date: Fri, 7 Mar 2025 12:26:58 +0800 Subject: [PATCH 2/4] [enhance](ut) add ut for schema util --- .../segment_v2/variant_column_writer_impl.cpp | 8 +- be/src/vec/common/schema_util.cpp | 11 +- be/src/vec/common/schema_util.h | 9 +- be/src/vec/olap/olap_data_convertor.cpp | 8 +- be/src/vec/olap/olap_data_convertor.h | 6 + be/test/common/schema_util_test.cpp | 109 ------ .../vec/common/schema_util_rowset_test.cpp | 265 +++++++++++++ be/test/vec/common/schema_util_test.cpp | 370 +++++------------- 8 files changed, 388 insertions(+), 398 deletions(-) delete mode 100644 be/test/common/schema_util_test.cpp create mode 100644 be/test/vec/common/schema_util_rowset_test.cpp diff --git a/be/src/olap/rowset/segment_v2/variant_column_writer_impl.cpp b/be/src/olap/rowset/segment_v2/variant_column_writer_impl.cpp index 2c9f4afd02f1c3..2c13dc877550ea 100644 --- a/be/src/olap/rowset/segment_v2/variant_column_writer_impl.cpp +++ b/be/src/olap/rowset/segment_v2/variant_column_writer_impl.cpp @@ -403,7 +403,8 @@ Status VariantColumnWriterImpl::_process_sparse_column( return status; } VLOG_DEBUG << "dump sparse " - << vectorized::Block::dump_column(ptr->get_sparse_column(), + << vectorized::Block::dump_column( + ptr->get_sparse_column(), vectorized::ColumnObject::get_sparse_column_type()); RETURN_IF_ERROR( _sparse_column_writer->append(column->get_nullmap(), column->get_data(), num_rows)); @@ -497,10 +498,11 @@ bool VariantColumnWriterImpl::is_finalized() const { Status VariantColumnWriterImpl::append_data(const uint8_t** ptr, size_t num_rows) { DCHECK(!is_finalized()); - const auto& src = *reinterpret_cast(*ptr); + const auto* column = reinterpret_cast(*ptr); + const auto& src = *reinterpret_cast(column->column_data); auto* dst_ptr = assert_cast(_column.get()); // TODO: if direct write we could avoid copy - dst_ptr->insert_range_from(src, 0, num_rows); + dst_ptr->insert_range_from(src, column->row_pos, num_rows); return Status::OK(); } diff --git a/be/src/vec/common/schema_util.cpp b/be/src/vec/common/schema_util.cpp index a19469c1bcdab8..9a42ed4ffd2fee 100644 --- a/be/src/vec/common/schema_util.cpp +++ b/be/src/vec/common/schema_util.cpp @@ -530,7 +530,8 @@ Status _parse_variant_columns(Block& block, const std::vector& variant_pos, } if (scalar_root_column->is_column_string()) { - variant_column = ColumnObject::create(var.max_subcolumns_count()); + // now, subcolumns have not been set, so we set it to 0 + variant_column = ColumnObject::create(0); parse_json_to_variant(*variant_column.get(), assert_cast(*scalar_root_column), config); } else { @@ -604,8 +605,6 @@ TabletColumn create_sparse_column(const TabletColumn& variant) { return res; } -using PathToNoneNullValues = std::unordered_map; - Status collect_path_stats(const RowsetSharedPtr& rs, std::unordered_map& uid_to_path_stats) { SegmentCacheHandle segment_cache; @@ -779,8 +778,10 @@ Status get_compaction_schema(const std::vector& rowsets, void calculate_variant_stats(const IColumn& encoded_sparse_column, segment_v2::VariantStatisticsPB* stats, size_t row_pos, size_t num_rows) { - int limit = stats->sparse_column_non_null_size().size() - - VariantStatistics::MAX_SPARSE_DATA_STATISTICS_SIZE; + DCHECK(stats->sparse_column_non_null_size_size() <= + VariantStatistics::MAX_SPARSE_DATA_STATISTICS_SIZE); + int limit = VariantStatistics::MAX_SPARSE_DATA_STATISTICS_SIZE - + stats->sparse_column_non_null_size_size(); // Cast input column to ColumnMap type since sparse column is stored as a map const auto& map_column = assert_cast(encoded_sparse_column); diff --git a/be/src/vec/common/schema_util.h b/be/src/vec/common/schema_util.h index 065af0dcb859cb..7f673d88f3df87 100644 --- a/be/src/vec/common/schema_util.h +++ b/be/src/vec/common/schema_util.h @@ -54,6 +54,9 @@ struct ColumnWithTypeAndName; const std::string SPARSE_COLUMN_PATH = "__DORIS_VARIANT_SPARSE__"; namespace doris::vectorized::schema_util { + +using PathToNoneNullValues = std::unordered_map; + /// Returns number of dimensions in Array type. 0 if type is not array. size_t get_number_of_dimensions(const IDataType& type); @@ -131,7 +134,11 @@ TabletColumn create_sparse_column(const TabletColumn& variant); // get the subpaths and sparse paths for the variant column void get_subpaths(const TabletColumn& variant, const std::unordered_map& path_stats, - std::unordered_map& uid_to_paths_set_info) + std::unordered_map& uid_to_paths_set_info); + +// collect path stats from the rowset +Status collect_path_stats(const RowsetSharedPtr& rs, + std::unordered_map& uid_to_path_stats); // Build the temporary schema for compaction, this will reduce the memory usage of compacting variant columns Status get_compaction_schema(const std::vector& rowsets, TabletSchemaSPtr& target); diff --git a/be/src/vec/olap/olap_data_convertor.cpp b/be/src/vec/olap/olap_data_convertor.cpp index 099f9af080c772..a5264dd09e339e 100644 --- a/be/src/vec/olap/olap_data_convertor.cpp +++ b/be/src/vec/olap/olap_data_convertor.cpp @@ -1186,6 +1186,8 @@ Status OlapBlockDataConvertor::OlapColumnDataConvertorVariant::convert_to_olap() } // Do nothing, the column writer will finally do finalize and write subcolumns one by one // since we are not sure the final column(type and columns) until the end of the last block + // need to return the position of the column data + _variant_column_data = std::make_unique(_value_ptr, _row_pos); return Status::OK(); } @@ -1193,9 +1195,9 @@ const void* OlapBlockDataConvertor::OlapColumnDataConvertorVariant::get_data() c if (!_value_ptr) { return _root_data_convertor->get_data(); } - // return the ptr of original column, see VariantColumnWriterImpl::append_data - // which will cast to ColumnObject - return _value_ptr; + // return the ptr of VariantColumnData, see VariantColumnWriterImpl::append_data + // which will cast to VariantColumnData + return _variant_column_data.get(); } const void* OlapBlockDataConvertor::OlapColumnDataConvertorVariant::get_data_at( size_t offset) const { diff --git a/be/src/vec/olap/olap_data_convertor.h b/be/src/vec/olap/olap_data_convertor.h index 1b19f419b40f27..f7ee108c29d3aa 100644 --- a/be/src/vec/olap/olap_data_convertor.h +++ b/be/src/vec/olap/olap_data_convertor.h @@ -72,6 +72,11 @@ class IOlapColumnDataAccessor { virtual ~IOlapColumnDataAccessor() = default; }; +struct VariantColumnData { + const void* column_data; + size_t row_pos; +}; + class OlapBlockDataConvertor { public: OlapBlockDataConvertor() = default; @@ -533,6 +538,7 @@ class OlapBlockDataConvertor { private: const void* _value_ptr; std::unique_ptr _root_data_convertor; + std::unique_ptr _variant_column_data; }; private: diff --git a/be/test/common/schema_util_test.cpp b/be/test/common/schema_util_test.cpp deleted file mode 100644 index faf30ce1de6767..00000000000000 --- a/be/test/common/schema_util_test.cpp +++ /dev/null @@ -1,109 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include "vec/common/schema_util.h" - -#include - -namespace doris { - -class SchemaUtilTest : public testing::Test {}; - -void construct_column(ColumnPB* column_pb, TabletIndexPB* tablet_index, int64_t index_id, - const std::string& index_name, int32_t col_unique_id, - const std::string& column_type, const std::string& column_name, - const IndexType& index_type) { - column_pb->set_unique_id(col_unique_id); - column_pb->set_name(column_name); - column_pb->set_type(column_type); - column_pb->set_is_nullable(true); - column_pb->set_is_bf_column(true); - tablet_index->set_index_id(index_id); - tablet_index->set_index_name(index_name); - tablet_index->set_index_type(index_type); - tablet_index->add_col_unique_id(col_unique_id); -} - -void construct_subcolumn(TabletSchemaSPtr schema, const FieldType& type, int32_t col_unique_id, - std::string_view path, std::vector* subcolumns) { - TabletColumn subcol; - subcol.set_type(type); - subcol.set_is_nullable(true); - subcol.set_unique_id(-1); - subcol.set_parent_unique_id(col_unique_id); - vectorized::PathInData col_path(path); - subcol.set_path_info(col_path); - subcol.set_name(col_path.get_path()); - schema->append_column(subcol); - subcolumns->emplace_back(std::move(subcol)); -} - -TEST_F(SchemaUtilTest, inherit_column_attributes) { - TabletSchemaPB schema_pb; - schema_pb.set_keys_type(KeysType::DUP_KEYS); - schema_pb.set_inverted_index_storage_format(InvertedIndexStorageFormatPB::V2); - - construct_column(schema_pb.add_column(), schema_pb.add_index(), 10000, "key_index", 0, "INT", - "key", IndexType::INVERTED); - construct_column(schema_pb.add_column(), schema_pb.add_index(), 10001, "v1_index", 1, "VARIANT", - "v1", IndexType::INVERTED); - construct_column(schema_pb.add_column(), schema_pb.add_index(), 10003, "v3_index", 3, "VARIANT", - "v3", IndexType::INVERTED); - - TabletSchemaSPtr tablet_schema = std::make_shared(); - tablet_schema->init_from_pb(schema_pb); - std::vector subcolumns; - - construct_subcolumn(tablet_schema, FieldType::OLAP_FIELD_TYPE_STRING, 1, "v1.b", &subcolumns); - construct_subcolumn(tablet_schema, FieldType::OLAP_FIELD_TYPE_INT, 1, "v1.c", &subcolumns); - - construct_subcolumn(tablet_schema, FieldType::OLAP_FIELD_TYPE_ARRAY, 3, "v3.d", &subcolumns); - construct_subcolumn(tablet_schema, FieldType::OLAP_FIELD_TYPE_FLOAT, 3, "v3.a", &subcolumns); - - vectorized::schema_util::inherit_column_attributes(tablet_schema); - for (const auto& col : subcolumns) { - switch (col._parent_col_unique_id) { - case 1: - EXPECT_TRUE(tablet_schema->inverted_index(col) != nullptr); - break; - case 3: - EXPECT_TRUE(tablet_schema->inverted_index(col) == nullptr); - break; - default: - EXPECT_TRUE(false); - } - } - EXPECT_EQ(tablet_schema->inverted_indexes().size(), 7); - - for (const auto& col : tablet_schema->_cols) { - if (!col->is_extracted_column()) { - continue; - } - switch (col->_parent_col_unique_id) { - case 1: - EXPECT_TRUE(col->is_bf_column()); - break; - case 3: - EXPECT_TRUE(!col->is_bf_column()); - break; - default: - EXPECT_TRUE(false); - } - } -} - -} // namespace doris diff --git a/be/test/vec/common/schema_util_rowset_test.cpp b/be/test/vec/common/schema_util_rowset_test.cpp new file mode 100644 index 00000000000000..eb4b2ad4c39b6c --- /dev/null +++ b/be/test/vec/common/schema_util_rowset_test.cpp @@ -0,0 +1,265 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include + +#include "olap/rowset/beta_rowset_writer.h" +#include "olap/rowset/rowset_factory.h" +#include "olap/rowset/segment_v2/variant_column_writer_impl.h" +#include "olap/storage_engine.h" +#include "olap/tablet_schema.h" +#include "vec/common/schema_util.h" +#include "vec/json/parse2column.h" + +using namespace doris::vectorized; + +using namespace doris::segment_v2; + +using namespace doris; + +constexpr static uint32_t MAX_PATH_LEN = 1024; +constexpr static std::string_view dest_dir = "/ut_dir/schema_util_test"; +constexpr static std::string_view tmp_dir = "./ut_dir/tmp"; + +class SchemaUtilRowsetTest : public testing::Test { +protected: + void SetUp() override { + // absolute dir + char buffer[MAX_PATH_LEN]; + EXPECT_NE(getcwd(buffer, MAX_PATH_LEN), nullptr); + _curreent_dir = std::string(buffer); + _absolute_dir = _curreent_dir + std::string(dest_dir); + EXPECT_TRUE(io::global_local_filesystem()->delete_directory(_absolute_dir).ok()); + EXPECT_TRUE(io::global_local_filesystem()->create_directory(_absolute_dir).ok()); + + // tmp dir + EXPECT_TRUE(io::global_local_filesystem()->delete_directory(tmp_dir).ok()); + EXPECT_TRUE(io::global_local_filesystem()->create_directory(tmp_dir).ok()); + std::vector paths; + paths.emplace_back(std::string(tmp_dir), 1024000000); + auto tmp_file_dirs = std::make_unique(paths); + EXPECT_TRUE(tmp_file_dirs->init().ok()); + ExecEnv::GetInstance()->set_tmp_file_dir(std::move(tmp_file_dirs)); + + // storage engine + doris::EngineOptions options; + auto engine = std::make_unique(options); + _engine_ref = engine.get(); + _data_dir = std::make_unique(*_engine_ref, _absolute_dir); + static_cast(_data_dir->update_capacity()); + EXPECT_TRUE(_data_dir->init(true).ok()); + ExecEnv::GetInstance()->set_storage_engine(std::move(engine)); + } + void TearDown() override { + //EXPECT_TRUE(io::global_local_filesystem()->delete_directory(_tablet->tablet_path()).ok()); + EXPECT_TRUE(io::global_local_filesystem()->delete_directory(_absolute_dir).ok()); + EXPECT_TRUE(io::global_local_filesystem()->delete_directory(tmp_dir).ok()); + _engine_ref = nullptr; + ExecEnv::GetInstance()->set_storage_engine(nullptr); + } + +public: + SchemaUtilRowsetTest() = default; + virtual ~SchemaUtilRowsetTest() = default; + +private: + StorageEngine* _engine_ref = nullptr; + std::unique_ptr _data_dir = nullptr; + TabletSharedPtr _tablet = nullptr; + std::string _absolute_dir; + std::string _curreent_dir; +}; + +static void construct_column(ColumnPB* column_pb, int32_t col_unique_id, + const std::string& column_type, const std::string& column_name, + bool is_key = false) { + column_pb->set_unique_id(col_unique_id); + column_pb->set_name(column_name); + column_pb->set_type(column_type); + column_pb->set_is_key(is_key); + column_pb->set_is_nullable(false); + if (column_type == "VARIANT") { + column_pb->set_variant_max_subcolumns_count(3); + } +} + +// static void construct_tablet_index(TabletIndexPB* tablet_index, int64_t index_id, const std::string& index_name, int32_t col_unique_id) { +// tablet_index->set_index_id(index_id); +// tablet_index->set_index_name(index_name); +// tablet_index->set_index_type(IndexType::INVERTED); +// tablet_index->add_col_unique_id(col_unique_id); +// } + +static std::unordered_map all_path_stats; +static void fill_string_column_with_test_data(auto& column_string, int size, int uid) { + std::srand(42); + for (int i = 0; i < size; i++) { + std::string json_str = "{"; + int num_pairs = std::rand() % 10 + 1; + for (int j = 0; j < num_pairs; j++) { + std::string key = "key" + std::to_string(j); + if (std::rand() % 2 == 0) { + int value = std::rand() % 100; + json_str += "\"" + key + "\" : " + std::to_string(value); + } else { + std::string value = "str" + std::to_string(std::rand() % 100); + json_str += "\"" + key + "\" : \"" + value + "\""; + } + if (j < num_pairs - 1) { + json_str += ", "; + } + all_path_stats[uid][key] += 1; + } + json_str += "}"; + vectorized::Field str(json_str); + column_string->insert_data(json_str.data(), json_str.size()); + } +} + +static void fill_varaint_column(auto& variant_column, int size, int uid) { + auto type_string = std::make_shared(); + auto column = type_string->create_column(); + auto column_string = assert_cast(column.get()); + fill_string_column_with_test_data(column_string, size, uid); + vectorized::ParseConfig config; + config.enable_flatten_nested = false; + parse_json_to_variant(*variant_column, *column_string, config); +} + +static void fill_block_with_test_data(vectorized::Block* block, int size) { + auto columns = block->mutate_columns(); + // insert key + for (int i = 0; i < size; i++) { + vectorized::Field key = i; + columns[0]->insert(key); + } + + // insert v1 + fill_varaint_column(columns[1], size, 1); + + // insert v2 + for (int i = 0; i < size; i++) { + vectorized::Field v2("V2"); + columns[2]->insert(v2); + } + + // insert v3 + fill_varaint_column(columns[3], size, 3); + + // insert v4 + for (int i = 0; i < size; i++) { + vectorized::Field v4(i); + columns[4]->insert(v4); + } +} +static int64_t inc_id = 1000; +static RowsetWriterContext rowset_writer_context(const std::unique_ptr& data_dir, + const TabletSchemaSPtr& schema, + const std::string& tablet_path) { + RowsetWriterContext context; + RowsetId rowset_id; + rowset_id.init(inc_id); + context.rowset_id = rowset_id; + context.rowset_type = BETA_ROWSET; + context.data_dir = data_dir.get(); + context.rowset_state = VISIBLE; + context.tablet_schema = schema; + context.tablet_path = tablet_path; + context.version = Version(inc_id, inc_id); + context.max_rows_per_segment = 200; + inc_id++; + return context; +} + +static RowsetSharedPtr create_rowset(auto& rowset_writer, const TabletSchemaSPtr& tablet_schema) { + vectorized::Block block = tablet_schema->create_block(); + fill_block_with_test_data(&block, 1000); + auto st = rowset_writer->add_block(&block); + EXPECT_TRUE(st.ok()) << st.msg(); + st = rowset_writer->flush(); + EXPECT_TRUE(st.ok()) << st.msg(); + + RowsetSharedPtr rowset; + EXPECT_TRUE(rowset_writer->build(rowset).ok()); + EXPECT_TRUE(rowset->num_segments() == 5); + return rowset; +} + +TEST_F(SchemaUtilRowsetTest, collect_path_stats_and_get_compaction_schema) { + // 1.create tablet schema + TabletSchemaPB schema_pb; + construct_column(schema_pb.add_column(), 0, "INT", "key", true); + construct_column(schema_pb.add_column(), 1, "VARIANT", "v1"); + construct_column(schema_pb.add_column(), 2, "STRING", "v2"); + construct_column(schema_pb.add_column(), 3, "VARIANT", "v3"); + construct_column(schema_pb.add_column(), 4, "INT", "v4"); + TabletSchemaSPtr tablet_schema = std::make_shared(); + tablet_schema->init_from_pb(schema_pb); + + // 2. create tablet + TabletMetaSharedPtr tablet_meta(new TabletMeta(tablet_schema)); + _tablet = std::make_shared(*_engine_ref, tablet_meta, _data_dir.get()); + EXPECT_TRUE(_tablet->init().ok()); + EXPECT_TRUE(io::global_local_filesystem()->create_directory(_tablet->tablet_path()).ok()); + + // 3. create rowset + std::vector rowsets; + for (int i = 0; i < 5; i++) { + const auto& res = RowsetFactory::create_rowset_writer( + *_engine_ref, + rowset_writer_context(_data_dir, tablet_schema, _tablet->tablet_path()), false); + EXPECT_TRUE(res.has_value()) << res.error(); + const auto& rowset_writer = res.value(); + auto rowset = create_rowset(rowset_writer, tablet_schema); + EXPECT_TRUE(_tablet->add_rowset(rowset).ok()); + rowsets.push_back(rowset); + } + + std::unordered_map path_stats; + for (const auto& rowset : rowsets) { + auto st = schema_util::collect_path_stats(rowset, path_stats); + EXPECT_TRUE(st.ok()) << st.msg(); + } + + for (const auto& [uid, path_stats] : path_stats) { + for (const auto& [path, size] : path_stats) { + EXPECT_EQ(all_path_stats[uid][path], size); + } + } + + // 4. get compaction schema + TabletSchemaSPtr compaction_schema = tablet_schema; + auto st = schema_util::get_compaction_schema(rowsets, compaction_schema); + EXPECT_TRUE(st.ok()) << st.msg(); + + // 5. check compaction schema + std::unordered_map> compaction_schema_map; + for (const auto& column : compaction_schema->columns()) { + if (column->parent_unique_id() > 0) { + compaction_schema_map[column->parent_unique_id()].push_back(column->name()); + } + } + for (auto& [uid, paths] : compaction_schema_map) { + EXPECT_EQ(paths.size(), 4); + std::sort(paths.begin(), paths.end()); + EXPECT_TRUE(paths[0].ends_with("__DORIS_VARIANT_SPARSE__")); + EXPECT_TRUE(paths[1].ends_with("key0")); + EXPECT_TRUE(paths[2].ends_with("key1")); + EXPECT_TRUE(paths[3].ends_with("key2")); + } +} diff --git a/be/test/vec/common/schema_util_test.cpp b/be/test/vec/common/schema_util_test.cpp index 76debf7c79216c..f5bd75d6d55245 100644 --- a/be/test/vec/common/schema_util_test.cpp +++ b/be/test/vec/common/schema_util_test.cpp @@ -22,76 +22,22 @@ #include "olap/rowset/segment_v2/variant_column_writer_impl.h" -#include "vec/json/parse2column.h" -#include "olap/rowset/beta_rowset_writer.h" -#include "olap/rowset/rowset_factory.h" -#include "olap/tablet_schema.h" -#include "olap/storage_engine.h" - using namespace doris::vectorized; using namespace doris::segment_v2; using namespace doris; -constexpr static uint32_t MAX_PATH_LEN = 1024; -constexpr static std::string_view dest_dir = "./ut_dir/schema_util_test"; -constexpr static std::string_view tmp_dir = "./ut_dir/tmp"; - class SchemaUtilTest : public testing::Test { -protected: - void SetUp() override { - // absolute dir - char buffer[MAX_PATH_LEN]; - EXPECT_NE(getcwd(buffer, MAX_PATH_LEN), nullptr); - _curreent_dir = std::string(buffer); - _absolute_dir = _curreent_dir + std::string(dest_dir); - EXPECT_TRUE(io::global_local_filesystem()->delete_directory(_absolute_dir).ok()); - EXPECT_TRUE(io::global_local_filesystem()->create_directory(_absolute_dir).ok()); - - // tmp dir - EXPECT_TRUE(io::global_local_filesystem()->delete_directory(tmp_dir).ok()); - EXPECT_TRUE(io::global_local_filesystem()->create_directory(tmp_dir).ok()); - std::vector paths; - paths.emplace_back(std::string(tmp_dir), 1024000000); - auto tmp_file_dirs = std::make_unique(paths); - EXPECT_TRUE(tmp_file_dirs->init().ok()); - ExecEnv::GetInstance()->set_tmp_file_dir(std::move(tmp_file_dirs)); - - // storage engine - doris::EngineOptions options; - auto engine = std::make_unique(options); - _engine_ref = engine.get(); - std::cout << "absolute_dir: " << _absolute_dir << std::endl; - _data_dir = std::make_unique(*_engine_ref, _absolute_dir); - static_cast(_data_dir->update_capacity()); - EXPECT_TRUE(_data_dir->init(true).ok()); - ExecEnv::GetInstance()->set_storage_engine(std::move(engine)); - } - void TearDown() override { - //EXPECT_TRUE(io::global_local_filesystem()->delete_directory(_tablet->tablet_path()).ok()); - EXPECT_TRUE(io::global_local_filesystem()->delete_directory(_absolute_dir).ok()); - EXPECT_TRUE(io::global_local_filesystem()->delete_directory(tmp_dir).ok()); - _engine_ref = nullptr; - ExecEnv::GetInstance()->set_storage_engine(nullptr); - } - public: SchemaUtilTest() = default; virtual ~SchemaUtilTest() = default; - -private: - StorageEngine* _engine_ref = nullptr; - std::unique_ptr _data_dir = nullptr; - TabletSharedPtr _tablet = nullptr; - std::string _absolute_dir; - std::string _curreent_dir; }; -void construct_column(ColumnPB* column_pb, TabletIndexPB* tablet_index, int64_t index_id, - const std::string& index_name, int32_t col_unique_id, - const std::string& column_type, const std::string& column_name, - const IndexType& index_type) { +static void construct_column(ColumnPB* column_pb, TabletIndexPB* tablet_index, int64_t index_id, + const std::string& index_name, int32_t col_unique_id, + const std::string& column_type, const std::string& column_name, + const IndexType& index_type) { column_pb->set_unique_id(col_unique_id); column_pb->set_name(column_name); column_pb->set_type(column_type); @@ -103,8 +49,9 @@ void construct_column(ColumnPB* column_pb, TabletIndexPB* tablet_index, int64_t tablet_index->add_col_unique_id(col_unique_id); } -void construct_subcolumn(TabletSchemaSPtr schema, const FieldType& type, int32_t col_unique_id, - std::string_view path, std::vector* subcolumns) { +static void construct_subcolumn(TabletSchemaSPtr schema, const FieldType& type, + int32_t col_unique_id, std::string_view path, + std::vector* subcolumns) { TabletColumn subcol; subcol.set_type(type); subcol.set_is_nullable(true); @@ -171,7 +118,7 @@ TEST_F(SchemaUtilTest, inherit_column_attributes) { } } -std::unordered_map construct_column_map_with_random_values( +static std::unordered_map construct_column_map_with_random_values( auto& column_map, int key_size, int value_size, const std::string& prefix) { std::unordered_map key_value_counts; auto& key = assert_cast(column_map->get_keys()); @@ -179,13 +126,13 @@ std::unordered_map construct_column_map_with_random_values( auto& offsets = column_map->get_offsets(); std::srand(42); - + for (int i = 0; i < key_size; ++i) { std::string current_key = prefix + std::to_string(i); int value_count = std::rand() % value_size + 1; key_value_counts[current_key] = value_count; - + for (int j = 0; j < value_count; ++j) { key.insert_data(current_key.data(), current_key.size()); auto value_str = prefix + std::to_string(j); @@ -193,46 +140,63 @@ std::unordered_map construct_column_map_with_random_values( } offsets.push_back(key.size()); } - + return key_value_counts; } TEST_F(SchemaUtilTest, calculate_variant_stats) { VariantStatisticsPB stats; - auto column_map = ColumnMap::create( - ColumnString::create(), ColumnString::create(), ColumnArray::ColumnOffsets::create()); - - const auto& key_value_counts = construct_column_map_with_random_values(column_map, 200, 100, "key_"); - + auto column_map = ColumnMap::create(ColumnString::create(), ColumnString::create(), + ColumnArray::ColumnOffsets::create()); + + const auto& key_value_counts = + construct_column_map_with_random_values(column_map, 200, 100, "key_"); + + // calculate stats schema_util::calculate_variant_stats(*column_map, &stats); EXPECT_EQ(stats.sparse_column_non_null_size_size(), key_value_counts.size()); - + for (const auto& kv : key_value_counts) { auto it = stats.sparse_column_non_null_size().find(kv.first); EXPECT_NE(it, stats.sparse_column_non_null_size().end()); EXPECT_EQ(it->second, kv.second); } + // test with different key size column_map->clear(); - const auto& key_value_counts2 = construct_column_map_with_random_values(column_map, 3000, 100, "key_"); + const auto& key_value_counts2 = + construct_column_map_with_random_values(column_map, 3000, 100, "key_"); schema_util::calculate_variant_stats(*column_map, &stats); EXPECT_EQ(stats.sparse_column_non_null_size_size(), 3000); for (const auto& [path, size] : stats.sparse_column_non_null_size()) { - auto first_size = key_value_counts.find(path) == key_value_counts.end() ? 0 : key_value_counts.find(path)->second; - auto second_size = key_value_counts2.find(path) == key_value_counts2.end() ? 0 : key_value_counts2.find(path)->second; + auto first_size = key_value_counts.find(path) == key_value_counts.end() + ? 0 + : key_value_counts.find(path)->second; + auto second_size = key_value_counts2.find(path) == key_value_counts2.end() + ? 0 + : key_value_counts2.find(path)->second; EXPECT_EQ(size, first_size + second_size); } + // test with max size column_map->clear(); - const auto& key_value_counts3 = construct_column_map_with_random_values(column_map, VariantStatistics::MAX_SPARSE_DATA_STATISTICS_SIZE, 5, "key2_"); + const auto& key_value_counts3 = construct_column_map_with_random_values( + column_map, VariantStatistics::MAX_SPARSE_DATA_STATISTICS_SIZE, 5, "key2_"); schema_util::calculate_variant_stats(*column_map, &stats); - EXPECT_EQ(VariantStatistics::MAX_SPARSE_DATA_STATISTICS_SIZE, stats.sparse_column_non_null_size_size()); + EXPECT_EQ(VariantStatistics::MAX_SPARSE_DATA_STATISTICS_SIZE, + stats.sparse_column_non_null_size_size()); for (const auto& [path, size] : stats.sparse_column_non_null_size()) { - auto first_size = key_value_counts.find(path) == key_value_counts.end() ? 0 : key_value_counts.find(path)->second; - auto second_size = key_value_counts2.find(path) == key_value_counts2.end() ? 0 : key_value_counts2.find(path)->second; - auto third_size = key_value_counts3.find(path) == key_value_counts3.end() ? 0 : key_value_counts3.find(path)->second; + auto first_size = key_value_counts.find(path) == key_value_counts.end() + ? 0 + : key_value_counts.find(path)->second; + auto second_size = key_value_counts2.find(path) == key_value_counts2.end() + ? 0 + : key_value_counts2.find(path)->second; + auto third_size = key_value_counts3.find(path) == key_value_counts3.end() + ? 0 + : key_value_counts3.find(path)->second; EXPECT_EQ(size, first_size + second_size + third_size); } } @@ -240,29 +204,30 @@ TEST_F(SchemaUtilTest, calculate_variant_stats) { TEST_F(SchemaUtilTest, get_subpaths) { TabletColumn variant; variant.set_unique_id(1); - variant.set_variant_max_subcolumns_count(3); + variant.set_variant_max_subcolumns_count(3); std::unordered_map path_stats; path_stats[1] = { - {"path1", 1000}, - {"path2", 800}, - {"path3", 500}, - {"path4", 300}, - {"path5", 200} - }; + {"path1", 1000}, {"path2", 800}, {"path3", 500}, {"path4", 300}, {"path5", 200}}; + // get subpaths std::unordered_map uid_to_paths_set_info; schema_util::get_subpaths(variant, path_stats, uid_to_paths_set_info); EXPECT_EQ(uid_to_paths_set_info[1].sub_path_set.size(), 3); EXPECT_EQ(uid_to_paths_set_info[1].sparse_path_set.size(), 2); - EXPECT_TRUE(uid_to_paths_set_info[1].sub_path_set.find("path1") != uid_to_paths_set_info[1].sub_path_set.end()); - EXPECT_TRUE(uid_to_paths_set_info[1].sub_path_set.find("path2") != uid_to_paths_set_info[1].sub_path_set.end()); - EXPECT_TRUE(uid_to_paths_set_info[1].sub_path_set.find("path3") != uid_to_paths_set_info[1].sub_path_set.end()); - - EXPECT_TRUE(uid_to_paths_set_info[1].sparse_path_set.find("path4") != uid_to_paths_set_info[1].sparse_path_set.end()); - EXPECT_TRUE(uid_to_paths_set_info[1].sparse_path_set.find("path5") != uid_to_paths_set_info[1].sparse_path_set.end()); + EXPECT_TRUE(uid_to_paths_set_info[1].sub_path_set.find("path1") != + uid_to_paths_set_info[1].sub_path_set.end()); + EXPECT_TRUE(uid_to_paths_set_info[1].sub_path_set.find("path2") != + uid_to_paths_set_info[1].sub_path_set.end()); + EXPECT_TRUE(uid_to_paths_set_info[1].sub_path_set.find("path3") != + uid_to_paths_set_info[1].sub_path_set.end()); + + EXPECT_TRUE(uid_to_paths_set_info[1].sparse_path_set.find("path4") != + uid_to_paths_set_info[1].sparse_path_set.end()); + EXPECT_TRUE(uid_to_paths_set_info[1].sparse_path_set.find("path5") != + uid_to_paths_set_info[1].sparse_path_set.end()); } TEST_F(SchemaUtilTest, get_subpaths_equal_to_max) { @@ -271,11 +236,7 @@ TEST_F(SchemaUtilTest, get_subpaths_equal_to_max) { variant.set_variant_max_subcolumns_count(3); std::unordered_map path_stats; - path_stats[1] = { - {"path1", 1000}, - {"path2", 800}, - {"path3", 500} - }; + path_stats[1] = {{"path1", 1000}, {"path2", 800}, {"path3", 500}}; std::unordered_map uid_to_paths_set_info; schema_util::get_subpaths(variant, path_stats, uid_to_paths_set_info); @@ -283,9 +244,12 @@ TEST_F(SchemaUtilTest, get_subpaths_equal_to_max) { EXPECT_EQ(uid_to_paths_set_info[1].sub_path_set.size(), 3); EXPECT_EQ(uid_to_paths_set_info[1].sparse_path_set.size(), 0); - EXPECT_TRUE(uid_to_paths_set_info[1].sub_path_set.find("path1") != uid_to_paths_set_info[1].sub_path_set.end()); - EXPECT_TRUE(uid_to_paths_set_info[1].sub_path_set.find("path2") != uid_to_paths_set_info[1].sub_path_set.end()); - EXPECT_TRUE(uid_to_paths_set_info[1].sub_path_set.find("path3") != uid_to_paths_set_info[1].sub_path_set.end()); + EXPECT_TRUE(uid_to_paths_set_info[1].sub_path_set.find("path1") != + uid_to_paths_set_info[1].sub_path_set.end()); + EXPECT_TRUE(uid_to_paths_set_info[1].sub_path_set.find("path2") != + uid_to_paths_set_info[1].sub_path_set.end()); + EXPECT_TRUE(uid_to_paths_set_info[1].sub_path_set.find("path3") != + uid_to_paths_set_info[1].sub_path_set.end()); } TEST_F(SchemaUtilTest, get_subpaths_multiple_variants) { @@ -303,29 +267,11 @@ TEST_F(SchemaUtilTest, get_subpaths_multiple_variants) { std::unordered_map path_stats; path_stats[1] = { - {"path1", 1000}, - {"path2", 800}, - {"path3", 500}, - {"path4", 300}, - {"path5", 200} - }; - path_stats[2] = { - {"path1", 1000}, - {"path2", 800} - }; - path_stats[3] = { - {"path1", 1000}, - {"path2", 800}, - {"path3", 500}, - {"path4", 300} - }; + {"path1", 1000}, {"path2", 800}, {"path3", 500}, {"path4", 300}, {"path5", 200}}; + path_stats[2] = {{"path1", 1000}, {"path2", 800}}; + path_stats[3] = {{"path1", 1000}, {"path2", 800}, {"path3", 500}, {"path4", 300}}; path_stats[4] = { - {"path1", 1000}, - {"path2", 800}, - {"path3", 500}, - {"path4", 300}, - {"path5", 200} - }; + {"path1", 1000}, {"path2", 800}, {"path3", 500}, {"path4", 300}, {"path5", 200}}; std::unordered_map uid_to_paths_set_info; schema_util::get_subpaths(variant1, path_stats, uid_to_paths_set_info); @@ -341,20 +287,31 @@ TEST_F(SchemaUtilTest, get_subpaths_multiple_variants) { EXPECT_EQ(uid_to_paths_set_info[3].sub_path_set.size(), 4); EXPECT_EQ(uid_to_paths_set_info[3].sparse_path_set.size(), 0); - EXPECT_TRUE(uid_to_paths_set_info[1].sub_path_set.find("path1") != uid_to_paths_set_info[1].sub_path_set.end()); - EXPECT_TRUE(uid_to_paths_set_info[1].sub_path_set.find("path2") != uid_to_paths_set_info[1].sub_path_set.end()); - EXPECT_TRUE(uid_to_paths_set_info[1].sub_path_set.find("path3") != uid_to_paths_set_info[1].sub_path_set.end()); - - EXPECT_TRUE(uid_to_paths_set_info[1].sparse_path_set.find("path4") != uid_to_paths_set_info[1].sparse_path_set.end()); - EXPECT_TRUE(uid_to_paths_set_info[1].sparse_path_set.find("path5") != uid_to_paths_set_info[1].sparse_path_set.end()); - - EXPECT_TRUE(uid_to_paths_set_info[2].sub_path_set.find("path1") != uid_to_paths_set_info[2].sub_path_set.end()); - EXPECT_TRUE(uid_to_paths_set_info[2].sub_path_set.find("path2") != uid_to_paths_set_info[2].sub_path_set.end()); - - EXPECT_TRUE(uid_to_paths_set_info[3].sub_path_set.find("path1") != uid_to_paths_set_info[3].sub_path_set.end()); - EXPECT_TRUE(uid_to_paths_set_info[3].sub_path_set.find("path2") != uid_to_paths_set_info[3].sub_path_set.end()); - EXPECT_TRUE(uid_to_paths_set_info[3].sub_path_set.find("path3") != uid_to_paths_set_info[3].sub_path_set.end()); - EXPECT_TRUE(uid_to_paths_set_info[3].sub_path_set.find("path4") != uid_to_paths_set_info[3].sub_path_set.end()); + EXPECT_TRUE(uid_to_paths_set_info[1].sub_path_set.find("path1") != + uid_to_paths_set_info[1].sub_path_set.end()); + EXPECT_TRUE(uid_to_paths_set_info[1].sub_path_set.find("path2") != + uid_to_paths_set_info[1].sub_path_set.end()); + EXPECT_TRUE(uid_to_paths_set_info[1].sub_path_set.find("path3") != + uid_to_paths_set_info[1].sub_path_set.end()); + + EXPECT_TRUE(uid_to_paths_set_info[1].sparse_path_set.find("path4") != + uid_to_paths_set_info[1].sparse_path_set.end()); + EXPECT_TRUE(uid_to_paths_set_info[1].sparse_path_set.find("path5") != + uid_to_paths_set_info[1].sparse_path_set.end()); + + EXPECT_TRUE(uid_to_paths_set_info[2].sub_path_set.find("path1") != + uid_to_paths_set_info[2].sub_path_set.end()); + EXPECT_TRUE(uid_to_paths_set_info[2].sub_path_set.find("path2") != + uid_to_paths_set_info[2].sub_path_set.end()); + + EXPECT_TRUE(uid_to_paths_set_info[3].sub_path_set.find("path1") != + uid_to_paths_set_info[3].sub_path_set.end()); + EXPECT_TRUE(uid_to_paths_set_info[3].sub_path_set.find("path2") != + uid_to_paths_set_info[3].sub_path_set.end()); + EXPECT_TRUE(uid_to_paths_set_info[3].sub_path_set.find("path3") != + uid_to_paths_set_info[3].sub_path_set.end()); + EXPECT_TRUE(uid_to_paths_set_info[3].sub_path_set.find("path4") != + uid_to_paths_set_info[3].sub_path_set.end()); } TEST_F(SchemaUtilTest, get_subpaths_no_path_stats) { @@ -363,10 +320,7 @@ TEST_F(SchemaUtilTest, get_subpaths_no_path_stats) { variant.set_variant_max_subcolumns_count(3); std::unordered_map path_stats; - path_stats[2] = { - {"path1", 1000}, - {"path2", 800} - }; + path_stats[2] = {{"path1", 1000}, {"path2", 800}}; std::unordered_map uid_to_paths_set_info; schema_util::get_subpaths(variant, path_stats, uid_to_paths_set_info); @@ -374,141 +328,3 @@ TEST_F(SchemaUtilTest, get_subpaths_no_path_stats) { EXPECT_EQ(uid_to_paths_set_info[1].sub_path_set.size(), 0); EXPECT_EQ(uid_to_paths_set_info[1].sparse_path_set.size(), 0); } - -void construct_column(ColumnPB* column_pb, int32_t col_unique_id, - const std::string& column_type, const std::string& column_name, bool is_key = false) { - column_pb->set_unique_id(col_unique_id); - column_pb->set_name(column_name); - column_pb->set_type(column_type); - column_pb->set_is_key(is_key); - column_pb->set_is_nullable(false); - if (column_type == "VARIANT") { - column_pb->set_variant_max_subcolumns_count(3); - } -} - -void construct_tablet_index(TabletIndexPB* tablet_index, int64_t index_id, const std::string& index_name, int32_t col_unique_id) { - tablet_index->set_index_id(index_id); - tablet_index->set_index_name(index_name); - tablet_index->set_index_type(IndexType::INVERTED); - tablet_index->add_col_unique_id(col_unique_id); -} - -void fill_string_column_with_test_data(auto& column_string, int size) { - std::srand(42); - for (int i = 0; i < size; i++) { - std::string json_str = "{"; - int num_pairs = std::rand() % 10 + 1; - for (int j = 0; j < num_pairs; j++) { - std::string key = "key" + std::to_string(j); - if (std::rand() % 2 == 0) { - int value = std::rand() % 100; - json_str += "\"" + key + "\" : " + std::to_string(value); - } else { - std::string value = "str" + std::to_string(std::rand() % 100); - json_str += "\"" + key + "\" : \"" + value + "\""; - } - if (j < num_pairs - 1) { - json_str += ", "; - } - } - json_str += "}"; - vectorized::Field str(json_str); - column_string->insert_data(json_str.data(), json_str.size()); - } -} - -void fill_varaint_column(auto& variant_column, int size) { - auto type_string =std::make_shared(); - auto column = type_string->create_column(); - auto column_string = assert_cast(column.get()); - fill_string_column_with_test_data(column_string, size); - vectorized::ParseConfig config; - config.enable_flatten_nested = false; - //ColumnObject* variant_column_object = assert_cast(variant_column.get()); - parse_json_to_variant(*variant_column, *column_string, config); -} - -void fill_block_with_test_data(vectorized::Block* block, int size) { - auto columns = block->mutate_columns(); - // insert key - for (int i = 0; i < size; i++) { - vectorized::Field key = i; - columns[0]->insert(key); - } - - // insert v1 - fill_varaint_column(columns[1], size); - - // insert v2 - for (int i = 0; i < size; i++) { - vectorized::Field v2("V2"); - columns[2]->insert(v2); - } - - //insert v3 - fill_varaint_column(columns[3], size); - - // insert v4 - for (int i = 0; i < size; i++) { - vectorized::Field v4(i); - columns[4]->insert(v4); - } -} -static int64_t inc_id = 1000; -static RowsetWriterContext rowset_writer_context(const std::unique_ptr& data_dir, - const TabletSchemaSPtr& schema, - const std::string& tablet_path) { - RowsetWriterContext context; - RowsetId rowset_id; - rowset_id.init(inc_id++); - context.rowset_id = rowset_id; - context.rowset_type = BETA_ROWSET; - context.data_dir = data_dir.get(); - context.rowset_state = VISIBLE; - context.tablet_schema = schema; - context.tablet_path = tablet_path; - context.version = Version(inc_id, inc_id); - context.max_rows_per_segment = 200; - return context; - } - -TEST_F(SchemaUtilTest, collect_path_stats) { - TabletSchemaPB schema_pb; - - construct_column(schema_pb.add_column(), 0, "INT", "key", true); - construct_column(schema_pb.add_column(), 1, "VARIANT", "v1"); - construct_column(schema_pb.add_column(), 2, "STRING", "v2"); - construct_column(schema_pb.add_column(), 3, "VARIANT", "v3"); - construct_column(schema_pb.add_column(), 4, "INT", "v4"); - - TabletSchemaSPtr tablet_schema = std::make_shared(); - tablet_schema->init_from_pb(schema_pb); - - TabletMetaSharedPtr tablet_meta(new TabletMeta(tablet_schema)); - - _tablet = std::make_shared(*_engine_ref, tablet_meta, _data_dir.get()); - EXPECT_TRUE(_tablet->init().ok()); - - - const auto& res = RowsetFactory::create_rowset_writer( - *_engine_ref, rowset_writer_context(_data_dir, tablet_schema, _tablet->tablet_path()), - false); - EXPECT_TRUE(res.has_value()) << res.error(); - const auto& rowset_writer = res.value(); - - vectorized::Block block = tablet_schema->create_block(); - - fill_block_with_test_data(&block, 1000); - auto st = rowset_writer->add_block(&block); - EXPECT_TRUE(st.ok()) << st.msg(); - st = rowset_writer->flush(); - EXPECT_TRUE(st.ok()) << st.msg(); - - RowsetSharedPtr rowset; - EXPECT_TRUE(rowset_writer->build(rowset).ok()); - EXPECT_TRUE(_tablet->add_rowset(rowset).ok()); - EXPECT_TRUE(rowset->num_segments() == 5); -} - - From 9d508c8a7ad1542708873bdc235ec4e6b053244b Mon Sep 17 00:00:00 2001 From: csun5285 Date: Mon, 10 Mar 2025 11:03:31 +0800 Subject: [PATCH 3/4] [fix](expression) fix Dynamically compute function signature --- .../functions/ComputeSignatureHelper.java | 25 ++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/ComputeSignatureHelper.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/ComputeSignatureHelper.java index 662185bc8eb9d6..252ca08bcaa636 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/ComputeSignatureHelper.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/ComputeSignatureHelper.java @@ -429,12 +429,27 @@ private static FunctionSignature defaultDateTimeV2PrecisionPromotion( return signature; } - /** dynamicComputeVariantArgs */ + /** + * Dynamically compute function signature for variant type arguments. + * This method handles cases where the function signature contains variant types + * and needs to be adjusted based on the actual argument types. + * + * @param signature Original function signature + * @param arguments List of actual arguments passed to the function + * @return Updated function signature with resolved variant types + */ public static FunctionSignature dynamicComputeVariantArgs( FunctionSignature signature, List arguments) { + // If return type is not variant, no need to compute + if (!(signature.returnType instanceof VariantType)) { + return signature; + } + List newArgTypes = Lists.newArrayListWithCapacity(arguments.size()); boolean findVariantType = false; + for (int i = 0; i < arguments.size(); i++) { + // Get signature type for current argument position DataType sigType; if (i >= signature.argumentsTypes.size()) { sigType = signature.getVarArgType().orElseThrow( @@ -442,15 +457,23 @@ public static FunctionSignature dynamicComputeVariantArgs( } else { sigType = signature.argumentsTypes.get(i); } + + // Get actual type of the argument expression DataType expressionType = arguments.get(i).getDataType(); + + // If both signature type and expression type are variant, + // use expression type and update return type if (sigType instanceof VariantType && expressionType instanceof VariantType) { newArgTypes.add(expressionType); signature = signature.withReturnType(expressionType); findVariantType = true; } else { + // Otherwise keep original signature type newArgTypes.add(sigType); } } + + // Update signature with new argument types if any variant type was found if (findVariantType) { signature = signature.withArgumentTypes(signature.hasVarArgs, newArgTypes); } From 332112b3bfe24ab4b480afd1bfa5e9701d37960d Mon Sep 17 00:00:00 2001 From: csun5285 Date: Mon, 10 Mar 2025 11:28:38 +0800 Subject: [PATCH 4/4] fix build --- .../compaction/util/index_compaction_utils.cpp | 2 +- be/test/vec/common/schema_util_test.cpp | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/be/test/olap/rowset/segment_v2/inverted_index/compaction/util/index_compaction_utils.cpp b/be/test/olap/rowset/segment_v2/inverted_index/compaction/util/index_compaction_utils.cpp index 530dca8054c19a..36160518d29467 100644 --- a/be/test/olap/rowset/segment_v2/inverted_index/compaction/util/index_compaction_utils.cpp +++ b/be/test/olap/rowset/segment_v2/inverted_index/compaction/util/index_compaction_utils.cpp @@ -343,7 +343,7 @@ class IndexCompactionUtils { // only base compaction can handle delete predicate BaseCompaction compaction(*engine_ref, tablet); compaction._input_rowsets = std::move(rowsets); - compaction.build_basic_info(); + RETURN_IF_ERROR(compaction.build_basic_info()); std::vector input_rs_readers; create_input_rowsets_readers(compaction, input_rs_readers); diff --git a/be/test/vec/common/schema_util_test.cpp b/be/test/vec/common/schema_util_test.cpp index f5bd75d6d55245..dc916eae1062e3 100644 --- a/be/test/vec/common/schema_util_test.cpp +++ b/be/test/vec/common/schema_util_test.cpp @@ -153,7 +153,7 @@ TEST_F(SchemaUtilTest, calculate_variant_stats) { construct_column_map_with_random_values(column_map, 200, 100, "key_"); // calculate stats - schema_util::calculate_variant_stats(*column_map, &stats); + schema_util::calculate_variant_stats(*column_map, &stats, 0, 200); EXPECT_EQ(stats.sparse_column_non_null_size_size(), key_value_counts.size()); for (const auto& kv : key_value_counts) { @@ -166,7 +166,7 @@ TEST_F(SchemaUtilTest, calculate_variant_stats) { column_map->clear(); const auto& key_value_counts2 = construct_column_map_with_random_values(column_map, 3000, 100, "key_"); - schema_util::calculate_variant_stats(*column_map, &stats); + schema_util::calculate_variant_stats(*column_map, &stats, 0, 3000); EXPECT_EQ(stats.sparse_column_non_null_size_size(), 3000); for (const auto& [path, size] : stats.sparse_column_non_null_size()) { @@ -183,7 +183,7 @@ TEST_F(SchemaUtilTest, calculate_variant_stats) { column_map->clear(); const auto& key_value_counts3 = construct_column_map_with_random_values( column_map, VariantStatistics::MAX_SPARSE_DATA_STATISTICS_SIZE, 5, "key2_"); - schema_util::calculate_variant_stats(*column_map, &stats); + schema_util::calculate_variant_stats(*column_map, &stats, 0, VariantStatistics::MAX_SPARSE_DATA_STATISTICS_SIZE); EXPECT_EQ(VariantStatistics::MAX_SPARSE_DATA_STATISTICS_SIZE, stats.sparse_column_non_null_size_size());