Skip to content

Commit c5668d9

Browse files
committed
schema: add per-table tablet options
Unlike with vnodes, each tablet is served only by a single shard, and it is associated with a memtable that, when flushed, it creates sstables which token-range is confined to the tablet owning them. On one hand, this allows for far better agility and elasticity since migration of tablets between nodes or shards does not require rewriting most if not all of the sstables, as required with vnodes (at the cleanup phase). Having too few tablets might limit performance due not being served by all shards or by imbalance between shards caused by quantization. The number of tabelts per table has to be a power of 2 with the current design, and when divided by the number of shards, some shards will serve N tablets, while others may serve N+1, and when N is small N+1/N may be significantly larger than 1. For example, with N=1, some shards will serve 2 tablet replicas and some will serve only 1, causing an imbalance of 100%. Now, simply allocating a lot more tablets for each table may theoretically address this problem, but practically: a. Each tablet has memory overhead and having too many tablets in the system with many tables and many tablets for each of them may overwhelm the system's and cause out-of-memory errors. b. Too-small tablets cause a proliferation of small sstables that are less efficient to acces, have higher metadata overhead (due to per-sstable overhead), and might exhaust the system's open file-descriptors limitations. The options introduced in this change can help the user tune the system in two ways: 1. Sizing the table to prevent unnecessary tablet splits and migrations. This can be done when the table is created, or later on, using ALTER TABLE. 2. Controlling min_per_shard_tablet_count to improve tablet balancing, for hot tables. Signed-off-by: Benny Halevy <[email protected]>
1 parent ad8b064 commit c5668d9

17 files changed

+390
-13
lines changed

configure.py

+1
Original file line numberDiff line numberDiff line change
@@ -1012,6 +1012,7 @@ def find_ninja():
10121012
'db/view/view_update_generator.cc',
10131013
'db/virtual_table.cc',
10141014
'db/virtual_tables.cc',
1015+
'db/tablet_options.cc',
10151016
'index/secondary_index_manager.cc',
10161017
'index/secondary_index.cc',
10171018
'utils/UUID_gen.cc',

cql3/statements/cf_prop_defs.cc

+36-1
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
*/
1010

1111
#include "cql3/statements/cf_prop_defs.hh"
12+
#include "cql3/statements/request_validations.hh"
1213
#include "data_dictionary/data_dictionary.hh"
1314
#include "db/extensions.hh"
1415
#include "db/tags/extension.hh"
@@ -20,6 +21,7 @@
2021
#include "tombstone_gc.hh"
2122
#include "db/per_partition_rate_limit_extension.hh"
2223
#include "db/per_partition_rate_limit_options.hh"
24+
#include "db/tablet_options.hh"
2325
#include "utils/bloom_calculations.hh"
2426

2527
#include <boost/algorithm/string/predicate.hpp>
@@ -52,6 +54,8 @@ const sstring cf_prop_defs::COMPACTION_STRATEGY_CLASS_KEY = "class";
5254

5355
const sstring cf_prop_defs::COMPACTION_ENABLED_KEY = "enabled";
5456

57+
const sstring cf_prop_defs::KW_TABLETS = "tablets";
58+
5559
schema::extensions_map cf_prop_defs::make_schema_extensions(const db::extensions& exts) const {
5660
schema::extensions_map er;
5761
for (auto& p : exts.schema_extensions()) {
@@ -68,20 +72,30 @@ schema::extensions_map cf_prop_defs::make_schema_extensions(const db::extensions
6872
return er;
6973
}
7074

75+
data_dictionary::keyspace cf_prop_defs::find_keyspace(const data_dictionary::database db, std::string_view ks_name) {
76+
try {
77+
return db.find_keyspace(ks_name);
78+
} catch (const data_dictionary::no_such_keyspace& e) {
79+
throw request_validations::invalid_request("{}", e.what());
80+
}
81+
}
82+
7183
void cf_prop_defs::validate(const data_dictionary::database db, sstring ks_name, const schema::extensions_map& schema_extensions) const {
7284
// Skip validation if the comapction strategy class is already set as it means we've already
7385
// prepared (and redoing it would set strategyClass back to null, which we don't want)
7486
if (_compaction_strategy_class) {
7587
return;
7688
}
7789

90+
const auto& ks = find_keyspace(db, ks_name);
91+
7892
static std::set<sstring> keywords({
7993
KW_COMMENT,
8094
KW_GCGRACESECONDS, KW_CACHING, KW_DEFAULT_TIME_TO_LIVE,
8195
KW_MIN_INDEX_INTERVAL, KW_MAX_INDEX_INTERVAL, KW_SPECULATIVE_RETRY,
8296
KW_BF_FP_CHANCE, KW_MEMTABLE_FLUSH_PERIOD, KW_COMPACTION,
8397
KW_COMPRESSION, KW_CRC_CHECK_CHANCE, KW_ID, KW_PAXOSGRACESECONDS,
84-
KW_SYNCHRONOUS_UPDATES
98+
KW_SYNCHRONOUS_UPDATES, KW_TABLETS,
8599
});
86100
static std::set<sstring> obsolete_keywords({
87101
sstring("index_interval"),
@@ -162,6 +176,16 @@ void cf_prop_defs::validate(const data_dictionary::database db, sstring ks_name,
162176
}
163177

164178
speculative_retry::from_sstring(get_string(KW_SPECULATIVE_RETRY, speculative_retry(speculative_retry::type::NONE, 0).to_sstring()));
179+
180+
if (auto tablet_options_map = get_tablet_options()) {
181+
if (!ks.uses_tablets()) {
182+
throw exceptions::configuration_exception("tablet options cannot be used when tablets are disabled for the keyspace");
183+
}
184+
if (!db.features().tablet_options) {
185+
throw exceptions::configuration_exception("tablet options cannot be used until all nodes in the cluster enable this feature");
186+
}
187+
db::tablet_options::validate(*tablet_options_map);
188+
}
165189
}
166190

167191
std::map<sstring, sstring> cf_prop_defs::get_compaction_type_options() const {
@@ -252,6 +276,13 @@ const db::per_partition_rate_limit_options* cf_prop_defs::get_per_partition_rate
252276
return &ext->get_options();
253277
}
254278

279+
std::optional<db::tablet_options::map_type> cf_prop_defs::get_tablet_options() const {
280+
if (auto tablet_options = get_map(KW_TABLETS)) {
281+
return tablet_options.value();
282+
}
283+
return std::nullopt;
284+
}
285+
255286
void cf_prop_defs::apply_to_builder(schema_builder& builder, schema::extensions_map schema_extensions, const data_dictionary::database& db, sstring ks_name) const {
256287
if (has_property(KW_COMMENT)) {
257288
builder.set_comment(get_string(KW_COMMENT, ""));
@@ -351,6 +382,10 @@ void cf_prop_defs::apply_to_builder(schema_builder& builder, schema::extensions_
351382

352383
builder.add_extension(db::tags_extension::NAME, ::make_shared<db::tags_extension>(tags_map));
353384
}
385+
386+
if (auto tablet_options_opt = get_map(KW_TABLETS)) {
387+
builder.set_tablet_options(std::move(*tablet_options_opt));
388+
}
354389
}
355390

356391
void cf_prop_defs::validate_minimum_int(const sstring& field, int32_t minimum_value, int32_t default_value) const

cql3/statements/cf_prop_defs.hh

+6
Original file line numberDiff line numberDiff line change
@@ -18,12 +18,14 @@
1818

1919
namespace data_dictionary {
2020
class database;
21+
class keyspace;
2122
}
2223

2324
class tombstone_gc_options;
2425

2526
namespace db {
2627
class extensions;
28+
class tablet_options;
2729
}
2830
namespace cdc {
2931
class options;
@@ -60,6 +62,8 @@ public:
6062
static const sstring COMPACTION_STRATEGY_CLASS_KEY;
6163
static const sstring COMPACTION_ENABLED_KEY;
6264

65+
static const sstring KW_TABLETS;
66+
6367
// FIXME: In origin the following consts are in CFMetaData.
6468
static constexpr int32_t DEFAULT_DEFAULT_TIME_TO_LIVE = 0;
6569
static constexpr int32_t DEFAULT_MIN_INDEX_INTERVAL = 128;
@@ -70,6 +74,7 @@ public:
7074

7175
private:
7276
mutable std::optional<sstables::compaction_strategy_type> _compaction_strategy_class;
77+
static data_dictionary::keyspace find_keyspace(const data_dictionary::database db, std::string_view ks_name);
7378
public:
7479
std::optional<sstables::compaction_strategy_type> get_compaction_strategy_class() const;
7580

@@ -103,6 +108,7 @@ public:
103108
int32_t get_paxos_grace_seconds() const;
104109
std::optional<table_id> get_id() const;
105110
bool get_synchronous_updates_flag() const;
111+
std::optional<db::tablet_options::map_type> get_tablet_options() const;
106112

107113
void apply_to_builder(schema_builder& builder, schema::extensions_map schema_extensions, const data_dictionary::database& db, sstring ks_name) const;
108114
void validate_minimum_int(const sstring& field, int32_t minimum_value, int32_t default_value) const;

data_dictionary/data_dictionary.cc

+5
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,11 @@ keyspace::is_internal() const {
5555
return _ops->is_internal(*this);
5656
}
5757

58+
bool
59+
keyspace::uses_tablets() const {
60+
return metadata()->uses_tablets();
61+
}
62+
5863
const locator::abstract_replication_strategy&
5964
keyspace::get_replication_strategy() const {
6065
return _ops->get_replication_strategy(*this);

data_dictionary/data_dictionary.hh

+1
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,7 @@ private:
8787
keyspace(const impl* ops, const void* keyspace);
8888
public:
8989
bool is_internal() const;
90+
bool uses_tablets() const;
9091
lw_shared_ptr<keyspace_metadata> metadata() const;
9192
const user_types_metadata& user_types() const;
9293
const locator::abstract_replication_strategy& get_replication_strategy() const;

data_dictionary/keyspace_metadata.hh

+3
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,9 @@ public:
6565
std::optional<unsigned> initial_tablets() const {
6666
return _initial_tablets;
6767
}
68+
bool uses_tablets() const noexcept {
69+
return _initial_tablets.has_value();
70+
}
6871
const std::unordered_map<sstring, schema_ptr>& cf_meta_data() const {
6972
return _cf_meta_data;
7073
}

db/CMakeLists.txt

+2-1
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,8 @@ target_sources(db
3838
snapshot/backup_task.cc
3939
rate_limiter.cc
4040
per_partition_rate_limit_options.cc
41-
row_cache.cc)
41+
row_cache.cc,
42+
tablet_options.cc)
4243
target_include_directories(db
4344
PUBLIC
4445
${CMAKE_SOURCE_DIR})

db/schema_tables.cc

+39-7
Original file line numberDiff line numberDiff line change
@@ -336,6 +336,11 @@ schema_ptr scylla_tables(schema_features features) {
336336
// In this case, for non-system tables, `version` is null and `schema::version()` will be a hash.
337337
sb.with_column("committed_by_group0", boolean_type);
338338
}
339+
340+
// It is safe to add the `tablets` column unconditionally,
341+
// since it is written to only after the cluster feature is enabled.
342+
sb.with_column("tablets", map_type_impl::get_instance(utf8_type, utf8_type, false));
343+
339344
sb.with_hash_version();
340345
s = sb.build();
341346
}
@@ -1733,6 +1738,19 @@ mutation make_scylla_tables_mutation(schema_ptr table, api::timestamp_type times
17331738
auto& cdef = *scylla_tables()->get_column_definition("partitioner");
17341739
m.set_clustered_cell(ckey, cdef, atomic_cell::make_dead(timestamp, gc_clock::now()));
17351740
}
1741+
// A table will have engaged tablet options
1742+
// only after they were set by CREATE TABLE or ALTER TABLE,
1743+
// Meaning the cluster feature is enabled, so it is safe to write
1744+
// to this columns.
1745+
if (table->has_tablet_options()) {
1746+
auto& map = table->raw_tablet_options();
1747+
auto& cdef = *scylla_tables()->get_column_definition("tablets");
1748+
if (map.empty()) {
1749+
m.set_clustered_cell(ckey, cdef, atomic_cell::make_dead(timestamp, gc_clock::now()));
1750+
} else {
1751+
m.set_clustered_cell(ckey, cdef, make_map_mutation(map, cdef, timestamp));
1752+
}
1753+
}
17361754
// In-memory tables are deprecated since scylla-2024.1.0
17371755
// FIXME: delete the column when there's no live version supporting it anymore.
17381756
// Writing it here breaks upgrade rollback to versions that do not support the in_memory schema_feature
@@ -2154,6 +2172,19 @@ static void prepare_builder_from_table_row(const schema_ctxt& ctxt, schema_build
21542172
}
21552173
}
21562174

2175+
static void prepare_builder_from_scylla_tables_row(const schema_ctxt& ctxt, schema_builder& builder, const query::result_set_row& table_row) {
2176+
auto in_mem = table_row.get<bool>("in_memory");
2177+
auto in_mem_enabled = in_mem.value_or(false);
2178+
if (in_mem_enabled) {
2179+
slogger.warn("Support for in_memory tables has been deprecated.");
2180+
}
2181+
builder.set_in_memory(in_mem_enabled);
2182+
if (auto opt_map = get_map<sstring, sstring>(table_row, "tablets")) {
2183+
auto tablet_options = db::tablet_options(*opt_map);
2184+
builder.set_tablet_options(tablet_options.to_map());
2185+
}
2186+
}
2187+
21572188
schema_ptr create_table_from_mutations(const schema_ctxt& ctxt, schema_mutations sm, std::optional<table_schema_version> version)
21582189
{
21592190
slogger.trace("create_table_from_mutations: version={}, {}", version, sm);
@@ -2208,13 +2239,7 @@ schema_ptr create_table_from_mutations(const schema_ctxt& ctxt, schema_mutations
22082239
if (sm.scylla_tables()) {
22092240
table_rs = query::result_set(*sm.scylla_tables());
22102241
if (!table_rs.empty()) {
2211-
query::result_set_row table_row = table_rs.row(0);
2212-
auto in_mem = table_row.get<bool>("in_memory");
2213-
auto in_mem_enabled = in_mem.value_or(false);
2214-
if (in_mem_enabled) {
2215-
slogger.warn("Support for in_memory tables has been deprecated.");
2216-
}
2217-
builder.set_in_memory(in_mem_enabled);
2242+
prepare_builder_from_scylla_tables_row(ctxt, builder, table_rs.row(0));
22182243
}
22192244
}
22202245
v3_columns columns(std::move(column_defs), is_dense, is_compound);
@@ -2445,6 +2470,13 @@ view_ptr create_view_from_mutations(const schema_ctxt& ctxt, schema_mutations sm
24452470
schema_builder builder{ks_name, cf_name, id};
24462471
prepare_builder_from_table_row(ctxt, builder, row);
24472472

2473+
if (sm.scylla_tables()) {
2474+
table_rs = query::result_set(*sm.scylla_tables());
2475+
if (!table_rs.empty()) {
2476+
prepare_builder_from_scylla_tables_row(ctxt, builder, table_rs.row(0));
2477+
}
2478+
}
2479+
24482480
auto computed_columns = get_computed_columns(sm);
24492481
auto column_defs = create_columns_from_column_rows(ctxt, query::result_set(sm.columns_mutation()), ks_name, cf_name, false, column_view_virtual::no, computed_columns);
24502482
for (auto&& cdef : column_defs) {

db/tablet_options.cc

+96
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
/*
2+
* Copyright 2025-present ScyllaDB
3+
*/
4+
/*
5+
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
6+
*/
7+
8+
#include <cstdlib>
9+
10+
#include "exceptions/exceptions.hh"
11+
#include "db/tablet_options.hh"
12+
#include "utils/log.hh"
13+
14+
extern logging::logger dblog;
15+
16+
namespace db {
17+
18+
tablet_options::tablet_options(const map_type& map) {
19+
for (auto& [key, value_str] : map) {
20+
switch (tablet_options::from_string(key)) {
21+
case tablet_option_type::min_tablet_count:
22+
if (auto value = std::atol(value_str.c_str())) {
23+
min_tablet_count.emplace(value);
24+
}
25+
break;
26+
case tablet_option_type::min_per_shard_tablet_count:
27+
if (auto value = std::atof(value_str.c_str())) {
28+
min_per_shard_tablet_count.emplace(value);
29+
}
30+
break;
31+
case tablet_option_type::expected_data_size_in_gb:
32+
if (auto value = std::atol(value_str.c_str())) {
33+
expected_data_size_in_gb.emplace(value);
34+
}
35+
break;
36+
}
37+
}
38+
}
39+
40+
sstring tablet_options::to_string(tablet_option_type hint) {
41+
switch (hint) {
42+
case tablet_option_type::min_tablet_count: return "min_tablet_count";
43+
case tablet_option_type::min_per_shard_tablet_count: return "min_per_shard_tablet_count";
44+
case tablet_option_type::expected_data_size_in_gb: return "expected_data_size_in_gb";
45+
}
46+
}
47+
48+
tablet_option_type tablet_options::from_string(sstring hint_desc) {
49+
if (hint_desc == "min_tablet_count") {
50+
return tablet_option_type::min_tablet_count;
51+
} else if (hint_desc == "min_per_shard_tablet_count") {
52+
return tablet_option_type::min_per_shard_tablet_count;
53+
} else if (hint_desc == "expected_data_size_in_gb") {
54+
return tablet_option_type::expected_data_size_in_gb;
55+
} else {
56+
throw exceptions::syntax_exception(fmt::format("Unknown tablet hint '{}'", hint_desc));
57+
}
58+
}
59+
60+
std::map<sstring, sstring> tablet_options::to_map() const {
61+
std::map<sstring, sstring> res;
62+
if (min_tablet_count) {
63+
res[to_string(tablet_option_type::min_tablet_count)] = fmt::to_string(*min_tablet_count);
64+
}
65+
if (min_per_shard_tablet_count) {
66+
res[to_string(tablet_option_type::min_per_shard_tablet_count)] = fmt::to_string(*min_per_shard_tablet_count);
67+
}
68+
if (expected_data_size_in_gb) {
69+
res[to_string(tablet_option_type::expected_data_size_in_gb)] = fmt::to_string(*expected_data_size_in_gb);
70+
}
71+
return res;
72+
}
73+
74+
void tablet_options::validate(const map_type& map) {
75+
for (auto& [key, value_str] : map) {
76+
switch (tablet_options::from_string(key)) {
77+
case tablet_option_type::min_tablet_count:
78+
if (auto value = std::atol(value_str.c_str()); value < 0) {
79+
throw exceptions::configuration_exception(format("Invalid value '{}' for min_tablet_count", value));
80+
}
81+
break;
82+
case tablet_option_type::min_per_shard_tablet_count:
83+
if (auto value = std::atof(value_str.c_str()); value < 0) {
84+
throw exceptions::configuration_exception(format("Invalid value '{}' for min_per_shard_tablet_count", value));
85+
}
86+
break;
87+
case tablet_option_type::expected_data_size_in_gb:
88+
if (auto value = std::atol(value_str.c_str()); value < 0) {
89+
throw exceptions::configuration_exception(format("Invalid value '{}' for expected_data_size_in_gb", value));
90+
}
91+
break;
92+
}
93+
}
94+
}
95+
96+
} // namespace db

0 commit comments

Comments
 (0)