Skip to content

Commit 46d06b4

Browse files
author
liuminjian
committed
Feature: When the cluster capacity is almost full, make the cluster read only
Signed-off-by: liuminjian <[email protected]>
1 parent b184f47 commit 46d06b4

31 files changed

+365
-102
lines changed

conf/chunkserver.conf

+7
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,8 @@ copyset.sync_chunk_limits=2097152
128128
copyset.sync_threshold=65536
129129
# check syncing interval
130130
copyset.check_syncing_interval_ms=500
131+
# wait for retry time when disk space is insufficient
132+
copyset.wait_for_disk_freed_interval_ms=60000
131133

132134
#
133135
# Clone settings
@@ -215,6 +217,11 @@ chunkfilepool.allocate_percent=80
215217
chunkfilepool.chunk_file_pool_size=1GB
216218
# The thread num for format chunks
217219
chunkfilepool.thread_num=1
220+
# When the chunkserver disk usage exceeds the percentage, heartbeat sets the disk status
221+
chunkfilepool.disk_usage_percent_limit=95
222+
# Reserve part of the chunk number, and the write operation returns readonly to the client
223+
# when the available value is too small to avoid chunkfilepool and walfilepool not being able to obtain the chunk.
224+
chunkfilepool.chunk_reserved=100
218225

219226
#
220227
# WAL file pool

proto/chunk.proto

+1
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,7 @@ enum CHUNK_OP_STATUS {
8585
CHUNK_OP_STATUS_BACKWARD = 10; // 请求的版本落后当前chunk的版本
8686
CHUNK_OP_STATUS_CHUNK_EXIST = 11; // chunk已存在
8787
CHUNK_OP_STATUS_EPOCH_TOO_OLD = 12; // request epoch too old
88+
CHUNK_OP_STATUS_READONLY = 13; // If there is insufficient disk space, set the chunkserver to read-only
8889
};
8990

9091
message ChunkResponse {

proto/heartbeat.proto

+6-1
Original file line numberDiff line numberDiff line change
@@ -71,8 +71,13 @@ message CopysetStatistics {
7171
required uint32 writeIOPS = 4;
7272
}
7373

74+
enum ErrorType {
75+
NORMAL = 0;
76+
DISKFULL = 1;
77+
}
78+
7479
message DiskState {
75-
required uint32 errType = 1;
80+
required ErrorType errType = 1;
7681
required string errMsg = 2;
7782
}
7883

proto/topology.proto

+1
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ enum ChunkServerStatus {
4848
enum DiskState {
4949
DISKNORMAL = 0;
5050
DISKERROR = 1;
51+
DISKFULL = 2;
5152
}
5253

5354
enum OnlineState {

src/chunkserver/chunkserver.cpp

+9
Original file line numberDiff line numberDiff line change
@@ -710,6 +710,12 @@ void ChunkServer::InitCopysetNodeOptions(
710710
LOG_IF(FATAL, !conf->GetUInt32Value("copyset.sync_trigger_seconds",
711711
&copysetNodeOptions->syncTriggerSeconds));
712712
}
713+
LOG_IF(FATAL, !conf->GetUInt32Value(
714+
"copyset.wait_for_disk_freed_interval_ms",
715+
&copysetNodeOptions->waitForDiskFreedIntervalMs));
716+
LOG_IF(FATAL, !conf->GetUInt32Value(
717+
"copyset.chunk_reserved",
718+
&copysetNodeOptions->chunkReserved));
713719
}
714720

715721
void ChunkServer::InitCopyerOptions(
@@ -781,6 +787,9 @@ void ChunkServer::InitHeartbeatOptions(
781787
&heartbeatOptions->intervalSec));
782788
LOG_IF(FATAL, !conf->GetUInt32Value("mds.heartbeat_timeout",
783789
&heartbeatOptions->timeout));
790+
LOG_IF(FATAL, !conf->GetUInt32Value(
791+
"chunkfilepool.disk_usage_percent_limit",
792+
&heartbeatOptions->chunkserverDiskLimit));
784793
}
785794

786795
void ChunkServer::InitRegisterOptions(

src/chunkserver/config_info.h

+4
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,10 @@ struct CopysetNodeOptions {
140140
uint64_t syncThreshold = 64 * 1024;
141141
// check syncing interval
142142
uint32_t checkSyncingIntervalMs = 500u;
143+
// wait for retry time when disk space is insufficient
144+
uint32_t waitForDiskFreedIntervalMs = 60000;
145+
// reserve part of the chunk number
146+
uint32_t chunkReserved = 100;
143147

144148
CopysetNodeOptions();
145149
};

src/chunkserver/copyset_node.cpp

+7
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,9 @@ int CopysetNode::Init(const CopysetNodeOptions &options) {
135135
dsOptions.locationLimit = options.locationLimit;
136136
dsOptions.enableOdsyncWhenOpenChunkFile =
137137
options.enableOdsyncWhenOpenChunkFile;
138+
dsOptions.waitForDiskFreedIntervalMs =
139+
options.waitForDiskFreedIntervalMs;
140+
dsOptions.chunkReserved = options.chunkReserved;
138141
dataStore_ = std::make_shared<CSDataStore>(options.localFileSystem,
139142
options.chunkFilePool,
140143
dsOptions);
@@ -345,6 +348,10 @@ void CopysetNode::WaitSnapshotDone() {
345348
}
346349
}
347350

351+
bool CopysetNode::ReadOnly() const {
352+
return !dataStore_->EnoughChunk();
353+
}
354+
348355
void CopysetNode::save_snapshot_background(::braft::SnapshotWriter *writer,
349356
::braft::Closure *done) {
350357
brpc::ClosureGuard doneGuard(done);

src/chunkserver/copyset_node.h

+2
Original file line numberDiff line numberDiff line change
@@ -469,6 +469,8 @@ class CopysetNode : public braft::StateMachine,
469469

470470
void WaitSnapshotDone();
471471

472+
bool ReadOnly() const;
473+
472474
private:
473475
inline std::string GroupId() {
474476
return ToGroupId(logicPoolId_, copysetId_);

src/chunkserver/datastore/chunkserver_chunkfile.cpp

+7-3
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
* File Created: Thursday, 6th September 2018 10:49:53 am
2020
* Author: yangyaokai
2121
*/
22+
#include <errno.h>
2223
#include <fcntl.h>
2324
#include <algorithm>
2425
#include <memory>
@@ -207,7 +208,8 @@ CSErrorCode CSChunkFile::Open(bool createFile) {
207208
if (rc != 0 && rc != -EEXIST) {
208209
LOG(ERROR) << "Error occured when create file."
209210
<< " filepath = " << chunkFilePath;
210-
return CSErrorCode::InternalError;
211+
return rc == -ENOSPC ? CSErrorCode::NoSpaceError :
212+
CSErrorCode::InternalError;
211213
}
212214
}
213215
int rc = -1;
@@ -400,7 +402,8 @@ CSErrorCode CSChunkFile::Write(SequenceNum sn,
400402
<< "ChunkID: " << chunkId_
401403
<< ",request sn: " << sn
402404
<< ",chunk sn: " << metaPage_.sn;
403-
return CSErrorCode::InternalError;
405+
return rc == -ENOSPC ? CSErrorCode::NoSpaceError :
406+
CSErrorCode::InternalError;
404407
}
405408
// If it is a clone chunk, the bitmap will be updated
406409
CSErrorCode errorCode = flush();
@@ -478,7 +481,8 @@ CSErrorCode CSChunkFile::Paste(const char * buf, off_t offset, size_t length) {
478481
<< "ChunkID: " << chunkId_
479482
<< ", offset: " << offset
480483
<< ", length: " << length;
481-
return CSErrorCode::InternalError;
484+
return rc == -ENOSPC ? CSErrorCode::NoSpaceError :
485+
CSErrorCode::InternalError;
482486
}
483487
}
484488

src/chunkserver/datastore/chunkserver_datastore.cpp

+3-1
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,9 @@ CSDataStore::CSDataStore(std::shared_ptr<LocalFileSystem> lfs,
4444
baseDir_(options.baseDir),
4545
chunkFilePool_(chunkFilePool),
4646
lfs_(lfs),
47-
enableOdsyncWhenOpenChunkFile_(options.enableOdsyncWhenOpenChunkFile) {
47+
enableOdsyncWhenOpenChunkFile_(options.enableOdsyncWhenOpenChunkFile),
48+
waitForDiskFreedIntervalMs_(options.waitForDiskFreedIntervalMs),
49+
chunkReserved_(options.chunkReserved) {
4850
CHECK(!baseDir_.empty()) << "Create datastore failed";
4951
CHECK(lfs_ != nullptr) << "Create datastore failed";
5052
CHECK(chunkFilePool_ != nullptr) << "Create datastore failed";

src/chunkserver/datastore/chunkserver_datastore.h

+14
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,8 @@ struct DataStoreOptions {
6363
PageSizeType metaPageSize;
6464
uint32_t locationLimit;
6565
bool enableOdsyncWhenOpenChunkFile;
66+
uint32_t waitForDiskFreedIntervalMs;
67+
uint32_t chunkReserved;
6668
};
6769

6870
/**
@@ -336,6 +338,14 @@ class CSDataStore {
336338
metaCache_.SetSyncChunkLimits(limit, threshold);
337339
}
338340

341+
void WaitForDiskFreed() {
342+
bthread_usleep(waitForDiskFreedIntervalMs_);
343+
}
344+
345+
bool EnoughChunk() {
346+
return chunkFilePool_->Size() > chunkReserved_;
347+
}
348+
339349
private:
340350
CSErrorCode loadChunkFile(ChunkID id);
341351
CSErrorCode CreateChunkFile(const ChunkOptions & ops,
@@ -362,6 +372,10 @@ class CSDataStore {
362372
DataStoreMetricPtr metric_;
363373
// enable O_DSYNC When Open ChunkFile
364374
bool enableOdsyncWhenOpenChunkFile_;
375+
// wait for retry time when disk space is insufficient
376+
uint32_t waitForDiskFreedIntervalMs_;
377+
// reserve part of the chunk number
378+
uint32_t chunkReserved_;
365379
};
366380

367381
} // namespace chunkserver

src/chunkserver/datastore/chunkserver_snapshot.cpp

+5-2
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
* Author: yangyaokai
2121
*/
2222

23+
#include <errno.h>
2324
#include <memory>
2425
#include "src/chunkserver/datastore/chunkserver_datastore.h"
2526
#include "src/chunkserver/datastore/chunkserver_snapshot.h"
@@ -154,7 +155,8 @@ CSErrorCode CSSnapshot::Open(bool createFile) {
154155
if (ret != 0) {
155156
LOG(ERROR) << "Error occured when create snapshot."
156157
<< " filepath = " << snapshotPath;
157-
return CSErrorCode::InternalError;
158+
return ret == -ENOSPC ? CSErrorCode::NoSpaceError :
159+
CSErrorCode::InternalError;
158160
}
159161
}
160162
int rc = lfs_->Open(snapshotPath, O_RDWR|O_NOATIME|O_DSYNC);
@@ -216,7 +218,8 @@ CSErrorCode CSSnapshot::Write(const char * buf, off_t offset, size_t length) {
216218
LOG(ERROR) << "Write snapshot failed."
217219
<< "ChunkID: " << chunkId_
218220
<< ",snapshot sn: " << metaPage_.sn;
219-
return CSErrorCode::InternalError;
221+
return rc == -ENOSPC ? CSErrorCode::NoSpaceError :
222+
CSErrorCode::InternalError;
220223
}
221224
uint32_t pageBeginIndex = offset / blockSize_;
222225
uint32_t pageEndIndex = (offset + length - 1) / blockSize_;

src/chunkserver/datastore/define.h

+2
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,8 @@ enum CSErrorCode {
7373
// The page has not been written, it will appear when the page that has not
7474
// been written is read when the clone chunk is read
7575
PageNerverWrittenError = 13,
76+
// ENOSPC error
77+
NoSpaceError = 14,
7678
};
7779

7880
// Chunk details

0 commit comments

Comments
 (0)