-
Notifications
You must be signed in to change notification settings - Fork 120
Expand file tree
/
Copy pathdisk_io_monitor.h
More file actions
379 lines (316 loc) · 11.7 KB
/
disk_io_monitor.h
File metadata and controls
379 lines (316 loc) · 11.7 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
/**
* disk_io_monitor.h - Disk I/O saturation monitoring for VoIPmonitor sniffer
*
* Monitors disk write performance and detects saturation using:
* 1. Throughput comparison with calibrated knee_throughput
* 2. Write latency from /proc/diskstats (with baseline calibration)
* 3. Queue depth and utilization from /proc/diskstats
* 4. Buffer level from sniffer heap (asyncwrite)
*
* Key features:
* - Calibration profile: Stored in .disk_io_calibration.conf
* - Background calibration thread: Non-blocking startup
* - Capacity-based detection: Shows % of sustainable throughput used
* - UUID tracking: Auto-recalibrate when disk changes
*
* Output format: IO[B0.5|L1.2|U45|C75|W125|R10|WI1.2k|RI500]
* B = baseline latency (calibrated, ms)
* L = current latency (ms)
* U = disk utilization %
* C = capacity % (current throughput / knee throughput)
* W = write throughput MB/s
* R = read throughput MB/s
* WI = write IOPS (k = thousands)
* RI = read IOPS (k = thousands)
*/
#ifndef DISK_IO_MONITOR_H
#define DISK_IO_MONITOR_H
#include <string>
#include <vector>
#include <stdint.h>
#include <pthread.h>
// C++11 atomic support - fallback to volatile for older compilers
// Use DIOM_ prefix to avoid conflict with sync.h macros
#if __cplusplus >= 201103L
#include <atomic>
#define DIOM_ATOMIC_BOOL std::atomic<bool>
#define DIOM_ATOMIC_INT std::atomic<int>
#define DIOM_ATOMIC_UINT64 std::atomic<uint64_t>
#define DIOM_ATOMIC_LOAD(x) (x).load()
#define DIOM_ATOMIC_LOAD_PTR(x) (x)->load()
#define DIOM_ATOMIC_CAS(x, expected, desired) (x).compare_exchange_strong(expected, desired)
#else
#define DIOM_ATOMIC_BOOL volatile bool
#define DIOM_ATOMIC_INT volatile int
#define DIOM_ATOMIC_UINT64 volatile uint64_t
#define DIOM_ATOMIC_LOAD(x) (x)
#define DIOM_ATOMIC_LOAD_PTR(x) (*(x))
#define DIOM_ATOMIC_CAS(x, expected, desired) __sync_bool_compare_and_swap(&(x), expected, desired)
#endif
// String helpers for pre-C++11 compatibility
inline char str_back(const std::string &s) { return s[s.length() - 1]; }
inline void str_pop_back(std::string &s) { if (!s.empty()) s.erase(s.length() - 1); }
inline char str_front(const std::string &s) { return s[0]; }
#define CALIBRATION_FILENAME ".disk_io_calibration.conf"
/**
* Calibration profile loaded from file
*/
struct sCalibrationProfile {
std::string uuid;
std::string device;
std::string filesystem;
std::string spool_path;
time_t calibration_time;
// Latency metrics
double baseline_latency_ms; // Latency at minimal load
double knee_latency_ms; // Latency at knee point
double saturation_latency_ms; // Latency when saturated
// Throughput metrics
double knee_throughput_mbs; // Throughput where latency starts increasing
double max_throughput_mbs; // Maximum measured throughput
// IOPS metrics
double baseline_iops; // IOPS at minimal load (single writer)
double knee_iops; // IOPS at knee point
double max_iops; // Maximum measured IOPS
// External I/O load detection (v2026.01.4)
bool calibrated_under_load; // Calibration ran with external I/O activity
double pre_calibration_util; // I/O utilization measured before calibration
bool needs_recalibration; // Flag for automatic recalibration when idle
bool valid;
sCalibrationProfile() { clear(); }
void clear() {
uuid.clear();
device.clear();
filesystem.clear();
spool_path.clear();
calibration_time = 0;
baseline_latency_ms = 0;
knee_latency_ms = 0;
saturation_latency_ms = 0;
knee_throughput_mbs = 0;
max_throughput_mbs = 0;
baseline_iops = 0;
knee_iops = 0;
max_iops = 0;
calibrated_under_load = false;
pre_calibration_util = 0;
needs_recalibration = false;
valid = false;
}
};
/**
* Raw disk statistics from /proc/diskstats
*/
struct sDiskStats {
uint64_t reads_completed;
uint64_t writes_completed;
uint64_t read_time_ms;
uint64_t write_time_ms;
uint64_t io_in_progress;
uint64_t io_time_ms; // Time spent doing I/O (for utilization)
uint64_t weighted_io_time_ms; // Weighted I/O time (for queue depth)
uint64_t sectors_read;
uint64_t sectors_written;
uint64_t timestamp_ms;
sDiskStats() { clear(); }
void clear() {
reads_completed = writes_completed = 0;
read_time_ms = write_time_ms = 0;
io_in_progress = io_time_ms = weighted_io_time_ms = 0;
sectors_read = sectors_written = timestamp_ms = 0;
}
};
/**
* Saturation state
*/
enum eSaturationState {
STATE_OK, // Everything normal
STATE_CALIBRATING, // Background calibration in progress
STATE_WARNING, // Approaching capacity limits
STATE_DISK_SATURATED // Disk cannot keep up
};
/**
* Calibration state machine (for external I/O load detection)
*/
enum eCalibrationState {
CALIB_STATE_NONE, // No calibration performed yet
CALIB_STATE_WAITING, // Waiting for idle disk
CALIB_STATE_RUNNING, // Calibration in progress
CALIB_STATE_DONE, // Calibration completed (clean)
CALIB_STATE_DONE_DIRTY // Calibration completed under load, needs recalibration
};
/**
* Computed I/O metrics
*/
struct sIOMetrics {
// Throughput metrics
double write_throughput_mbs; // Current write throughput MB/s
double read_throughput_mbs; // Current read throughput MB/s
double capacity_pct; // Throughput as % of knee_throughput (0-100+)
double reserve_pct; // Headroom before knee (100 - capacity_pct, min 0)
// IOPS metrics
double write_iops; // Current write IOPS
double read_iops; // Current read IOPS
// Latency metrics (from /proc/diskstats)
double write_latency_ms; // Current write latency
double latency_ratio; // current / baseline
double baseline_latency_ms; // Calibrated baseline latency
// Other disk metrics
double queue_depth; // Average I/O queue depth
double utilization_pct; // % of time disk was busy (0-100)
// Buffer metrics (from sniffer)
double buffer_level_pct; // Current asyncwrite buffer fill %
bool buffer_growing; // Is buffer level increasing?
// Status
eSaturationState state;
sIOMetrics() { clear(); }
void clear() {
write_throughput_mbs = read_throughput_mbs = 0;
capacity_pct = reserve_pct = 0;
write_iops = read_iops = 0;
write_latency_ms = latency_ratio = baseline_latency_ms = 0;
queue_depth = utilization_pct = 0;
buffer_level_pct = 0;
buffer_growing = false;
state = STATE_CALIBRATING;
}
const char* getStateString() const {
switch (state) {
case STATE_DISK_SATURATED: return "DISK_SAT";
case STATE_WARNING: return "WARN";
case STATE_CALIBRATING: return "calibrating";
default: return "";
}
}
};
/**
* Disk I/O Monitor class
*/
class cDiskIOMonitor {
public:
cDiskIOMonitor();
~cDiskIOMonitor();
/**
* Initialize monitoring for the given spool directory.
* Loads calibration profile or starts background calibration.
*
* @param spool_path Path to spool directory
* @param allow_calibration If false, skip calibration even if profile missing
* (use for read-from-file mode, sender mode, etc.)
* @return true if successful
*/
bool init(const char *spool_path, bool allow_calibration = true);
/**
* Update all metrics. Call periodically (~10 seconds).
*
* @param buffer_level_pct Current asyncwrite buffer level (0-100%)
*/
void update(double buffer_level_pct);
/**
* Get current metrics.
*/
sIOMetrics getMetrics() const { return metrics_; }
/**
* Format status string for syslog output.
* Format: IO[85%|L1.2ms×1.1|U45] or IO[97%|L8.5ms×7.5|U100] DISK_SAT
*/
std::string formatStatusString() const;
/**
* Check if monitoring is active (calibration complete).
*/
bool isActive() const { return active_ && !calibrating_; }
/**
* Check if calibration is in progress.
*/
bool isCalibrating() const { return calibrating_; }
/**
* Get calibration progress (0-100%).
*/
int getCalibrationProgress() const { return calibration_progress_; }
/**
* Get detected device name.
*/
std::string getDeviceName() const { return device_name_; }
/**
* Get calibration profile.
*/
sCalibrationProfile getProfile() const { return profile_; }
/**
* Force recalibration.
*/
void forceRecalibrate();
/**
* Get calibration state (for status line).
*/
eCalibrationState getCalibrationState() const { return calibration_state_; }
private:
/**
* Measure external I/O load over 3 seconds.
* @return utilization percentage (0-100)
*/
double measureExternalLoad();
/**
* Check and trigger auto-recalibration if conditions met.
* Called from update().
*/
void checkAutoRecalibration();
// Device detection
std::string detectBlockDevice(const char *path);
std::string getFilesystemUUID(const char *path);
// Calibration profile I/O
bool loadCalibrationProfile();
bool saveCalibrationProfile();
// Background calibration
static void* calibrationThreadFunc(void *arg);
void runCalibration();
// Disk stats
bool readDiskStats(sDiskStats &stats);
void calculateMetrics();
void detectSaturation();
// Utilities
static uint64_t getTimestampMs();
static std::string execCommand(const char *cmd);
private:
// Configuration
std::string spool_path_;
std::string device_name_;
std::string filesystem_uuid_;
// State
bool active_;
DIOM_ATOMIC_BOOL init_started_; // Guard against concurrent init() calls
DIOM_ATOMIC_BOOL calibrating_;
DIOM_ATOMIC_INT calibration_progress_;
// Calibration profile
sCalibrationProfile profile_;
// Calibration thread
pthread_t calibration_thread_;
bool calibration_thread_started_;
// Disk stats for delta calculation
sDiskStats prev_stats_;
sDiskStats curr_stats_;
bool first_sample_;
// Buffer tracking
double prev_buffer_level_;
int buffer_growing_count_; // Consecutive samples with growing buffer
// Computed metrics
sIOMetrics metrics_;
// Thresholds for DISK_SAT detection
static const double CAPACITY_WARNING_PCT; // Show WARN at 80%
static const double CAPACITY_CRITICAL_PCT; // Show DISK_SAT at 95%
static const double LATENCY_CRITICAL_RATIO; // Latency 3× baseline
static const double BUFFER_GROW_THRESHOLD; // % buffer increase
static const int BUFFER_GROW_SAMPLES = 3; // Consecutive samples (int OK inline)
// External I/O load detection thresholds (v2026.01.4)
static const double LOAD_IDLE_THRESHOLD; // <20% = idle, calibrate immediately
static const double LOAD_HEAVY_THRESHOLD; // >50% = heavy, wait for idle
static const int LOAD_MODERATE_WAIT_SEC; // Wait time for moderate load (60s)
static const double AUTO_RECAL_IDLE_THRESHOLD; // <10% for auto-recalibration
static const int AUTO_RECAL_IDLE_TIME_SEC; // 10 minutes idle before auto-recal
// Calibration state tracking
eCalibrationState calibration_state_;
uint64_t idle_time_sec_;
uint64_t last_update_time_ms_;
};
// Global instance
extern cDiskIOMonitor diskIOMonitor;
#endif // DISK_IO_MONITOR_H