Skip to content

Commit 91a9f70

Browse files
committed
add latency_flatten_window option
Signed-off-by: Sven Nierlein <sven@nierlein.de>
1 parent 3c7833b commit 91a9f70

7 files changed

Lines changed: 78 additions & 10 deletions

File tree

Changes

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ This file documents the revision history for mod_gearman.
22

33
next:
44
- reopen logfile after core rotation
5+
- add latency_flatten_window option
56

67
5.0.2 Sun Feb 5 19:05:17 CET 2023
78
- fix crash when using multiple result worker in neb module

README.asciidoc

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -788,6 +788,17 @@ Default is no.
788788
====
789789
790790
791+
latency_flatten_window::
792+
When enabled, reschedules host/service checks if their latency is more than
793+
one second. This value is the maximum delay in seconds applied to hosts/services.
794+
Set to 0 or less than 0 to disable rescheduling.
795+
Default is 30.
796+
+
797+
====
798+
latency_flatten_window=30
799+
====
800+
801+
791802
792803
793804
Worker Options

common/utils.c

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -304,6 +304,7 @@ int set_default_options(mod_gm_opt_t *opt) {
304304
opt->has_starttime = FALSE;
305305
opt->has_finishtime = FALSE;
306306
opt->has_latency = FALSE;
307+
opt->latency_flatten_window = 30;
307308
opt->active = GM_DISABLED;
308309

309310
opt->restrict_command_characters = gm_strdup("$&();<>`\"'|");
@@ -561,6 +562,12 @@ int parse_args_line(mod_gm_opt_t *opt, char * arg, int recursion_level) {
561562
return(GM_OK);
562563
}
563564

565+
/* latency_flatten_window */
566+
else if ( !strcmp( key, "latency_flatten_window" ) ) {
567+
opt->latency_flatten_window = atoi(value);
568+
return(GM_OK);
569+
}
570+
564571
/* enable_embedded_perl */
565572
else if ( !strcmp( key, "enable_embedded_perl" ) ) {
566573
#ifdef EMBEDDEDPERL
@@ -1096,6 +1103,11 @@ void dumpconfig(mod_gm_opt_t *opt, int mode) {
10961103
gm_log( GM_LOG_DEBUG, "result_worker: %d\n", opt->result_workers);
10971104
gm_log( GM_LOG_DEBUG, "do_hostchecks: %s\n", opt->do_hostchecks == GM_ENABLED ? "yes" : "no");
10981105
gm_log( GM_LOG_DEBUG, "route_eventhandler_like_checks: %s\n", opt->route_eventhandler_like_checks == GM_ENABLED ? "yes" : "no");
1106+
if(opt->latency_flatten_window > 0) {
1107+
gm_log( GM_LOG_DEBUG, "latency_flatten_window: %d\n", opt->latency_flatten_window);
1108+
} else {
1109+
gm_log( GM_LOG_DEBUG, "latency_flatten_window: disabled\n");
1110+
}
10991111
}
11001112
if(mode == GM_NEB_MODE || mode == GM_SEND_GEARMAN_MODE) {
11011113
gm_log( GM_LOG_DEBUG, "result_queue: %s\n", opt->result_queue);

include/common.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -223,6 +223,7 @@ typedef struct mod_gm_opt_struct {
223223
int orphan_host_checks; /**< generate fake result for orphaned host checks */
224224
int orphan_service_checks; /**< generate fake result for orphaned service checks */
225225
int accept_clear_results; /**< accept unencrypted results */
226+
int latency_flatten_window; /**< postpone high latency checks */
226227
/* worker */
227228
char * identifier; /**< identifier for this worker */
228229
char * pidfile; /**< path to a pidfile */

include/result_thread.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434
void *result_worker(void *);
3535
int set_worker( gearman_worker_st **worker );
3636
void *get_results( gearman_job_st *, void *, size_t *, gearman_return_t * );
37+
void reschedule_high_latency(check_result * chk_result);
3738

3839
/**
3940
* @}

neb_module_naemon/mod_gearman.c

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -291,14 +291,18 @@ static void move_results_to_core(struct nm_event_execution_properties *evprop) {
291291
}
292292

293293
gettimeofday(&tval_before, NULL);
294-
gm_log( GM_LOG_DEBUG, "move_results_to_core()\n" );
294+
gm_log( GM_LOG_TRACE3, "move_results_to_core()\n" );
295+
schedule_event(1, move_results_to_core, NULL);
295296

296297
/* safely move result list aside */
297298
pthread_mutex_lock(&mod_gm_result_list_mutex);
298299
tmp_list = mod_gm_result_list;
299300
mod_gm_result_list = NULL;
300301
pthread_mutex_unlock(&mod_gm_result_list_mutex);
301302

303+
if(tmp_list == NULL)
304+
return;
305+
302306
/* process result list */
303307
while(tmp_list) {
304308
cur = tmp_list;
@@ -316,7 +320,6 @@ static void move_results_to_core(struct nm_event_execution_properties *evprop) {
316320
timersub(&tval_after, &tval_before, &tval_result);
317321

318322
gm_log( GM_LOG_DEBUG, "move_results_to_core processed %d results in %ld.%06lds\n", count, (long int)tval_result.tv_sec, (long int)tval_result.tv_usec );
319-
schedule_event(1, move_results_to_core, NULL);
320323
}
321324

322325
/* add list to gearman result list */
@@ -856,7 +859,12 @@ static int handle_host_check( int event_type, void *data ) {
856859
if(hostdata->latency < 0)
857860
hostdata->latency = 0;
858861

859-
gm_log(GM_LOG_DEBUG, "received job for queue %s: %s, check_options: %d latency so far: %.3fs\n", target_queue, hostdata->host_name, check_options, hostdata->latency);
862+
gm_log(GM_LOG_DEBUG, "received job for queue %s: %s, check_options: %d latency so far: %.3fs\n",
863+
target_queue,
864+
hostdata->host_name,
865+
check_options,
866+
hostdata->latency
867+
);
860868

861869
/* as we have to intercept host checks so early
862870
* (we cannot cancel checks otherwise)
@@ -1002,7 +1010,13 @@ static int handle_svc_check( int event_type, void *data ) {
10021010
if(svcdata->latency < 0)
10031011
svcdata->latency = 0;
10041012

1005-
gm_log(GM_LOG_DEBUG, "received job for queue %s: %s - %s, check_options: %d latency so far: %.3fs\n", target_queue, svcdata->host_name, svcdata->service_description, check_options, svcdata->latency);
1013+
gm_log(GM_LOG_DEBUG, "received job for queue %s: %s - %s, check_options: %d latency so far: %.3fs\n",
1014+
target_queue,
1015+
svcdata->host_name,
1016+
svcdata->service_description,
1017+
check_options,
1018+
svcdata->latency
1019+
);
10061020

10071021
/* as we have to intercept service checks so early
10081022
* (we cannot cancel checks otherwise)

neb_module_naemon/result_thread.c

Lines changed: 34 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -309,15 +309,13 @@ void *get_results( gearman_job_st *job, __attribute__((__unused__)) void *contex
309309
if ( chk_result->service_description != NULL ) {
310310
gm_log( GM_LOG_DEBUG, "service job completed: %s %s: exit %d, latency: %0.3f, exec_time: %0.3f\n", chk_result->host_name, chk_result->service_description, chk_result->return_code, chk_result->latency, exec_time );
311311
} else {
312-
if(active_check) {
313-
host * hst = find_host( chk_result->host_name );
314-
if(hst != NULL) {
315-
hst->is_executing = FALSE;
316-
}
317-
}
318312
gm_log( GM_LOG_DEBUG, "host job completed: %s: exit %d, latency: %0.3f, exec_time: %0.3f\n", chk_result->host_name, chk_result->return_code, chk_result->latency, exec_time );
319313
}
320314

315+
/* reschedule next check if latency is to high to flatten curve */
316+
if(active_check)
317+
reschedule_high_latency(chk_result);
318+
321319
/* add result to result list */
322320
mod_gm_add_result_to_list( chk_result );
323321

@@ -359,3 +357,33 @@ int set_worker( gearman_worker_st **worker ) {
359357

360358
return GM_OK;
361359
}
360+
361+
/* reschedule next check if latency is to high to flatten curve */
362+
void reschedule_high_latency(check_result * chk_result) {
363+
if(chk_result->latency < 1)
364+
return;
365+
366+
if(mod_gm_opt->latency_flatten_window <= 0)
367+
return;
368+
369+
time_t current_time = time(NULL);
370+
int delay_max = (int)(chk_result->latency);
371+
if(delay_max > mod_gm_opt->latency_flatten_window)
372+
delay_max = mod_gm_opt->latency_flatten_window;
373+
int delay = ranged_urand(1, delay_max);
374+
if(delay < 1)
375+
delay = 1; // minimum to 1 second
376+
if(chk_result->service_description != NULL) {
377+
service * svc = find_service(chk_result->host_name, chk_result->service_description);
378+
if(svc != NULL && svc->check_interval != 0.0 && svc->next_check > current_time) {
379+
schedule_service_check(svc, svc->next_check + delay, CHECK_OPTION_ALLOW_POSTPONE);
380+
gm_log( GM_LOG_DEBUG, "delayed service %s - %s by %d seconds (latency: %.3fs)\n", chk_result->host_name, chk_result->service_description, delay, chk_result->latency);
381+
}
382+
} else {
383+
host * hst = find_host( chk_result->host_name );
384+
if(hst != NULL && hst->check_interval != 0.0 && hst->next_check > current_time) {
385+
schedule_host_check(hst, hst->next_check + delay, CHECK_OPTION_ALLOW_POSTPONE);
386+
gm_log( GM_LOG_DEBUG, "delayed host %s by %d seconds (latency: %.3fs)\n", chk_result->host_name, delay, chk_result->latency);
387+
}
388+
}
389+
}

0 commit comments

Comments
 (0)