add latency_flatten_window option

sni · sni · commit 91a9f702d1fe · 2023-03-04T21:20:16.000+01:00
Signed-off-by: Sven Nierlein &lt;sven@nierlein.de&gt;
diff --git a/Changes b/Changes
@@ -2,6 +2,7 @@ This file documents the revision history for mod_gearman.
 
 next:
           - reopen logfile after core rotation
+          - add latency_flatten_window option
 
 5.0.2 Sun Feb  5 19:05:17 CET 2023
           - fix crash when using multiple result worker in neb module
diff --git a/README.asciidoc b/README.asciidoc
@@ -788,6 +788,17 @@ Default is no.
 ====
 
 
+latency_flatten_window::
+When enabled, reschedules host/service checks if their latency is more than
+one second. This value is the maximum delay in seconds applied to hosts/services.
+Set to 0 or less than 0 to disable rescheduling.
+Default is 30.
++
+====
+    latency_flatten_window=30
+====
+
+
 
 
 Worker Options
diff --git a/common/utils.c b/common/utils.c
@@ -304,6 +304,7 @@ int set_default_options(mod_gm_opt_t *opt) {
     opt->has_starttime      = FALSE;
     opt->has_finishtime     = FALSE;
     opt->has_latency        = FALSE;
+    opt->latency_flatten_window = 30;
     opt->active             = GM_DISABLED;
 
     opt->restrict_command_characters = gm_strdup("$&();<>`\"'|");
@@ -561,6 +562,12 @@ int parse_args_line(mod_gm_opt_t *opt, char * arg, int recursion_level) {
         return(GM_OK);
     }
 
+    /* latency_flatten_window */
+    else if ( !strcmp( key, "latency_flatten_window" ) ) {
+        opt->latency_flatten_window = atoi(value);
+        return(GM_OK);
+    }
+
     /* enable_embedded_perl */
     else if ( !strcmp( key, "enable_embedded_perl" ) ) {
 #ifdef EMBEDDEDPERL
@@ -1096,6 +1103,11 @@ void dumpconfig(mod_gm_opt_t *opt, int mode) {
             gm_log( GM_LOG_DEBUG, "result_worker:                   %d\n", opt->result_workers);
         gm_log( GM_LOG_DEBUG, "do_hostchecks:                   %s\n", opt->do_hostchecks == GM_ENABLED ? "yes" : "no");
         gm_log( GM_LOG_DEBUG, "route_eventhandler_like_checks:  %s\n", opt->route_eventhandler_like_checks == GM_ENABLED ? "yes" : "no");
+        if(opt->latency_flatten_window > 0) {
+            gm_log( GM_LOG_DEBUG, "latency_flatten_window:          %d\n", opt->latency_flatten_window);
+        } else {
+            gm_log( GM_LOG_DEBUG, "latency_flatten_window:          disabled\n");
+        }
     }
     if(mode == GM_NEB_MODE || mode == GM_SEND_GEARMAN_MODE) {
         gm_log( GM_LOG_DEBUG, "result_queue:                    %s\n", opt->result_queue);
diff --git a/include/common.h b/include/common.h
@@ -223,6 +223,7 @@ typedef struct mod_gm_opt_struct {
     int            orphan_host_checks;                      /**< generate fake result for orphaned host checks */
     int            orphan_service_checks;                   /**< generate fake result for orphaned service checks */
     int            accept_clear_results;                    /**< accept unencrypted results */
+    int            latency_flatten_window;                  /**< postpone high latency checks */
 /* worker */
     char         * identifier;                              /**< identifier for this worker */
     char         * pidfile;                                 /**< path to a pidfile */
diff --git a/include/result_thread.h b/include/result_thread.h
@@ -34,6 +34,7 @@
 void *result_worker(void *);
 int set_worker( gearman_worker_st **worker );
 void *get_results( gearman_job_st *, void *, size_t *, gearman_return_t * );
+void reschedule_high_latency(check_result * chk_result);
 
 /**
  * @}
diff --git a/neb_module_naemon/mod_gearman.c b/neb_module_naemon/mod_gearman.c
@@ -291,14 +291,18 @@ static void move_results_to_core(struct nm_event_execution_properties *evprop) {
     }
 
     gettimeofday(&tval_before, NULL);
-    gm_log( GM_LOG_DEBUG, "move_results_to_core()\n" );
+    gm_log( GM_LOG_TRACE3, "move_results_to_core()\n" );
+    schedule_event(1, move_results_to_core, NULL);
 
     /* safely move result list aside */
     pthread_mutex_lock(&mod_gm_result_list_mutex);
     tmp_list = mod_gm_result_list;
     mod_gm_result_list = NULL;
     pthread_mutex_unlock(&mod_gm_result_list_mutex);
 
+    if(tmp_list == NULL)
+        return;
+
     /* process result list */
     while(tmp_list) {
         cur = tmp_list;
@@ -316,7 +320,6 @@ static void move_results_to_core(struct nm_event_execution_properties *evprop) {
     timersub(&tval_after, &tval_before, &tval_result);
 
     gm_log( GM_LOG_DEBUG, "move_results_to_core processed %d results in %ld.%06lds\n", count, (long int)tval_result.tv_sec, (long int)tval_result.tv_usec );
-    schedule_event(1, move_results_to_core, NULL);
 }
 
 /* add list to gearman result list */
@@ -856,7 +859,12 @@ static int handle_host_check( int event_type, void *data ) {
     if(hostdata->latency < 0)
         hostdata->latency = 0;
 
-    gm_log(GM_LOG_DEBUG, "received job for queue %s: %s, check_options: %d    latency so far: %.3fs\n", target_queue, hostdata->host_name, check_options, hostdata->latency);
+    gm_log(GM_LOG_DEBUG, "received job for queue %s: %s, check_options: %d    latency so far: %.3fs\n",
+            target_queue,
+            hostdata->host_name,
+            check_options,
+            hostdata->latency
+    );
 
     /* as we have to intercept host checks so early
      * (we cannot cancel checks otherwise)
@@ -1002,7 +1010,13 @@ static int handle_svc_check( int event_type, void *data ) {
     if(svcdata->latency < 0)
         svcdata->latency = 0;
 
-    gm_log(GM_LOG_DEBUG, "received job for queue %s: %s - %s, check_options: %d   latency so far: %.3fs\n", target_queue, svcdata->host_name, svcdata->service_description, check_options, svcdata->latency);
+    gm_log(GM_LOG_DEBUG, "received job for queue %s: %s - %s, check_options: %d   latency so far: %.3fs\n",
+        target_queue,
+        svcdata->host_name,
+        svcdata->service_description,
+        check_options,
+        svcdata->latency
+    );
 
     /* as we have to intercept service checks so early
      * (we cannot cancel checks otherwise)
diff --git a/neb_module_naemon/result_thread.c b/neb_module_naemon/result_thread.c
@@ -309,15 +309,13 @@ void *get_results( gearman_job_st *job, __attribute__((__unused__)) void *contex
     if ( chk_result->service_description != NULL ) {
         gm_log( GM_LOG_DEBUG, "service job completed: %s %s: exit %d, latency: %0.3f, exec_time: %0.3f\n", chk_result->host_name, chk_result->service_description, chk_result->return_code, chk_result->latency, exec_time );
     } else {
-        if(active_check) {
-            host * hst = find_host( chk_result->host_name );
-            if(hst != NULL) {
-                hst->is_executing = FALSE;
-            }
-        }
         gm_log( GM_LOG_DEBUG, "host job completed: %s: exit %d, latency: %0.3f, exec_time: %0.3f\n", chk_result->host_name, chk_result->return_code, chk_result->latency, exec_time );
     }
 
+    /* reschedule next check if latency is to high to flatten curve */
+    if(active_check)
+        reschedule_high_latency(chk_result);
+
     /* add result to result list */
     mod_gm_add_result_to_list( chk_result );
 
@@ -359,3 +357,33 @@ int set_worker( gearman_worker_st **worker ) {
 
     return GM_OK;
 }
+
+/* reschedule next check if latency is to high to flatten curve */
+void reschedule_high_latency(check_result * chk_result) {
+    if(chk_result->latency < 1)
+        return;
+
+    if(mod_gm_opt->latency_flatten_window <= 0)
+        return;
+
+    time_t current_time = time(NULL);
+    int delay_max = (int)(chk_result->latency);
+    if(delay_max > mod_gm_opt->latency_flatten_window)
+        delay_max = mod_gm_opt->latency_flatten_window;
+    int delay = ranged_urand(1, delay_max);
+    if(delay < 1)
+        delay = 1; // minimum to 1 second
+    if(chk_result->service_description != NULL) {
+        service * svc = find_service(chk_result->host_name, chk_result->service_description);
+        if(svc != NULL && svc->check_interval != 0.0 && svc->next_check > current_time) {
+            schedule_service_check(svc, svc->next_check + delay, CHECK_OPTION_ALLOW_POSTPONE);
+            gm_log( GM_LOG_DEBUG, "delayed service %s - %s by %d seconds (latency: %.3fs)\n", chk_result->host_name, chk_result->service_description, delay, chk_result->latency);
+        }
+    } else {
+        host * hst = find_host( chk_result->host_name );
+        if(hst != NULL && hst->check_interval != 0.0 && hst->next_check > current_time) {
+            schedule_host_check(hst, hst->next_check + delay, CHECK_OPTION_ALLOW_POSTPONE);
+            gm_log( GM_LOG_DEBUG, "delayed host %s by %d seconds (latency: %.3fs)\n", chk_result->host_name, delay, chk_result->latency);
+        }
+    }
+}