add memberlist settings to the loki ingesters config

olegbet · olegbet · commit 901ca329e61e · 2025-11-22T18:37:32.000+02:00
Signed-off-by: obetsun &lt;obetsun@redhat.com&gt;

rh-pre-commit.version: 2.3.2
rh-pre-commit.check-secrets: ENABLED
diff --git a/components/vector-kubearchive-log-collector/production/kflux-ocp-p01/loki-helm-prod-values.yaml b/components/vector-kubearchive-log-collector/production/kflux-ocp-p01/loki-helm-prod-values.yaml
@@ -16,6 +16,32 @@ gateway:
 loki:
   commonConfig:
     replication_factor: 3
+  memberlist:
+    join_members: []
+    # How long to wait before reclaiming a dead node's tokens
+    # Reduced to 2 minutes for development (faster cleanup with single replica)
+    # This helps remove stale ring instances quickly when pods are restarted
+    dead_node_reclaim_time: 2m
+    # How often to gossip with other nodes (lower = faster detection of failures)
+    # Keep at 2s for quick failure detection
+    gossip_interval: 2s
+    # How often to do full state sync with other nodes
+    # Reduced for development to sync faster
+    push_pull_interval: 5s
+    # Number of random nodes to gossip with per interval
+    # Set to 1 for development (only 1 ingester replica)
+    gossip_nodes: 1
+    # How long to continue gossiping to dead nodes (helps propagate death info)
+    # Reduced for development to propagate death info faster
+    gossip_to_dead_nodes_time: 10s
+    # How long to wait for an ingester to gracefully leave before considering it dead
+    # This should be longer than terminationGracePeriodSeconds to allow graceful shutdown
+    # Reduced to 60s for development (faster cleanup)
+    left_ingesters_timeout: 60s
+    max_join_backoff: 1m
+    max_join_retries: 10
+    min_join_backoff: 1s
+    rejoin_interval: 90s
   # Required storage configuration for Helm chart
   storage:
     type: s3
diff --git a/components/vector-kubearchive-log-collector/production/kflux-osp-p01/loki-helm-prod-values.yaml b/components/vector-kubearchive-log-collector/production/kflux-osp-p01/loki-helm-prod-values.yaml
@@ -17,6 +17,32 @@ gateway:
 loki:
   commonConfig:
     replication_factor: 3
+  memberlist:
+    join_members: []
+    # How long to wait before reclaiming a dead node's tokens
+    # Reduced to 2 minutes for development (faster cleanup with single replica)
+    # This helps remove stale ring instances quickly when pods are restarted
+    dead_node_reclaim_time: 2m
+    # How often to gossip with other nodes (lower = faster detection of failures)
+    # Keep at 2s for quick failure detection
+    gossip_interval: 2s
+    # How often to do full state sync with other nodes
+    # Reduced for development to sync faster
+    push_pull_interval: 5s
+    # Number of random nodes to gossip with per interval
+    # Set to 1 for development (only 1 ingester replica)
+    gossip_nodes: 1
+    # How long to continue gossiping to dead nodes (helps propagate death info)
+    # Reduced for development to propagate death info faster
+    gossip_to_dead_nodes_time: 10s
+    # How long to wait for an ingester to gracefully leave before considering it dead
+    # This should be longer than terminationGracePeriodSeconds to allow graceful shutdown
+    # Reduced to 60s for development (faster cleanup)
+    left_ingesters_timeout: 60s
+    max_join_backoff: 1m
+    max_join_retries: 10
+    min_join_backoff: 1s
+    rejoin_interval: 90s
   # Required storage configuration for Helm chart
   storage:
     type: s3
diff --git a/components/vector-kubearchive-log-collector/production/kflux-prd-rh02/loki-helm-prod-values.yaml b/components/vector-kubearchive-log-collector/production/kflux-prd-rh02/loki-helm-prod-values.yaml
@@ -17,6 +17,32 @@ gateway:
 loki:
   commonConfig:
     replication_factor: 3
+  memberlist:
+    join_members: []
+    # How long to wait before reclaiming a dead node's tokens
+    # Reduced to 2 minutes for development (faster cleanup with single replica)
+    # This helps remove stale ring instances quickly when pods are restarted
+    dead_node_reclaim_time: 2m
+    # How often to gossip with other nodes (lower = faster detection of failures)
+    # Keep at 2s for quick failure detection
+    gossip_interval: 2s
+    # How often to do full state sync with other nodes
+    # Reduced for development to sync faster
+    push_pull_interval: 5s
+    # Number of random nodes to gossip with per interval
+    # Set to 1 for development (only 1 ingester replica)
+    gossip_nodes: 1
+    # How long to continue gossiping to dead nodes (helps propagate death info)
+    # Reduced for development to propagate death info faster
+    gossip_to_dead_nodes_time: 10s
+    # How long to wait for an ingester to gracefully leave before considering it dead
+    # This should be longer than terminationGracePeriodSeconds to allow graceful shutdown
+    # Reduced to 60s for development (faster cleanup)
+    left_ingesters_timeout: 60s
+    max_join_backoff: 1m
+    max_join_retries: 10
+    min_join_backoff: 1s
+    rejoin_interval: 90s
   # Required storage configuration for Helm chart
   storage:
     type: s3
diff --git a/components/vector-kubearchive-log-collector/production/kflux-prd-rh03/loki-helm-prod-values.yaml b/components/vector-kubearchive-log-collector/production/kflux-prd-rh03/loki-helm-prod-values.yaml
@@ -17,6 +17,32 @@ gateway:
 loki:
   commonConfig:
     replication_factor: 3
+  memberlist:
+    join_members: []
+    # How long to wait before reclaiming a dead node's tokens
+    # Reduced to 2 minutes for development (faster cleanup with single replica)
+    # This helps remove stale ring instances quickly when pods are restarted
+    dead_node_reclaim_time: 2m
+    # How often to gossip with other nodes (lower = faster detection of failures)
+    # Keep at 2s for quick failure detection
+    gossip_interval: 2s
+    # How often to do full state sync with other nodes
+    # Reduced for development to sync faster
+    push_pull_interval: 5s
+    # Number of random nodes to gossip with per interval
+    # Set to 1 for development (only 1 ingester replica)
+    gossip_nodes: 1
+    # How long to continue gossiping to dead nodes (helps propagate death info)
+    # Reduced for development to propagate death info faster
+    gossip_to_dead_nodes_time: 10s
+    # How long to wait for an ingester to gracefully leave before considering it dead
+    # This should be longer than terminationGracePeriodSeconds to allow graceful shutdown
+    # Reduced to 60s for development (faster cleanup)
+    left_ingesters_timeout: 60s
+    max_join_backoff: 1m
+    max_join_retries: 10
+    min_join_backoff: 1s
+    rejoin_interval: 90s
   # Required storage configuration for Helm chart
   storage:
     type: s3
diff --git a/components/vector-kubearchive-log-collector/production/pentest-p01/loki-helm-prod-values.yaml b/components/vector-kubearchive-log-collector/production/pentest-p01/loki-helm-prod-values.yaml
@@ -47,6 +47,32 @@ gateway:
 loki:
   commonConfig:
     replication_factor: 3
+  memberlist:
+    join_members: []
+    # How long to wait before reclaiming a dead node's tokens
+    # Reduced to 2 minutes for development (faster cleanup with single replica)
+    # This helps remove stale ring instances quickly when pods are restarted
+    dead_node_reclaim_time: 2m
+    # How often to gossip with other nodes (lower = faster detection of failures)
+    # Keep at 2s for quick failure detection
+    gossip_interval: 2s
+    # How often to do full state sync with other nodes
+    # Reduced for development to sync faster
+    push_pull_interval: 5s
+    # Number of random nodes to gossip with per interval
+    # Set to 1 for development (only 1 ingester replica)
+    gossip_nodes: 1
+    # How long to continue gossiping to dead nodes (helps propagate death info)
+    # Reduced for development to propagate death info faster
+    gossip_to_dead_nodes_time: 10s
+    # How long to wait for an ingester to gracefully leave before considering it dead
+    # This should be longer than terminationGracePeriodSeconds to allow graceful shutdown
+    # Reduced to 60s for development (faster cleanup)
+    left_ingesters_timeout: 60s
+    max_join_backoff: 1m
+    max_join_retries: 10
+    min_join_backoff: 1s
+    rejoin_interval: 90s
   # Required storage configuration for Helm chart
   storage:
     type: s3
diff --git a/components/vector-kubearchive-log-collector/production/stone-prod-p01/loki-helm-prod-values.yaml b/components/vector-kubearchive-log-collector/production/stone-prod-p01/loki-helm-prod-values.yaml
@@ -17,6 +17,32 @@ gateway:
 loki:
   commonConfig:
     replication_factor: 3
+  memberlist:
+    join_members: []
+    # How long to wait before reclaiming a dead node's tokens
+    # Reduced to 2 minutes for development (faster cleanup with single replica)
+    # This helps remove stale ring instances quickly when pods are restarted
+    dead_node_reclaim_time: 2m
+    # How often to gossip with other nodes (lower = faster detection of failures)
+    # Keep at 2s for quick failure detection
+    gossip_interval: 2s
+    # How often to do full state sync with other nodes
+    # Reduced for development to sync faster
+    push_pull_interval: 5s
+    # Number of random nodes to gossip with per interval
+    # Set to 1 for development (only 1 ingester replica)
+    gossip_nodes: 1
+    # How long to continue gossiping to dead nodes (helps propagate death info)
+    # Reduced for development to propagate death info faster
+    gossip_to_dead_nodes_time: 10s
+    # How long to wait for an ingester to gracefully leave before considering it dead
+    # This should be longer than terminationGracePeriodSeconds to allow graceful shutdown
+    # Reduced to 60s for development (faster cleanup)
+    left_ingesters_timeout: 60s
+    max_join_backoff: 1m
+    max_join_retries: 10
+    min_join_backoff: 1s
+    rejoin_interval: 90s
   # Required storage configuration for Helm chart
   storage:
     type: s3
diff --git a/components/vector-kubearchive-log-collector/production/stone-prod-p02/loki-helm-prod-values.yaml b/components/vector-kubearchive-log-collector/production/stone-prod-p02/loki-helm-prod-values.yaml
@@ -17,6 +17,32 @@ gateway:
 loki:
   commonConfig:
     replication_factor: 3
+  memberlist:
+    join_members: []
+    # How long to wait before reclaiming a dead node's tokens
+    # Reduced to 2 minutes for development (faster cleanup with single replica)
+    # This helps remove stale ring instances quickly when pods are restarted
+    dead_node_reclaim_time: 2m
+    # How often to gossip with other nodes (lower = faster detection of failures)
+    # Keep at 2s for quick failure detection
+    gossip_interval: 2s
+    # How often to do full state sync with other nodes
+    # Reduced for development to sync faster
+    push_pull_interval: 5s
+    # Number of random nodes to gossip with per interval
+    # Set to 1 for development (only 1 ingester replica)
+    gossip_nodes: 1
+    # How long to continue gossiping to dead nodes (helps propagate death info)
+    # Reduced for development to propagate death info faster
+    gossip_to_dead_nodes_time: 10s
+    # How long to wait for an ingester to gracefully leave before considering it dead
+    # This should be longer than terminationGracePeriodSeconds to allow graceful shutdown
+    # Reduced to 60s for development (faster cleanup)
+    left_ingesters_timeout: 60s
+    max_join_backoff: 1m
+    max_join_retries: 10
+    min_join_backoff: 1s
+    rejoin_interval: 90s
   # Required storage configuration for Helm chart
   storage:
     type: s3
diff --git a/components/vector-kubearchive-log-collector/staging/stone-stage-p01/loki-helm-stg-values.yaml b/components/vector-kubearchive-log-collector/staging/stone-stage-p01/loki-helm-stg-values.yaml
@@ -16,6 +16,32 @@ gateway:
 loki:
   commonConfig:
     replication_factor: 3
+  memberlist:
+    join_members: []
+    # How long to wait before reclaiming a dead node's tokens
+    # Reduced to 2 minutes for development (faster cleanup with single replica)
+    # This helps remove stale ring instances quickly when pods are restarted
+    dead_node_reclaim_time: 2m
+    # How often to gossip with other nodes (lower = faster detection of failures)
+    # Keep at 2s for quick failure detection
+    gossip_interval: 2s
+    # How often to do full state sync with other nodes
+    # Reduced for development to sync faster
+    push_pull_interval: 5s
+    # Number of random nodes to gossip with per interval
+    # Set to 1 for development (only 1 ingester replica)
+    gossip_nodes: 1
+    # How long to continue gossiping to dead nodes (helps propagate death info)
+    # Reduced for development to propagate death info faster
+    gossip_to_dead_nodes_time: 10s
+    # How long to wait for an ingester to gracefully leave before considering it dead
+    # This should be longer than terminationGracePeriodSeconds to allow graceful shutdown
+    # Reduced to 60s for development (faster cleanup)
+    left_ingesters_timeout: 60s
+    max_join_backoff: 1m
+    max_join_retries: 10
+    min_join_backoff: 1s
+    rejoin_interval: 90s
   storage:
     type: s3
     # bucketNames: Fill it on the generator for each cluster
diff --git a/components/vector-kubearchive-log-collector/staging/stone-stg-rh01/loki-helm-stg-values.yaml b/components/vector-kubearchive-log-collector/staging/stone-stg-rh01/loki-helm-stg-values.yaml
@@ -48,6 +48,32 @@ gateway:
 loki:
   commonConfig:
     replication_factor: 3
+  memberlist:
+    join_members: []
+    # How long to wait before reclaiming a dead node's tokens
+    # Reduced to 2 minutes for development (faster cleanup with single replica)
+    # This helps remove stale ring instances quickly when pods are restarted
+    dead_node_reclaim_time: 2m
+    # How often to gossip with other nodes (lower = faster detection of failures)
+    # Keep at 2s for quick failure detection
+    gossip_interval: 2s
+    # How often to do full state sync with other nodes
+    # Reduced for development to sync faster
+    push_pull_interval: 5s
+    # Number of random nodes to gossip with per interval
+    # Set to 1 for development (only 1 ingester replica)
+    gossip_nodes: 1
+    # How long to continue gossiping to dead nodes (helps propagate death info)
+    # Reduced for development to propagate death info faster
+    gossip_to_dead_nodes_time: 10s
+    # How long to wait for an ingester to gracefully leave before considering it dead
+    # This should be longer than terminationGracePeriodSeconds to allow graceful shutdown
+    # Reduced to 60s for development (faster cleanup)
+    left_ingesters_timeout: 60s
+    max_join_backoff: 1m
+    max_join_retries: 10
+    min_join_backoff: 1s
+    rejoin_interval: 90s
   storage:
     type: s3
     # bucketNames: Fill it on the generator for each cluster