Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,17 @@ patches:
target:
kind: SecurityContextConstraints
name: kubearchive-logging-scc
# Patch all Loki PodDisruptionBudgets to allow eviction of unhealthy pods
# This is critical for StatefulSets where stale pods should be evicted
- patch: |
- op: add
path: /spec/unhealthyPodEvictionPolicy
value: AlwaysAllow
target:
group: policy
version: v1
kind: PodDisruptionBudget
labelSelector: app.kubernetes.io/name=loki

generators:
- vector-helm-generator.yaml
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,29 +32,51 @@ loki:
replication_factor: 1
memberlist:
join_members: []
dead_node_reclaim_time: 0s
# How long to wait before reclaiming a dead node's tokens
# Reduced to 2 minutes for development (faster cleanup with single replica)
# This helps remove stale ring instances quickly when pods are restarted
dead_node_reclaim_time: 2m
# How often to gossip with other nodes (lower = faster detection of failures)
# Keep at 2s for quick failure detection
gossip_interval: 2s
push_pull_interval: 10s
gossip_nodes: 2
gossip_to_dead_nodes_time: 15s
left_ingesters_timeout: 30s
# How often to do full state sync with other nodes
# Reduced for development to sync faster
push_pull_interval: 5s
# Number of random nodes to gossip with per interval
# Set to 1 for development (only 1 ingester replica)
gossip_nodes: 1
# How long to continue gossiping to dead nodes (helps propagate death info)
# Reduced for development to propagate death info faster
gossip_to_dead_nodes_time: 10s
# How long to wait for an ingester to gracefully leave before considering it dead
# This should be longer than terminationGracePeriodSeconds to allow graceful shutdown
# Reduced to 60s for development (faster cleanup)
left_ingesters_timeout: 60s
max_join_backoff: 1m
max_join_retries: 10
min_join_backoff: 1s
rejoin_interval: 90s
storage:
bucketNames:
chunks: loki-data
admin: loki-data
s3:
endpoint: http://minio:9000
region: us-east-1
s3ForcePathStyle: true
insecure: true
# bucketNames:
# chunks: loki-data
# admin: loki-data
# s3:
# endpoint: http://minio:9000
# region: us-east-1
# s3ForcePathStyle: true
# insecure: true
type: filesystem
storage_config:
aws:
s3: s3://loki-chunks
s3forcepathstyle: true
bucketnames: loki-data
region: us-east-1
endpoint: minio:9000
insecure: true
# aws:
# s3: s3://loki-chunks
# s3forcepathstyle: true
# bucketnames: loki-data
# region: us-east-1
# endpoint: minio:9000
# insecure: true
# Filesystem storage configuration
filesystem:
directory: /var/loki/chunks
limits_config:
retention_period: 24h # Reduce from 744h for development
ingestion_rate_mb: 5 # Reduce from 10 for development
Expand All @@ -80,6 +102,7 @@ loki:
log_stream_creation: false
log_duplicate_stream_info: true
ingester:
autoforget_unhealthy: true
chunk_encoding: snappy
chunk_target_size: 3145728
chunk_idle_period: 5m
Expand Down Expand Up @@ -123,6 +146,28 @@ ingester:
podAntiAffinity:
soft: {}
hard: {}
# AWS credentials for S3/MinIO access
# These match the MinIO credentials from loki-helm-minio-values.yaml
extraEnv:
- name: AWS_ACCESS_KEY_ID
value: "loki"
- name: AWS_SECRET_ACCESS_KEY
value: "supersecret"
- name: AWS_DEFAULT_REGION
value: "us-east-1"
# Graceful shutdown configuration to prevent stale ring instances
# Give Loki time to flush chunks and leave the ring gracefully
# Set to 90s to be longer than left_ingesters_timeout (60s) but still allow quick cleanup
terminationGracePeriodSeconds: 90
lifecycle:
preStop:
exec:
# Sleep to allow readiness probe to fail, removing pod from service endpoints
# This gives distributor time to stop sending new requests before shutdown
command:
- /bin/sh
- -c
- sleep 10

querier:
replicas: 1
Expand Down Expand Up @@ -198,40 +243,40 @@ chunksCache:
replicas: 1
batchSize: 256 # Batch size for sending/receiving chunks from cache
parallelism: 10 # Parallel threads for cache operations
maxItemMemory: 30 # MB - Increased from 10MB to handle chunks (3MB target + compression overhead + metadata)
maxItemMemory: 10 # MB
defaultValidity: 12h # How long cached chunks are stored

resultsCache:
enabled: true
replicas: 1
maxItemMemory: 100 # MB - Increased from 10MB to handle large query results (can be much larger than chunks)
maxItemMemory: 10 # MB
defaultValidity: 12h # How long cached query results are stored

memcached:
enabled: true
maxItemMemory: 30 # MB - Shared default for general memcached instances
maxItemMemory: 10 # MB - Shared default for general memcached instances

# Shared memcached configuration (used as defaults for all memcached instances)
# These don't deploy separate instances - they configure shared settings
memcachedResults:
enabled: true
maxItemMemory: 100 # MB - For query result caching
maxItemMemory: 10 # MB - For query result caching

memcachedChunks:
enabled: true
maxItemMemory: 30 # MB - For chunk caching
maxItemMemory: 10 # MB - For chunk caching

memcachedFrontend:
enabled: true
maxItemMemory: 100 # MB - Frontend cache can handle large query results
maxItemMemory: 10 # MB - Frontend cache can handle large query results

memcachedIndexQueries:
enabled: true
maxItemMemory: 50 # MB - Index queries can be large
maxItemMemory: 10 # MB - Index queries can be large

memcachedIndexWrites:
enabled: true
maxItemMemory: 30 # MB - Index write operations
maxItemMemory: 10 # MB - Index write operations



Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,14 @@ loki:
commonConfig:
path_prefix: /var/loki # This directory will be writable via volume mount
storage:
type: s3
# type: s3
type: filesystem
schemaConfig:
configs:
- from: "2024-04-01"
store: tsdb
object_store: s3
# object_store: s3
object_store: filesystem
schema: v13
index:
prefix: loki_index_
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,3 +20,14 @@ patches:
target:
kind: SecurityContextConstraints
name: kubearchive-logging-scc
# Patch all Loki PodDisruptionBudgets to allow eviction of unhealthy pods
# This is critical for StatefulSets where stale pods should be evicted
- patch: |
- op: add
path: /spec/unhealthyPodEvictionPolicy
value: AlwaysAllow
target:
group: policy
version: v1
kind: PodDisruptionBudget
labelSelector: app.kubernetes.io/name=loki
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,32 @@ gateway:
loki:
commonConfig:
replication_factor: 3
memberlist:
join_members: []
# How long to wait before reclaiming a dead node's tokens
# Reduced to 2 minutes for development (faster cleanup with single replica)
# This helps remove stale ring instances quickly when pods are restarted
dead_node_reclaim_time: 2m
# How often to gossip with other nodes (lower = faster detection of failures)
# Keep at 2s for quick failure detection
gossip_interval: 2s
# How often to do full state sync with other nodes
# Reduced for development to sync faster
push_pull_interval: 5s
# Number of random nodes to gossip with per interval
# Set to 1 for development (only 1 ingester replica)
gossip_nodes: 1
# How long to continue gossiping to dead nodes (helps propagate death info)
# Reduced for development to propagate death info faster
gossip_to_dead_nodes_time: 10s
# How long to wait for an ingester to gracefully leave before considering it dead
# This should be longer than terminationGracePeriodSeconds to allow graceful shutdown
# Reduced to 60s for development (faster cleanup)
left_ingesters_timeout: 60s
max_join_backoff: 1m
max_join_retries: 10
min_join_backoff: 1s
rejoin_interval: 90s
# Required storage configuration for Helm chart
storage:
type: s3
Expand Down Expand Up @@ -54,6 +80,7 @@ loki:
log_stream_creation: false
log_duplicate_stream_info: true
ingester:
autoforget_unhealthy: true
chunk_target_size: 8388608 # 8MB
chunk_idle_period: 5m
max_chunk_age: 2h
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,32 @@ gateway:
loki:
commonConfig:
replication_factor: 3
memberlist:
join_members: []
# How long to wait before reclaiming a dead node's tokens
# Reduced to 2 minutes for development (faster cleanup with single replica)
# This helps remove stale ring instances quickly when pods are restarted
dead_node_reclaim_time: 2m
# How often to gossip with other nodes (lower = faster detection of failures)
# Keep at 2s for quick failure detection
gossip_interval: 2s
# How often to do full state sync with other nodes
# Reduced for development to sync faster
push_pull_interval: 5s
# Number of random nodes to gossip with per interval
# Set to 1 for development (only 1 ingester replica)
gossip_nodes: 1
# How long to continue gossiping to dead nodes (helps propagate death info)
# Reduced for development to propagate death info faster
gossip_to_dead_nodes_time: 10s
# How long to wait for an ingester to gracefully leave before considering it dead
# This should be longer than terminationGracePeriodSeconds to allow graceful shutdown
# Reduced to 60s for development (faster cleanup)
left_ingesters_timeout: 60s
max_join_backoff: 1m
max_join_retries: 10
min_join_backoff: 1s
rejoin_interval: 90s
# Required storage configuration for Helm chart
storage:
type: s3
Expand Down Expand Up @@ -55,6 +81,7 @@ loki:
log_stream_creation: false
log_duplicate_stream_info: true
ingester:
autoforget_unhealthy: true
chunk_target_size: 8388608 # 8MB
chunk_idle_period: 5m
max_chunk_age: 2h
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,32 @@ gateway:
loki:
commonConfig:
replication_factor: 3
memberlist:
join_members: []
# How long to wait before reclaiming a dead node's tokens
# Reduced to 2 minutes for development (faster cleanup with single replica)
# This helps remove stale ring instances quickly when pods are restarted
dead_node_reclaim_time: 2m
# How often to gossip with other nodes (lower = faster detection of failures)
# Keep at 2s for quick failure detection
gossip_interval: 2s
# How often to do full state sync with other nodes
# Reduced for development to sync faster
push_pull_interval: 5s
# Number of random nodes to gossip with per interval
# Set to 1 for development (only 1 ingester replica)
gossip_nodes: 1
# How long to continue gossiping to dead nodes (helps propagate death info)
# Reduced for development to propagate death info faster
gossip_to_dead_nodes_time: 10s
# How long to wait for an ingester to gracefully leave before considering it dead
# This should be longer than terminationGracePeriodSeconds to allow graceful shutdown
# Reduced to 60s for development (faster cleanup)
left_ingesters_timeout: 60s
max_join_backoff: 1m
max_join_retries: 10
min_join_backoff: 1s
rejoin_interval: 90s
# Required storage configuration for Helm chart
storage:
type: s3
Expand Down Expand Up @@ -55,6 +81,7 @@ loki:
log_stream_creation: false
log_duplicate_stream_info: true
ingester:
autoforget_unhealthy: true
chunk_target_size: 8388608 # 8MB
chunk_idle_period: 5m
max_chunk_age: 2h
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,32 @@ gateway:
loki:
commonConfig:
replication_factor: 3
memberlist:
join_members: []
# How long to wait before reclaiming a dead node's tokens
# Reduced to 2 minutes for development (faster cleanup with single replica)
# This helps remove stale ring instances quickly when pods are restarted
dead_node_reclaim_time: 2m
# How often to gossip with other nodes (lower = faster detection of failures)
# Keep at 2s for quick failure detection
gossip_interval: 2s
# How often to do full state sync with other nodes
# Reduced for development to sync faster
push_pull_interval: 5s
# Number of random nodes to gossip with per interval
# Set to 1 for development (only 1 ingester replica)
gossip_nodes: 1
# How long to continue gossiping to dead nodes (helps propagate death info)
# Reduced for development to propagate death info faster
gossip_to_dead_nodes_time: 10s
# How long to wait for an ingester to gracefully leave before considering it dead
# This should be longer than terminationGracePeriodSeconds to allow graceful shutdown
# Reduced to 60s for development (faster cleanup)
left_ingesters_timeout: 60s
max_join_backoff: 1m
max_join_retries: 10
min_join_backoff: 1s
rejoin_interval: 90s
# Required storage configuration for Helm chart
storage:
type: s3
Expand Down Expand Up @@ -55,6 +81,7 @@ loki:
log_stream_creation: false
log_duplicate_stream_info: true
ingester:
autoforget_unhealthy: true
chunk_target_size: 8388608 # 8MB
chunk_idle_period: 5m
max_chunk_age: 2h
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,3 +17,16 @@ resources:
generators:
- vector-helm-generator.yaml
- loki-helm-generator.yaml

patches:
# Patch all Loki PodDisruptionBudgets to allow eviction of unhealthy pods
# This is critical for StatefulSets where stale pods should be evicted
- patch: |
- op: add
path: /spec/unhealthyPodEvictionPolicy
value: AlwaysAllow
target:
group: policy
version: v1
kind: PodDisruptionBudget
labelSelector: app.kubernetes.io/name=loki
Loading
Loading